Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
26
vendor/ruvector/crates/rvf/rvf-import/Cargo.toml
vendored
Normal file
26
vendor/ruvector/crates/rvf/rvf-import/Cargo.toml
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
[package]
|
||||
name = "rvf-import"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
description = "Import tools for migrating data from JSON, CSV, and NumPy formats into RVF stores"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/ruvnet/ruvector"
|
||||
homepage = "https://github.com/ruvnet/ruvector"
|
||||
readme = "README.md"
|
||||
categories = ["database-implementations", "command-line-utilities"]
|
||||
keywords = ["rvf", "import", "json", "csv", "numpy"]
|
||||
|
||||
[[bin]]
|
||||
name = "rvf-import"
|
||||
path = "src/bin/rvf_import.rs"
|
||||
|
||||
[dependencies]
|
||||
rvf-runtime = { version = "0.2.0", path = "../rvf-runtime", features = ["std"] }
|
||||
rvf-types = { version = "0.2.0", path = "../rvf-types", features = ["std"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
csv = "1"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
46
vendor/ruvector/crates/rvf/rvf-import/README.md
vendored
Normal file
46
vendor/ruvector/crates/rvf/rvf-import/README.md
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
# rvf-import
|
||||
|
||||
Data import tools for migrating vectors from JSON, CSV, and NumPy formats into RVF stores.
|
||||
|
||||
## What It Does
|
||||
|
||||
`rvf-import` provides both a library API and a CLI binary for importing vector data from common formats into `.rvf` files. Supports automatic ID generation, metadata extraction, and batch ingestion.
|
||||
|
||||
## Supported Formats
|
||||
|
||||
| Format | Extension | Features |
|
||||
|--------|-----------|----------|
|
||||
| **JSON** | `.json` | Configurable ID/vector/metadata field names |
|
||||
| **CSV** | `.csv` | Header-based column mapping, configurable delimiter |
|
||||
| **NumPy** | `.npy` | Direct binary array loading, auto-dimension detection |
|
||||
|
||||
## Library Usage
|
||||
|
||||
```rust
|
||||
use rvf_import::json::{parse_json_file, JsonConfig};
|
||||
|
||||
let config = JsonConfig {
|
||||
id_field: "id".into(),
|
||||
vector_field: "embedding".into(),
|
||||
..Default::default()
|
||||
};
|
||||
let records = parse_json_file(Path::new("vectors.json"), &config)?;
|
||||
```
|
||||
|
||||
## CLI Usage
|
||||
|
||||
```bash
|
||||
rvf-import --input data.npy --output vectors.rvf --format npy --dimension 384
|
||||
rvf-import --input data.csv --output vectors.rvf --format csv --dimension 128
|
||||
rvf-import --input data.json --output vectors.rvf --format json
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
```bash
|
||||
cargo test -p rvf-import
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
MIT OR Apache-2.0
|
||||
167
vendor/ruvector/crates/rvf/rvf-import/src/bin/rvf_import.rs
vendored
Normal file
167
vendor/ruvector/crates/rvf/rvf-import/src/bin/rvf_import.rs
vendored
Normal file
@@ -0,0 +1,167 @@
|
||||
//! CLI binary for importing data into RVF stores.
|
||||
//!
|
||||
//! Usage examples:
|
||||
//! rvf-import --format json --input vectors.json --output data.rvf --dimension 384
|
||||
//! rvf-import --format csv --input data.csv --output data.rvf --id-column 0 --vector-start 1
|
||||
//! rvf-import --format npy --input embeddings.npy --output data.rvf
|
||||
|
||||
use clap::Parser;
|
||||
use rvf_import::progress::StderrProgress;
|
||||
use std::path::PathBuf;
|
||||
use std::process;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "rvf-import", about = "Import vectors into an RVF store")]
|
||||
struct Cli {
|
||||
/// Input format: json, csv, tsv, or npy.
|
||||
#[arg(long)]
|
||||
format: String,
|
||||
|
||||
/// Path to the input file.
|
||||
#[arg(long)]
|
||||
input: PathBuf,
|
||||
|
||||
/// Path to the output .rvf file (will be created).
|
||||
#[arg(long)]
|
||||
output: PathBuf,
|
||||
|
||||
/// Vector dimension. Required for json/csv; auto-detected for npy.
|
||||
#[arg(long)]
|
||||
dimension: Option<u16>,
|
||||
|
||||
/// (CSV) Column index for the vector ID (0-based, default 0).
|
||||
#[arg(long, default_value_t = 0)]
|
||||
id_column: usize,
|
||||
|
||||
/// (CSV) Column index where vector components start (0-based, default 1).
|
||||
#[arg(long, default_value_t = 1)]
|
||||
vector_start: usize,
|
||||
|
||||
/// (CSV) Disable header row detection.
|
||||
#[arg(long)]
|
||||
no_header: bool,
|
||||
|
||||
/// (NPY) Starting ID for auto-assigned vector IDs.
|
||||
#[arg(long, default_value_t = 0)]
|
||||
start_id: u64,
|
||||
|
||||
/// Batch size for ingestion (default 1000).
|
||||
#[arg(long, default_value_t = 1000)]
|
||||
batch_size: usize,
|
||||
|
||||
/// Suppress progress output.
|
||||
#[arg(long)]
|
||||
quiet: bool,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let cli = Cli::parse();
|
||||
|
||||
let records = match cli.format.as_str() {
|
||||
"json" => match rvf_import::json::parse_json_file(&cli.input) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
},
|
||||
"csv" => {
|
||||
let config = rvf_import::csv_import::CsvConfig {
|
||||
id_column: cli.id_column,
|
||||
vector_start: cli.vector_start,
|
||||
delimiter: b',',
|
||||
has_header: !cli.no_header,
|
||||
dimension: cli.dimension.map(|d| d as usize),
|
||||
};
|
||||
match rvf_import::csv_import::parse_csv_file(&cli.input, &config) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
"tsv" => {
|
||||
let config = rvf_import::csv_import::CsvConfig {
|
||||
id_column: cli.id_column,
|
||||
vector_start: cli.vector_start,
|
||||
delimiter: b'\t',
|
||||
has_header: !cli.no_header,
|
||||
dimension: cli.dimension.map(|d| d as usize),
|
||||
};
|
||||
match rvf_import::csv_import::parse_csv_file(&cli.input, &config) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
"npy" => {
|
||||
let config = rvf_import::numpy::NpyConfig {
|
||||
start_id: cli.start_id,
|
||||
};
|
||||
match rvf_import::numpy::parse_npy_file(&cli.input, &config) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
eprintln!("error: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
other => {
|
||||
eprintln!("error: unknown format '{other}'. Use: json, csv, tsv, npy");
|
||||
process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
if records.is_empty() {
|
||||
eprintln!("warning: no records parsed from input file");
|
||||
process::exit(0);
|
||||
}
|
||||
|
||||
// Determine dimension
|
||||
let dimension = match cli.dimension {
|
||||
Some(d) => d,
|
||||
None => {
|
||||
let inferred = records[0].vector.len() as u16;
|
||||
if inferred == 0 {
|
||||
eprintln!("error: cannot infer dimension (first vector is empty). Use --dimension");
|
||||
process::exit(1);
|
||||
}
|
||||
eprintln!("info: inferred dimension = {inferred} from first record");
|
||||
inferred
|
||||
}
|
||||
};
|
||||
|
||||
let progress: Option<&dyn rvf_import::progress::ProgressReporter> = if cli.quiet {
|
||||
None
|
||||
} else {
|
||||
Some(&StderrProgress)
|
||||
};
|
||||
|
||||
match rvf_import::import_to_new_store(
|
||||
&cli.output,
|
||||
dimension,
|
||||
&records,
|
||||
cli.batch_size,
|
||||
progress,
|
||||
) {
|
||||
Ok(result) => {
|
||||
if !cli.quiet {
|
||||
eprintln!();
|
||||
}
|
||||
eprintln!(
|
||||
"done: imported {} vectors, rejected {}, in {} batches -> {}",
|
||||
result.total_imported,
|
||||
result.total_rejected,
|
||||
result.batches,
|
||||
cli.output.display()
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("\nerror: import failed: {e}");
|
||||
process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
209
vendor/ruvector/crates/rvf/rvf-import/src/csv_import.rs
vendored
Normal file
209
vendor/ruvector/crates/rvf/rvf-import/src/csv_import.rs
vendored
Normal file
@@ -0,0 +1,209 @@
|
||||
//! CSV/TSV importer for RVF stores.
|
||||
//!
|
||||
//! Expects a CSV where one column contains the vector ID and a contiguous
|
||||
//! range of columns holds the vector components (as f32).
|
||||
//!
|
||||
//! Example CSV (id_column=0, vector_start=1, dimension=3):
|
||||
//! ```text
|
||||
//! id,x0,x1,x2
|
||||
//! 1,0.1,0.2,0.3
|
||||
//! 2,0.4,0.5,0.6
|
||||
//! ```
|
||||
|
||||
use crate::VectorRecord;
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
|
||||
/// Configuration for CSV parsing.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CsvConfig {
|
||||
/// Column index (0-based) that holds the vector ID.
|
||||
pub id_column: usize,
|
||||
/// Column index (0-based) where vector components begin.
|
||||
pub vector_start: usize,
|
||||
/// Expected vector dimensionality. If `None`, it is inferred from the
|
||||
/// first data row as `num_columns - vector_start`.
|
||||
pub dimension: Option<usize>,
|
||||
/// Field delimiter. Defaults to `,`.
|
||||
pub delimiter: u8,
|
||||
/// Whether the first row is a header row (skipped).
|
||||
pub has_header: bool,
|
||||
}
|
||||
|
||||
impl Default for CsvConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
id_column: 0,
|
||||
vector_start: 1,
|
||||
dimension: None,
|
||||
delimiter: b',',
|
||||
has_header: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse CSV from a reader with the given config.
|
||||
pub fn parse_csv<R: Read>(reader: R, config: &CsvConfig) -> Result<Vec<VectorRecord>, String> {
|
||||
let mut csv_reader = csv::ReaderBuilder::new()
|
||||
.delimiter(config.delimiter)
|
||||
.has_headers(config.has_header)
|
||||
.from_reader(reader);
|
||||
|
||||
let mut records = Vec::new();
|
||||
let mut inferred_dim: Option<usize> = config.dimension;
|
||||
|
||||
for (row_idx, result) in csv_reader.records().enumerate() {
|
||||
let record = result.map_err(|e| format!("CSV row {}: {e}", row_idx + 1))?;
|
||||
|
||||
let id: u64 = record
|
||||
.get(config.id_column)
|
||||
.ok_or_else(|| {
|
||||
format!(
|
||||
"row {}: missing id column {}",
|
||||
row_idx + 1,
|
||||
config.id_column
|
||||
)
|
||||
})?
|
||||
.trim()
|
||||
.parse()
|
||||
.map_err(|e| format!("row {}: bad id: {e}", row_idx + 1))?;
|
||||
|
||||
let dim = match inferred_dim {
|
||||
Some(d) => d,
|
||||
None => {
|
||||
let d = record.len().saturating_sub(config.vector_start);
|
||||
if d == 0 {
|
||||
return Err(format!(
|
||||
"row {}: no vector columns after index {}",
|
||||
row_idx + 1,
|
||||
config.vector_start
|
||||
));
|
||||
}
|
||||
inferred_dim = Some(d);
|
||||
d
|
||||
}
|
||||
};
|
||||
|
||||
let end = config.vector_start + dim;
|
||||
if record.len() < end {
|
||||
return Err(format!(
|
||||
"row {}: expected {} columns for vector, got {}",
|
||||
row_idx + 1,
|
||||
dim,
|
||||
record.len().saturating_sub(config.vector_start)
|
||||
));
|
||||
}
|
||||
|
||||
let mut vector = Vec::with_capacity(dim);
|
||||
for col in config.vector_start..end {
|
||||
let val: f32 = record
|
||||
.get(col)
|
||||
.ok_or_else(|| format!("row {}: missing column {col}", row_idx + 1))?
|
||||
.trim()
|
||||
.parse()
|
||||
.map_err(|e| format!("row {}, col {col}: bad float: {e}", row_idx + 1))?;
|
||||
vector.push(val);
|
||||
}
|
||||
|
||||
records.push(VectorRecord {
|
||||
id,
|
||||
vector,
|
||||
metadata: Vec::new(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(records)
|
||||
}
|
||||
|
||||
/// Parse CSV from a file path.
|
||||
pub fn parse_csv_file(path: &Path, config: &CsvConfig) -> Result<Vec<VectorRecord>, String> {
|
||||
let file =
|
||||
std::fs::File::open(path).map_err(|e| format!("cannot open {}: {e}", path.display()))?;
|
||||
let reader = std::io::BufReader::new(file);
|
||||
parse_csv(reader, config)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_basic_csv() {
|
||||
let data = "id,x0,x1,x2\n1,0.1,0.2,0.3\n2,0.4,0.5,0.6\n";
|
||||
let config = CsvConfig::default();
|
||||
let records = parse_csv(data.as_bytes(), &config).unwrap();
|
||||
assert_eq!(records.len(), 2);
|
||||
assert_eq!(records[0].id, 1);
|
||||
assert_eq!(records[0].vector, vec![0.1, 0.2, 0.3]);
|
||||
assert_eq!(records[1].id, 2);
|
||||
assert_eq!(records[1].vector, vec![0.4, 0.5, 0.6]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_tsv() {
|
||||
let data = "id\tx0\tx1\n10\t1.0\t2.0\n20\t3.0\t4.0\n";
|
||||
let config = CsvConfig {
|
||||
delimiter: b'\t',
|
||||
..Default::default()
|
||||
};
|
||||
let records = parse_csv(data.as_bytes(), &config).unwrap();
|
||||
assert_eq!(records.len(), 2);
|
||||
assert_eq!(records[0].id, 10);
|
||||
assert_eq!(records[0].vector, vec![1.0, 2.0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_no_header() {
|
||||
let data = "1,0.1,0.2\n2,0.3,0.4\n";
|
||||
let config = CsvConfig {
|
||||
has_header: false,
|
||||
dimension: Some(2),
|
||||
..Default::default()
|
||||
};
|
||||
let records = parse_csv(data.as_bytes(), &config).unwrap();
|
||||
assert_eq!(records.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_custom_columns() {
|
||||
let data = "x0,x1,id\n0.1,0.2,100\n0.3,0.4,200\n";
|
||||
let config = CsvConfig {
|
||||
id_column: 2,
|
||||
vector_start: 0,
|
||||
dimension: Some(2),
|
||||
..Default::default()
|
||||
};
|
||||
let records = parse_csv(data.as_bytes(), &config).unwrap();
|
||||
assert_eq!(records[0].id, 100);
|
||||
assert_eq!(records[0].vector, vec![0.1, 0.2]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_empty_csv() {
|
||||
let data = "id,x0\n";
|
||||
let config = CsvConfig::default();
|
||||
let records = parse_csv(data.as_bytes(), &config).unwrap();
|
||||
assert!(records.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bad_float_gives_error() {
|
||||
let data = "id,x0\n1,notanumber\n";
|
||||
let config = CsvConfig::default();
|
||||
let result = parse_csv(data.as_bytes(), &config);
|
||||
assert!(result.is_err());
|
||||
assert!(result.unwrap_err().contains("bad float"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn infer_dimension_from_first_row() {
|
||||
let data = "id,a,b,c,d\n1,0.1,0.2,0.3,0.4\n2,0.5,0.6,0.7,0.8\n";
|
||||
let config = CsvConfig {
|
||||
dimension: None,
|
||||
..Default::default()
|
||||
};
|
||||
let records = parse_csv(data.as_bytes(), &config).unwrap();
|
||||
assert_eq!(records[0].vector.len(), 4);
|
||||
assert_eq!(records[1].vector.len(), 4);
|
||||
}
|
||||
}
|
||||
203
vendor/ruvector/crates/rvf/rvf-import/src/json.rs
vendored
Normal file
203
vendor/ruvector/crates/rvf/rvf-import/src/json.rs
vendored
Normal file
@@ -0,0 +1,203 @@
|
||||
//! JSON importer for RVF stores.
|
||||
//!
|
||||
//! Supports two JSON layouts:
|
||||
//!
|
||||
//! 1. **Array of objects** (the common case):
|
||||
//! ```json
|
||||
//! [
|
||||
//! {"id": 1, "vector": [0.1, 0.2, ...], "metadata": {"key": "value"}},
|
||||
//! {"id": 2, "vector": [0.3, 0.4, ...]}
|
||||
//! ]
|
||||
//! ```
|
||||
//!
|
||||
//! 2. **HNSW dump format**:
|
||||
//! ```json
|
||||
//! {
|
||||
//! "vectors": [
|
||||
//! {"id": 1, "vector": [0.1, 0.2, ...]},
|
||||
//! ...
|
||||
//! ],
|
||||
//! "graph": { ... }
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! The `graph` field in HNSW dumps is ignored — only vector data is imported.
|
||||
|
||||
use crate::VectorRecord;
|
||||
use rvf_runtime::{MetadataEntry, MetadataValue};
|
||||
use serde::Deserialize;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
|
||||
/// A single vector entry as it appears in JSON.
|
||||
#[derive(Deserialize)]
|
||||
struct JsonVectorEntry {
|
||||
id: u64,
|
||||
vector: Vec<f32>,
|
||||
#[serde(default)]
|
||||
metadata: Option<HashMap<String, serde_json::Value>>,
|
||||
}
|
||||
|
||||
/// HNSW dump envelope.
|
||||
#[derive(Deserialize)]
|
||||
struct HnswDump {
|
||||
vectors: Vec<JsonVectorEntry>,
|
||||
// `graph` is intentionally ignored during import.
|
||||
}
|
||||
|
||||
/// Intermediate deserialization target that handles both layouts.
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum JsonInput {
|
||||
Array(Vec<JsonVectorEntry>),
|
||||
HnswDump(HnswDump),
|
||||
}
|
||||
|
||||
fn convert_metadata(map: &HashMap<String, serde_json::Value>) -> Vec<MetadataEntry> {
|
||||
let mut entries = Vec::new();
|
||||
for (i, (_key, value)) in map.iter().enumerate() {
|
||||
let field_id = i as u16;
|
||||
match value {
|
||||
serde_json::Value::Number(n) => {
|
||||
if let Some(u) = n.as_u64() {
|
||||
entries.push(MetadataEntry {
|
||||
field_id,
|
||||
value: MetadataValue::U64(u),
|
||||
});
|
||||
} else if let Some(i) = n.as_i64() {
|
||||
entries.push(MetadataEntry {
|
||||
field_id,
|
||||
value: MetadataValue::I64(i),
|
||||
});
|
||||
} else if let Some(f) = n.as_f64() {
|
||||
entries.push(MetadataEntry {
|
||||
field_id,
|
||||
value: MetadataValue::F64(f),
|
||||
});
|
||||
}
|
||||
}
|
||||
serde_json::Value::String(s) => {
|
||||
entries.push(MetadataEntry {
|
||||
field_id,
|
||||
value: MetadataValue::String(s.clone()),
|
||||
});
|
||||
}
|
||||
_ => {
|
||||
// Arrays, objects, bools, null — store as JSON string
|
||||
entries.push(MetadataEntry {
|
||||
field_id,
|
||||
value: MetadataValue::String(value.to_string()),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
entries
|
||||
}
|
||||
|
||||
fn entries_to_records(entries: Vec<JsonVectorEntry>) -> Vec<VectorRecord> {
|
||||
entries
|
||||
.into_iter()
|
||||
.map(|e| {
|
||||
let metadata = e
|
||||
.metadata
|
||||
.as_ref()
|
||||
.map(convert_metadata)
|
||||
.unwrap_or_default();
|
||||
VectorRecord {
|
||||
id: e.id,
|
||||
vector: e.vector,
|
||||
metadata,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Parse JSON from a reader. Handles both array-of-objects and HNSW dump formats.
|
||||
pub fn parse_json<R: Read>(reader: R) -> Result<Vec<VectorRecord>, String> {
|
||||
let input: JsonInput =
|
||||
serde_json::from_reader(reader).map_err(|e| format!("JSON parse error: {e}"))?;
|
||||
|
||||
let entries = match input {
|
||||
JsonInput::Array(arr) => arr,
|
||||
JsonInput::HnswDump(dump) => dump.vectors,
|
||||
};
|
||||
|
||||
Ok(entries_to_records(entries))
|
||||
}
|
||||
|
||||
/// Parse JSON from a file path.
|
||||
pub fn parse_json_file(path: &Path) -> Result<Vec<VectorRecord>, String> {
|
||||
let file =
|
||||
std::fs::File::open(path).map_err(|e| format!("cannot open {}: {e}", path.display()))?;
|
||||
let reader = std::io::BufReader::new(file);
|
||||
parse_json(reader)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_array_format() {
|
||||
let json = r#"[
|
||||
{"id": 1, "vector": [0.1, 0.2, 0.3]},
|
||||
{"id": 2, "vector": [0.4, 0.5, 0.6], "metadata": {"category": "test", "score": 42}}
|
||||
]"#;
|
||||
|
||||
let records = parse_json(json.as_bytes()).unwrap();
|
||||
assert_eq!(records.len(), 2);
|
||||
assert_eq!(records[0].id, 1);
|
||||
assert_eq!(records[0].vector, vec![0.1, 0.2, 0.3]);
|
||||
assert!(records[0].metadata.is_empty());
|
||||
|
||||
assert_eq!(records[1].id, 2);
|
||||
assert_eq!(records[1].vector, vec![0.4, 0.5, 0.6]);
|
||||
assert_eq!(records[1].metadata.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_hnsw_dump_format() {
|
||||
let json = r#"{
|
||||
"vectors": [
|
||||
{"id": 10, "vector": [1.0, 2.0]},
|
||||
{"id": 20, "vector": [3.0, 4.0]}
|
||||
],
|
||||
"graph": {"layers": 3, "nodes": []}
|
||||
}"#;
|
||||
|
||||
let records = parse_json(json.as_bytes()).unwrap();
|
||||
assert_eq!(records.len(), 2);
|
||||
assert_eq!(records[0].id, 10);
|
||||
assert_eq!(records[1].id, 20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_empty_array() {
|
||||
let json = "[]";
|
||||
let records = parse_json(json.as_bytes()).unwrap();
|
||||
assert!(records.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_invalid_json() {
|
||||
let json = "not json at all";
|
||||
let result = parse_json(json.as_bytes());
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn metadata_types() {
|
||||
let json = r#"[
|
||||
{"id": 1, "vector": [0.1], "metadata": {
|
||||
"name": "hello",
|
||||
"count": 99,
|
||||
"neg": -5,
|
||||
"score": 3.14
|
||||
}}
|
||||
]"#;
|
||||
|
||||
let records = parse_json(json.as_bytes()).unwrap();
|
||||
assert_eq!(records[0].metadata.len(), 4);
|
||||
}
|
||||
}
|
||||
101
vendor/ruvector/crates/rvf/rvf-import/src/lib.rs
vendored
Normal file
101
vendor/ruvector/crates/rvf/rvf-import/src/lib.rs
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
//! rvf-import: Migration tools for importing data into RVF stores.
|
||||
//!
|
||||
//! Supports JSON, CSV/TSV, and NumPy `.npy` formats. Each importer
|
||||
//! parses the source format and batch-ingests vectors into an
|
||||
//! [`rvf_runtime::RvfStore`].
|
||||
|
||||
pub mod csv_import;
|
||||
pub mod json;
|
||||
pub mod numpy;
|
||||
pub mod progress;
|
||||
|
||||
use rvf_runtime::{MetadataEntry, RvfOptions, RvfStore};
|
||||
use rvf_types::RvfError;
|
||||
use std::path::Path;
|
||||
|
||||
/// A single vector record ready for ingestion.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct VectorRecord {
|
||||
/// Unique identifier for this vector.
|
||||
pub id: u64,
|
||||
/// The embedding / feature vector.
|
||||
pub vector: Vec<f32>,
|
||||
/// Optional key-value metadata entries.
|
||||
pub metadata: Vec<MetadataEntry>,
|
||||
}
|
||||
|
||||
/// Result summary returned after an import completes.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ImportResult {
|
||||
/// Total records successfully ingested.
|
||||
pub total_imported: u64,
|
||||
/// Total records that failed validation (wrong dimension, etc.).
|
||||
pub total_rejected: u64,
|
||||
/// Number of batches written.
|
||||
pub batches: u32,
|
||||
}
|
||||
|
||||
/// Batch-ingest a slice of [`VectorRecord`]s into an [`RvfStore`].
|
||||
///
|
||||
/// Records whose vector length does not match `dimension` are silently
|
||||
/// rejected by the store. Returns an [`ImportResult`] summarising the
|
||||
/// operation.
|
||||
pub fn ingest_records(
|
||||
store: &mut RvfStore,
|
||||
records: &[VectorRecord],
|
||||
batch_size: usize,
|
||||
progress: Option<&dyn progress::ProgressReporter>,
|
||||
) -> Result<ImportResult, RvfError> {
|
||||
let batch_size = batch_size.max(1);
|
||||
let mut total_imported = 0u64;
|
||||
let mut total_rejected = 0u64;
|
||||
let mut batches = 0u32;
|
||||
|
||||
for chunk in records.chunks(batch_size) {
|
||||
let vec_data: Vec<Vec<f32>> = chunk.iter().map(|r| r.vector.clone()).collect();
|
||||
let vec_refs: Vec<&[f32]> = vec_data.iter().map(|v| v.as_slice()).collect();
|
||||
let ids: Vec<u64> = chunk.iter().map(|r| r.id).collect();
|
||||
|
||||
let has_metadata = chunk.iter().any(|r| !r.metadata.is_empty());
|
||||
let metadata: Option<Vec<MetadataEntry>> = if has_metadata {
|
||||
Some(chunk.iter().flat_map(|r| r.metadata.clone()).collect())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let result = store.ingest_batch(&vec_refs, &ids, metadata.as_deref())?;
|
||||
|
||||
total_imported += result.accepted;
|
||||
total_rejected += result.rejected;
|
||||
batches += 1;
|
||||
|
||||
if let Some(p) = progress {
|
||||
p.report(total_imported, total_rejected, records.len() as u64);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ImportResult {
|
||||
total_imported,
|
||||
total_rejected,
|
||||
batches,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a new RVF store at `path` with the given dimension, then
|
||||
/// ingest all `records` into it.
|
||||
pub fn import_to_new_store(
|
||||
path: &Path,
|
||||
dimension: u16,
|
||||
records: &[VectorRecord],
|
||||
batch_size: usize,
|
||||
progress: Option<&dyn progress::ProgressReporter>,
|
||||
) -> Result<ImportResult, RvfError> {
|
||||
let options = RvfOptions {
|
||||
dimension,
|
||||
..Default::default()
|
||||
};
|
||||
let mut store = RvfStore::create(path, options)?;
|
||||
let result = ingest_records(&mut store, records, batch_size, progress)?;
|
||||
store.close()?;
|
||||
Ok(result)
|
||||
}
|
||||
251
vendor/ruvector/crates/rvf/rvf-import/src/numpy.rs
vendored
Normal file
251
vendor/ruvector/crates/rvf/rvf-import/src/numpy.rs
vendored
Normal file
@@ -0,0 +1,251 @@
|
||||
//! NumPy `.npy` importer for RVF stores.
|
||||
//!
|
||||
//! Parses the NumPy v1/v2 `.npy` format (little-endian float32 only).
|
||||
//! The shape `(N, D)` is read from the header; IDs are assigned
|
||||
//! sequentially starting from `start_id` (default 0).
|
||||
//!
|
||||
//! Reference: <https://numpy.org/devdocs/reference/generated/numpy.lib.format.html>
|
||||
|
||||
use crate::VectorRecord;
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
|
||||
/// Configuration for NumPy import.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct NpyConfig {
|
||||
/// Starting ID for auto-assigned vector IDs.
|
||||
pub start_id: u64,
|
||||
}
|
||||
|
||||
/// Parsed header from a `.npy` file.
|
||||
#[derive(Debug)]
|
||||
struct NpyHeader {
|
||||
/// Number of rows (vectors).
|
||||
rows: usize,
|
||||
/// Number of columns (dimensions per vector).
|
||||
cols: usize,
|
||||
}
|
||||
|
||||
/// Parse the `.npy` header from a reader, returning the shape and
|
||||
/// advancing the reader past the header.
|
||||
fn parse_npy_header<R: Read>(reader: &mut R) -> Result<NpyHeader, String> {
|
||||
// Magic: \x93NUMPY
|
||||
let mut magic = [0u8; 6];
|
||||
reader
|
||||
.read_exact(&mut magic)
|
||||
.map_err(|e| format!("failed to read npy magic: {e}"))?;
|
||||
if magic[0] != 0x93 || &magic[1..6] != b"NUMPY" {
|
||||
return Err("not a valid .npy file (bad magic)".to_string());
|
||||
}
|
||||
|
||||
// Version
|
||||
let mut version = [0u8; 2];
|
||||
reader
|
||||
.read_exact(&mut version)
|
||||
.map_err(|e| format!("failed to read npy version: {e}"))?;
|
||||
let major = version[0];
|
||||
|
||||
// Header length
|
||||
let header_len: usize = if major <= 1 {
|
||||
let mut buf = [0u8; 2];
|
||||
reader
|
||||
.read_exact(&mut buf)
|
||||
.map_err(|e| format!("failed to read header length: {e}"))?;
|
||||
u16::from_le_bytes(buf) as usize
|
||||
} else {
|
||||
let mut buf = [0u8; 4];
|
||||
reader
|
||||
.read_exact(&mut buf)
|
||||
.map_err(|e| format!("failed to read header length: {e}"))?;
|
||||
u32::from_le_bytes(buf) as usize
|
||||
};
|
||||
|
||||
// Read the header dict string
|
||||
let mut header_bytes = vec![0u8; header_len];
|
||||
reader
|
||||
.read_exact(&mut header_bytes)
|
||||
.map_err(|e| format!("failed to read header dict: {e}"))?;
|
||||
let header_str =
|
||||
std::str::from_utf8(&header_bytes).map_err(|e| format!("header is not utf8: {e}"))?;
|
||||
|
||||
// Validate dtype is float32
|
||||
if !header_str.contains("'<f4'") && !header_str.contains("'float32'") {
|
||||
return Err(format!(
|
||||
"unsupported dtype in npy header (only float32/<f4 supported): {header_str}"
|
||||
));
|
||||
}
|
||||
|
||||
// Parse shape: look for 'shape': (N, D) or 'shape': (N,)
|
||||
let shape = parse_shape(header_str)?;
|
||||
|
||||
Ok(shape)
|
||||
}
|
||||
|
||||
fn parse_shape(header: &str) -> Result<NpyHeader, String> {
|
||||
// Find the shape tuple in the header dict
|
||||
let shape_start = header
|
||||
.find("'shape':")
|
||||
.or_else(|| header.find("\"shape\":"))
|
||||
.ok_or_else(|| format!("no 'shape' key in npy header: {header}"))?;
|
||||
|
||||
let after_key = &header[shape_start..];
|
||||
let paren_open = after_key
|
||||
.find('(')
|
||||
.ok_or_else(|| "no opening paren in shape".to_string())?;
|
||||
let paren_close = after_key
|
||||
.find(')')
|
||||
.ok_or_else(|| "no closing paren in shape".to_string())?;
|
||||
|
||||
let shape_content = &after_key[paren_open + 1..paren_close];
|
||||
let parts: Vec<&str> = shape_content
|
||||
.split(',')
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
|
||||
match parts.len() {
|
||||
1 => {
|
||||
let rows: usize = parts[0]
|
||||
.parse()
|
||||
.map_err(|e| format!("bad shape dim: {e}"))?;
|
||||
// 1-D array: each element is a 1-d vector
|
||||
Ok(NpyHeader { rows, cols: 1 })
|
||||
}
|
||||
2 => {
|
||||
let rows: usize = parts[0]
|
||||
.parse()
|
||||
.map_err(|e| format!("bad shape row: {e}"))?;
|
||||
let cols: usize = parts[1]
|
||||
.parse()
|
||||
.map_err(|e| format!("bad shape col: {e}"))?;
|
||||
Ok(NpyHeader { rows, cols })
|
||||
}
|
||||
_ => Err(format!(
|
||||
"unsupported shape rank {}: {shape_content}",
|
||||
parts.len()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a `.npy` file from a reader.
|
||||
pub fn parse_npy<R: Read>(mut reader: R, config: &NpyConfig) -> Result<Vec<VectorRecord>, String> {
|
||||
let header = parse_npy_header(&mut reader)?;
|
||||
|
||||
let total_floats = header.rows * header.cols;
|
||||
let total_bytes = total_floats * 4;
|
||||
let mut raw = vec![0u8; total_bytes];
|
||||
reader
|
||||
.read_exact(&mut raw)
|
||||
.map_err(|e| format!("failed to read npy data ({total_bytes} bytes expected): {e}"))?;
|
||||
|
||||
let mut records = Vec::with_capacity(header.rows);
|
||||
for i in 0..header.rows {
|
||||
let offset = i * header.cols * 4;
|
||||
let mut vector = Vec::with_capacity(header.cols);
|
||||
for j in 0..header.cols {
|
||||
let byte_offset = offset + j * 4;
|
||||
let bytes: [u8; 4] = [
|
||||
raw[byte_offset],
|
||||
raw[byte_offset + 1],
|
||||
raw[byte_offset + 2],
|
||||
raw[byte_offset + 3],
|
||||
];
|
||||
vector.push(f32::from_le_bytes(bytes));
|
||||
}
|
||||
records.push(VectorRecord {
|
||||
id: config.start_id + i as u64,
|
||||
vector,
|
||||
metadata: Vec::new(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(records)
|
||||
}
|
||||
|
||||
/// Parse a `.npy` file from a file path.
|
||||
pub fn parse_npy_file(path: &Path, config: &NpyConfig) -> Result<Vec<VectorRecord>, String> {
|
||||
let file =
|
||||
std::fs::File::open(path).map_err(|e| format!("cannot open {}: {e}", path.display()))?;
|
||||
let reader = std::io::BufReader::new(file);
|
||||
parse_npy(reader, config)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Build a minimal valid .npy file in memory with the given shape and f32 data.
|
||||
fn build_npy(rows: usize, cols: usize, data: &[f32]) -> Vec<u8> {
|
||||
let header_dict =
|
||||
format!("{{'descr': '<f4', 'fortran_order': False, 'shape': ({rows}, {cols}), }}");
|
||||
// Pad header to 64-byte alignment (magic=6 + version=2 + header_len=2 + dict)
|
||||
let preamble_len = 6 + 2 + 2;
|
||||
let total_header = preamble_len + header_dict.len();
|
||||
let padding = (64 - (total_header % 64)) % 64;
|
||||
let padded_dict_len = header_dict.len() + padding;
|
||||
|
||||
let mut buf = Vec::new();
|
||||
// Magic
|
||||
buf.push(0x93);
|
||||
buf.extend_from_slice(b"NUMPY");
|
||||
// Version 1.0
|
||||
buf.push(1);
|
||||
buf.push(0);
|
||||
// Header length (u16 LE)
|
||||
buf.extend_from_slice(&(padded_dict_len as u16).to_le_bytes());
|
||||
// Dict
|
||||
buf.extend_from_slice(header_dict.as_bytes());
|
||||
// Padding (spaces + newline)
|
||||
buf.extend(std::iter::repeat_n(b' ', padding.saturating_sub(1)));
|
||||
if padding > 0 {
|
||||
buf.push(b'\n');
|
||||
}
|
||||
// Data
|
||||
for &val in data {
|
||||
buf.extend_from_slice(&val.to_le_bytes());
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_2d_npy() {
|
||||
let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
|
||||
let npy = build_npy(2, 3, &data);
|
||||
|
||||
let records = parse_npy(npy.as_slice(), &NpyConfig::default()).unwrap();
|
||||
assert_eq!(records.len(), 2);
|
||||
assert_eq!(records[0].id, 0);
|
||||
assert_eq!(records[0].vector, vec![1.0, 2.0, 3.0]);
|
||||
assert_eq!(records[1].id, 1);
|
||||
assert_eq!(records[1].vector, vec![4.0, 5.0, 6.0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_npy_custom_start_id() {
|
||||
let data = vec![0.5f32, 0.6];
|
||||
let npy = build_npy(1, 2, &data);
|
||||
|
||||
let config = NpyConfig { start_id: 100 };
|
||||
let records = parse_npy(npy.as_slice(), &config).unwrap();
|
||||
assert_eq!(records[0].id, 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bad_magic_rejected() {
|
||||
let bad = b"NOT_NUMPY_DATA";
|
||||
let result = parse_npy(bad.as_slice(), &NpyConfig::default());
|
||||
assert!(result.is_err());
|
||||
assert!(result.unwrap_err().contains("bad magic"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shape_parsing() {
|
||||
let h = parse_shape("{'descr': '<f4', 'shape': (100, 384), }").unwrap();
|
||||
assert_eq!(h.rows, 100);
|
||||
assert_eq!(h.cols, 384);
|
||||
|
||||
let h = parse_shape("{'descr': '<f4', 'shape': (50,), }").unwrap();
|
||||
assert_eq!(h.rows, 50);
|
||||
assert_eq!(h.cols, 1);
|
||||
}
|
||||
}
|
||||
54
vendor/ruvector/crates/rvf/rvf-import/src/progress.rs
vendored
Normal file
54
vendor/ruvector/crates/rvf/rvf-import/src/progress.rs
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
//! Progress reporting for long-running imports.
|
||||
|
||||
use std::io::Write;
|
||||
|
||||
/// Trait for receiving import progress callbacks.
|
||||
pub trait ProgressReporter {
|
||||
/// Called after each batch with cumulative counts.
|
||||
fn report(&self, imported: u64, rejected: u64, total: u64);
|
||||
}
|
||||
|
||||
/// A reporter that prints progress to stderr.
|
||||
pub struct StderrProgress;
|
||||
|
||||
impl ProgressReporter for StderrProgress {
|
||||
fn report(&self, imported: u64, rejected: u64, total: u64) {
|
||||
if total > 0 {
|
||||
let pct = (imported + rejected) as f64 / total as f64 * 100.0;
|
||||
eprint!("\r imported: {imported}, rejected: {rejected}, total: {total} ({pct:.1}%)");
|
||||
let _ = std::io::stderr().flush();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A reporter that collects reports for testing.
|
||||
pub struct CollectingProgress {
|
||||
reports: std::sync::Mutex<Vec<(u64, u64, u64)>>,
|
||||
}
|
||||
|
||||
impl Default for CollectingProgress {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
reports: std::sync::Mutex::new(Vec::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CollectingProgress {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
pub fn reports(&self) -> Vec<(u64, u64, u64)> {
|
||||
self.reports.lock().unwrap().clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl ProgressReporter for CollectingProgress {
|
||||
fn report(&self, imported: u64, rejected: u64, total: u64) {
|
||||
self.reports
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push((imported, rejected, total));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user