Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,167 @@
//! CLI binary for importing data into RVF stores.
//!
//! Usage examples:
//! rvf-import --format json --input vectors.json --output data.rvf --dimension 384
//! rvf-import --format csv --input data.csv --output data.rvf --id-column 0 --vector-start 1
//! rvf-import --format npy --input embeddings.npy --output data.rvf
use clap::Parser;
use rvf_import::progress::StderrProgress;
use std::path::PathBuf;
use std::process;
#[derive(Parser)]
#[command(name = "rvf-import", about = "Import vectors into an RVF store")]
struct Cli {
/// Input format: json, csv, tsv, or npy.
#[arg(long)]
format: String,
/// Path to the input file.
#[arg(long)]
input: PathBuf,
/// Path to the output .rvf file (will be created).
#[arg(long)]
output: PathBuf,
/// Vector dimension. Required for json/csv; auto-detected for npy.
#[arg(long)]
dimension: Option<u16>,
/// (CSV) Column index for the vector ID (0-based, default 0).
#[arg(long, default_value_t = 0)]
id_column: usize,
/// (CSV) Column index where vector components start (0-based, default 1).
#[arg(long, default_value_t = 1)]
vector_start: usize,
/// (CSV) Disable header row detection.
#[arg(long)]
no_header: bool,
/// (NPY) Starting ID for auto-assigned vector IDs.
#[arg(long, default_value_t = 0)]
start_id: u64,
/// Batch size for ingestion (default 1000).
#[arg(long, default_value_t = 1000)]
batch_size: usize,
/// Suppress progress output.
#[arg(long)]
quiet: bool,
}
fn main() {
let cli = Cli::parse();
let records = match cli.format.as_str() {
"json" => match rvf_import::json::parse_json_file(&cli.input) {
Ok(r) => r,
Err(e) => {
eprintln!("error: {e}");
process::exit(1);
}
},
"csv" => {
let config = rvf_import::csv_import::CsvConfig {
id_column: cli.id_column,
vector_start: cli.vector_start,
delimiter: b',',
has_header: !cli.no_header,
dimension: cli.dimension.map(|d| d as usize),
};
match rvf_import::csv_import::parse_csv_file(&cli.input, &config) {
Ok(r) => r,
Err(e) => {
eprintln!("error: {e}");
process::exit(1);
}
}
}
"tsv" => {
let config = rvf_import::csv_import::CsvConfig {
id_column: cli.id_column,
vector_start: cli.vector_start,
delimiter: b'\t',
has_header: !cli.no_header,
dimension: cli.dimension.map(|d| d as usize),
};
match rvf_import::csv_import::parse_csv_file(&cli.input, &config) {
Ok(r) => r,
Err(e) => {
eprintln!("error: {e}");
process::exit(1);
}
}
}
"npy" => {
let config = rvf_import::numpy::NpyConfig {
start_id: cli.start_id,
};
match rvf_import::numpy::parse_npy_file(&cli.input, &config) {
Ok(r) => r,
Err(e) => {
eprintln!("error: {e}");
process::exit(1);
}
}
}
other => {
eprintln!("error: unknown format '{other}'. Use: json, csv, tsv, npy");
process::exit(1);
}
};
if records.is_empty() {
eprintln!("warning: no records parsed from input file");
process::exit(0);
}
// Determine dimension
let dimension = match cli.dimension {
Some(d) => d,
None => {
let inferred = records[0].vector.len() as u16;
if inferred == 0 {
eprintln!("error: cannot infer dimension (first vector is empty). Use --dimension");
process::exit(1);
}
eprintln!("info: inferred dimension = {inferred} from first record");
inferred
}
};
let progress: Option<&dyn rvf_import::progress::ProgressReporter> = if cli.quiet {
None
} else {
Some(&StderrProgress)
};
match rvf_import::import_to_new_store(
&cli.output,
dimension,
&records,
cli.batch_size,
progress,
) {
Ok(result) => {
if !cli.quiet {
eprintln!();
}
eprintln!(
"done: imported {} vectors, rejected {}, in {} batches -> {}",
result.total_imported,
result.total_rejected,
result.batches,
cli.output.display()
);
}
Err(e) => {
eprintln!("\nerror: import failed: {e}");
process::exit(1);
}
}
}

View File

@@ -0,0 +1,209 @@
//! CSV/TSV importer for RVF stores.
//!
//! Expects a CSV where one column contains the vector ID and a contiguous
//! range of columns holds the vector components (as f32).
//!
//! Example CSV (id_column=0, vector_start=1, dimension=3):
//! ```text
//! id,x0,x1,x2
//! 1,0.1,0.2,0.3
//! 2,0.4,0.5,0.6
//! ```
use crate::VectorRecord;
use std::io::Read;
use std::path::Path;
/// Configuration for CSV parsing.
#[derive(Clone, Debug)]
pub struct CsvConfig {
/// Column index (0-based) that holds the vector ID.
pub id_column: usize,
/// Column index (0-based) where vector components begin.
pub vector_start: usize,
/// Expected vector dimensionality. If `None`, it is inferred from the
/// first data row as `num_columns - vector_start`.
pub dimension: Option<usize>,
/// Field delimiter. Defaults to `,`.
pub delimiter: u8,
/// Whether the first row is a header row (skipped).
pub has_header: bool,
}
impl Default for CsvConfig {
fn default() -> Self {
Self {
id_column: 0,
vector_start: 1,
dimension: None,
delimiter: b',',
has_header: true,
}
}
}
/// Parse CSV from a reader with the given config.
pub fn parse_csv<R: Read>(reader: R, config: &CsvConfig) -> Result<Vec<VectorRecord>, String> {
let mut csv_reader = csv::ReaderBuilder::new()
.delimiter(config.delimiter)
.has_headers(config.has_header)
.from_reader(reader);
let mut records = Vec::new();
let mut inferred_dim: Option<usize> = config.dimension;
for (row_idx, result) in csv_reader.records().enumerate() {
let record = result.map_err(|e| format!("CSV row {}: {e}", row_idx + 1))?;
let id: u64 = record
.get(config.id_column)
.ok_or_else(|| {
format!(
"row {}: missing id column {}",
row_idx + 1,
config.id_column
)
})?
.trim()
.parse()
.map_err(|e| format!("row {}: bad id: {e}", row_idx + 1))?;
let dim = match inferred_dim {
Some(d) => d,
None => {
let d = record.len().saturating_sub(config.vector_start);
if d == 0 {
return Err(format!(
"row {}: no vector columns after index {}",
row_idx + 1,
config.vector_start
));
}
inferred_dim = Some(d);
d
}
};
let end = config.vector_start + dim;
if record.len() < end {
return Err(format!(
"row {}: expected {} columns for vector, got {}",
row_idx + 1,
dim,
record.len().saturating_sub(config.vector_start)
));
}
let mut vector = Vec::with_capacity(dim);
for col in config.vector_start..end {
let val: f32 = record
.get(col)
.ok_or_else(|| format!("row {}: missing column {col}", row_idx + 1))?
.trim()
.parse()
.map_err(|e| format!("row {}, col {col}: bad float: {e}", row_idx + 1))?;
vector.push(val);
}
records.push(VectorRecord {
id,
vector,
metadata: Vec::new(),
});
}
Ok(records)
}
/// Parse CSV from a file path.
pub fn parse_csv_file(path: &Path, config: &CsvConfig) -> Result<Vec<VectorRecord>, String> {
let file =
std::fs::File::open(path).map_err(|e| format!("cannot open {}: {e}", path.display()))?;
let reader = std::io::BufReader::new(file);
parse_csv(reader, config)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_basic_csv() {
let data = "id,x0,x1,x2\n1,0.1,0.2,0.3\n2,0.4,0.5,0.6\n";
let config = CsvConfig::default();
let records = parse_csv(data.as_bytes(), &config).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0].id, 1);
assert_eq!(records[0].vector, vec![0.1, 0.2, 0.3]);
assert_eq!(records[1].id, 2);
assert_eq!(records[1].vector, vec![0.4, 0.5, 0.6]);
}
#[test]
fn parse_tsv() {
let data = "id\tx0\tx1\n10\t1.0\t2.0\n20\t3.0\t4.0\n";
let config = CsvConfig {
delimiter: b'\t',
..Default::default()
};
let records = parse_csv(data.as_bytes(), &config).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0].id, 10);
assert_eq!(records[0].vector, vec![1.0, 2.0]);
}
#[test]
fn parse_no_header() {
let data = "1,0.1,0.2\n2,0.3,0.4\n";
let config = CsvConfig {
has_header: false,
dimension: Some(2),
..Default::default()
};
let records = parse_csv(data.as_bytes(), &config).unwrap();
assert_eq!(records.len(), 2);
}
#[test]
fn parse_custom_columns() {
let data = "x0,x1,id\n0.1,0.2,100\n0.3,0.4,200\n";
let config = CsvConfig {
id_column: 2,
vector_start: 0,
dimension: Some(2),
..Default::default()
};
let records = parse_csv(data.as_bytes(), &config).unwrap();
assert_eq!(records[0].id, 100);
assert_eq!(records[0].vector, vec![0.1, 0.2]);
}
#[test]
fn parse_empty_csv() {
let data = "id,x0\n";
let config = CsvConfig::default();
let records = parse_csv(data.as_bytes(), &config).unwrap();
assert!(records.is_empty());
}
#[test]
fn bad_float_gives_error() {
let data = "id,x0\n1,notanumber\n";
let config = CsvConfig::default();
let result = parse_csv(data.as_bytes(), &config);
assert!(result.is_err());
assert!(result.unwrap_err().contains("bad float"));
}
#[test]
fn infer_dimension_from_first_row() {
let data = "id,a,b,c,d\n1,0.1,0.2,0.3,0.4\n2,0.5,0.6,0.7,0.8\n";
let config = CsvConfig {
dimension: None,
..Default::default()
};
let records = parse_csv(data.as_bytes(), &config).unwrap();
assert_eq!(records[0].vector.len(), 4);
assert_eq!(records[1].vector.len(), 4);
}
}

View File

@@ -0,0 +1,203 @@
//! JSON importer for RVF stores.
//!
//! Supports two JSON layouts:
//!
//! 1. **Array of objects** (the common case):
//! ```json
//! [
//! {"id": 1, "vector": [0.1, 0.2, ...], "metadata": {"key": "value"}},
//! {"id": 2, "vector": [0.3, 0.4, ...]}
//! ]
//! ```
//!
//! 2. **HNSW dump format**:
//! ```json
//! {
//! "vectors": [
//! {"id": 1, "vector": [0.1, 0.2, ...]},
//! ...
//! ],
//! "graph": { ... }
//! }
//! ```
//!
//! The `graph` field in HNSW dumps is ignored — only vector data is imported.
use crate::VectorRecord;
use rvf_runtime::{MetadataEntry, MetadataValue};
use serde::Deserialize;
use std::collections::HashMap;
use std::io::Read;
use std::path::Path;
/// A single vector entry as it appears in JSON.
#[derive(Deserialize)]
struct JsonVectorEntry {
id: u64,
vector: Vec<f32>,
#[serde(default)]
metadata: Option<HashMap<String, serde_json::Value>>,
}
/// HNSW dump envelope.
#[derive(Deserialize)]
struct HnswDump {
vectors: Vec<JsonVectorEntry>,
// `graph` is intentionally ignored during import.
}
/// Intermediate deserialization target that handles both layouts.
#[derive(Deserialize)]
#[serde(untagged)]
enum JsonInput {
Array(Vec<JsonVectorEntry>),
HnswDump(HnswDump),
}
fn convert_metadata(map: &HashMap<String, serde_json::Value>) -> Vec<MetadataEntry> {
let mut entries = Vec::new();
for (i, (_key, value)) in map.iter().enumerate() {
let field_id = i as u16;
match value {
serde_json::Value::Number(n) => {
if let Some(u) = n.as_u64() {
entries.push(MetadataEntry {
field_id,
value: MetadataValue::U64(u),
});
} else if let Some(i) = n.as_i64() {
entries.push(MetadataEntry {
field_id,
value: MetadataValue::I64(i),
});
} else if let Some(f) = n.as_f64() {
entries.push(MetadataEntry {
field_id,
value: MetadataValue::F64(f),
});
}
}
serde_json::Value::String(s) => {
entries.push(MetadataEntry {
field_id,
value: MetadataValue::String(s.clone()),
});
}
_ => {
// Arrays, objects, bools, null — store as JSON string
entries.push(MetadataEntry {
field_id,
value: MetadataValue::String(value.to_string()),
});
}
}
}
entries
}
fn entries_to_records(entries: Vec<JsonVectorEntry>) -> Vec<VectorRecord> {
entries
.into_iter()
.map(|e| {
let metadata = e
.metadata
.as_ref()
.map(convert_metadata)
.unwrap_or_default();
VectorRecord {
id: e.id,
vector: e.vector,
metadata,
}
})
.collect()
}
/// Parse JSON from a reader. Handles both array-of-objects and HNSW dump formats.
pub fn parse_json<R: Read>(reader: R) -> Result<Vec<VectorRecord>, String> {
let input: JsonInput =
serde_json::from_reader(reader).map_err(|e| format!("JSON parse error: {e}"))?;
let entries = match input {
JsonInput::Array(arr) => arr,
JsonInput::HnswDump(dump) => dump.vectors,
};
Ok(entries_to_records(entries))
}
/// Parse JSON from a file path.
pub fn parse_json_file(path: &Path) -> Result<Vec<VectorRecord>, String> {
let file =
std::fs::File::open(path).map_err(|e| format!("cannot open {}: {e}", path.display()))?;
let reader = std::io::BufReader::new(file);
parse_json(reader)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_array_format() {
let json = r#"[
{"id": 1, "vector": [0.1, 0.2, 0.3]},
{"id": 2, "vector": [0.4, 0.5, 0.6], "metadata": {"category": "test", "score": 42}}
]"#;
let records = parse_json(json.as_bytes()).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0].id, 1);
assert_eq!(records[0].vector, vec![0.1, 0.2, 0.3]);
assert!(records[0].metadata.is_empty());
assert_eq!(records[1].id, 2);
assert_eq!(records[1].vector, vec![0.4, 0.5, 0.6]);
assert_eq!(records[1].metadata.len(), 2);
}
#[test]
fn parse_hnsw_dump_format() {
let json = r#"{
"vectors": [
{"id": 10, "vector": [1.0, 2.0]},
{"id": 20, "vector": [3.0, 4.0]}
],
"graph": {"layers": 3, "nodes": []}
}"#;
let records = parse_json(json.as_bytes()).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0].id, 10);
assert_eq!(records[1].id, 20);
}
#[test]
fn parse_empty_array() {
let json = "[]";
let records = parse_json(json.as_bytes()).unwrap();
assert!(records.is_empty());
}
#[test]
fn parse_invalid_json() {
let json = "not json at all";
let result = parse_json(json.as_bytes());
assert!(result.is_err());
}
#[test]
fn metadata_types() {
let json = r#"[
{"id": 1, "vector": [0.1], "metadata": {
"name": "hello",
"count": 99,
"neg": -5,
"score": 3.14
}}
]"#;
let records = parse_json(json.as_bytes()).unwrap();
assert_eq!(records[0].metadata.len(), 4);
}
}

View File

@@ -0,0 +1,101 @@
//! rvf-import: Migration tools for importing data into RVF stores.
//!
//! Supports JSON, CSV/TSV, and NumPy `.npy` formats. Each importer
//! parses the source format and batch-ingests vectors into an
//! [`rvf_runtime::RvfStore`].
pub mod csv_import;
pub mod json;
pub mod numpy;
pub mod progress;
use rvf_runtime::{MetadataEntry, RvfOptions, RvfStore};
use rvf_types::RvfError;
use std::path::Path;
/// A single vector record ready for ingestion.
#[derive(Clone, Debug)]
pub struct VectorRecord {
/// Unique identifier for this vector.
pub id: u64,
/// The embedding / feature vector.
pub vector: Vec<f32>,
/// Optional key-value metadata entries.
pub metadata: Vec<MetadataEntry>,
}
/// Result summary returned after an import completes.
#[derive(Clone, Debug)]
pub struct ImportResult {
/// Total records successfully ingested.
pub total_imported: u64,
/// Total records that failed validation (wrong dimension, etc.).
pub total_rejected: u64,
/// Number of batches written.
pub batches: u32,
}
/// Batch-ingest a slice of [`VectorRecord`]s into an [`RvfStore`].
///
/// Records whose vector length does not match `dimension` are silently
/// rejected by the store. Returns an [`ImportResult`] summarising the
/// operation.
pub fn ingest_records(
store: &mut RvfStore,
records: &[VectorRecord],
batch_size: usize,
progress: Option<&dyn progress::ProgressReporter>,
) -> Result<ImportResult, RvfError> {
let batch_size = batch_size.max(1);
let mut total_imported = 0u64;
let mut total_rejected = 0u64;
let mut batches = 0u32;
for chunk in records.chunks(batch_size) {
let vec_data: Vec<Vec<f32>> = chunk.iter().map(|r| r.vector.clone()).collect();
let vec_refs: Vec<&[f32]> = vec_data.iter().map(|v| v.as_slice()).collect();
let ids: Vec<u64> = chunk.iter().map(|r| r.id).collect();
let has_metadata = chunk.iter().any(|r| !r.metadata.is_empty());
let metadata: Option<Vec<MetadataEntry>> = if has_metadata {
Some(chunk.iter().flat_map(|r| r.metadata.clone()).collect())
} else {
None
};
let result = store.ingest_batch(&vec_refs, &ids, metadata.as_deref())?;
total_imported += result.accepted;
total_rejected += result.rejected;
batches += 1;
if let Some(p) = progress {
p.report(total_imported, total_rejected, records.len() as u64);
}
}
Ok(ImportResult {
total_imported,
total_rejected,
batches,
})
}
/// Create a new RVF store at `path` with the given dimension, then
/// ingest all `records` into it.
pub fn import_to_new_store(
path: &Path,
dimension: u16,
records: &[VectorRecord],
batch_size: usize,
progress: Option<&dyn progress::ProgressReporter>,
) -> Result<ImportResult, RvfError> {
let options = RvfOptions {
dimension,
..Default::default()
};
let mut store = RvfStore::create(path, options)?;
let result = ingest_records(&mut store, records, batch_size, progress)?;
store.close()?;
Ok(result)
}

View File

@@ -0,0 +1,251 @@
//! NumPy `.npy` importer for RVF stores.
//!
//! Parses the NumPy v1/v2 `.npy` format (little-endian float32 only).
//! The shape `(N, D)` is read from the header; IDs are assigned
//! sequentially starting from `start_id` (default 0).
//!
//! Reference: <https://numpy.org/devdocs/reference/generated/numpy.lib.format.html>
use crate::VectorRecord;
use std::io::Read;
use std::path::Path;
/// Configuration for NumPy import.
#[derive(Clone, Debug, Default)]
pub struct NpyConfig {
/// Starting ID for auto-assigned vector IDs.
pub start_id: u64,
}
/// Parsed header from a `.npy` file.
#[derive(Debug)]
struct NpyHeader {
/// Number of rows (vectors).
rows: usize,
/// Number of columns (dimensions per vector).
cols: usize,
}
/// Parse the `.npy` header from a reader, returning the shape and
/// advancing the reader past the header.
fn parse_npy_header<R: Read>(reader: &mut R) -> Result<NpyHeader, String> {
// Magic: \x93NUMPY
let mut magic = [0u8; 6];
reader
.read_exact(&mut magic)
.map_err(|e| format!("failed to read npy magic: {e}"))?;
if magic[0] != 0x93 || &magic[1..6] != b"NUMPY" {
return Err("not a valid .npy file (bad magic)".to_string());
}
// Version
let mut version = [0u8; 2];
reader
.read_exact(&mut version)
.map_err(|e| format!("failed to read npy version: {e}"))?;
let major = version[0];
// Header length
let header_len: usize = if major <= 1 {
let mut buf = [0u8; 2];
reader
.read_exact(&mut buf)
.map_err(|e| format!("failed to read header length: {e}"))?;
u16::from_le_bytes(buf) as usize
} else {
let mut buf = [0u8; 4];
reader
.read_exact(&mut buf)
.map_err(|e| format!("failed to read header length: {e}"))?;
u32::from_le_bytes(buf) as usize
};
// Read the header dict string
let mut header_bytes = vec![0u8; header_len];
reader
.read_exact(&mut header_bytes)
.map_err(|e| format!("failed to read header dict: {e}"))?;
let header_str =
std::str::from_utf8(&header_bytes).map_err(|e| format!("header is not utf8: {e}"))?;
// Validate dtype is float32
if !header_str.contains("'<f4'") && !header_str.contains("'float32'") {
return Err(format!(
"unsupported dtype in npy header (only float32/<f4 supported): {header_str}"
));
}
// Parse shape: look for 'shape': (N, D) or 'shape': (N,)
let shape = parse_shape(header_str)?;
Ok(shape)
}
fn parse_shape(header: &str) -> Result<NpyHeader, String> {
// Find the shape tuple in the header dict
let shape_start = header
.find("'shape':")
.or_else(|| header.find("\"shape\":"))
.ok_or_else(|| format!("no 'shape' key in npy header: {header}"))?;
let after_key = &header[shape_start..];
let paren_open = after_key
.find('(')
.ok_or_else(|| "no opening paren in shape".to_string())?;
let paren_close = after_key
.find(')')
.ok_or_else(|| "no closing paren in shape".to_string())?;
let shape_content = &after_key[paren_open + 1..paren_close];
let parts: Vec<&str> = shape_content
.split(',')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.collect();
match parts.len() {
1 => {
let rows: usize = parts[0]
.parse()
.map_err(|e| format!("bad shape dim: {e}"))?;
// 1-D array: each element is a 1-d vector
Ok(NpyHeader { rows, cols: 1 })
}
2 => {
let rows: usize = parts[0]
.parse()
.map_err(|e| format!("bad shape row: {e}"))?;
let cols: usize = parts[1]
.parse()
.map_err(|e| format!("bad shape col: {e}"))?;
Ok(NpyHeader { rows, cols })
}
_ => Err(format!(
"unsupported shape rank {}: {shape_content}",
parts.len()
)),
}
}
/// Parse a `.npy` file from a reader.
pub fn parse_npy<R: Read>(mut reader: R, config: &NpyConfig) -> Result<Vec<VectorRecord>, String> {
let header = parse_npy_header(&mut reader)?;
let total_floats = header.rows * header.cols;
let total_bytes = total_floats * 4;
let mut raw = vec![0u8; total_bytes];
reader
.read_exact(&mut raw)
.map_err(|e| format!("failed to read npy data ({total_bytes} bytes expected): {e}"))?;
let mut records = Vec::with_capacity(header.rows);
for i in 0..header.rows {
let offset = i * header.cols * 4;
let mut vector = Vec::with_capacity(header.cols);
for j in 0..header.cols {
let byte_offset = offset + j * 4;
let bytes: [u8; 4] = [
raw[byte_offset],
raw[byte_offset + 1],
raw[byte_offset + 2],
raw[byte_offset + 3],
];
vector.push(f32::from_le_bytes(bytes));
}
records.push(VectorRecord {
id: config.start_id + i as u64,
vector,
metadata: Vec::new(),
});
}
Ok(records)
}
/// Parse a `.npy` file from a file path.
pub fn parse_npy_file(path: &Path, config: &NpyConfig) -> Result<Vec<VectorRecord>, String> {
let file =
std::fs::File::open(path).map_err(|e| format!("cannot open {}: {e}", path.display()))?;
let reader = std::io::BufReader::new(file);
parse_npy(reader, config)
}
#[cfg(test)]
mod tests {
use super::*;
/// Build a minimal valid .npy file in memory with the given shape and f32 data.
fn build_npy(rows: usize, cols: usize, data: &[f32]) -> Vec<u8> {
let header_dict =
format!("{{'descr': '<f4', 'fortran_order': False, 'shape': ({rows}, {cols}), }}");
// Pad header to 64-byte alignment (magic=6 + version=2 + header_len=2 + dict)
let preamble_len = 6 + 2 + 2;
let total_header = preamble_len + header_dict.len();
let padding = (64 - (total_header % 64)) % 64;
let padded_dict_len = header_dict.len() + padding;
let mut buf = Vec::new();
// Magic
buf.push(0x93);
buf.extend_from_slice(b"NUMPY");
// Version 1.0
buf.push(1);
buf.push(0);
// Header length (u16 LE)
buf.extend_from_slice(&(padded_dict_len as u16).to_le_bytes());
// Dict
buf.extend_from_slice(header_dict.as_bytes());
// Padding (spaces + newline)
buf.extend(std::iter::repeat_n(b' ', padding.saturating_sub(1)));
if padding > 0 {
buf.push(b'\n');
}
// Data
for &val in data {
buf.extend_from_slice(&val.to_le_bytes());
}
buf
}
#[test]
fn parse_2d_npy() {
let data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
let npy = build_npy(2, 3, &data);
let records = parse_npy(npy.as_slice(), &NpyConfig::default()).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0].id, 0);
assert_eq!(records[0].vector, vec![1.0, 2.0, 3.0]);
assert_eq!(records[1].id, 1);
assert_eq!(records[1].vector, vec![4.0, 5.0, 6.0]);
}
#[test]
fn parse_npy_custom_start_id() {
let data = vec![0.5f32, 0.6];
let npy = build_npy(1, 2, &data);
let config = NpyConfig { start_id: 100 };
let records = parse_npy(npy.as_slice(), &config).unwrap();
assert_eq!(records[0].id, 100);
}
#[test]
fn bad_magic_rejected() {
let bad = b"NOT_NUMPY_DATA";
let result = parse_npy(bad.as_slice(), &NpyConfig::default());
assert!(result.is_err());
assert!(result.unwrap_err().contains("bad magic"));
}
#[test]
fn shape_parsing() {
let h = parse_shape("{'descr': '<f4', 'shape': (100, 384), }").unwrap();
assert_eq!(h.rows, 100);
assert_eq!(h.cols, 384);
let h = parse_shape("{'descr': '<f4', 'shape': (50,), }").unwrap();
assert_eq!(h.rows, 50);
assert_eq!(h.cols, 1);
}
}

View File

@@ -0,0 +1,54 @@
//! Progress reporting for long-running imports.
use std::io::Write;
/// Trait for receiving import progress callbacks.
pub trait ProgressReporter {
/// Called after each batch with cumulative counts.
fn report(&self, imported: u64, rejected: u64, total: u64);
}
/// A reporter that prints progress to stderr.
pub struct StderrProgress;
impl ProgressReporter for StderrProgress {
fn report(&self, imported: u64, rejected: u64, total: u64) {
if total > 0 {
let pct = (imported + rejected) as f64 / total as f64 * 100.0;
eprint!("\r imported: {imported}, rejected: {rejected}, total: {total} ({pct:.1}%)");
let _ = std::io::stderr().flush();
}
}
}
/// A reporter that collects reports for testing.
pub struct CollectingProgress {
reports: std::sync::Mutex<Vec<(u64, u64, u64)>>,
}
impl Default for CollectingProgress {
fn default() -> Self {
Self {
reports: std::sync::Mutex::new(Vec::new()),
}
}
}
impl CollectingProgress {
pub fn new() -> Self {
Self::default()
}
pub fn reports(&self) -> Vec<(u64, u64, u64)> {
self.reports.lock().unwrap().clone()
}
}
impl ProgressReporter for CollectingProgress {
fn report(&self, imported: u64, rejected: u64, total: u64) {
self.reports
.lock()
.unwrap()
.push((imported, rejected, total));
}
}