//! Collection types and operations use ruvector_core::types::{DistanceMetric, HnswConfig, QuantizationConfig}; use ruvector_core::vector_db::VectorDB; use serde::{Deserialize, Serialize}; use crate::error::{CollectionError, Result}; /// Configuration for creating a collection #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CollectionConfig { /// Vector dimensions pub dimensions: usize, /// Distance metric for similarity calculation pub distance_metric: DistanceMetric, /// HNSW index configuration pub hnsw_config: Option, /// Quantization configuration pub quantization: Option, /// Whether to store payload data on disk pub on_disk_payload: bool, } impl CollectionConfig { /// Validate the configuration pub fn validate(&self) -> Result<()> { if self.dimensions == 0 { return Err(CollectionError::InvalidConfiguration { message: "Dimensions must be greater than 0".to_string(), }); } if self.dimensions > 100_000 { return Err(CollectionError::InvalidConfiguration { message: "Dimensions exceeds maximum of 100,000".to_string(), }); } // Validate HNSW config if present if let Some(ref hnsw_config) = self.hnsw_config { if hnsw_config.m == 0 { return Err(CollectionError::InvalidConfiguration { message: "HNSW M parameter must be greater than 0".to_string(), }); } if hnsw_config.ef_construction < hnsw_config.m { return Err(CollectionError::InvalidConfiguration { message: "HNSW ef_construction must be >= M".to_string(), }); } if hnsw_config.ef_search == 0 { return Err(CollectionError::InvalidConfiguration { message: "HNSW ef_search must be greater than 0".to_string(), }); } } Ok(()) } /// Create a default configuration for the given dimensions pub fn with_dimensions(dimensions: usize) -> Self { Self { dimensions, distance_metric: DistanceMetric::Cosine, hnsw_config: Some(HnswConfig::default()), quantization: Some(QuantizationConfig::Scalar), on_disk_payload: true, } } } /// A collection of vectors with its own configuration pub struct Collection { /// Collection name pub name: String, /// Collection configuration pub config: CollectionConfig, /// Underlying vector database pub db: VectorDB, /// When the collection was created (Unix timestamp in seconds) pub created_at: i64, /// When the collection was last updated (Unix timestamp in seconds) pub updated_at: i64, } impl std::fmt::Debug for Collection { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Collection") .field("name", &self.name) .field("config", &self.config) .field("created_at", &self.created_at) .field("updated_at", &self.updated_at) .field("db", &"") .finish() } } impl Collection { /// Create a new collection pub fn new(name: String, config: CollectionConfig, storage_path: String) -> Result { // Validate configuration config.validate()?; // Create VectorDB with the configuration let db_options = ruvector_core::types::DbOptions { dimensions: config.dimensions, distance_metric: config.distance_metric, storage_path, hnsw_config: config.hnsw_config.clone(), quantization: config.quantization.clone(), }; let db = VectorDB::new(db_options)?; let now = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap() .as_secs() as i64; Ok(Self { name, config, db, created_at: now, updated_at: now, }) } /// Get collection statistics pub fn stats(&self) -> Result { let vectors_count = self.db.len()?; Ok(CollectionStats { vectors_count, segments_count: 1, // Single segment for now disk_size_bytes: 0, // TODO: Implement disk size calculation ram_size_bytes: 0, // TODO: Implement RAM size calculation }) } /// Update the last modified timestamp pub fn touch(&mut self) { self.updated_at = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap() .as_secs() as i64; } } /// Statistics about a collection #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CollectionStats { /// Number of vectors in the collection pub vectors_count: usize, /// Number of segments (partitions) in the collection pub segments_count: usize, /// Total disk space used (bytes) pub disk_size_bytes: u64, /// Total RAM used (bytes) pub ram_size_bytes: u64, } impl CollectionStats { /// Check if the collection is empty pub fn is_empty(&self) -> bool { self.vectors_count == 0 } /// Get human-readable disk size pub fn disk_size_human(&self) -> String { format_bytes(self.disk_size_bytes) } /// Get human-readable RAM size pub fn ram_size_human(&self) -> String { format_bytes(self.ram_size_bytes) } } /// Format bytes into human-readable size fn format_bytes(bytes: u64) -> String { const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"]; if bytes == 0 { return "0 B".to_string(); } let mut size = bytes as f64; let mut unit_idx = 0; while size >= 1024.0 && unit_idx < UNITS.len() - 1 { size /= 1024.0; unit_idx += 1; } format!("{:.2} {}", size, UNITS[unit_idx]) } #[cfg(test)] mod tests { use super::*; #[test] fn test_collection_config_validation() { // Valid config let config = CollectionConfig::with_dimensions(384); assert!(config.validate().is_ok()); // Invalid: zero dimensions let config = CollectionConfig { dimensions: 0, distance_metric: DistanceMetric::Cosine, hnsw_config: None, quantization: None, on_disk_payload: true, }; assert!(config.validate().is_err()); // Invalid: dimensions too large let config = CollectionConfig { dimensions: 200_000, distance_metric: DistanceMetric::Cosine, hnsw_config: None, quantization: None, on_disk_payload: true, }; assert!(config.validate().is_err()); } #[test] fn test_format_bytes() { assert_eq!(format_bytes(0), "0 B"); assert_eq!(format_bytes(512), "512.00 B"); assert_eq!(format_bytes(1024), "1.00 KB"); assert_eq!(format_bytes(1536), "1.50 KB"); assert_eq!(format_bytes(1048576), "1.00 MB"); assert_eq!(format_bytes(1073741824), "1.00 GB"); } }