Files
wifi-densepose/vendor/ruvector/crates/ruvector-collections/src/collection.rs

254 lines
7.2 KiB
Rust

//! Collection types and operations
use ruvector_core::types::{DistanceMetric, HnswConfig, QuantizationConfig};
use ruvector_core::vector_db::VectorDB;
use serde::{Deserialize, Serialize};
use crate::error::{CollectionError, Result};
/// Configuration for creating a collection
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CollectionConfig {
/// Vector dimensions
pub dimensions: usize,
/// Distance metric for similarity calculation
pub distance_metric: DistanceMetric,
/// HNSW index configuration
pub hnsw_config: Option<HnswConfig>,
/// Quantization configuration
pub quantization: Option<QuantizationConfig>,
/// Whether to store payload data on disk
pub on_disk_payload: bool,
}
impl CollectionConfig {
/// Validate the configuration
pub fn validate(&self) -> Result<()> {
if self.dimensions == 0 {
return Err(CollectionError::InvalidConfiguration {
message: "Dimensions must be greater than 0".to_string(),
});
}
if self.dimensions > 100_000 {
return Err(CollectionError::InvalidConfiguration {
message: "Dimensions exceeds maximum of 100,000".to_string(),
});
}
// Validate HNSW config if present
if let Some(ref hnsw_config) = self.hnsw_config {
if hnsw_config.m == 0 {
return Err(CollectionError::InvalidConfiguration {
message: "HNSW M parameter must be greater than 0".to_string(),
});
}
if hnsw_config.ef_construction < hnsw_config.m {
return Err(CollectionError::InvalidConfiguration {
message: "HNSW ef_construction must be >= M".to_string(),
});
}
if hnsw_config.ef_search == 0 {
return Err(CollectionError::InvalidConfiguration {
message: "HNSW ef_search must be greater than 0".to_string(),
});
}
}
Ok(())
}
/// Create a default configuration for the given dimensions
pub fn with_dimensions(dimensions: usize) -> Self {
Self {
dimensions,
distance_metric: DistanceMetric::Cosine,
hnsw_config: Some(HnswConfig::default()),
quantization: Some(QuantizationConfig::Scalar),
on_disk_payload: true,
}
}
}
/// A collection of vectors with its own configuration
pub struct Collection {
/// Collection name
pub name: String,
/// Collection configuration
pub config: CollectionConfig,
/// Underlying vector database
pub db: VectorDB,
/// When the collection was created (Unix timestamp in seconds)
pub created_at: i64,
/// When the collection was last updated (Unix timestamp in seconds)
pub updated_at: i64,
}
impl std::fmt::Debug for Collection {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Collection")
.field("name", &self.name)
.field("config", &self.config)
.field("created_at", &self.created_at)
.field("updated_at", &self.updated_at)
.field("db", &"<VectorDB>")
.finish()
}
}
impl Collection {
/// Create a new collection
pub fn new(name: String, config: CollectionConfig, storage_path: String) -> Result<Self> {
// Validate configuration
config.validate()?;
// Create VectorDB with the configuration
let db_options = ruvector_core::types::DbOptions {
dimensions: config.dimensions,
distance_metric: config.distance_metric,
storage_path,
hnsw_config: config.hnsw_config.clone(),
quantization: config.quantization.clone(),
};
let db = VectorDB::new(db_options)?;
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs() as i64;
Ok(Self {
name,
config,
db,
created_at: now,
updated_at: now,
})
}
/// Get collection statistics
pub fn stats(&self) -> Result<CollectionStats> {
let vectors_count = self.db.len()?;
Ok(CollectionStats {
vectors_count,
segments_count: 1, // Single segment for now
disk_size_bytes: 0, // TODO: Implement disk size calculation
ram_size_bytes: 0, // TODO: Implement RAM size calculation
})
}
/// Update the last modified timestamp
pub fn touch(&mut self) {
self.updated_at = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs() as i64;
}
}
/// Statistics about a collection
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CollectionStats {
/// Number of vectors in the collection
pub vectors_count: usize,
/// Number of segments (partitions) in the collection
pub segments_count: usize,
/// Total disk space used (bytes)
pub disk_size_bytes: u64,
/// Total RAM used (bytes)
pub ram_size_bytes: u64,
}
impl CollectionStats {
/// Check if the collection is empty
pub fn is_empty(&self) -> bool {
self.vectors_count == 0
}
/// Get human-readable disk size
pub fn disk_size_human(&self) -> String {
format_bytes(self.disk_size_bytes)
}
/// Get human-readable RAM size
pub fn ram_size_human(&self) -> String {
format_bytes(self.ram_size_bytes)
}
}
/// Format bytes into human-readable size
fn format_bytes(bytes: u64) -> String {
const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
if bytes == 0 {
return "0 B".to_string();
}
let mut size = bytes as f64;
let mut unit_idx = 0;
while size >= 1024.0 && unit_idx < UNITS.len() - 1 {
size /= 1024.0;
unit_idx += 1;
}
format!("{:.2} {}", size, UNITS[unit_idx])
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_collection_config_validation() {
// Valid config
let config = CollectionConfig::with_dimensions(384);
assert!(config.validate().is_ok());
// Invalid: zero dimensions
let config = CollectionConfig {
dimensions: 0,
distance_metric: DistanceMetric::Cosine,
hnsw_config: None,
quantization: None,
on_disk_payload: true,
};
assert!(config.validate().is_err());
// Invalid: dimensions too large
let config = CollectionConfig {
dimensions: 200_000,
distance_metric: DistanceMetric::Cosine,
hnsw_config: None,
quantization: None,
on_disk_payload: true,
};
assert!(config.validate().is_err());
}
#[test]
fn test_format_bytes() {
assert_eq!(format_bytes(0), "0 B");
assert_eq!(format_bytes(512), "512.00 B");
assert_eq!(format_bytes(1024), "1.00 KB");
assert_eq!(format_bytes(1536), "1.50 KB");
assert_eq!(format_bytes(1048576), "1.00 MB");
assert_eq!(format_bytes(1073741824), "1.00 GB");
}
}