Files
wifi-densepose/crates/ruvector-core/src/storage.rs
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

447 lines
15 KiB
Rust

//! Storage layer with redb for metadata and memory-mapped vectors
//!
//! This module is only available when the "storage" feature is enabled.
//! For WASM builds, use the in-memory storage backend instead.
#[cfg(feature = "storage")]
use crate::error::{Result, RuvectorError};
#[cfg(feature = "storage")]
use crate::types::{DbOptions, VectorEntry, VectorId};
#[cfg(feature = "storage")]
use bincode::config;
#[cfg(feature = "storage")]
use once_cell::sync::Lazy;
#[cfg(feature = "storage")]
use parking_lot::Mutex;
#[cfg(feature = "storage")]
use redb::{Database, ReadableTable, ReadableTableMetadata, TableDefinition};
#[cfg(feature = "storage")]
use serde_json;
#[cfg(feature = "storage")]
use std::collections::HashMap;
#[cfg(feature = "storage")]
use std::path::{Path, PathBuf};
#[cfg(feature = "storage")]
use std::sync::Arc;
#[cfg(feature = "storage")]
const VECTORS_TABLE: TableDefinition<&str, &[u8]> = TableDefinition::new("vectors");
const METADATA_TABLE: TableDefinition<&str, &str> = TableDefinition::new("metadata");
const CONFIG_TABLE: TableDefinition<&str, &str> = TableDefinition::new("config");
/// Key used to store database configuration in CONFIG_TABLE
const DB_CONFIG_KEY: &str = "__ruvector_db_config__";
// Global database connection pool to allow multiple VectorDB instances
// to share the same underlying database file
static DB_POOL: Lazy<Mutex<HashMap<PathBuf, Arc<Database>>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
/// Storage backend for vector database
pub struct VectorStorage {
db: Arc<Database>,
dimensions: usize,
}
impl VectorStorage {
/// Create or open a vector storage at the given path
///
/// This method uses a global connection pool to allow multiple VectorDB
/// instances to share the same underlying database file, fixing the
/// "Database already open. Cannot acquire lock" error.
pub fn new<P: AsRef<Path>>(path: P, dimensions: usize) -> Result<Self> {
// SECURITY: Validate path to prevent directory traversal attacks
let path_ref = path.as_ref();
// Create parent directories if they don't exist (needed for canonicalize)
if let Some(parent) = path_ref.parent() {
if !parent.as_os_str().is_empty() && !parent.exists() {
std::fs::create_dir_all(parent).map_err(|e| {
RuvectorError::InvalidPath(format!("Failed to create directory: {}", e))
})?;
}
}
// Convert to absolute path first, then validate
let path_buf = if path_ref.is_absolute() {
path_ref.to_path_buf()
} else {
std::env::current_dir()
.map_err(|e| RuvectorError::InvalidPath(format!("Failed to get cwd: {}", e)))?
.join(path_ref)
};
// SECURITY: Check for path traversal attempts (e.g., "../../../etc/passwd")
// Only reject paths that contain ".." components trying to escape
let path_str = path_ref.to_string_lossy();
if path_str.contains("..") {
// Verify the resolved path doesn't escape intended boundaries
// For absolute paths, we allow them as-is (user explicitly specified)
// For relative paths with "..", check they don't escape cwd
if !path_ref.is_absolute() {
if let Ok(cwd) = std::env::current_dir() {
// Normalize the path by resolving .. components
let mut normalized = cwd.clone();
for component in path_ref.components() {
match component {
std::path::Component::ParentDir => {
if !normalized.pop() || !normalized.starts_with(&cwd) {
return Err(RuvectorError::InvalidPath(
"Path traversal attempt detected".to_string(),
));
}
}
std::path::Component::Normal(c) => normalized.push(c),
_ => {}
}
}
}
}
}
// Check if we already have a Database instance for this path
let db = {
let mut pool = DB_POOL.lock();
if let Some(existing_db) = pool.get(&path_buf) {
// Reuse existing database connection
Arc::clone(existing_db)
} else {
// Create new database and add to pool
let new_db = Arc::new(Database::create(&path_buf)?);
// Initialize tables
let write_txn = new_db.begin_write()?;
{
let _ = write_txn.open_table(VECTORS_TABLE)?;
let _ = write_txn.open_table(METADATA_TABLE)?;
let _ = write_txn.open_table(CONFIG_TABLE)?;
}
write_txn.commit()?;
pool.insert(path_buf, Arc::clone(&new_db));
new_db
}
};
Ok(Self { db, dimensions })
}
/// Insert a vector entry
pub fn insert(&self, entry: &VectorEntry) -> Result<VectorId> {
if entry.vector.len() != self.dimensions {
return Err(RuvectorError::DimensionMismatch {
expected: self.dimensions,
actual: entry.vector.len(),
});
}
let id = entry
.id
.clone()
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
let write_txn = self.db.begin_write()?;
{
let mut table = write_txn.open_table(VECTORS_TABLE)?;
// Serialize vector data
let vector_data = bincode::encode_to_vec(&entry.vector, config::standard())
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
table.insert(id.as_str(), vector_data.as_slice())?;
// Store metadata if present
if let Some(metadata) = &entry.metadata {
let mut meta_table = write_txn.open_table(METADATA_TABLE)?;
let metadata_json = serde_json::to_string(metadata)
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
meta_table.insert(id.as_str(), metadata_json.as_str())?;
}
}
write_txn.commit()?;
Ok(id)
}
/// Insert multiple vectors in a batch
pub fn insert_batch(&self, entries: &[VectorEntry]) -> Result<Vec<VectorId>> {
let write_txn = self.db.begin_write()?;
let mut ids = Vec::with_capacity(entries.len());
{
let mut table = write_txn.open_table(VECTORS_TABLE)?;
let mut meta_table = write_txn.open_table(METADATA_TABLE)?;
for entry in entries {
if entry.vector.len() != self.dimensions {
return Err(RuvectorError::DimensionMismatch {
expected: self.dimensions,
actual: entry.vector.len(),
});
}
let id = entry
.id
.clone()
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
// Serialize and insert vector
let vector_data = bincode::encode_to_vec(&entry.vector, config::standard())
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
table.insert(id.as_str(), vector_data.as_slice())?;
// Insert metadata if present
if let Some(metadata) = &entry.metadata {
let metadata_json = serde_json::to_string(metadata)
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
meta_table.insert(id.as_str(), metadata_json.as_str())?;
}
ids.push(id);
}
}
write_txn.commit()?;
Ok(ids)
}
/// Get a vector by ID
pub fn get(&self, id: &str) -> Result<Option<VectorEntry>> {
let read_txn = self.db.begin_read()?;
let table = read_txn.open_table(VECTORS_TABLE)?;
let Some(vector_data) = table.get(id)? else {
return Ok(None);
};
let (vector, _): (Vec<f32>, usize) =
bincode::decode_from_slice(vector_data.value(), config::standard())
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
// Try to get metadata
let meta_table = read_txn.open_table(METADATA_TABLE)?;
let metadata = if let Some(meta_data) = meta_table.get(id)? {
let meta_str = meta_data.value();
Some(
serde_json::from_str(meta_str)
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?,
)
} else {
None
};
Ok(Some(VectorEntry {
id: Some(id.to_string()),
vector,
metadata,
}))
}
/// Delete a vector by ID
pub fn delete(&self, id: &str) -> Result<bool> {
let write_txn = self.db.begin_write()?;
let deleted;
{
let mut table = write_txn.open_table(VECTORS_TABLE)?;
deleted = table.remove(id)?.is_some();
let mut meta_table = write_txn.open_table(METADATA_TABLE)?;
let _ = meta_table.remove(id)?;
}
write_txn.commit()?;
Ok(deleted)
}
/// Get the number of vectors stored
pub fn len(&self) -> Result<usize> {
let read_txn = self.db.begin_read()?;
let table = read_txn.open_table(VECTORS_TABLE)?;
Ok(table.len()? as usize)
}
/// Check if storage is empty
pub fn is_empty(&self) -> Result<bool> {
Ok(self.len()? == 0)
}
/// Get all vector IDs
pub fn all_ids(&self) -> Result<Vec<VectorId>> {
let read_txn = self.db.begin_read()?;
let table = read_txn.open_table(VECTORS_TABLE)?;
let mut ids = Vec::new();
let iter = table.iter()?;
for item in iter {
let (key, _) = item?;
ids.push(key.value().to_string());
}
Ok(ids)
}
/// Save database configuration to persistent storage
pub fn save_config(&self, options: &DbOptions) -> Result<()> {
let config_json = serde_json::to_string(options)
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
let write_txn = self.db.begin_write()?;
{
let mut table = write_txn.open_table(CONFIG_TABLE)?;
table.insert(DB_CONFIG_KEY, config_json.as_str())?;
}
write_txn.commit()?;
Ok(())
}
/// Load database configuration from persistent storage
pub fn load_config(&self) -> Result<Option<DbOptions>> {
let read_txn = self.db.begin_read()?;
// Try to open config table - may not exist in older databases
let table = match read_txn.open_table(CONFIG_TABLE) {
Ok(t) => t,
Err(_) => return Ok(None),
};
let Some(config_data) = table.get(DB_CONFIG_KEY)? else {
return Ok(None);
};
let config: DbOptions = serde_json::from_str(config_data.value())
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
Ok(Some(config))
}
/// Get the stored dimensions
pub fn dimensions(&self) -> usize {
self.dimensions
}
}
// Add uuid dependency
use uuid;
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
#[test]
fn test_insert_and_get() -> Result<()> {
let dir = tempdir().unwrap();
let storage = VectorStorage::new(dir.path().join("test.db"), 3)?;
let entry = VectorEntry {
id: Some("test1".to_string()),
vector: vec![1.0, 2.0, 3.0],
metadata: None,
};
let id = storage.insert(&entry)?;
assert_eq!(id, "test1");
let retrieved = storage.get("test1")?;
assert!(retrieved.is_some());
let retrieved = retrieved.unwrap();
assert_eq!(retrieved.vector, vec![1.0, 2.0, 3.0]);
Ok(())
}
#[test]
fn test_batch_insert() -> Result<()> {
let dir = tempdir().unwrap();
let storage = VectorStorage::new(dir.path().join("test.db"), 3)?;
let entries = vec![
VectorEntry {
id: None,
vector: vec![1.0, 2.0, 3.0],
metadata: None,
},
VectorEntry {
id: None,
vector: vec![4.0, 5.0, 6.0],
metadata: None,
},
];
let ids = storage.insert_batch(&entries)?;
assert_eq!(ids.len(), 2);
assert_eq!(storage.len()?, 2);
Ok(())
}
#[test]
fn test_delete() -> Result<()> {
let dir = tempdir().unwrap();
let storage = VectorStorage::new(dir.path().join("test.db"), 3)?;
let entry = VectorEntry {
id: Some("test1".to_string()),
vector: vec![1.0, 2.0, 3.0],
metadata: None,
};
storage.insert(&entry)?;
assert_eq!(storage.len()?, 1);
let deleted = storage.delete("test1")?;
assert!(deleted);
assert_eq!(storage.len()?, 0);
Ok(())
}
#[test]
fn test_multiple_instances_same_path() -> Result<()> {
// This test verifies the fix for the database locking bug
// Multiple VectorStorage instances should be able to share the same database file
let dir = tempdir().unwrap();
let db_path = dir.path().join("shared.db");
// Create first instance
let storage1 = VectorStorage::new(&db_path, 3)?;
// Insert data with first instance
storage1.insert(&VectorEntry {
id: Some("test1".to_string()),
vector: vec![1.0, 2.0, 3.0],
metadata: None,
})?;
// Create second instance with SAME path - this should NOT fail
let storage2 = VectorStorage::new(&db_path, 3)?;
// Both instances should see the same data
assert_eq!(storage1.len()?, 1);
assert_eq!(storage2.len()?, 1);
// Insert with second instance
storage2.insert(&VectorEntry {
id: Some("test2".to_string()),
vector: vec![4.0, 5.0, 6.0],
metadata: None,
})?;
// Both instances should see both records
assert_eq!(storage1.len()?, 2);
assert_eq!(storage2.len()?, 2);
// Verify data integrity
let retrieved1 = storage1.get("test1")?;
assert!(retrieved1.is_some());
let retrieved2 = storage2.get("test2")?;
assert!(retrieved2.is_some());
Ok(())
}
}