Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
446
crates/ruvector-core/src/storage.rs
Normal file
446
crates/ruvector-core/src/storage.rs
Normal file
@@ -0,0 +1,446 @@
|
||||
//! Storage layer with redb for metadata and memory-mapped vectors
|
||||
//!
|
||||
//! This module is only available when the "storage" feature is enabled.
|
||||
//! For WASM builds, use the in-memory storage backend instead.
|
||||
|
||||
#[cfg(feature = "storage")]
|
||||
use crate::error::{Result, RuvectorError};
|
||||
#[cfg(feature = "storage")]
|
||||
use crate::types::{DbOptions, VectorEntry, VectorId};
|
||||
#[cfg(feature = "storage")]
|
||||
use bincode::config;
|
||||
#[cfg(feature = "storage")]
|
||||
use once_cell::sync::Lazy;
|
||||
#[cfg(feature = "storage")]
|
||||
use parking_lot::Mutex;
|
||||
#[cfg(feature = "storage")]
|
||||
use redb::{Database, ReadableTable, ReadableTableMetadata, TableDefinition};
|
||||
#[cfg(feature = "storage")]
|
||||
use serde_json;
|
||||
#[cfg(feature = "storage")]
|
||||
use std::collections::HashMap;
|
||||
#[cfg(feature = "storage")]
|
||||
use std::path::{Path, PathBuf};
|
||||
#[cfg(feature = "storage")]
|
||||
use std::sync::Arc;
|
||||
|
||||
#[cfg(feature = "storage")]
|
||||
const VECTORS_TABLE: TableDefinition<&str, &[u8]> = TableDefinition::new("vectors");
|
||||
const METADATA_TABLE: TableDefinition<&str, &str> = TableDefinition::new("metadata");
|
||||
const CONFIG_TABLE: TableDefinition<&str, &str> = TableDefinition::new("config");
|
||||
|
||||
/// Key used to store database configuration in CONFIG_TABLE
|
||||
const DB_CONFIG_KEY: &str = "__ruvector_db_config__";
|
||||
|
||||
// Global database connection pool to allow multiple VectorDB instances
|
||||
// to share the same underlying database file
|
||||
static DB_POOL: Lazy<Mutex<HashMap<PathBuf, Arc<Database>>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
|
||||
/// Storage backend for vector database
|
||||
pub struct VectorStorage {
|
||||
db: Arc<Database>,
|
||||
dimensions: usize,
|
||||
}
|
||||
|
||||
impl VectorStorage {
|
||||
/// Create or open a vector storage at the given path
|
||||
///
|
||||
/// This method uses a global connection pool to allow multiple VectorDB
|
||||
/// instances to share the same underlying database file, fixing the
|
||||
/// "Database already open. Cannot acquire lock" error.
|
||||
pub fn new<P: AsRef<Path>>(path: P, dimensions: usize) -> Result<Self> {
|
||||
// SECURITY: Validate path to prevent directory traversal attacks
|
||||
let path_ref = path.as_ref();
|
||||
|
||||
// Create parent directories if they don't exist (needed for canonicalize)
|
||||
if let Some(parent) = path_ref.parent() {
|
||||
if !parent.as_os_str().is_empty() && !parent.exists() {
|
||||
std::fs::create_dir_all(parent).map_err(|e| {
|
||||
RuvectorError::InvalidPath(format!("Failed to create directory: {}", e))
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to absolute path first, then validate
|
||||
let path_buf = if path_ref.is_absolute() {
|
||||
path_ref.to_path_buf()
|
||||
} else {
|
||||
std::env::current_dir()
|
||||
.map_err(|e| RuvectorError::InvalidPath(format!("Failed to get cwd: {}", e)))?
|
||||
.join(path_ref)
|
||||
};
|
||||
|
||||
// SECURITY: Check for path traversal attempts (e.g., "../../../etc/passwd")
|
||||
// Only reject paths that contain ".." components trying to escape
|
||||
let path_str = path_ref.to_string_lossy();
|
||||
if path_str.contains("..") {
|
||||
// Verify the resolved path doesn't escape intended boundaries
|
||||
// For absolute paths, we allow them as-is (user explicitly specified)
|
||||
// For relative paths with "..", check they don't escape cwd
|
||||
if !path_ref.is_absolute() {
|
||||
if let Ok(cwd) = std::env::current_dir() {
|
||||
// Normalize the path by resolving .. components
|
||||
let mut normalized = cwd.clone();
|
||||
for component in path_ref.components() {
|
||||
match component {
|
||||
std::path::Component::ParentDir => {
|
||||
if !normalized.pop() || !normalized.starts_with(&cwd) {
|
||||
return Err(RuvectorError::InvalidPath(
|
||||
"Path traversal attempt detected".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
std::path::Component::Normal(c) => normalized.push(c),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we already have a Database instance for this path
|
||||
let db = {
|
||||
let mut pool = DB_POOL.lock();
|
||||
|
||||
if let Some(existing_db) = pool.get(&path_buf) {
|
||||
// Reuse existing database connection
|
||||
Arc::clone(existing_db)
|
||||
} else {
|
||||
// Create new database and add to pool
|
||||
let new_db = Arc::new(Database::create(&path_buf)?);
|
||||
|
||||
// Initialize tables
|
||||
let write_txn = new_db.begin_write()?;
|
||||
{
|
||||
let _ = write_txn.open_table(VECTORS_TABLE)?;
|
||||
let _ = write_txn.open_table(METADATA_TABLE)?;
|
||||
let _ = write_txn.open_table(CONFIG_TABLE)?;
|
||||
}
|
||||
write_txn.commit()?;
|
||||
|
||||
pool.insert(path_buf, Arc::clone(&new_db));
|
||||
new_db
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self { db, dimensions })
|
||||
}
|
||||
|
||||
/// Insert a vector entry
|
||||
pub fn insert(&self, entry: &VectorEntry) -> Result<VectorId> {
|
||||
if entry.vector.len() != self.dimensions {
|
||||
return Err(RuvectorError::DimensionMismatch {
|
||||
expected: self.dimensions,
|
||||
actual: entry.vector.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let id = entry
|
||||
.id
|
||||
.clone()
|
||||
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
|
||||
|
||||
let write_txn = self.db.begin_write()?;
|
||||
{
|
||||
let mut table = write_txn.open_table(VECTORS_TABLE)?;
|
||||
|
||||
// Serialize vector data
|
||||
let vector_data = bincode::encode_to_vec(&entry.vector, config::standard())
|
||||
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
|
||||
|
||||
table.insert(id.as_str(), vector_data.as_slice())?;
|
||||
|
||||
// Store metadata if present
|
||||
if let Some(metadata) = &entry.metadata {
|
||||
let mut meta_table = write_txn.open_table(METADATA_TABLE)?;
|
||||
let metadata_json = serde_json::to_string(metadata)
|
||||
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
|
||||
meta_table.insert(id.as_str(), metadata_json.as_str())?;
|
||||
}
|
||||
}
|
||||
write_txn.commit()?;
|
||||
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
/// Insert multiple vectors in a batch
|
||||
pub fn insert_batch(&self, entries: &[VectorEntry]) -> Result<Vec<VectorId>> {
|
||||
let write_txn = self.db.begin_write()?;
|
||||
let mut ids = Vec::with_capacity(entries.len());
|
||||
|
||||
{
|
||||
let mut table = write_txn.open_table(VECTORS_TABLE)?;
|
||||
let mut meta_table = write_txn.open_table(METADATA_TABLE)?;
|
||||
|
||||
for entry in entries {
|
||||
if entry.vector.len() != self.dimensions {
|
||||
return Err(RuvectorError::DimensionMismatch {
|
||||
expected: self.dimensions,
|
||||
actual: entry.vector.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let id = entry
|
||||
.id
|
||||
.clone()
|
||||
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
|
||||
|
||||
// Serialize and insert vector
|
||||
let vector_data = bincode::encode_to_vec(&entry.vector, config::standard())
|
||||
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
|
||||
table.insert(id.as_str(), vector_data.as_slice())?;
|
||||
|
||||
// Insert metadata if present
|
||||
if let Some(metadata) = &entry.metadata {
|
||||
let metadata_json = serde_json::to_string(metadata)
|
||||
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
|
||||
meta_table.insert(id.as_str(), metadata_json.as_str())?;
|
||||
}
|
||||
|
||||
ids.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
write_txn.commit()?;
|
||||
Ok(ids)
|
||||
}
|
||||
|
||||
/// Get a vector by ID
|
||||
pub fn get(&self, id: &str) -> Result<Option<VectorEntry>> {
|
||||
let read_txn = self.db.begin_read()?;
|
||||
let table = read_txn.open_table(VECTORS_TABLE)?;
|
||||
|
||||
let Some(vector_data) = table.get(id)? else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let (vector, _): (Vec<f32>, usize) =
|
||||
bincode::decode_from_slice(vector_data.value(), config::standard())
|
||||
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
|
||||
|
||||
// Try to get metadata
|
||||
let meta_table = read_txn.open_table(METADATA_TABLE)?;
|
||||
let metadata = if let Some(meta_data) = meta_table.get(id)? {
|
||||
let meta_str = meta_data.value();
|
||||
Some(
|
||||
serde_json::from_str(meta_str)
|
||||
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(Some(VectorEntry {
|
||||
id: Some(id.to_string()),
|
||||
vector,
|
||||
metadata,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Delete a vector by ID
|
||||
pub fn delete(&self, id: &str) -> Result<bool> {
|
||||
let write_txn = self.db.begin_write()?;
|
||||
let deleted;
|
||||
|
||||
{
|
||||
let mut table = write_txn.open_table(VECTORS_TABLE)?;
|
||||
deleted = table.remove(id)?.is_some();
|
||||
|
||||
let mut meta_table = write_txn.open_table(METADATA_TABLE)?;
|
||||
let _ = meta_table.remove(id)?;
|
||||
}
|
||||
|
||||
write_txn.commit()?;
|
||||
Ok(deleted)
|
||||
}
|
||||
|
||||
/// Get the number of vectors stored
|
||||
pub fn len(&self) -> Result<usize> {
|
||||
let read_txn = self.db.begin_read()?;
|
||||
let table = read_txn.open_table(VECTORS_TABLE)?;
|
||||
Ok(table.len()? as usize)
|
||||
}
|
||||
|
||||
/// Check if storage is empty
|
||||
pub fn is_empty(&self) -> Result<bool> {
|
||||
Ok(self.len()? == 0)
|
||||
}
|
||||
|
||||
/// Get all vector IDs
|
||||
pub fn all_ids(&self) -> Result<Vec<VectorId>> {
|
||||
let read_txn = self.db.begin_read()?;
|
||||
let table = read_txn.open_table(VECTORS_TABLE)?;
|
||||
|
||||
let mut ids = Vec::new();
|
||||
let iter = table.iter()?;
|
||||
for item in iter {
|
||||
let (key, _) = item?;
|
||||
ids.push(key.value().to_string());
|
||||
}
|
||||
|
||||
Ok(ids)
|
||||
}
|
||||
|
||||
/// Save database configuration to persistent storage
|
||||
pub fn save_config(&self, options: &DbOptions) -> Result<()> {
|
||||
let config_json = serde_json::to_string(options)
|
||||
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
|
||||
|
||||
let write_txn = self.db.begin_write()?;
|
||||
{
|
||||
let mut table = write_txn.open_table(CONFIG_TABLE)?;
|
||||
table.insert(DB_CONFIG_KEY, config_json.as_str())?;
|
||||
}
|
||||
write_txn.commit()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load database configuration from persistent storage
|
||||
pub fn load_config(&self) -> Result<Option<DbOptions>> {
|
||||
let read_txn = self.db.begin_read()?;
|
||||
|
||||
// Try to open config table - may not exist in older databases
|
||||
let table = match read_txn.open_table(CONFIG_TABLE) {
|
||||
Ok(t) => t,
|
||||
Err(_) => return Ok(None),
|
||||
};
|
||||
|
||||
let Some(config_data) = table.get(DB_CONFIG_KEY)? else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let config: DbOptions = serde_json::from_str(config_data.value())
|
||||
.map_err(|e| RuvectorError::SerializationError(e.to_string()))?;
|
||||
|
||||
Ok(Some(config))
|
||||
}
|
||||
|
||||
/// Get the stored dimensions
|
||||
pub fn dimensions(&self) -> usize {
|
||||
self.dimensions
|
||||
}
|
||||
}
|
||||
|
||||
// Add uuid dependency
|
||||
use uuid;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn test_insert_and_get() -> Result<()> {
|
||||
let dir = tempdir().unwrap();
|
||||
let storage = VectorStorage::new(dir.path().join("test.db"), 3)?;
|
||||
|
||||
let entry = VectorEntry {
|
||||
id: Some("test1".to_string()),
|
||||
vector: vec![1.0, 2.0, 3.0],
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let id = storage.insert(&entry)?;
|
||||
assert_eq!(id, "test1");
|
||||
|
||||
let retrieved = storage.get("test1")?;
|
||||
assert!(retrieved.is_some());
|
||||
let retrieved = retrieved.unwrap();
|
||||
assert_eq!(retrieved.vector, vec![1.0, 2.0, 3.0]);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_insert() -> Result<()> {
|
||||
let dir = tempdir().unwrap();
|
||||
let storage = VectorStorage::new(dir.path().join("test.db"), 3)?;
|
||||
|
||||
let entries = vec![
|
||||
VectorEntry {
|
||||
id: None,
|
||||
vector: vec![1.0, 2.0, 3.0],
|
||||
metadata: None,
|
||||
},
|
||||
VectorEntry {
|
||||
id: None,
|
||||
vector: vec![4.0, 5.0, 6.0],
|
||||
metadata: None,
|
||||
},
|
||||
];
|
||||
|
||||
let ids = storage.insert_batch(&entries)?;
|
||||
assert_eq!(ids.len(), 2);
|
||||
assert_eq!(storage.len()?, 2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete() -> Result<()> {
|
||||
let dir = tempdir().unwrap();
|
||||
let storage = VectorStorage::new(dir.path().join("test.db"), 3)?;
|
||||
|
||||
let entry = VectorEntry {
|
||||
id: Some("test1".to_string()),
|
||||
vector: vec![1.0, 2.0, 3.0],
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
storage.insert(&entry)?;
|
||||
assert_eq!(storage.len()?, 1);
|
||||
|
||||
let deleted = storage.delete("test1")?;
|
||||
assert!(deleted);
|
||||
assert_eq!(storage.len()?, 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_instances_same_path() -> Result<()> {
|
||||
// This test verifies the fix for the database locking bug
|
||||
// Multiple VectorStorage instances should be able to share the same database file
|
||||
let dir = tempdir().unwrap();
|
||||
let db_path = dir.path().join("shared.db");
|
||||
|
||||
// Create first instance
|
||||
let storage1 = VectorStorage::new(&db_path, 3)?;
|
||||
|
||||
// Insert data with first instance
|
||||
storage1.insert(&VectorEntry {
|
||||
id: Some("test1".to_string()),
|
||||
vector: vec![1.0, 2.0, 3.0],
|
||||
metadata: None,
|
||||
})?;
|
||||
|
||||
// Create second instance with SAME path - this should NOT fail
|
||||
let storage2 = VectorStorage::new(&db_path, 3)?;
|
||||
|
||||
// Both instances should see the same data
|
||||
assert_eq!(storage1.len()?, 1);
|
||||
assert_eq!(storage2.len()?, 1);
|
||||
|
||||
// Insert with second instance
|
||||
storage2.insert(&VectorEntry {
|
||||
id: Some("test2".to_string()),
|
||||
vector: vec![4.0, 5.0, 6.0],
|
||||
metadata: None,
|
||||
})?;
|
||||
|
||||
// Both instances should see both records
|
||||
assert_eq!(storage1.len()?, 2);
|
||||
assert_eq!(storage2.len()?, 2);
|
||||
|
||||
// Verify data integrity
|
||||
let retrieved1 = storage1.get("test1")?;
|
||||
assert!(retrieved1.is_some());
|
||||
|
||||
let retrieved2 = storage2.get("test2")?;
|
||||
assert!(retrieved2.is_some());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user