Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,22 @@
[package]
name = "ruvector-snapshot"
version.workspace = true
edition.workspace = true
license.workspace = true
authors.workspace = true
repository.workspace = true
readme = "README.md"
description = "Point-in-time snapshots and backup for Ruvector vector databases"
[dependencies]
ruvector-core = { version = "2.0", path = "../ruvector-core" }
serde = { workspace = true }
serde_json = { workspace = true }
bincode = { workspace = true, features = ["serde"] }
thiserror = { workspace = true }
uuid = { workspace = true }
chrono = { workspace = true, features = ["serde"] }
flate2 = "1.0"
sha2 = "0.10"
tokio = { workspace = true, features = ["fs", "io-util"] }
async-trait = "0.1"

View File

@@ -0,0 +1,217 @@
# Ruvector Snapshot
[![Crates.io](https://img.shields.io/crates/v/ruvector-snapshot.svg)](https://crates.io/crates/ruvector-snapshot)
[![Documentation](https://docs.rs/ruvector-snapshot/badge.svg)](https://docs.rs/ruvector-snapshot)
[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Rust](https://img.shields.io/badge/rust-1.77%2B-orange.svg)](https://www.rust-lang.org)
**Point-in-time snapshots and backup for Ruvector vector databases.**
`ruvector-snapshot` provides efficient snapshot creation, storage, and restoration for Ruvector databases. Supports incremental snapshots, compression, and integrity verification. Part of the [Ruvector](https://github.com/ruvnet/ruvector) ecosystem.
## Why Ruvector Snapshot?
- **Point-in-Time Recovery**: Restore to any snapshot
- **Incremental Snapshots**: Only store changed data
- **Compression**: GZIP compression for storage efficiency
- **Integrity Verification**: SHA-256 checksums
- **Async I/O**: Non-blocking snapshot operations
## Features
### Core Capabilities
- **Full Snapshots**: Complete database backup
- **Incremental Snapshots**: Delta-based backups
- **Compression**: GZIP compression support
- **Checksums**: SHA-256 integrity verification
- **Async Operations**: Tokio-based async I/O
### Advanced Features
- **Snapshot Scheduling**: Automated snapshot creation
- **Retention Policies**: Automatic cleanup of old snapshots
- **Remote Storage**: S3/GCS compatible storage (planned)
- **Streaming Restore**: Progressive restoration
- **Parallel Processing**: Multi-threaded snapshot creation
## Installation
Add `ruvector-snapshot` to your `Cargo.toml`:
```toml
[dependencies]
ruvector-snapshot = "0.1.1"
```
## Quick Start
### Create Snapshot
```rust
use ruvector_snapshot::{SnapshotManager, SnapshotConfig};
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Configure snapshot manager
let config = SnapshotConfig {
snapshot_dir: "./snapshots".into(),
compression: true,
verify_checksum: true,
..Default::default()
};
let manager = SnapshotManager::new(config)?;
// Create a full snapshot
let snapshot = manager.create_snapshot(&db, "backup-2024-01").await?;
println!("Created snapshot: {} ({} bytes)",
snapshot.id,
snapshot.size_bytes
);
Ok(())
}
```
### Restore from Snapshot
```rust
use ruvector_snapshot::SnapshotManager;
// List available snapshots
let snapshots = manager.list_snapshots().await?;
for snapshot in &snapshots {
println!("{}: {} ({})",
snapshot.id,
snapshot.created_at,
snapshot.size_bytes
);
}
// Restore from snapshot
let restored_db = manager.restore_snapshot(&snapshots[0].id).await?;
println!("Restored {} vectors", restored_db.len()?);
```
### Incremental Snapshots
```rust
use ruvector_snapshot::{SnapshotManager, SnapshotType};
// Create base snapshot
let base = manager.create_snapshot(&db, "base").await?;
// ... database modifications ...
// Create incremental snapshot
let incremental = manager.create_incremental_snapshot(
&db,
"incremental-1",
&base.id
).await?;
println!("Incremental snapshot: {} bytes (vs {} full)",
incremental.size_bytes,
base.size_bytes
);
```
## API Overview
### Core Types
```rust
// Snapshot configuration
pub struct SnapshotConfig {
pub snapshot_dir: PathBuf,
pub compression: bool,
pub compression_level: u32,
pub verify_checksum: bool,
pub max_concurrent_io: usize,
}
// Snapshot metadata
pub struct Snapshot {
pub id: String,
pub created_at: DateTime<Utc>,
pub size_bytes: u64,
pub checksum: String,
pub snapshot_type: SnapshotType,
pub vector_count: usize,
pub metadata: serde_json::Value,
}
// Snapshot types
pub enum SnapshotType {
Full,
Incremental { base_id: String },
}
```
### Manager Operations
```rust
impl SnapshotManager {
pub fn new(config: SnapshotConfig) -> Result<Self>;
// Snapshot creation
pub async fn create_snapshot(&self, db: &VectorDB, name: &str) -> Result<Snapshot>;
pub async fn create_incremental_snapshot(
&self,
db: &VectorDB,
name: &str,
base_id: &str
) -> Result<Snapshot>;
// Listing and info
pub async fn list_snapshots(&self) -> Result<Vec<Snapshot>>;
pub async fn get_snapshot(&self, id: &str) -> Result<Option<Snapshot>>;
// Restoration
pub async fn restore_snapshot(&self, id: &str) -> Result<VectorDB>;
pub async fn verify_snapshot(&self, id: &str) -> Result<bool>;
// Management
pub async fn delete_snapshot(&self, id: &str) -> Result<()>;
pub async fn cleanup_old_snapshots(&self, keep: usize) -> Result<usize>;
}
```
## Snapshot Format
```
snapshot-{id}/
├── metadata.json # Snapshot metadata
├── vectors.bin.gz # Compressed vector data
├── index.bin.gz # HNSW index data
├── metadata.bin.gz # Vector metadata
└── checksum.sha256 # Integrity checksum
```
## Related Crates
- **[ruvector-core](../ruvector-core/)** - Core vector database engine
- **[ruvector-replication](../ruvector-replication/)** - Data replication
## Documentation
- **[Main README](../../README.md)** - Complete project overview
- **[API Documentation](https://docs.rs/ruvector-snapshot)** - Full API reference
- **[GitHub Repository](https://github.com/ruvnet/ruvector)** - Source code
## License
**MIT License** - see [LICENSE](../../LICENSE) for details.
---
<div align="center">
**Part of [Ruvector](https://github.com/ruvnet/ruvector) - Built by [rUv](https://ruv.io)**
[![Star on GitHub](https://img.shields.io/github/stars/ruvnet/ruvector?style=social)](https://github.com/ruvnet/ruvector)
[Documentation](https://docs.rs/ruvector-snapshot) | [Crates.io](https://crates.io/crates/ruvector-snapshot) | [GitHub](https://github.com/ruvnet/ruvector)
</div>

View File

@@ -0,0 +1,52 @@
use thiserror::Error;
/// Result type for snapshot operations
pub type Result<T> = std::result::Result<T, SnapshotError>;
/// Errors that can occur during snapshot operations
#[derive(Error, Debug)]
pub enum SnapshotError {
#[error("Snapshot not found: {0}")]
SnapshotNotFound(String),
#[error("Corrupted snapshot: {0}")]
CorruptedSnapshot(String),
#[error("Storage error: {0}")]
StorageError(String),
#[error("Compression error: {0}")]
CompressionError(String),
#[error("IO error: {0}")]
IoError(#[from] std::io::Error),
#[error("Serialization error: {0}")]
SerializationError(String),
#[error("JSON error: {0}")]
JsonError(#[from] serde_json::Error),
#[error("Invalid checksum: expected {expected}, got {actual}")]
InvalidChecksum { expected: String, actual: String },
#[error("Collection error: {0}")]
CollectionError(String),
}
impl SnapshotError {
/// Create a storage error with a custom message
pub fn storage<S: Into<String>>(msg: S) -> Self {
SnapshotError::StorageError(msg.into())
}
/// Create a corrupted snapshot error with a custom message
pub fn corrupted<S: Into<String>>(msg: S) -> Self {
SnapshotError::CorruptedSnapshot(msg.into())
}
/// Create a compression error with a custom message
pub fn compression<S: Into<String>>(msg: S) -> Self {
SnapshotError::CompressionError(msg.into())
}
}

View File

@@ -0,0 +1,27 @@
//! Snapshot and restore functionality for rUvector collections
//!
//! This crate provides backup and restore capabilities for vector collections,
//! including compression, checksums, and multiple storage backends.
mod error;
mod manager;
mod snapshot;
mod storage;
pub use error::{Result, SnapshotError};
pub use manager::SnapshotManager;
pub use snapshot::{Snapshot, SnapshotData, SnapshotMetadata, VectorRecord};
pub use storage::{LocalStorage, SnapshotStorage};
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_module_exports() {
// Verify all public exports are accessible
let _: Option<SnapshotError> = None;
let _: Option<SnapshotManager> = None;
let _: Option<Snapshot> = None;
}
}

View File

@@ -0,0 +1,294 @@
use crate::error::{Result, SnapshotError};
use crate::snapshot::{Snapshot, SnapshotData};
use crate::storage::SnapshotStorage;
/// Manages snapshot operations for collections
pub struct SnapshotManager {
storage: Box<dyn SnapshotStorage>,
}
impl SnapshotManager {
/// Create a new snapshot manager with the given storage backend
pub fn new(storage: Box<dyn SnapshotStorage>) -> Self {
Self { storage }
}
/// Create a snapshot of a collection
///
/// # Arguments
/// * `snapshot_data` - The complete snapshot data including vectors and configuration
///
/// # Returns
/// * `Snapshot` - Metadata about the created snapshot
pub async fn create_snapshot(&self, snapshot_data: SnapshotData) -> Result<Snapshot> {
// Validate snapshot data
if snapshot_data.vectors.is_empty() {
return Err(SnapshotError::storage(
"Cannot create snapshot of empty collection",
));
}
// Verify all vectors have the same dimension
let expected_dim = snapshot_data.config.dimension;
for (idx, vector) in snapshot_data.vectors.iter().enumerate() {
if vector.vector.len() != expected_dim {
return Err(SnapshotError::storage(format!(
"Vector {} has dimension {} but expected {}",
idx,
vector.vector.len(),
expected_dim
)));
}
}
// Save the snapshot
self.storage.save(&snapshot_data).await
}
/// Restore a snapshot by ID
///
/// # Arguments
/// * `id` - The unique snapshot identifier
///
/// # Returns
/// * `SnapshotData` - The complete snapshot data including vectors and configuration
pub async fn restore_snapshot(&self, id: &str) -> Result<SnapshotData> {
if id.is_empty() {
return Err(SnapshotError::storage("Snapshot ID cannot be empty"));
}
self.storage.load(id).await
}
/// List all available snapshots
///
/// # Returns
/// * `Vec<Snapshot>` - List of all snapshot metadata, sorted by creation date (newest first)
pub async fn list_snapshots(&self) -> Result<Vec<Snapshot>> {
self.storage.list().await
}
/// List snapshots for a specific collection
///
/// # Arguments
/// * `collection_name` - Name of the collection to filter by
///
/// # Returns
/// * `Vec<Snapshot>` - List of snapshots for the specified collection
pub async fn list_snapshots_for_collection(
&self,
collection_name: &str,
) -> Result<Vec<Snapshot>> {
let all_snapshots = self.storage.list().await?;
Ok(all_snapshots
.into_iter()
.filter(|s| s.collection_name == collection_name)
.collect())
}
/// Delete a snapshot by ID
///
/// # Arguments
/// * `id` - The unique snapshot identifier
pub async fn delete_snapshot(&self, id: &str) -> Result<()> {
if id.is_empty() {
return Err(SnapshotError::storage("Snapshot ID cannot be empty"));
}
self.storage.delete(id).await
}
/// Get snapshot metadata by ID
///
/// # Arguments
/// * `id` - The unique snapshot identifier
///
/// # Returns
/// * `Snapshot` - Metadata about the snapshot
pub async fn get_snapshot_info(&self, id: &str) -> Result<Snapshot> {
let snapshots = self.storage.list().await?;
snapshots
.into_iter()
.find(|s| s.id == id)
.ok_or_else(|| SnapshotError::SnapshotNotFound(id.to_string()))
}
/// Delete old snapshots, keeping only the N most recent
///
/// # Arguments
/// * `collection_name` - Name of the collection
/// * `keep_count` - Number of recent snapshots to keep
///
/// # Returns
/// * `usize` - Number of snapshots deleted
pub async fn cleanup_old_snapshots(
&self,
collection_name: &str,
keep_count: usize,
) -> Result<usize> {
let snapshots = self.list_snapshots_for_collection(collection_name).await?;
if snapshots.len() <= keep_count {
return Ok(0);
}
let to_delete = &snapshots[keep_count..];
let mut deleted = 0;
for snapshot in to_delete {
if self.storage.delete(&snapshot.id).await.is_ok() {
deleted += 1;
}
}
Ok(deleted)
}
/// Get the total size of all snapshots in bytes
pub async fn total_size(&self) -> Result<u64> {
let snapshots = self.storage.list().await?;
Ok(snapshots.iter().map(|s| s.size_bytes).sum())
}
/// Get the total size of snapshots for a specific collection
pub async fn collection_size(&self, collection_name: &str) -> Result<u64> {
let snapshots = self.list_snapshots_for_collection(collection_name).await?;
Ok(snapshots.iter().map(|s| s.size_bytes).sum())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::snapshot::{CollectionConfig, DistanceMetric, VectorRecord};
use crate::storage::LocalStorage;
use std::path::PathBuf;
fn create_test_snapshot_data(name: &str, vector_count: usize) -> SnapshotData {
let config = CollectionConfig {
dimension: 3,
metric: DistanceMetric::Cosine,
hnsw_config: None,
};
let vectors = (0..vector_count)
.map(|i| {
VectorRecord::new(
format!("v{}", i),
vec![i as f32, (i + 1) as f32, (i + 2) as f32],
None,
)
})
.collect();
SnapshotData::new(name.to_string(), config, vectors)
}
#[tokio::test]
async fn test_create_and_restore_snapshot() {
let temp_dir = std::env::temp_dir().join("ruvector-manager-test");
let storage = Box::new(LocalStorage::new(temp_dir.clone()));
let manager = SnapshotManager::new(storage);
let snapshot_data = create_test_snapshot_data("test-collection", 5);
let id = snapshot_data.id().to_string();
// Create snapshot
let snapshot = manager.create_snapshot(snapshot_data).await.unwrap();
assert_eq!(snapshot.id, id);
assert_eq!(snapshot.vectors_count, 5);
// Restore snapshot
let restored = manager.restore_snapshot(&id).await.unwrap();
assert_eq!(restored.id(), id);
assert_eq!(restored.vectors_count(), 5);
// Cleanup
let _ = manager.delete_snapshot(&id).await;
let _ = std::fs::remove_dir_all(temp_dir);
}
#[tokio::test]
async fn test_list_snapshots() {
let temp_dir = std::env::temp_dir().join("ruvector-list-test");
let storage = Box::new(LocalStorage::new(temp_dir.clone()));
let manager = SnapshotManager::new(storage);
// Create multiple snapshots
let snapshot1 = create_test_snapshot_data("collection-1", 3);
let snapshot2 = create_test_snapshot_data("collection-2", 5);
let id1 = snapshot1.id().to_string();
let id2 = snapshot2.id().to_string();
manager.create_snapshot(snapshot1).await.unwrap();
manager.create_snapshot(snapshot2).await.unwrap();
// List all
let all_snapshots = manager.list_snapshots().await.unwrap();
assert!(all_snapshots.len() >= 2);
// List by collection
let collection1_snapshots = manager
.list_snapshots_for_collection("collection-1")
.await
.unwrap();
assert_eq!(collection1_snapshots.len(), 1);
// Cleanup
let _ = manager.delete_snapshot(&id1).await;
let _ = manager.delete_snapshot(&id2).await;
let _ = std::fs::remove_dir_all(temp_dir);
}
#[tokio::test]
async fn test_cleanup_old_snapshots() {
let temp_dir = std::env::temp_dir().join("ruvector-cleanup-test");
let storage = Box::new(LocalStorage::new(temp_dir.clone()));
let manager = SnapshotManager::new(storage);
// Create multiple snapshots for the same collection
for i in 0..5 {
let snapshot_data = create_test_snapshot_data("test-collection", i + 1);
manager.create_snapshot(snapshot_data).await.unwrap();
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
}
// Cleanup, keeping only 2 most recent
let deleted = manager
.cleanup_old_snapshots("test-collection", 2)
.await
.unwrap();
assert_eq!(deleted, 3);
// Verify only 2 remain
let remaining = manager
.list_snapshots_for_collection("test-collection")
.await
.unwrap();
assert_eq!(remaining.len(), 2);
// Cleanup
let _ = std::fs::remove_dir_all(temp_dir);
}
#[tokio::test]
async fn test_snapshot_validation() {
let temp_dir = std::env::temp_dir().join("ruvector-validation-test");
let storage = Box::new(LocalStorage::new(temp_dir.clone()));
let manager = SnapshotManager::new(storage);
// Test empty collection
let config = CollectionConfig {
dimension: 3,
metric: DistanceMetric::Cosine,
hnsw_config: None,
};
let empty_data = SnapshotData::new("empty".to_string(), config, vec![]);
let result = manager.create_snapshot(empty_data).await;
assert!(result.is_err());
// Cleanup
let _ = std::fs::remove_dir_all(temp_dir);
}
}

View File

@@ -0,0 +1,195 @@
use bincode::{Decode, Encode};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use serde_json::Value;
/// Snapshot metadata and information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Snapshot {
/// Unique snapshot identifier
pub id: String,
/// Name of the collection this snapshot represents
pub collection_name: String,
/// Timestamp when the snapshot was created
pub created_at: DateTime<Utc>,
/// Number of vectors in the snapshot
pub vectors_count: usize,
/// SHA-256 checksum of the snapshot data
pub checksum: String,
/// Size of the snapshot in bytes (compressed)
pub size_bytes: u64,
}
/// Complete snapshot data including metadata and vectors
#[derive(Debug, Serialize, Deserialize, Encode, Decode)]
pub struct SnapshotData {
/// Snapshot metadata
pub metadata: SnapshotMetadata,
/// Collection configuration
pub config: CollectionConfig,
/// All vectors in the collection
pub vectors: Vec<VectorRecord>,
}
impl SnapshotData {
/// Create a new snapshot data instance
pub fn new(
collection_name: String,
config: CollectionConfig,
vectors: Vec<VectorRecord>,
) -> Self {
Self {
metadata: SnapshotMetadata {
id: uuid::Uuid::new_v4().to_string(),
collection_name,
created_at: Utc::now().to_rfc3339(),
version: env!("CARGO_PKG_VERSION").to_string(),
},
config,
vectors,
}
}
/// Get the number of vectors in this snapshot
pub fn vectors_count(&self) -> usize {
self.vectors.len()
}
/// Get the snapshot ID
pub fn id(&self) -> &str {
&self.metadata.id
}
/// Get the collection name
pub fn collection_name(&self) -> &str {
&self.metadata.collection_name
}
}
/// Snapshot metadata
#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
pub struct SnapshotMetadata {
/// Unique snapshot identifier
pub id: String,
/// Name of the collection
pub collection_name: String,
/// Creation timestamp (RFC3339 format)
pub created_at: String,
/// Version of the snapshot format
pub version: String,
}
/// Collection configuration stored in snapshot
#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
pub struct CollectionConfig {
/// Vector dimension
pub dimension: usize,
/// Distance metric
pub metric: DistanceMetric,
/// HNSW configuration
pub hnsw_config: Option<HnswConfig>,
}
/// Distance metric for vector similarity
#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
pub enum DistanceMetric {
Cosine,
Euclidean,
DotProduct,
}
/// HNSW index configuration
#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
pub struct HnswConfig {
pub m: usize,
pub ef_construction: usize,
pub ef_search: usize,
}
/// Individual vector record in a snapshot
#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
pub struct VectorRecord {
/// Unique vector identifier
pub id: String,
/// Vector data
pub vector: Vec<f32>,
/// Optional metadata payload (stored as JSON string for bincode compatibility)
#[serde(skip)]
#[bincode(with_serde)]
payload_json: Option<String>,
}
impl VectorRecord {
/// Create a new vector record
pub fn new(id: String, vector: Vec<f32>, payload: Option<Value>) -> Self {
let payload_json = payload.and_then(|v| serde_json::to_string(&v).ok());
Self {
id,
vector,
payload_json,
}
}
/// Get the payload as a serde_json::Value
pub fn payload(&self) -> Option<Value> {
self.payload_json
.as_ref()
.and_then(|s| serde_json::from_str(s).ok())
}
/// Set the payload from a serde_json::Value
pub fn set_payload(&mut self, payload: Option<Value>) {
self.payload_json = payload.and_then(|v| serde_json::to_string(&v).ok());
}
/// Get the dimension of this vector
pub fn dimension(&self) -> usize {
self.vector.len()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_vector_record_creation() {
let record = VectorRecord::new("test-1".to_string(), vec![1.0, 2.0, 3.0], None);
assert_eq!(record.id, "test-1");
assert_eq!(record.dimension(), 3);
}
#[test]
fn test_snapshot_data_creation() {
let config = CollectionConfig {
dimension: 3,
metric: DistanceMetric::Cosine,
hnsw_config: None,
};
let vectors = vec![
VectorRecord::new("v1".to_string(), vec![1.0, 0.0, 0.0], None),
VectorRecord::new("v2".to_string(), vec![0.0, 1.0, 0.0], None),
];
let data = SnapshotData::new("test-collection".to_string(), config, vectors);
assert_eq!(data.vectors_count(), 2);
assert_eq!(data.collection_name(), "test-collection");
assert!(!data.id().is_empty());
}
}

View File

@@ -0,0 +1,276 @@
use async_trait::async_trait;
use flate2::read::GzDecoder;
use flate2::write::GzEncoder;
use flate2::Compression;
use sha2::{Digest, Sha256};
use std::io::{Read, Write};
use std::path::PathBuf;
use tokio::fs;
use crate::error::{Result, SnapshotError};
use crate::snapshot::{Snapshot, SnapshotData};
/// Trait for snapshot storage backends
#[async_trait]
pub trait SnapshotStorage: Send + Sync {
/// Save a snapshot to storage
async fn save(&self, snapshot: &SnapshotData) -> Result<Snapshot>;
/// Load a snapshot from storage
async fn load(&self, id: &str) -> Result<SnapshotData>;
/// List all available snapshots
async fn list(&self) -> Result<Vec<Snapshot>>;
/// Delete a snapshot from storage
async fn delete(&self, id: &str) -> Result<()>;
}
/// Local filesystem storage backend
pub struct LocalStorage {
base_path: PathBuf,
}
impl LocalStorage {
/// Create a new local storage instance
pub fn new(base_path: PathBuf) -> Self {
Self { base_path }
}
/// Get the path for a snapshot file
fn snapshot_path(&self, id: &str) -> PathBuf {
self.base_path.join(format!("{}.snapshot.gz", id))
}
/// Get the path for a snapshot metadata file
fn metadata_path(&self, id: &str) -> PathBuf {
self.base_path.join(format!("{}.metadata.json", id))
}
/// Compress data using gzip
fn compress(data: &[u8]) -> Result<Vec<u8>> {
let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
encoder
.write_all(data)
.map_err(|e| SnapshotError::compression(format!("Compression failed: {}", e)))?;
encoder
.finish()
.map_err(|e| SnapshotError::compression(format!("Finish compression failed: {}", e)))
}
/// Decompress gzip data
fn decompress(data: &[u8]) -> Result<Vec<u8>> {
let mut decoder = GzDecoder::new(data);
let mut decompressed = Vec::new();
decoder
.read_to_end(&mut decompressed)
.map_err(|e| SnapshotError::compression(format!("Decompression failed: {}", e)))?;
Ok(decompressed)
}
/// Calculate SHA-256 checksum
fn calculate_checksum(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
format!("{:x}", hasher.finalize())
}
/// Ensure the base directory exists
async fn ensure_dir(&self) -> Result<()> {
if !self.base_path.exists() {
fs::create_dir_all(&self.base_path).await?;
}
Ok(())
}
}
#[async_trait]
impl SnapshotStorage for LocalStorage {
async fn save(&self, snapshot_data: &SnapshotData) -> Result<Snapshot> {
self.ensure_dir().await?;
let id = snapshot_data.id().to_string();
let snapshot_path = self.snapshot_path(&id);
let metadata_path = self.metadata_path(&id);
// Serialize snapshot data
let config = bincode::config::standard();
let serialized = bincode::encode_to_vec(snapshot_data, config)
.map_err(|e| SnapshotError::SerializationError(e.to_string()))?;
// Calculate checksum before compression
let checksum = Self::calculate_checksum(&serialized);
// Compress data
let compressed = Self::compress(&serialized)?;
let size_bytes = compressed.len() as u64;
// Write compressed data
fs::write(&snapshot_path, &compressed).await?;
// Create snapshot metadata
let created_at = chrono::DateTime::parse_from_rfc3339(&snapshot_data.metadata.created_at)
.map_err(|e| SnapshotError::storage(format!("Invalid timestamp: {}", e)))?
.with_timezone(&chrono::Utc);
let snapshot = Snapshot {
id: id.clone(),
collection_name: snapshot_data.collection_name().to_string(),
created_at,
vectors_count: snapshot_data.vectors_count(),
checksum,
size_bytes,
};
// Write metadata
let metadata_json = serde_json::to_string_pretty(&snapshot)?;
fs::write(&metadata_path, metadata_json).await?;
Ok(snapshot)
}
async fn load(&self, id: &str) -> Result<SnapshotData> {
let snapshot_path = self.snapshot_path(id);
let metadata_path = self.metadata_path(id);
// Check if files exist
if !snapshot_path.exists() {
return Err(SnapshotError::SnapshotNotFound(id.to_string()));
}
// Load and verify metadata
let metadata_json = fs::read_to_string(&metadata_path).await?;
let snapshot: Snapshot = serde_json::from_str(&metadata_json)?;
// Load compressed data
let compressed = fs::read(&snapshot_path).await?;
// Decompress
let decompressed = Self::decompress(&compressed)?;
// Verify checksum
let actual_checksum = Self::calculate_checksum(&decompressed);
if actual_checksum != snapshot.checksum {
return Err(SnapshotError::InvalidChecksum {
expected: snapshot.checksum,
actual: actual_checksum,
});
}
// Deserialize
let config = bincode::config::standard();
let (snapshot_data, _): (SnapshotData, usize) =
bincode::decode_from_slice(&decompressed, config)
.map_err(|e| SnapshotError::SerializationError(e.to_string()))?;
Ok(snapshot_data)
}
async fn list(&self) -> Result<Vec<Snapshot>> {
self.ensure_dir().await?;
let mut snapshots = Vec::new();
let mut entries = fs::read_dir(&self.base_path).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if let Some(extension) = path.extension() {
if extension == "json" {
if let Some(file_name) = path.file_stem() {
let file_name_str = file_name.to_string_lossy();
if file_name_str.ends_with(".metadata") {
let contents = fs::read_to_string(&path).await?;
if let Ok(snapshot) = serde_json::from_str::<Snapshot>(&contents) {
snapshots.push(snapshot);
}
}
}
}
}
}
// Sort by creation date (newest first)
snapshots.sort_by(|a, b| b.created_at.cmp(&a.created_at));
Ok(snapshots)
}
async fn delete(&self, id: &str) -> Result<()> {
let snapshot_path = self.snapshot_path(id);
let metadata_path = self.metadata_path(id);
if !snapshot_path.exists() {
return Err(SnapshotError::SnapshotNotFound(id.to_string()));
}
// Delete both files
fs::remove_file(&snapshot_path).await?;
if metadata_path.exists() {
fs::remove_file(&metadata_path).await?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::snapshot::{CollectionConfig, DistanceMetric, VectorRecord};
#[test]
fn test_compression_roundtrip() {
let data = b"Hello, World! This is test data for compression.";
let compressed = LocalStorage::compress(data).unwrap();
let decompressed = LocalStorage::decompress(&compressed).unwrap();
assert_eq!(data.to_vec(), decompressed);
}
#[test]
fn test_checksum_calculation() {
let data = b"test data";
let checksum = LocalStorage::calculate_checksum(data);
assert_eq!(checksum.len(), 64); // SHA-256 produces 64 hex characters
}
#[tokio::test]
async fn test_local_storage_roundtrip() {
let temp_dir = std::env::temp_dir().join("ruvector-snapshot-test");
let storage = LocalStorage::new(temp_dir.clone());
let config = CollectionConfig {
dimension: 3,
metric: DistanceMetric::Cosine,
hnsw_config: None,
};
let vectors = vec![
VectorRecord::new("v1".to_string(), vec![1.0, 0.0, 0.0], None),
VectorRecord::new("v2".to_string(), vec![0.0, 1.0, 0.0], None),
];
let snapshot_data = SnapshotData::new("test-collection".to_string(), config, vectors);
let id = snapshot_data.id().to_string();
// Save
let snapshot = storage.save(&snapshot_data).await.unwrap();
assert_eq!(snapshot.id, id);
assert_eq!(snapshot.vectors_count, 2);
// List
let snapshots = storage.list().await.unwrap();
assert!(!snapshots.is_empty());
// Load
let loaded = storage.load(&id).await.unwrap();
assert_eq!(loaded.id(), id);
assert_eq!(loaded.vectors_count(), 2);
// Delete
storage.delete(&id).await.unwrap();
// Cleanup
let _ = std::fs::remove_dir_all(temp_dir);
}
}