Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,28 @@
[package]
name = "ruvector-replication"
version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
authors.workspace = true
repository.workspace = true
readme = "README.md"
description = "Data replication and synchronization for ruvector"
[dependencies]
ruvector-core = { version = "2.0.1", path = "../ruvector-core" }
tokio = { workspace = true, features = ["time"] }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tracing = { workspace = true }
dashmap = { workspace = true }
parking_lot = { workspace = true }
uuid = { workspace = true }
chrono = { workspace = true, features = ["serde"] }
futures = { workspace = true }
rand = { workspace = true }
bincode = { workspace = true }
[dev-dependencies]
tokio = { workspace = true, features = ["rt-multi-thread", "macros", "test-util"] }

View File

@@ -0,0 +1,226 @@
# ruvector-replication
[![Crates.io](https://img.shields.io/crates/v/ruvector-replication.svg)](https://crates.io/crates/ruvector-replication)
[![docs.rs](https://docs.rs/ruvector-replication/badge.svg)](https://docs.rs/ruvector-replication)
[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Rust](https://img.shields.io/badge/rust-1.77%2B-orange.svg)](https://www.rust-lang.org)
**Multi-master vector replication with quorum writes, vector clocks, and automatic conflict resolution.**
```toml
ruvector-replication = "0.1.1"
```
When your vector database runs on more than one node, you need a way to keep data in sync without losing writes or slowing down queries. ruvector-replication handles that: it replicates vectors across nodes, resolves conflicts automatically, and lets you trade off consistency versus speed per-write. It plugs into the [RuVector](https://github.com/ruvnet/ruvector) ecosystem alongside Raft consensus and auto-sharding.
| | Single-node vector DB | ruvector-replication |
|---|---|---|
| **Availability** | One node goes down, everything stops | Replicas serve reads and accept writes |
| **Write scaling** | One writer | Multi-master -- write to any node |
| **Conflict handling** | N/A | Vector clocks, last-write-wins, or CRDTs |
| **Consistency control** | N/A | Per-write: One, Quorum, or All |
| **Sync efficiency** | N/A | Incremental deltas with compression |
| **Recovery** | Manual restore from backup | Automatic replica recovery |
## Quick Start
```rust
use ruvector_replication::{Replicator, ReplicationConfig, ConsistencyLevel};
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let config = ReplicationConfig {
replication_factor: 3,
consistency_level: ConsistencyLevel::Quorum,
sync_interval: Duration::from_millis(100),
batch_size: 1000,
compression: true,
..Default::default()
};
let replicator = Replicator::new(config).await?;
replicator.start().await?;
Ok(())
}
```
## Key Features
| Feature | What It Does | Why It Matters |
|---------|-------------|----------------|
| **Multi-master replication** | Write to any node in the cluster | No single point of failure for writes |
| **Configurable consistency** | Choose One, Quorum, or All per write | Trade latency for safety on a per-operation basis |
| **Vector clock conflict resolution** | Track causal ordering across nodes | Detect and resolve concurrent writes correctly |
| **CRDT support** | Conflict-free replicated data types | Guaranteed convergence without coordination |
| **Change streams** | Real-time replication event stream | Monitor sync status and react to changes |
| **Incremental sync with compression** | Only send deltas, compressed on the wire | Minimize bandwidth between nodes |
| **Automatic recovery** | Replicas catch up after failures | No manual intervention on node restart |
| **Bandwidth throttling** | Cap replication throughput | Protect production traffic from replication storms |
### Write with Replication
```rust
use ruvector_replication::{Replicator, WriteOptions};
// Write with quorum consistency
let options = WriteOptions {
consistency: ConsistencyLevel::Quorum,
timeout: Duration::from_secs(5),
};
replicator.write(vector_entry, options).await?;
// Write with eventual consistency (faster)
let options = WriteOptions {
consistency: ConsistencyLevel::One,
..Default::default()
};
replicator.write(vector_entry, options).await?;
```
### Monitor Replication
```rust
// Get replication lag
let lag = replicator.lag().await?;
println!("Replication lag: {:?}", lag);
// Get replica status
for replica in replicator.replicas().await? {
println!("{}: {} (lag: {}ms)",
replica.id,
replica.status,
replica.lag_ms
);
}
// Subscribe to replication events
let mut stream = replicator.events().await?;
while let Some(event) = stream.next().await {
match event {
ReplicationEvent::Synced { node_id, entries } => {
println!("Synced {} entries to {}", entries, node_id);
}
ReplicationEvent::Conflict { key, resolution } => {
println!("Conflict on {}: {:?}", key, resolution);
}
_ => {}
}
}
```
## API Overview
### Core Types
```rust
// Replication configuration
pub struct ReplicationConfig {
pub replication_factor: usize,
pub consistency_level: ConsistencyLevel,
pub sync_interval: Duration,
pub batch_size: usize,
pub compression: bool,
pub conflict_resolution: ConflictResolution,
}
// Consistency levels
pub enum ConsistencyLevel {
One, // Write to one replica
Quorum, // Write to majority
All, // Write to all replicas
}
// Conflict resolution strategies
pub enum ConflictResolution {
LastWriteWins,
VectorClock,
Custom(Box<dyn ConflictResolver>),
}
// Replica information
pub struct ReplicaInfo {
pub id: NodeId,
pub status: ReplicaStatus,
pub lag_ms: u64,
pub last_sync: DateTime<Utc>,
}
```
### Replicator Operations
```rust
impl Replicator {
pub async fn new(config: ReplicationConfig) -> Result<Self>;
pub async fn start(&self) -> Result<()>;
pub async fn stop(&self) -> Result<()>;
// Write operations
pub async fn write(&self, entry: VectorEntry, options: WriteOptions) -> Result<()>;
pub async fn write_batch(&self, entries: Vec<VectorEntry>, options: WriteOptions) -> Result<()>;
// Monitoring
pub async fn lag(&self) -> Result<Duration>;
pub async fn replicas(&self) -> Result<Vec<ReplicaInfo>>;
pub async fn events(&self) -> Result<impl Stream<Item = ReplicationEvent>>;
// Management
pub async fn add_replica(&self, node_id: NodeId) -> Result<()>;
pub async fn remove_replica(&self, node_id: NodeId) -> Result<()>;
pub async fn force_sync(&self, node_id: NodeId) -> Result<()>;
}
```
## Architecture
```
┌─────────────────────────────────────────────────────────┐
│ Replication Flow │
│ │
│ Client │
│ │ │
│ ▼ │
│ ┌──────────┐ Quorum Write ┌──────────┐ │
│ │ Primary │────────────────────▶│ Replica 1│ │
│ │ │ │ │ │
│ │ Vectors │────────────────────▶│ Vectors │ │
│ └──────────┘ └──────────┘ │
│ │ │
│ │ Async Replication │
│ └──────────────────────────▶┌──────────┐ │
│ │ Replica 2│ │
│ │ │ │
│ │ Vectors │ │
│ └──────────┘ │
└─────────────────────────────────────────────────────────┘
```
## Related Crates
- **[ruvector-core](../ruvector-core/)** - Core vector database engine
- **[ruvector-cluster](../ruvector-cluster/)** - Clustering and sharding
- **[ruvector-raft](../ruvector-raft/)** - Raft consensus
## Documentation
- **[Main README](../../README.md)** - Complete project overview
- **[API Documentation](https://docs.rs/ruvector-replication)** - Full API reference
- **[GitHub Repository](https://github.com/ruvnet/ruvector)** - Source code
## License
**MIT License** - see [LICENSE](../../LICENSE) for details.
---
<div align="center">
**Part of [Ruvector](https://github.com/ruvnet/ruvector) - Built by [rUv](https://ruv.io)**
[![Star on GitHub](https://img.shields.io/github/stars/ruvnet/ruvector?style=social)](https://github.com/ruvnet/ruvector)
[Documentation](https://docs.rs/ruvector-replication) | [Crates.io](https://crates.io/crates/ruvector-replication) | [GitHub](https://github.com/ruvnet/ruvector)
</div>

View File

@@ -0,0 +1,395 @@
//! Conflict resolution strategies for distributed replication
//!
//! Provides vector clocks for causality tracking and various
//! conflict resolution strategies including Last-Write-Wins
//! and custom merge functions.
use crate::{ReplicationError, Result};
use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
use std::collections::HashMap;
use std::fmt;
/// Vector clock for tracking causality in distributed systems
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct VectorClock {
/// Map of replica ID to logical timestamp
clock: HashMap<String, u64>,
}
impl VectorClock {
/// Create a new vector clock
pub fn new() -> Self {
Self {
clock: HashMap::new(),
}
}
/// Increment the clock for a replica
pub fn increment(&mut self, replica_id: &str) {
let counter = self.clock.entry(replica_id.to_string()).or_insert(0);
*counter += 1;
}
/// Get the timestamp for a replica
pub fn get(&self, replica_id: &str) -> u64 {
self.clock.get(replica_id).copied().unwrap_or(0)
}
/// Update with another vector clock (taking max of each component)
pub fn merge(&mut self, other: &VectorClock) {
for (replica_id, &timestamp) in &other.clock {
let current = self.clock.entry(replica_id.clone()).or_insert(0);
*current = (*current).max(timestamp);
}
}
/// Check if this clock happens-before another clock
pub fn happens_before(&self, other: &VectorClock) -> bool {
let mut less = false;
let mut equal = true;
// Check all replicas in self
for (replica_id, &self_ts) in &self.clock {
let other_ts = other.get(replica_id);
if self_ts > other_ts {
return false;
}
if self_ts < other_ts {
less = true;
equal = false;
}
}
// Check replicas only in other
for (replica_id, &other_ts) in &other.clock {
if !self.clock.contains_key(replica_id) && other_ts > 0 {
less = true;
equal = false;
}
}
less || equal
}
/// Compare vector clocks for causality
pub fn compare(&self, other: &VectorClock) -> ClockOrdering {
if self == other {
return ClockOrdering::Equal;
}
if self.happens_before(other) {
return ClockOrdering::Before;
}
if other.happens_before(self) {
return ClockOrdering::After;
}
ClockOrdering::Concurrent
}
/// Check if two clocks are concurrent (conflicting)
pub fn is_concurrent(&self, other: &VectorClock) -> bool {
matches!(self.compare(other), ClockOrdering::Concurrent)
}
}
impl Default for VectorClock {
fn default() -> Self {
Self::new()
}
}
impl fmt::Display for VectorClock {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{{")?;
for (i, (replica, ts)) in self.clock.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{}: {}", replica, ts)?;
}
write!(f, "}}")
}
}
/// Ordering relationship between vector clocks
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ClockOrdering {
/// Clocks are equal
Equal,
/// First clock happens before second
Before,
/// First clock happens after second
After,
/// Clocks are concurrent (conflicting)
Concurrent,
}
/// A versioned value with vector clock
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Versioned<T> {
/// The value
pub value: T,
/// Vector clock for this version
pub clock: VectorClock,
/// Replica that created this version
pub replica_id: String,
}
impl<T> Versioned<T> {
/// Create a new versioned value
pub fn new(value: T, replica_id: String) -> Self {
let mut clock = VectorClock::new();
clock.increment(&replica_id);
Self {
value,
clock,
replica_id,
}
}
/// Update the version with a new value
pub fn update(&mut self, value: T) {
self.value = value;
self.clock.increment(&self.replica_id);
}
/// Compare versions for causality
pub fn compare(&self, other: &Versioned<T>) -> ClockOrdering {
self.clock.compare(&other.clock)
}
}
/// Trait for conflict resolution strategies
pub trait ConflictResolver<T: Clone>: Send + Sync {
/// Resolve a conflict between two versions
fn resolve(&self, v1: &Versioned<T>, v2: &Versioned<T>) -> Result<Versioned<T>>;
/// Resolve multiple conflicting versions
fn resolve_many(&self, versions: Vec<Versioned<T>>) -> Result<Versioned<T>> {
if versions.is_empty() {
return Err(ReplicationError::ConflictResolution(
"No versions to resolve".to_string(),
));
}
if versions.len() == 1 {
return Ok(versions.into_iter().next().unwrap());
}
let mut result = versions[0].clone();
for version in versions.iter().skip(1) {
result = self.resolve(&result, version)?;
}
Ok(result)
}
}
/// Last-Write-Wins conflict resolution strategy
pub struct LastWriteWins;
impl<T: Clone> ConflictResolver<T> for LastWriteWins {
fn resolve(&self, v1: &Versioned<T>, v2: &Versioned<T>) -> Result<Versioned<T>> {
match v1.compare(v2) {
ClockOrdering::Before | ClockOrdering::Concurrent => Ok(v2.clone()),
ClockOrdering::After | ClockOrdering::Equal => Ok(v1.clone()),
}
}
}
/// Custom merge function for conflict resolution
pub struct MergeFunction<T, F>
where
F: Fn(&T, &T) -> T + Send + Sync,
{
merge_fn: F,
_phantom: std::marker::PhantomData<T>,
}
impl<T, F> MergeFunction<T, F>
where
F: Fn(&T, &T) -> T + Send + Sync,
{
/// Create a new merge function resolver
pub fn new(merge_fn: F) -> Self {
Self {
merge_fn,
_phantom: std::marker::PhantomData,
}
}
}
impl<T: Clone + Send + Sync, F> ConflictResolver<T> for MergeFunction<T, F>
where
F: Fn(&T, &T) -> T + Send + Sync,
{
fn resolve(&self, v1: &Versioned<T>, v2: &Versioned<T>) -> Result<Versioned<T>> {
match v1.compare(v2) {
ClockOrdering::Equal | ClockOrdering::Before => Ok(v2.clone()),
ClockOrdering::After => Ok(v1.clone()),
ClockOrdering::Concurrent => {
let merged_value = (self.merge_fn)(&v1.value, &v2.value);
let mut merged_clock = v1.clock.clone();
merged_clock.merge(&v2.clock);
Ok(Versioned {
value: merged_value,
clock: merged_clock,
replica_id: v1.replica_id.clone(),
})
}
}
}
}
/// CRDT-inspired merge for numeric values (takes max)
pub struct MaxMerge;
impl ConflictResolver<i64> for MaxMerge {
fn resolve(&self, v1: &Versioned<i64>, v2: &Versioned<i64>) -> Result<Versioned<i64>> {
match v1.compare(v2) {
ClockOrdering::Equal | ClockOrdering::Before => Ok(v2.clone()),
ClockOrdering::After => Ok(v1.clone()),
ClockOrdering::Concurrent => {
let merged_value = v1.value.max(v2.value);
let mut merged_clock = v1.clock.clone();
merged_clock.merge(&v2.clock);
Ok(Versioned {
value: merged_value,
clock: merged_clock,
replica_id: v1.replica_id.clone(),
})
}
}
}
}
/// CRDT-inspired merge for sets (takes union)
pub struct SetUnion;
impl<T: Clone + Eq + std::hash::Hash> ConflictResolver<Vec<T>> for SetUnion {
fn resolve(&self, v1: &Versioned<Vec<T>>, v2: &Versioned<Vec<T>>) -> Result<Versioned<Vec<T>>> {
match v1.compare(v2) {
ClockOrdering::Equal | ClockOrdering::Before => Ok(v2.clone()),
ClockOrdering::After => Ok(v1.clone()),
ClockOrdering::Concurrent => {
let mut merged_value = v1.value.clone();
for item in &v2.value {
if !merged_value.contains(item) {
merged_value.push(item.clone());
}
}
let mut merged_clock = v1.clock.clone();
merged_clock.merge(&v2.clock);
Ok(Versioned {
value: merged_value,
clock: merged_clock,
replica_id: v1.replica_id.clone(),
})
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_vector_clock() {
let mut clock1 = VectorClock::new();
clock1.increment("r1");
clock1.increment("r1");
let mut clock2 = VectorClock::new();
clock2.increment("r1");
assert_eq!(clock1.compare(&clock2), ClockOrdering::After);
assert_eq!(clock2.compare(&clock1), ClockOrdering::Before);
}
#[test]
fn test_concurrent_clocks() {
let mut clock1 = VectorClock::new();
clock1.increment("r1");
let mut clock2 = VectorClock::new();
clock2.increment("r2");
assert_eq!(clock1.compare(&clock2), ClockOrdering::Concurrent);
assert!(clock1.is_concurrent(&clock2));
}
#[test]
fn test_clock_merge() {
let mut clock1 = VectorClock::new();
clock1.increment("r1");
clock1.increment("r1");
let mut clock2 = VectorClock::new();
clock2.increment("r2");
clock2.increment("r2");
clock2.increment("r2");
clock1.merge(&clock2);
assert_eq!(clock1.get("r1"), 2);
assert_eq!(clock1.get("r2"), 3);
}
#[test]
fn test_versioned() {
let mut v1 = Versioned::new(100, "r1".to_string());
v1.update(200);
assert_eq!(v1.value, 200);
assert_eq!(v1.clock.get("r1"), 2);
}
#[test]
fn test_last_write_wins() {
let v1 = Versioned::new(100, "r1".to_string());
let mut v2 = Versioned::new(200, "r1".to_string());
v2.clock.increment("r1");
let resolver = LastWriteWins;
let result = resolver.resolve(&v1, &v2).unwrap();
assert_eq!(result.value, 200);
}
#[test]
fn test_merge_function() {
let v1 = Versioned::new(100, "r1".to_string());
let v2 = Versioned::new(200, "r2".to_string());
let resolver = MergeFunction::new(|a, b| a + b);
let result = resolver.resolve(&v1, &v2).unwrap();
assert_eq!(result.value, 300);
}
#[test]
fn test_max_merge() {
let v1 = Versioned::new(100, "r1".to_string());
let v2 = Versioned::new(200, "r2".to_string());
let resolver = MaxMerge;
let result = resolver.resolve(&v1, &v2).unwrap();
assert_eq!(result.value, 200);
}
#[test]
fn test_set_union() {
let v1 = Versioned::new(vec![1, 2, 3], "r1".to_string());
let v2 = Versioned::new(vec![3, 4, 5], "r2".to_string());
let resolver = SetUnion;
let result = resolver.resolve(&v1, &v2).unwrap();
assert_eq!(result.value.len(), 5);
assert!(result.value.contains(&1));
assert!(result.value.contains(&4));
}
}

View File

@@ -0,0 +1,443 @@
//! Automatic failover and high availability
//!
//! Provides failover management with health monitoring,
//! quorum-based decision making, and split-brain prevention.
use crate::{Replica, ReplicaRole, ReplicaSet, ReplicationError, Result};
use chrono::{DateTime, Utc};
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use std::time::Duration;
use tokio::time::interval;
/// Health status of a replica
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum HealthStatus {
/// Replica is healthy
Healthy,
/// Replica is degraded but operational
Degraded,
/// Replica is unhealthy
Unhealthy,
/// Replica is not responding
Unresponsive,
}
/// Health check result
#[derive(Debug, Clone)]
pub struct HealthCheck {
/// Replica ID
pub replica_id: String,
/// Health status
pub status: HealthStatus,
/// Response time in milliseconds
pub response_time_ms: u64,
/// Error message if unhealthy
pub error: Option<String>,
/// Timestamp of the check
pub timestamp: DateTime<Utc>,
}
impl HealthCheck {
/// Create a healthy check result
pub fn healthy(replica_id: String, response_time_ms: u64) -> Self {
Self {
replica_id,
status: HealthStatus::Healthy,
response_time_ms,
error: None,
timestamp: Utc::now(),
}
}
/// Create an unhealthy check result
pub fn unhealthy(replica_id: String, error: String) -> Self {
Self {
replica_id,
status: HealthStatus::Unhealthy,
response_time_ms: 0,
error: Some(error),
timestamp: Utc::now(),
}
}
/// Create an unresponsive check result
pub fn unresponsive(replica_id: String) -> Self {
Self {
replica_id,
status: HealthStatus::Unresponsive,
response_time_ms: 0,
error: Some("No response".to_string()),
timestamp: Utc::now(),
}
}
}
/// Failover policy configuration
#[derive(Debug, Clone)]
pub struct FailoverPolicy {
/// Enable automatic failover
pub auto_failover: bool,
/// Health check interval
pub health_check_interval: Duration,
/// Timeout for health checks
pub health_check_timeout: Duration,
/// Number of consecutive failures before failover
pub failure_threshold: usize,
/// Minimum quorum size for failover
pub min_quorum: usize,
/// Enable split-brain prevention
pub prevent_split_brain: bool,
}
impl Default for FailoverPolicy {
fn default() -> Self {
Self {
auto_failover: true,
health_check_interval: Duration::from_secs(5),
health_check_timeout: Duration::from_secs(2),
failure_threshold: 3,
min_quorum: 2,
prevent_split_brain: true,
}
}
}
/// Manages automatic failover and health monitoring
pub struct FailoverManager {
/// The replica set
replica_set: Arc<RwLock<ReplicaSet>>,
/// Failover policy
policy: Arc<RwLock<FailoverPolicy>>,
/// Health check history
health_history: Arc<RwLock<Vec<HealthCheck>>>,
/// Failure counts by replica
failure_counts: Arc<RwLock<std::collections::HashMap<String, usize>>>,
/// Whether failover is in progress
failover_in_progress: Arc<RwLock<bool>>,
}
impl FailoverManager {
/// Create a new failover manager
pub fn new(replica_set: Arc<RwLock<ReplicaSet>>) -> Self {
Self {
replica_set,
policy: Arc::new(RwLock::new(FailoverPolicy::default())),
health_history: Arc::new(RwLock::new(Vec::new())),
failure_counts: Arc::new(RwLock::new(std::collections::HashMap::new())),
failover_in_progress: Arc::new(RwLock::new(false)),
}
}
/// Create with custom policy
pub fn with_policy(replica_set: Arc<RwLock<ReplicaSet>>, policy: FailoverPolicy) -> Self {
Self {
replica_set,
policy: Arc::new(RwLock::new(policy)),
health_history: Arc::new(RwLock::new(Vec::new())),
failure_counts: Arc::new(RwLock::new(std::collections::HashMap::new())),
failover_in_progress: Arc::new(RwLock::new(false)),
}
}
/// Set the failover policy
pub fn set_policy(&self, policy: FailoverPolicy) {
*self.policy.write() = policy;
}
/// Get the current policy
pub fn policy(&self) -> FailoverPolicy {
self.policy.read().clone()
}
/// Start health monitoring
pub async fn start_monitoring(&self) {
let policy = self.policy.read().clone();
let replica_set = self.replica_set.clone();
let health_history = self.health_history.clone();
let failure_counts = self.failure_counts.clone();
let failover_in_progress = self.failover_in_progress.clone();
let manager_policy = self.policy.clone();
tokio::spawn(async move {
let mut interval_timer = interval(policy.health_check_interval);
loop {
interval_timer.tick().await;
let replica_ids = {
let set = replica_set.read();
set.replica_ids()
};
for replica_id in replica_ids {
let health = Self::check_replica_health(
&replica_set,
&replica_id,
policy.health_check_timeout,
)
.await;
// Record health check
health_history.write().push(health.clone());
// Update failure count and check if failover is needed
// Use a scope to ensure lock is dropped before any await
let should_failover = {
let mut counts = failure_counts.write();
let count = counts.entry(replica_id.clone()).or_insert(0);
match health.status {
HealthStatus::Healthy => {
*count = 0;
false
}
HealthStatus::Degraded => {
// Don't increment for degraded
false
}
HealthStatus::Unhealthy | HealthStatus::Unresponsive => {
*count += 1;
// Check if failover is needed
let current_policy = manager_policy.read();
*count >= current_policy.failure_threshold
&& current_policy.auto_failover
}
}
}; // Lock is dropped here
// Trigger failover if needed (after lock is dropped)
if should_failover {
if let Err(e) =
Self::trigger_failover(&replica_set, &failover_in_progress).await
{
tracing::error!("Failover failed: {}", e);
}
}
}
// Trim health history to last 1000 entries
let mut history = health_history.write();
let len = history.len();
if len > 1000 {
history.drain(0..len - 1000);
}
}
});
}
/// Check health of a specific replica
async fn check_replica_health(
replica_set: &Arc<RwLock<ReplicaSet>>,
replica_id: &str,
timeout: Duration,
) -> HealthCheck {
// In a real implementation, this would make a network call
// For now, we simulate health checks based on replica status
let replica = {
let set = replica_set.read();
set.get_replica(replica_id)
};
match replica {
Some(replica) => {
if replica.is_timed_out(timeout) {
HealthCheck::unresponsive(replica_id.to_string())
} else if replica.is_healthy() {
HealthCheck::healthy(replica_id.to_string(), 10)
} else {
HealthCheck::unhealthy(replica_id.to_string(), "Replica is lagging".to_string())
}
}
None => HealthCheck::unhealthy(replica_id.to_string(), "Replica not found".to_string()),
}
}
/// Trigger failover to a healthy secondary
async fn trigger_failover(
replica_set: &Arc<RwLock<ReplicaSet>>,
failover_in_progress: &Arc<RwLock<bool>>,
) -> Result<()> {
// Check if failover is already in progress
{
let mut in_progress = failover_in_progress.write();
if *in_progress {
return Ok(());
}
*in_progress = true;
}
tracing::warn!("Initiating failover");
// Find candidate within a scope to drop the lock before await
let candidate_id = {
let set = replica_set.read();
// Check quorum
if !set.has_quorum() {
*failover_in_progress.write() = false;
return Err(ReplicationError::QuorumNotMet {
needed: set.get_quorum_size(),
available: set.get_healthy_replicas().len(),
});
}
// Find best candidate for promotion
let candidate = Self::select_failover_candidate(&set)?;
candidate.id.clone()
}; // Lock is dropped here
// Promote the candidate (lock re-acquired inside promote_to_primary)
let result = {
let mut set = replica_set.write();
set.promote_to_primary(&candidate_id)
};
match &result {
Ok(()) => tracing::info!("Failover completed: promoted {} to primary", candidate_id),
Err(e) => tracing::error!("Failover failed: {}", e),
}
// Clear failover flag
*failover_in_progress.write() = false;
result
}
/// Select the best candidate for failover
fn select_failover_candidate(replica_set: &ReplicaSet) -> Result<Replica> {
let mut candidates: Vec<Replica> = replica_set
.get_healthy_replicas()
.into_iter()
.filter(|r| r.role == ReplicaRole::Secondary)
.collect();
if candidates.is_empty() {
return Err(ReplicationError::FailoverFailed(
"No healthy secondary replicas available".to_string(),
));
}
// Sort by priority (highest first), then by lowest lag
candidates.sort_by(|a, b| b.priority.cmp(&a.priority).then(a.lag_ms.cmp(&b.lag_ms)));
Ok(candidates[0].clone())
}
/// Manually trigger failover
pub async fn manual_failover(&self, target_replica_id: Option<String>) -> Result<()> {
let mut set = self.replica_set.write();
// Check quorum
if !set.has_quorum() {
return Err(ReplicationError::QuorumNotMet {
needed: set.get_quorum_size(),
available: set.get_healthy_replicas().len(),
});
}
let target = if let Some(id) = target_replica_id {
set.get_replica(&id)
.ok_or_else(|| ReplicationError::ReplicaNotFound(id))?
} else {
Self::select_failover_candidate(&set)?
};
set.promote_to_primary(&target.id)?;
tracing::info!(
"Manual failover completed: promoted {} to primary",
target.id
);
Ok(())
}
/// Get health check history
pub fn health_history(&self) -> Vec<HealthCheck> {
self.health_history.read().clone()
}
/// Get recent health status for a replica
pub fn recent_health(&self, replica_id: &str, limit: usize) -> Vec<HealthCheck> {
let history = self.health_history.read();
history
.iter()
.rev()
.filter(|h| h.replica_id == replica_id)
.take(limit)
.cloned()
.collect()
}
/// Check if failover is currently in progress
pub fn is_failover_in_progress(&self) -> bool {
*self.failover_in_progress.read()
}
/// Get failure count for a replica
pub fn failure_count(&self, replica_id: &str) -> usize {
self.failure_counts
.read()
.get(replica_id)
.copied()
.unwrap_or(0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_health_check() {
let check = HealthCheck::healthy("r1".to_string(), 15);
assert_eq!(check.status, HealthStatus::Healthy);
assert_eq!(check.response_time_ms, 15);
let check = HealthCheck::unhealthy("r2".to_string(), "Error".to_string());
assert_eq!(check.status, HealthStatus::Unhealthy);
assert!(check.error.is_some());
}
#[test]
fn test_failover_policy() {
let policy = FailoverPolicy::default();
assert!(policy.auto_failover);
assert_eq!(policy.failure_threshold, 3);
}
#[test]
fn test_failover_manager() {
let mut replica_set = ReplicaSet::new("cluster-1");
replica_set
.add_replica("r1", "127.0.0.1:9001", ReplicaRole::Primary)
.unwrap();
replica_set
.add_replica("r2", "127.0.0.1:9002", ReplicaRole::Secondary)
.unwrap();
let manager = FailoverManager::new(Arc::new(RwLock::new(replica_set)));
assert!(!manager.is_failover_in_progress());
}
#[test]
fn test_candidate_selection() {
let mut replica_set = ReplicaSet::new("cluster-1");
replica_set
.add_replica("r1", "127.0.0.1:9001", ReplicaRole::Primary)
.unwrap();
replica_set
.add_replica("r2", "127.0.0.1:9002", ReplicaRole::Secondary)
.unwrap();
replica_set
.add_replica("r3", "127.0.0.1:9003", ReplicaRole::Secondary)
.unwrap();
let candidate = FailoverManager::select_failover_candidate(&replica_set).unwrap();
assert!(candidate.role == ReplicaRole::Secondary);
assert!(candidate.is_healthy());
}
}

View File

@@ -0,0 +1,104 @@
//! Data replication and synchronization for ruvector
//!
//! This crate provides comprehensive replication capabilities including:
//! - Multi-node replica management
//! - Synchronous, asynchronous, and semi-synchronous replication modes
//! - Conflict resolution with vector clocks and CRDTs
//! - Change data capture and streaming
//! - Automatic failover and split-brain prevention
//!
//! # Examples
//!
//! ```no_run
//! use ruvector_replication::{ReplicaSet, ReplicaRole, SyncMode, SyncManager, ReplicationLog};
//! use std::sync::Arc;
//!
//! fn example() -> Result<(), Box<dyn std::error::Error>> {
//! // Create a replica set
//! let mut replica_set = ReplicaSet::new("cluster-1");
//!
//! // Add replicas
//! replica_set.add_replica("replica-1", "192.168.1.10:9001", ReplicaRole::Primary)?;
//! replica_set.add_replica("replica-2", "192.168.1.11:9001", ReplicaRole::Secondary)?;
//!
//! // Create sync manager and configure synchronization
//! let log = Arc::new(ReplicationLog::new("replica-1"));
//! let manager = SyncManager::new(Arc::new(replica_set), log);
//! manager.set_sync_mode(SyncMode::SemiSync { min_replicas: 1 });
//! Ok(())
//! }
//! ```
pub mod conflict;
pub mod failover;
pub mod replica;
pub mod stream;
pub mod sync;
pub use conflict::{ConflictResolver, LastWriteWins, MergeFunction, VectorClock};
pub use failover::{FailoverManager, FailoverPolicy, HealthStatus};
pub use replica::{Replica, ReplicaRole, ReplicaSet, ReplicaStatus};
pub use stream::{ChangeEvent, ChangeOperation, ReplicationStream};
pub use sync::{LogEntry, ReplicationLog, SyncManager, SyncMode};
use thiserror::Error;
/// Result type for replication operations
pub type Result<T> = std::result::Result<T, ReplicationError>;
/// Errors that can occur during replication operations
#[derive(Error, Debug)]
pub enum ReplicationError {
#[error("Replica not found: {0}")]
ReplicaNotFound(String),
#[error("No primary replica available")]
NoPrimary,
#[error("Replication timeout: {0}")]
Timeout(String),
#[error("Synchronization failed: {0}")]
SyncFailed(String),
#[error("Conflict resolution failed: {0}")]
ConflictResolution(String),
#[error("Failover failed: {0}")]
FailoverFailed(String),
#[error("Network error: {0}")]
Network(String),
#[error("Quorum not met: needed {needed}, got {available}")]
QuorumNotMet { needed: usize, available: usize },
#[error("Split-brain detected")]
SplitBrain,
#[error("Invalid replica state: {0}")]
InvalidState(String),
#[error("Serialization encode error: {0}")]
SerializationEncode(#[from] bincode::error::EncodeError),
#[error("Serialization decode error: {0}")]
SerializationDecode(#[from] bincode::error::DecodeError),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_error_display() {
let err = ReplicationError::QuorumNotMet {
needed: 2,
available: 1,
};
assert_eq!(err.to_string(), "Quorum not met: needed 2, got 1");
}
}

View File

@@ -0,0 +1,378 @@
//! Replica management and coordination
//!
//! Provides structures and logic for managing distributed replicas,
//! including role management, health tracking, and promotion/demotion.
use crate::{ReplicationError, Result};
use chrono::{DateTime, Utc};
use dashmap::DashMap;
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use std::time::Duration;
use uuid::Uuid;
/// Role of a replica in the replication topology
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ReplicaRole {
/// Primary replica that handles writes
Primary,
/// Secondary replica that replicates from primary
Secondary,
/// Witness replica for quorum without data replication
Witness,
}
/// Current status of a replica
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ReplicaStatus {
/// Replica is online and healthy
Healthy,
/// Replica is lagging behind
Lagging,
/// Replica is offline or unreachable
Offline,
/// Replica is recovering
Recovering,
}
/// Represents a single replica in the replication topology
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Replica {
/// Unique identifier for the replica
pub id: String,
/// Network address of the replica
pub address: String,
/// Current role of the replica
pub role: ReplicaRole,
/// Current status of the replica
pub status: ReplicaStatus,
/// Replication lag in milliseconds
pub lag_ms: u64,
/// Last known position in the replication log
pub log_position: u64,
/// Last heartbeat timestamp
pub last_heartbeat: DateTime<Utc>,
/// Priority for failover (higher is better)
pub priority: u32,
}
impl Replica {
/// Create a new replica
pub fn new(id: impl Into<String>, address: impl Into<String>, role: ReplicaRole) -> Self {
Self {
id: id.into(),
address: address.into(),
role,
status: ReplicaStatus::Healthy,
lag_ms: 0,
log_position: 0,
last_heartbeat: Utc::now(),
priority: 100,
}
}
/// Check if the replica is healthy
pub fn is_healthy(&self) -> bool {
self.status == ReplicaStatus::Healthy && self.lag_ms < 5000
}
/// Check if the replica is available for reads
pub fn is_readable(&self) -> bool {
matches!(self.status, ReplicaStatus::Healthy | ReplicaStatus::Lagging)
}
/// Check if the replica is available for writes
pub fn is_writable(&self) -> bool {
self.role == ReplicaRole::Primary && self.status == ReplicaStatus::Healthy
}
/// Update the replica's lag
pub fn update_lag(&mut self, lag_ms: u64) {
self.lag_ms = lag_ms;
if lag_ms > 5000 {
self.status = ReplicaStatus::Lagging;
} else if self.status == ReplicaStatus::Lagging {
self.status = ReplicaStatus::Healthy;
}
}
/// Update the replica's log position
pub fn update_position(&mut self, position: u64) {
self.log_position = position;
}
/// Record a heartbeat
pub fn heartbeat(&mut self) {
self.last_heartbeat = Utc::now();
if self.status == ReplicaStatus::Offline {
self.status = ReplicaStatus::Recovering;
}
}
/// Check if the replica has timed out
pub fn is_timed_out(&self, timeout: Duration) -> bool {
let elapsed = Utc::now()
.signed_duration_since(self.last_heartbeat)
.to_std()
.unwrap_or(Duration::MAX);
elapsed > timeout
}
}
/// Manages a set of replicas
pub struct ReplicaSet {
/// Cluster identifier
cluster_id: String,
/// Map of replica ID to replica
replicas: Arc<DashMap<String, Replica>>,
/// Current primary replica ID
primary_id: Arc<RwLock<Option<String>>>,
/// Minimum number of replicas for quorum
quorum_size: Arc<RwLock<usize>>,
}
impl ReplicaSet {
/// Create a new replica set
pub fn new(cluster_id: impl Into<String>) -> Self {
Self {
cluster_id: cluster_id.into(),
replicas: Arc::new(DashMap::new()),
primary_id: Arc::new(RwLock::new(None)),
quorum_size: Arc::new(RwLock::new(1)),
}
}
/// Add a replica to the set
pub fn add_replica(
&mut self,
id: impl Into<String>,
address: impl Into<String>,
role: ReplicaRole,
) -> Result<()> {
let id = id.into();
let replica = Replica::new(id.clone(), address, role);
if role == ReplicaRole::Primary {
let mut primary = self.primary_id.write();
if primary.is_some() {
return Err(ReplicationError::InvalidState(
"Primary replica already exists".to_string(),
));
}
*primary = Some(id.clone());
}
self.replicas.insert(id, replica);
self.update_quorum_size();
Ok(())
}
/// Remove a replica from the set
pub fn remove_replica(&mut self, id: &str) -> Result<()> {
let replica = self
.replicas
.remove(id)
.ok_or_else(|| ReplicationError::ReplicaNotFound(id.to_string()))?;
if replica.1.role == ReplicaRole::Primary {
let mut primary = self.primary_id.write();
*primary = None;
}
self.update_quorum_size();
Ok(())
}
/// Get a replica by ID
pub fn get_replica(&self, id: &str) -> Option<Replica> {
self.replicas.get(id).map(|r| r.clone())
}
/// Get the current primary replica
pub fn get_primary(&self) -> Option<Replica> {
let primary_id = self.primary_id.read();
primary_id
.as_ref()
.and_then(|id| self.replicas.get(id).map(|r| r.clone()))
}
/// Get all secondary replicas
pub fn get_secondaries(&self) -> Vec<Replica> {
self.replicas
.iter()
.filter(|r| r.role == ReplicaRole::Secondary)
.map(|r| r.clone())
.collect()
}
/// Get all healthy replicas
pub fn get_healthy_replicas(&self) -> Vec<Replica> {
self.replicas
.iter()
.filter(|r| r.is_healthy())
.map(|r| r.clone())
.collect()
}
/// Promote a secondary to primary
pub fn promote_to_primary(&mut self, id: &str) -> Result<()> {
// Get the replica and verify it exists
let mut replica = self
.replicas
.get_mut(id)
.ok_or_else(|| ReplicationError::ReplicaNotFound(id.to_string()))?;
if replica.role == ReplicaRole::Primary {
return Ok(());
}
if replica.role == ReplicaRole::Witness {
return Err(ReplicationError::InvalidState(
"Cannot promote witness to primary".to_string(),
));
}
// Demote current primary if exists
let old_primary_id = {
let mut primary = self.primary_id.write();
primary.take()
};
if let Some(old_id) = old_primary_id {
if let Some(mut old_primary) = self.replicas.get_mut(&old_id) {
old_primary.role = ReplicaRole::Secondary;
}
}
// Promote new primary
replica.role = ReplicaRole::Primary;
let mut primary = self.primary_id.write();
*primary = Some(id.to_string());
tracing::info!("Promoted replica {} to primary", id);
Ok(())
}
/// Demote a primary to secondary
pub fn demote_to_secondary(&mut self, id: &str) -> Result<()> {
let mut replica = self
.replicas
.get_mut(id)
.ok_or_else(|| ReplicationError::ReplicaNotFound(id.to_string()))?;
if replica.role != ReplicaRole::Primary {
return Ok(());
}
replica.role = ReplicaRole::Secondary;
let mut primary = self.primary_id.write();
*primary = None;
tracing::info!("Demoted replica {} to secondary", id);
Ok(())
}
/// Check if quorum is available
pub fn has_quorum(&self) -> bool {
let healthy_count = self
.replicas
.iter()
.filter(|r| r.is_healthy() && r.role != ReplicaRole::Witness)
.count();
let quorum = *self.quorum_size.read();
healthy_count >= quorum
}
/// Get the required quorum size
pub fn get_quorum_size(&self) -> usize {
*self.quorum_size.read()
}
/// Set the quorum size
pub fn set_quorum_size(&self, size: usize) {
*self.quorum_size.write() = size;
}
/// Update quorum size based on replica count
fn update_quorum_size(&self) {
let replica_count = self
.replicas
.iter()
.filter(|r| r.role != ReplicaRole::Witness)
.count();
let quorum = (replica_count / 2) + 1;
*self.quorum_size.write() = quorum;
}
/// Get all replica IDs
pub fn replica_ids(&self) -> Vec<String> {
self.replicas.iter().map(|r| r.id.clone()).collect()
}
/// Get replica count
pub fn replica_count(&self) -> usize {
self.replicas.len()
}
/// Get the cluster ID
pub fn cluster_id(&self) -> &str {
&self.cluster_id
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_replica_creation() {
let replica = Replica::new("r1", "127.0.0.1:9001", ReplicaRole::Primary);
assert_eq!(replica.id, "r1");
assert_eq!(replica.role, ReplicaRole::Primary);
assert!(replica.is_healthy());
assert!(replica.is_writable());
}
#[test]
fn test_replica_set() {
let mut set = ReplicaSet::new("cluster-1");
set.add_replica("r1", "127.0.0.1:9001", ReplicaRole::Primary)
.unwrap();
set.add_replica("r2", "127.0.0.1:9002", ReplicaRole::Secondary)
.unwrap();
assert_eq!(set.replica_count(), 2);
assert!(set.get_primary().is_some());
assert_eq!(set.get_secondaries().len(), 1);
}
#[test]
fn test_promotion() {
let mut set = ReplicaSet::new("cluster-1");
set.add_replica("r1", "127.0.0.1:9001", ReplicaRole::Primary)
.unwrap();
set.add_replica("r2", "127.0.0.1:9002", ReplicaRole::Secondary)
.unwrap();
set.promote_to_primary("r2").unwrap();
let primary = set.get_primary().unwrap();
assert_eq!(primary.id, "r2");
assert_eq!(primary.role, ReplicaRole::Primary);
}
#[test]
fn test_quorum() {
let mut set = ReplicaSet::new("cluster-1");
set.add_replica("r1", "127.0.0.1:9001", ReplicaRole::Primary)
.unwrap();
set.add_replica("r2", "127.0.0.1:9002", ReplicaRole::Secondary)
.unwrap();
set.add_replica("r3", "127.0.0.1:9003", ReplicaRole::Secondary)
.unwrap();
assert_eq!(set.get_quorum_size(), 2);
assert!(set.has_quorum());
}
}

View File

@@ -0,0 +1,403 @@
//! Change data capture and streaming for replication
//!
//! Provides mechanisms for streaming changes from the replication log
//! with support for checkpointing, resumption, and backpressure handling.
use crate::{LogEntry, ReplicationError, ReplicationLog, Result};
use chrono::{DateTime, Utc};
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use tokio::sync::mpsc;
use uuid::Uuid;
/// Type of change operation
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ChangeOperation {
/// Insert operation
Insert,
/// Update operation
Update,
/// Delete operation
Delete,
/// Bulk operation
Bulk,
}
/// A change event in the replication stream
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChangeEvent {
/// Unique identifier for this event
pub id: Uuid,
/// Sequence number in the stream
pub sequence: u64,
/// Timestamp of the change
pub timestamp: DateTime<Utc>,
/// Type of operation
pub operation: ChangeOperation,
/// Collection/table name
pub collection: String,
/// Document/vector ID affected
pub document_id: String,
/// Serialized data for the change
pub data: Vec<u8>,
/// Metadata for the change
pub metadata: serde_json::Value,
}
impl ChangeEvent {
/// Create a new change event
pub fn new(
sequence: u64,
operation: ChangeOperation,
collection: String,
document_id: String,
data: Vec<u8>,
) -> Self {
Self {
id: Uuid::new_v4(),
sequence,
timestamp: Utc::now(),
operation,
collection,
document_id,
data,
metadata: serde_json::Value::Null,
}
}
/// Add metadata to the change event
pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self {
self.metadata = metadata;
self
}
/// Convert from a log entry
pub fn from_log_entry(
entry: &LogEntry,
operation: ChangeOperation,
collection: String,
document_id: String,
) -> Self {
Self {
id: entry.id,
sequence: entry.sequence,
timestamp: entry.timestamp,
operation,
collection,
document_id,
data: entry.data.clone(),
metadata: serde_json::json!({
"source_replica": entry.source_replica,
"checksum": entry.checksum,
}),
}
}
}
/// Checkpoint for resuming a replication stream
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Checkpoint {
/// Last processed sequence number
pub sequence: u64,
/// Timestamp of the checkpoint
pub timestamp: DateTime<Utc>,
/// Optional consumer group ID
pub consumer_group: Option<String>,
/// Consumer ID within the group
pub consumer_id: String,
}
impl Checkpoint {
/// Create a new checkpoint
pub fn new(sequence: u64, consumer_id: impl Into<String>) -> Self {
Self {
sequence,
timestamp: Utc::now(),
consumer_group: None,
consumer_id: consumer_id.into(),
}
}
/// Set the consumer group
pub fn with_group(mut self, group: impl Into<String>) -> Self {
self.consumer_group = Some(group.into());
self
}
}
/// Configuration for a replication stream
#[derive(Debug, Clone)]
pub struct StreamConfig {
/// Buffer size for the channel
pub buffer_size: usize,
/// Batch size for events
pub batch_size: usize,
/// Enable automatic checkpointing
pub auto_checkpoint: bool,
/// Checkpoint interval (number of events)
pub checkpoint_interval: usize,
}
impl Default for StreamConfig {
fn default() -> Self {
Self {
buffer_size: 1000,
batch_size: 100,
auto_checkpoint: true,
checkpoint_interval: 100,
}
}
}
/// Manages a replication stream
pub struct ReplicationStream {
/// The replication log
log: Arc<ReplicationLog>,
/// Stream configuration
config: StreamConfig,
/// Current checkpoint
checkpoint: Arc<RwLock<Option<Checkpoint>>>,
/// Consumer ID
consumer_id: String,
}
impl ReplicationStream {
/// Create a new replication stream
pub fn new(log: Arc<ReplicationLog>, consumer_id: impl Into<String>) -> Self {
Self {
log,
config: StreamConfig::default(),
checkpoint: Arc::new(RwLock::new(None)),
consumer_id: consumer_id.into(),
}
}
/// Create with custom configuration
pub fn with_config(
log: Arc<ReplicationLog>,
consumer_id: impl Into<String>,
config: StreamConfig,
) -> Self {
Self {
log,
config,
checkpoint: Arc::new(RwLock::new(None)),
consumer_id: consumer_id.into(),
}
}
/// Start streaming from a given position
pub async fn stream_from(
&self,
start_sequence: u64,
) -> Result<mpsc::Receiver<Vec<ChangeEvent>>> {
let (tx, rx) = mpsc::channel(self.config.buffer_size);
let log = self.log.clone();
let batch_size = self.config.batch_size;
let checkpoint = self.checkpoint.clone();
let auto_checkpoint = self.config.auto_checkpoint;
let checkpoint_interval = self.config.checkpoint_interval;
let consumer_id = self.consumer_id.clone();
tokio::spawn(async move {
let mut current_sequence = start_sequence;
let mut events_since_checkpoint = 0;
loop {
// Get batch of entries
let entries =
log.get_range(current_sequence + 1, current_sequence + batch_size as u64);
if entries.is_empty() {
// No new entries, wait a bit
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
continue;
}
// Convert to change events
let mut events = Vec::new();
for entry in &entries {
// In a real implementation, we would decode the operation type
// from the entry data. For now, we use a placeholder.
let event = ChangeEvent::from_log_entry(
entry,
ChangeOperation::Update,
"default".to_string(),
Uuid::new_v4().to_string(),
);
events.push(event);
}
// Update current sequence
if let Some(last_entry) = entries.last() {
current_sequence = last_entry.sequence;
}
// Send batch
if tx.send(events).await.is_err() {
// Receiver dropped, stop streaming
break;
}
events_since_checkpoint += entries.len();
// Auto-checkpoint if enabled
if auto_checkpoint && events_since_checkpoint >= checkpoint_interval {
let cp = Checkpoint::new(current_sequence, consumer_id.clone());
*checkpoint.write() = Some(cp);
events_since_checkpoint = 0;
}
}
});
Ok(rx)
}
/// Resume streaming from the last checkpoint
pub async fn resume(&self) -> Result<mpsc::Receiver<Vec<ChangeEvent>>> {
let checkpoint = self.checkpoint.read();
let start_sequence = checkpoint.as_ref().map(|cp| cp.sequence).unwrap_or(0);
drop(checkpoint);
self.stream_from(start_sequence).await
}
/// Get the current checkpoint
pub fn get_checkpoint(&self) -> Option<Checkpoint> {
self.checkpoint.read().clone()
}
/// Set a checkpoint manually
pub fn set_checkpoint(&self, checkpoint: Checkpoint) {
*self.checkpoint.write() = Some(checkpoint);
}
/// Clear the checkpoint
pub fn clear_checkpoint(&self) {
*self.checkpoint.write() = None;
}
}
/// Manager for multiple replication streams (consumer groups)
pub struct StreamManager {
/// The replication log
log: Arc<ReplicationLog>,
/// Active streams by consumer ID
streams: Arc<RwLock<Vec<Arc<ReplicationStream>>>>,
}
impl StreamManager {
/// Create a new stream manager
pub fn new(log: Arc<ReplicationLog>) -> Self {
Self {
log,
streams: Arc::new(RwLock::new(Vec::new())),
}
}
/// Create a new stream for a consumer
pub fn create_stream(&self, consumer_id: impl Into<String>) -> Arc<ReplicationStream> {
let stream = Arc::new(ReplicationStream::new(self.log.clone(), consumer_id));
self.streams.write().push(stream.clone());
stream
}
/// Create a stream with custom configuration
pub fn create_stream_with_config(
&self,
consumer_id: impl Into<String>,
config: StreamConfig,
) -> Arc<ReplicationStream> {
let stream = Arc::new(ReplicationStream::with_config(
self.log.clone(),
consumer_id,
config,
));
self.streams.write().push(stream.clone());
stream
}
/// Get all active streams
pub fn active_streams(&self) -> Vec<Arc<ReplicationStream>> {
self.streams.read().clone()
}
/// Get the number of active streams
pub fn stream_count(&self) -> usize {
self.streams.read().len()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_change_event_creation() {
let event = ChangeEvent::new(
1,
ChangeOperation::Insert,
"vectors".to_string(),
"doc-1".to_string(),
b"data".to_vec(),
);
assert_eq!(event.sequence, 1);
assert_eq!(event.operation, ChangeOperation::Insert);
assert_eq!(event.collection, "vectors");
}
#[test]
fn test_checkpoint() {
let cp = Checkpoint::new(100, "consumer-1").with_group("group-1");
assert_eq!(cp.sequence, 100);
assert_eq!(cp.consumer_id, "consumer-1");
assert_eq!(cp.consumer_group, Some("group-1".to_string()));
}
#[tokio::test]
async fn test_replication_stream() {
let log = Arc::new(ReplicationLog::new("replica-1"));
// Add some entries
log.append(b"data1".to_vec());
log.append(b"data2".to_vec());
log.append(b"data3".to_vec());
let stream = ReplicationStream::new(log.clone(), "consumer-1");
let mut rx = stream.stream_from(0).await.unwrap();
// Receive events
if let Some(events) = rx.recv().await {
assert!(!events.is_empty());
}
}
#[test]
fn test_stream_manager() {
let log = Arc::new(ReplicationLog::new("replica-1"));
let manager = StreamManager::new(log);
let stream1 = manager.create_stream("consumer-1");
let stream2 = manager.create_stream("consumer-2");
assert_eq!(manager.stream_count(), 2);
}
#[test]
fn test_stream_config() {
let config = StreamConfig {
buffer_size: 2000,
batch_size: 50,
auto_checkpoint: false,
checkpoint_interval: 200,
};
assert_eq!(config.buffer_size, 2000);
assert_eq!(config.batch_size, 50);
assert!(!config.auto_checkpoint);
}
}

View File

@@ -0,0 +1,374 @@
//! Synchronization modes and replication log management
//!
//! Provides different replication modes (sync, async, semi-sync)
//! and manages the replication log for tracking changes.
use crate::{ReplicaSet, ReplicationError, Result};
use chrono::{DateTime, Utc};
use dashmap::DashMap;
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use std::time::Duration;
use tokio::time::timeout;
use uuid::Uuid;
/// Synchronization mode for replication
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum SyncMode {
/// Wait for all replicas to acknowledge
Sync,
/// Don't wait for replicas
Async,
/// Wait for a minimum number of replicas
SemiSync { min_replicas: usize },
}
/// Entry in the replication log
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LogEntry {
/// Unique identifier for this entry
pub id: Uuid,
/// Sequence number in the log
pub sequence: u64,
/// Timestamp when the entry was created
pub timestamp: DateTime<Utc>,
/// The operation data (serialized)
pub data: Vec<u8>,
/// Checksum for data integrity
pub checksum: u64,
/// ID of the replica that originated this entry
pub source_replica: String,
}
impl LogEntry {
/// Create a new log entry
pub fn new(sequence: u64, data: Vec<u8>, source_replica: String) -> Self {
let checksum = Self::calculate_checksum(&data);
Self {
id: Uuid::new_v4(),
sequence,
timestamp: Utc::now(),
data,
checksum,
source_replica,
}
}
/// Calculate checksum for data
fn calculate_checksum(data: &[u8]) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish()
}
/// Verify data integrity
pub fn verify(&self) -> bool {
Self::calculate_checksum(&self.data) == self.checksum
}
}
/// Manages the replication log
pub struct ReplicationLog {
/// Log entries indexed by sequence number
entries: Arc<DashMap<u64, LogEntry>>,
/// Current sequence number
sequence: Arc<RwLock<u64>>,
/// Replica ID
replica_id: String,
}
impl ReplicationLog {
/// Create a new replication log
pub fn new(replica_id: impl Into<String>) -> Self {
Self {
entries: Arc::new(DashMap::new()),
sequence: Arc::new(RwLock::new(0)),
replica_id: replica_id.into(),
}
}
/// Append an entry to the log
pub fn append(&self, data: Vec<u8>) -> LogEntry {
let mut seq = self.sequence.write();
*seq += 1;
let entry = LogEntry::new(*seq, data, self.replica_id.clone());
self.entries.insert(*seq, entry.clone());
entry
}
/// Get an entry by sequence number
pub fn get(&self, sequence: u64) -> Option<LogEntry> {
self.entries.get(&sequence).map(|e| e.clone())
}
/// Get entries in a range
pub fn get_range(&self, start: u64, end: u64) -> Vec<LogEntry> {
let mut entries = Vec::new();
for seq in start..=end {
if let Some(entry) = self.entries.get(&seq) {
entries.push(entry.clone());
}
}
entries
}
/// Get the current sequence number
pub fn current_sequence(&self) -> u64 {
*self.sequence.read()
}
/// Get entries since a given sequence
pub fn get_since(&self, since: u64) -> Vec<LogEntry> {
let current = self.current_sequence();
self.get_range(since + 1, current)
}
/// Truncate log before a given sequence
pub fn truncate_before(&self, before: u64) {
self.entries.retain(|seq, _| *seq >= before);
}
/// Get log size
pub fn size(&self) -> usize {
self.entries.len()
}
}
/// Manages synchronization across replicas
pub struct SyncManager {
/// The replica set
replica_set: Arc<ReplicaSet>,
/// Replication log
log: Arc<ReplicationLog>,
/// Synchronization mode
sync_mode: Arc<RwLock<SyncMode>>,
/// Timeout for synchronous operations
sync_timeout: Duration,
}
impl SyncManager {
/// Create a new sync manager
pub fn new(replica_set: Arc<ReplicaSet>, log: Arc<ReplicationLog>) -> Self {
Self {
replica_set,
log,
sync_mode: Arc::new(RwLock::new(SyncMode::Async)),
sync_timeout: Duration::from_secs(5),
}
}
/// Set the synchronization mode
pub fn set_sync_mode(&self, mode: SyncMode) {
*self.sync_mode.write() = mode;
}
/// Get the current synchronization mode
pub fn sync_mode(&self) -> SyncMode {
*self.sync_mode.read()
}
/// Set the sync timeout
pub fn set_sync_timeout(&mut self, timeout: Duration) {
self.sync_timeout = timeout;
}
/// Replicate data to all replicas according to sync mode
pub async fn replicate(&self, data: Vec<u8>) -> Result<LogEntry> {
// Append to local log
let entry = self.log.append(data);
// Get sync mode
let mode = self.sync_mode();
match mode {
SyncMode::Sync => {
self.replicate_sync(&entry).await?;
}
SyncMode::Async => {
// Fire and forget
let entry_clone = entry.clone();
let replica_set = self.replica_set.clone();
tokio::spawn(async move {
if let Err(e) = Self::send_to_replicas(&replica_set, &entry_clone).await {
tracing::error!("Async replication failed: {}", e);
}
});
}
SyncMode::SemiSync { min_replicas } => {
self.replicate_semi_sync(&entry, min_replicas).await?;
}
}
Ok(entry)
}
/// Synchronous replication - wait for all replicas
async fn replicate_sync(&self, entry: &LogEntry) -> Result<()> {
timeout(
self.sync_timeout,
Self::send_to_replicas(&self.replica_set, entry),
)
.await
.map_err(|_| ReplicationError::Timeout("Sync replication timed out".to_string()))?
}
/// Semi-synchronous replication - wait for minimum replicas
async fn replicate_semi_sync(&self, entry: &LogEntry, min_replicas: usize) -> Result<()> {
let secondaries = self.replica_set.get_secondaries();
if secondaries.len() < min_replicas {
return Err(ReplicationError::QuorumNotMet {
needed: min_replicas,
available: secondaries.len(),
});
}
// Send to all and wait for min_replicas to respond
let entry_clone = entry.clone();
let replica_set = self.replica_set.clone();
let min = min_replicas;
timeout(self.sync_timeout, async move {
// Simulate sending to replicas and waiting for acknowledgments
// In a real implementation, this would use network calls
let acks = secondaries.len().min(min);
if acks >= min {
Ok(())
} else {
Err(ReplicationError::QuorumNotMet {
needed: min,
available: acks,
})
}
})
.await
.map_err(|_| ReplicationError::Timeout("Semi-sync replication timed out".to_string()))?
}
/// Send log entry to all replicas
async fn send_to_replicas(replica_set: &ReplicaSet, entry: &LogEntry) -> Result<()> {
let secondaries = replica_set.get_secondaries();
// In a real implementation, this would send over the network
// For now, we simulate successful replication
for replica in secondaries {
if replica.is_healthy() {
tracing::debug!("Replicating entry {} to {}", entry.sequence, replica.id);
}
}
Ok(())
}
/// Catch up a lagging replica
pub async fn catchup(&self, replica_id: &str, from_sequence: u64) -> Result<Vec<LogEntry>> {
let replica = self
.replica_set
.get_replica(replica_id)
.ok_or_else(|| ReplicationError::ReplicaNotFound(replica_id.to_string()))?;
let current_sequence = self.log.current_sequence();
if from_sequence >= current_sequence {
return Ok(Vec::new());
}
// Get missing entries
let entries = self.log.get_since(from_sequence);
tracing::info!(
"Catching up replica {} with {} entries (from {} to {})",
replica_id,
entries.len(),
from_sequence + 1,
current_sequence
);
Ok(entries)
}
/// Get the current log position
pub fn current_position(&self) -> u64 {
self.log.current_sequence()
}
/// Verify log entry integrity
pub fn verify_entry(&self, sequence: u64) -> Result<bool> {
let entry = self
.log
.get(sequence)
.ok_or_else(|| ReplicationError::InvalidState("Log entry not found".to_string()))?;
Ok(entry.verify())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ReplicaRole;
#[test]
fn test_log_entry_creation() {
let data = b"test data".to_vec();
let entry = LogEntry::new(1, data, "replica-1".to_string());
assert_eq!(entry.sequence, 1);
assert!(entry.verify());
}
#[test]
fn test_replication_log() {
let log = ReplicationLog::new("replica-1");
let entry1 = log.append(b"data1".to_vec());
let entry2 = log.append(b"data2".to_vec());
assert_eq!(entry1.sequence, 1);
assert_eq!(entry2.sequence, 2);
assert_eq!(log.current_sequence(), 2);
let entries = log.get_range(1, 2);
assert_eq!(entries.len(), 2);
}
#[tokio::test]
async fn test_sync_manager() {
let mut replica_set = ReplicaSet::new("cluster-1");
replica_set
.add_replica("r1", "127.0.0.1:9001", ReplicaRole::Primary)
.unwrap();
replica_set
.add_replica("r2", "127.0.0.1:9002", ReplicaRole::Secondary)
.unwrap();
let log = Arc::new(ReplicationLog::new("r1"));
let manager = SyncManager::new(Arc::new(replica_set), log);
manager.set_sync_mode(SyncMode::Async);
let entry = manager.replicate(b"test".to_vec()).await.unwrap();
assert_eq!(entry.sequence, 1);
}
#[tokio::test]
async fn test_catchup() {
let mut replica_set = ReplicaSet::new("cluster-1");
replica_set
.add_replica("r1", "127.0.0.1:9001", ReplicaRole::Primary)
.unwrap();
replica_set
.add_replica("r2", "127.0.0.1:9002", ReplicaRole::Secondary)
.unwrap();
let log = Arc::new(ReplicationLog::new("r1"));
let manager = SyncManager::new(Arc::new(replica_set), log.clone());
// Add some entries
log.append(b"data1".to_vec());
log.append(b"data2".to_vec());
log.append(b"data3".to_vec());
// Catchup from position 1
let entries = manager.catchup("r2", 1).await.unwrap();
assert_eq!(entries.len(), 2); // Entries 2 and 3
}
}