Files
wifi-densepose/crates/ruvector-replication/src/stream.rs
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

404 lines
11 KiB
Rust

//! Change data capture and streaming for replication
//!
//! Provides mechanisms for streaming changes from the replication log
//! with support for checkpointing, resumption, and backpressure handling.
use crate::{LogEntry, ReplicationError, ReplicationLog, Result};
use chrono::{DateTime, Utc};
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use tokio::sync::mpsc;
use uuid::Uuid;
/// Type of change operation
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ChangeOperation {
/// Insert operation
Insert,
/// Update operation
Update,
/// Delete operation
Delete,
/// Bulk operation
Bulk,
}
/// A change event in the replication stream
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChangeEvent {
/// Unique identifier for this event
pub id: Uuid,
/// Sequence number in the stream
pub sequence: u64,
/// Timestamp of the change
pub timestamp: DateTime<Utc>,
/// Type of operation
pub operation: ChangeOperation,
/// Collection/table name
pub collection: String,
/// Document/vector ID affected
pub document_id: String,
/// Serialized data for the change
pub data: Vec<u8>,
/// Metadata for the change
pub metadata: serde_json::Value,
}
impl ChangeEvent {
/// Create a new change event
pub fn new(
sequence: u64,
operation: ChangeOperation,
collection: String,
document_id: String,
data: Vec<u8>,
) -> Self {
Self {
id: Uuid::new_v4(),
sequence,
timestamp: Utc::now(),
operation,
collection,
document_id,
data,
metadata: serde_json::Value::Null,
}
}
/// Add metadata to the change event
pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self {
self.metadata = metadata;
self
}
/// Convert from a log entry
pub fn from_log_entry(
entry: &LogEntry,
operation: ChangeOperation,
collection: String,
document_id: String,
) -> Self {
Self {
id: entry.id,
sequence: entry.sequence,
timestamp: entry.timestamp,
operation,
collection,
document_id,
data: entry.data.clone(),
metadata: serde_json::json!({
"source_replica": entry.source_replica,
"checksum": entry.checksum,
}),
}
}
}
/// Checkpoint for resuming a replication stream
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Checkpoint {
/// Last processed sequence number
pub sequence: u64,
/// Timestamp of the checkpoint
pub timestamp: DateTime<Utc>,
/// Optional consumer group ID
pub consumer_group: Option<String>,
/// Consumer ID within the group
pub consumer_id: String,
}
impl Checkpoint {
/// Create a new checkpoint
pub fn new(sequence: u64, consumer_id: impl Into<String>) -> Self {
Self {
sequence,
timestamp: Utc::now(),
consumer_group: None,
consumer_id: consumer_id.into(),
}
}
/// Set the consumer group
pub fn with_group(mut self, group: impl Into<String>) -> Self {
self.consumer_group = Some(group.into());
self
}
}
/// Configuration for a replication stream
#[derive(Debug, Clone)]
pub struct StreamConfig {
/// Buffer size for the channel
pub buffer_size: usize,
/// Batch size for events
pub batch_size: usize,
/// Enable automatic checkpointing
pub auto_checkpoint: bool,
/// Checkpoint interval (number of events)
pub checkpoint_interval: usize,
}
impl Default for StreamConfig {
fn default() -> Self {
Self {
buffer_size: 1000,
batch_size: 100,
auto_checkpoint: true,
checkpoint_interval: 100,
}
}
}
/// Manages a replication stream
pub struct ReplicationStream {
/// The replication log
log: Arc<ReplicationLog>,
/// Stream configuration
config: StreamConfig,
/// Current checkpoint
checkpoint: Arc<RwLock<Option<Checkpoint>>>,
/// Consumer ID
consumer_id: String,
}
impl ReplicationStream {
/// Create a new replication stream
pub fn new(log: Arc<ReplicationLog>, consumer_id: impl Into<String>) -> Self {
Self {
log,
config: StreamConfig::default(),
checkpoint: Arc::new(RwLock::new(None)),
consumer_id: consumer_id.into(),
}
}
/// Create with custom configuration
pub fn with_config(
log: Arc<ReplicationLog>,
consumer_id: impl Into<String>,
config: StreamConfig,
) -> Self {
Self {
log,
config,
checkpoint: Arc::new(RwLock::new(None)),
consumer_id: consumer_id.into(),
}
}
/// Start streaming from a given position
pub async fn stream_from(
&self,
start_sequence: u64,
) -> Result<mpsc::Receiver<Vec<ChangeEvent>>> {
let (tx, rx) = mpsc::channel(self.config.buffer_size);
let log = self.log.clone();
let batch_size = self.config.batch_size;
let checkpoint = self.checkpoint.clone();
let auto_checkpoint = self.config.auto_checkpoint;
let checkpoint_interval = self.config.checkpoint_interval;
let consumer_id = self.consumer_id.clone();
tokio::spawn(async move {
let mut current_sequence = start_sequence;
let mut events_since_checkpoint = 0;
loop {
// Get batch of entries
let entries =
log.get_range(current_sequence + 1, current_sequence + batch_size as u64);
if entries.is_empty() {
// No new entries, wait a bit
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
continue;
}
// Convert to change events
let mut events = Vec::new();
for entry in &entries {
// In a real implementation, we would decode the operation type
// from the entry data. For now, we use a placeholder.
let event = ChangeEvent::from_log_entry(
entry,
ChangeOperation::Update,
"default".to_string(),
Uuid::new_v4().to_string(),
);
events.push(event);
}
// Update current sequence
if let Some(last_entry) = entries.last() {
current_sequence = last_entry.sequence;
}
// Send batch
if tx.send(events).await.is_err() {
// Receiver dropped, stop streaming
break;
}
events_since_checkpoint += entries.len();
// Auto-checkpoint if enabled
if auto_checkpoint && events_since_checkpoint >= checkpoint_interval {
let cp = Checkpoint::new(current_sequence, consumer_id.clone());
*checkpoint.write() = Some(cp);
events_since_checkpoint = 0;
}
}
});
Ok(rx)
}
/// Resume streaming from the last checkpoint
pub async fn resume(&self) -> Result<mpsc::Receiver<Vec<ChangeEvent>>> {
let checkpoint = self.checkpoint.read();
let start_sequence = checkpoint.as_ref().map(|cp| cp.sequence).unwrap_or(0);
drop(checkpoint);
self.stream_from(start_sequence).await
}
/// Get the current checkpoint
pub fn get_checkpoint(&self) -> Option<Checkpoint> {
self.checkpoint.read().clone()
}
/// Set a checkpoint manually
pub fn set_checkpoint(&self, checkpoint: Checkpoint) {
*self.checkpoint.write() = Some(checkpoint);
}
/// Clear the checkpoint
pub fn clear_checkpoint(&self) {
*self.checkpoint.write() = None;
}
}
/// Manager for multiple replication streams (consumer groups)
pub struct StreamManager {
/// The replication log
log: Arc<ReplicationLog>,
/// Active streams by consumer ID
streams: Arc<RwLock<Vec<Arc<ReplicationStream>>>>,
}
impl StreamManager {
/// Create a new stream manager
pub fn new(log: Arc<ReplicationLog>) -> Self {
Self {
log,
streams: Arc::new(RwLock::new(Vec::new())),
}
}
/// Create a new stream for a consumer
pub fn create_stream(&self, consumer_id: impl Into<String>) -> Arc<ReplicationStream> {
let stream = Arc::new(ReplicationStream::new(self.log.clone(), consumer_id));
self.streams.write().push(stream.clone());
stream
}
/// Create a stream with custom configuration
pub fn create_stream_with_config(
&self,
consumer_id: impl Into<String>,
config: StreamConfig,
) -> Arc<ReplicationStream> {
let stream = Arc::new(ReplicationStream::with_config(
self.log.clone(),
consumer_id,
config,
));
self.streams.write().push(stream.clone());
stream
}
/// Get all active streams
pub fn active_streams(&self) -> Vec<Arc<ReplicationStream>> {
self.streams.read().clone()
}
/// Get the number of active streams
pub fn stream_count(&self) -> usize {
self.streams.read().len()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_change_event_creation() {
let event = ChangeEvent::new(
1,
ChangeOperation::Insert,
"vectors".to_string(),
"doc-1".to_string(),
b"data".to_vec(),
);
assert_eq!(event.sequence, 1);
assert_eq!(event.operation, ChangeOperation::Insert);
assert_eq!(event.collection, "vectors");
}
#[test]
fn test_checkpoint() {
let cp = Checkpoint::new(100, "consumer-1").with_group("group-1");
assert_eq!(cp.sequence, 100);
assert_eq!(cp.consumer_id, "consumer-1");
assert_eq!(cp.consumer_group, Some("group-1".to_string()));
}
#[tokio::test]
async fn test_replication_stream() {
let log = Arc::new(ReplicationLog::new("replica-1"));
// Add some entries
log.append(b"data1".to_vec());
log.append(b"data2".to_vec());
log.append(b"data3".to_vec());
let stream = ReplicationStream::new(log.clone(), "consumer-1");
let mut rx = stream.stream_from(0).await.unwrap();
// Receive events
if let Some(events) = rx.recv().await {
assert!(!events.is_empty());
}
}
#[test]
fn test_stream_manager() {
let log = Arc::new(ReplicationLog::new("replica-1"));
let manager = StreamManager::new(log);
let stream1 = manager.create_stream("consumer-1");
let stream2 = manager.create_stream("consumer-2");
assert_eq!(manager.stream_count(), 2);
}
#[test]
fn test_stream_config() {
let config = StreamConfig {
buffer_size: 2000,
batch_size: 50,
auto_checkpoint: false,
checkpoint_interval: 200,
};
assert_eq!(config.buffer_size, 2000);
assert_eq!(config.batch_size, 50);
assert!(!config.auto_checkpoint);
}
}