//! # RuVector Data Discovery Framework //! //! Core traits and types for building dataset integrations with RuVector's //! vector memory, graph structures, and dynamic minimum cut algorithms. //! //! ## Architecture //! //! The framework provides three core abstractions: //! //! 1. **DataIngester**: Streaming data ingestion with batched graph/vector updates //! 2. **CoherenceEngine**: Real-time coherence signal computation using min-cut //! 3. **DiscoveryEngine**: Pattern detection for emerging structures and anomalies //! //! ## Quick Start //! //! ```rust,ignore //! use ruvector_data_framework::{ //! DataIngester, CoherenceEngine, DiscoveryEngine, //! IngestionConfig, CoherenceConfig, DiscoveryConfig, //! }; //! //! // Configure the discovery pipeline //! let ingester = DataIngester::new(ingestion_config); //! let coherence = CoherenceEngine::new(coherence_config); //! let discovery = DiscoveryEngine::new(discovery_config); //! //! // Stream data and detect patterns //! let stream = ingester.stream_from_source(source).await?; //! let signals = coherence.compute_signals(stream).await?; //! let patterns = discovery.detect_patterns(signals).await?; //! ``` #![warn(missing_docs)] #![warn(clippy::all)] pub mod academic_clients; pub mod api_clients; pub mod arxiv_client; pub mod biorxiv_client; pub mod coherence; pub mod crossref_client; pub mod discovery; pub mod dynamic_mincut; pub mod economic_clients; pub mod export; pub mod finance_clients; pub mod forecasting; pub mod genomics_clients; pub mod geospatial_clients; pub mod government_clients; pub mod hnsw; pub mod cut_aware_hnsw; pub mod ingester; pub mod mcp_server; pub mod medical_clients; pub mod ml_clients; pub mod news_clients; pub mod optimized; pub mod patent_clients; pub mod persistence; pub mod physics_clients; pub mod realtime; pub mod ruvector_native; pub mod semantic_scholar; pub mod space_clients; pub mod streaming; pub mod transportation_clients; pub mod utils; pub mod visualization; pub mod wiki_clients; use std::collections::HashMap; use std::sync::Arc; use async_trait::async_trait; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use thiserror::Error; // Re-exports pub use academic_clients::{CoreClient, EricClient, UnpaywallClient}; pub use api_clients::{EdgarClient, Embedder, NoaaClient, OpenAlexClient, SimpleEmbedder}; #[cfg(feature = "onnx-embeddings")] pub use api_clients::OnnxEmbedder; #[cfg(feature = "onnx-embeddings")] pub use ruvector_onnx_embeddings::{PretrainedModel, EmbedderConfig, PoolingStrategy}; pub use arxiv_client::ArxivClient; pub use biorxiv_client::{BiorxivClient, MedrxivClient}; pub use crossref_client::CrossRefClient; pub use economic_clients::{AlphaVantageClient, FredClient, WorldBankClient}; pub use finance_clients::{BlsClient, CoinGeckoClient, EcbClient, FinnhubClient, TwelveDataClient}; pub use genomics_clients::{EnsemblClient, GwasClient, NcbiClient, UniProtClient}; pub use geospatial_clients::{GeonamesClient, NominatimClient, OpenElevationClient, OverpassClient}; pub use government_clients::{ CensusClient, DataGovClient, EuOpenDataClient, UkGovClient, UNDataClient, WorldBankClient as WorldBankGovClient, }; pub use medical_clients::{ClinicalTrialsClient, FdaClient, PubMedClient}; pub use ml_clients::{ HuggingFaceClient, HuggingFaceDataset, HuggingFaceModel, OllamaClient, OllamaModel, PapersWithCodeClient, PaperWithCodeDataset, PaperWithCodePaper, ReplicateClient, ReplicateModel, TogetherAiClient, TogetherModel, }; pub use news_clients::{GuardianClient, HackerNewsClient, NewsDataClient, RedditClient}; pub use patent_clients::{EpoClient, UsptoPatentClient}; pub use physics_clients::{ArgoClient, CernOpenDataClient, GeoUtils, MaterialsProjectClient, UsgsEarthquakeClient}; pub use semantic_scholar::SemanticScholarClient; pub use space_clients::{AstronomyClient, ExoplanetClient, NasaClient, SpaceXClient}; pub use transportation_clients::{GtfsClient, MobilityDatabaseClient, OpenChargeMapClient, OpenRouteServiceClient}; pub use wiki_clients::{WikidataClient, WikidataEntity, WikipediaClient}; pub use coherence::{ CoherenceBoundary, CoherenceConfig, CoherenceEngine, CoherenceEvent, CoherenceSignal, }; pub use cut_aware_hnsw::{ CutAwareHNSW, CutAwareConfig, CutAwareMetrics, CoherenceZone, SearchResult as CutAwareSearchResult, EdgeUpdate as CutAwareEdgeUpdate, UpdateKind, LayerCutStats, }; pub use discovery::{ DiscoveryConfig, DiscoveryEngine, DiscoveryPattern, PatternCategory, PatternStrength, }; pub use dynamic_mincut::{ CutGatedSearch, CutWatcherConfig, DynamicCutWatcher, DynamicMinCutError, EdgeUpdate as DynamicEdgeUpdate, EdgeUpdateType, EulerTourTree, HNSWGraph, LocalCut, LocalMinCutProcedure, WatcherStats, }; pub use export::{ export_all, export_coherence_csv, export_dot, export_graphml, export_patterns_csv, export_patterns_with_evidence_csv, ExportFilter, }; pub use forecasting::{CoherenceForecaster, CrossDomainForecaster, Forecast, Trend}; pub use ingester::{DataIngester, IngestionConfig, IngestionStats, SourceConfig}; pub use realtime::{FeedItem, FeedSource, NewsAggregator, NewsSource, RealTimeEngine}; pub use ruvector_native::{ CoherenceHistoryEntry, CoherenceSnapshot, Domain, DiscoveredPattern, GraphExport, NativeDiscoveryEngine, NativeEngineConfig, SemanticVector, }; pub use streaming::{StreamingConfig, StreamingEngine, StreamingEngineBuilder, StreamingMetrics}; /// Framework error types #[derive(Error, Debug)] pub enum FrameworkError { /// Data ingestion failed #[error("Ingestion error: {0}")] Ingestion(String), /// Coherence computation failed #[error("Coherence error: {0}")] Coherence(String), /// Discovery algorithm failed #[error("Discovery error: {0}")] Discovery(String), /// Network/API error #[error("Network error: {0}")] Network(#[from] reqwest::Error), /// Serialization error #[error("Serialization error: {0}")] Serialization(#[from] serde_json::Error), /// Graph operation failed #[error("Graph error: {0}")] Graph(String), /// Configuration error #[error("Config error: {0}")] Config(String), } /// Result type for framework operations pub type Result = std::result::Result; /// A timestamped data record from any source #[derive(Debug, Clone, Serialize, Deserialize)] pub struct DataRecord { /// Unique identifier pub id: String, /// Source dataset (e.g., "openalex", "noaa", "edgar") pub source: String, /// Record type within source (e.g., "work", "author", "filing") pub record_type: String, /// Timestamp when data was observed/published pub timestamp: DateTime, /// Raw data payload pub data: serde_json::Value, /// Pre-computed embedding vector (optional) pub embedding: Option>, /// Relationships to other records pub relationships: Vec, } /// A relationship between two records #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Relationship { /// Target record ID pub target_id: String, /// Relationship type (e.g., "cites", "authored_by", "filed_by") pub rel_type: String, /// Relationship weight/strength pub weight: f64, /// Additional properties pub properties: HashMap, } /// Trait for data sources that can be ingested #[async_trait] pub trait DataSource: Send + Sync { /// Source identifier fn source_id(&self) -> &str; /// Fetch a batch of records starting from cursor async fn fetch_batch( &self, cursor: Option, batch_size: usize, ) -> Result<(Vec, Option)>; /// Get total record count (if known) async fn total_count(&self) -> Result>; /// Check if source is available async fn health_check(&self) -> Result; } /// Trait for computing embeddings from records #[async_trait] pub trait EmbeddingProvider: Send + Sync { /// Compute embedding for a single record async fn embed_record(&self, record: &DataRecord) -> Result>; /// Compute embeddings for a batch of records async fn embed_batch(&self, records: &[DataRecord]) -> Result>>; /// Embedding dimension fn dimension(&self) -> usize; } /// Trait for graph building from records pub trait GraphBuilder: Send + Sync { /// Add a node from a data record fn add_node(&mut self, record: &DataRecord) -> Result; /// Add an edge between nodes fn add_edge(&mut self, source: u64, target: u64, weight: f64, rel_type: &str) -> Result<()>; /// Get node count fn node_count(&self) -> usize; /// Get edge count fn edge_count(&self) -> usize; } /// Temporal window for time-series analysis #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct TemporalWindow { /// Window start pub start: DateTime, /// Window end pub end: DateTime, /// Window identifier (for sliding windows) pub window_id: u64, } impl TemporalWindow { /// Create a new temporal window pub fn new(start: DateTime, end: DateTime, window_id: u64) -> Self { Self { start, end, window_id, } } /// Duration in seconds pub fn duration_secs(&self) -> i64 { (self.end - self.start).num_seconds() } /// Check if timestamp falls within window pub fn contains(&self, timestamp: DateTime) -> bool { timestamp >= self.start && timestamp < self.end } } /// Statistics for a discovery session #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct DiscoveryStats { /// Records processed pub records_processed: u64, /// Nodes in graph pub nodes_created: u64, /// Edges in graph pub edges_created: u64, /// Coherence signals computed pub signals_computed: u64, /// Patterns discovered pub patterns_discovered: u64, /// Processing duration in milliseconds pub duration_ms: u64, /// Peak memory usage in bytes pub peak_memory_bytes: u64, } /// Configuration for the entire discovery pipeline #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PipelineConfig { /// Ingestion configuration pub ingestion: IngestionConfig, /// Coherence engine configuration pub coherence: CoherenceConfig, /// Discovery engine configuration pub discovery: DiscoveryConfig, /// Enable parallel processing pub parallel: bool, /// Checkpoint interval (records) pub checkpoint_interval: u64, /// Output directory for results pub output_dir: String, } impl Default for PipelineConfig { fn default() -> Self { Self { ingestion: IngestionConfig::default(), coherence: CoherenceConfig::default(), discovery: DiscoveryConfig::default(), parallel: true, checkpoint_interval: 10_000, output_dir: "./discovery_output".to_string(), } } } /// Main discovery pipeline orchestrator pub struct DiscoveryPipeline { config: PipelineConfig, ingester: DataIngester, coherence: CoherenceEngine, discovery: DiscoveryEngine, stats: Arc>, } impl DiscoveryPipeline { /// Create a new discovery pipeline pub fn new(config: PipelineConfig) -> Self { let ingester = DataIngester::new(config.ingestion.clone()); let coherence = CoherenceEngine::new(config.coherence.clone()); let discovery = DiscoveryEngine::new(config.discovery.clone()); Self { config, ingester, coherence, discovery, stats: Arc::new(std::sync::RwLock::new(DiscoveryStats::default())), } } /// Run the discovery pipeline on a data source pub async fn run(&mut self, source: S) -> Result> { let start_time = std::time::Instant::now(); // Phase 1: Ingest data tracing::info!("Starting ingestion from source: {}", source.source_id()); let records = self.ingester.ingest_all(&source).await?; { let mut stats = self.stats.write().unwrap(); stats.records_processed = records.len() as u64; } // Phase 2: Build graph and compute coherence tracing::info!("Computing coherence signals over {} records", records.len()); let signals = self.coherence.compute_from_records(&records)?; { let mut stats = self.stats.write().unwrap(); stats.signals_computed = signals.len() as u64; stats.nodes_created = self.coherence.node_count() as u64; stats.edges_created = self.coherence.edge_count() as u64; } // Phase 3: Detect patterns tracing::info!("Detecting discovery patterns"); let patterns = self.discovery.detect(&signals)?; { let mut stats = self.stats.write().unwrap(); stats.patterns_discovered = patterns.len() as u64; stats.duration_ms = start_time.elapsed().as_millis() as u64; } tracing::info!( "Discovery complete: {} patterns found in {}ms", patterns.len(), start_time.elapsed().as_millis() ); Ok(patterns) } /// Get current statistics pub fn stats(&self) -> DiscoveryStats { self.stats.read().unwrap().clone() } } #[cfg(test)] mod tests { use super::*; #[test] fn test_temporal_window() { let start = Utc::now(); let end = start + chrono::Duration::hours(1); let window = TemporalWindow::new(start, end, 1); assert_eq!(window.duration_secs(), 3600); assert!(window.contains(start + chrono::Duration::minutes(30))); assert!(!window.contains(start - chrono::Duration::minutes(1))); assert!(!window.contains(end + chrono::Duration::minutes(1))); } #[test] fn test_default_pipeline_config() { let config = PipelineConfig::default(); assert!(config.parallel); assert_eq!(config.checkpoint_interval, 10_000); } #[test] fn test_data_record_serialization() { let record = DataRecord { id: "test-1".to_string(), source: "test".to_string(), record_type: "document".to_string(), timestamp: Utc::now(), data: serde_json::json!({"title": "Test"}), embedding: Some(vec![0.1, 0.2, 0.3]), relationships: vec![], }; let json = serde_json::to_string(&record).unwrap(); let parsed: DataRecord = serde_json::from_str(&json).unwrap(); assert_eq!(parsed.id, record.id); } }