Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
[package]
|
||||
name = "sevensense-analysis"
|
||||
description = "Analysis bounded context for 7sense bioacoustics platform - clustering, motif detection, sequence analysis"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
license.workspace = true
|
||||
repository.workspace = true
|
||||
authors.workspace = true
|
||||
readme = "README.md"
|
||||
keywords = ["bioacoustics", "clustering", "motif-detection", "hdbscan", "markov"]
|
||||
categories = ["science", "algorithms"]
|
||||
|
||||
[dependencies]
|
||||
# Internal crates
|
||||
sevensense-core = { workspace = true, version = "0.1.0" }
|
||||
sevensense-vector = { workspace = true, version = "0.1.0" }
|
||||
|
||||
# Async runtime
|
||||
tokio = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
|
||||
# Numerical computing
|
||||
ndarray = { workspace = true }
|
||||
|
||||
# Graph algorithms
|
||||
petgraph = "0.6"
|
||||
|
||||
# Serialization
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
|
||||
# Error handling
|
||||
thiserror = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
|
||||
# Logging / tracing
|
||||
tracing = { workspace = true }
|
||||
|
||||
# Utilities
|
||||
uuid = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
ordered-float = "4.2"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { workspace = true, features = ["test-util", "macros", "rt-multi-thread"] }
|
||||
proptest = { workspace = true }
|
||||
test-case = { workspace = true }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
parallel = []
|
||||
409
examples/vibecast-7sense/crates/sevensense-analysis/README.md
Normal file
409
examples/vibecast-7sense/crates/sevensense-analysis/README.md
Normal file
@@ -0,0 +1,409 @@
|
||||
# sevensense-analysis
|
||||
|
||||
[](https://crates.io/crates/sevensense-analysis)
|
||||
[](https://docs.rs/sevensense-analysis)
|
||||
[](../../LICENSE)
|
||||
|
||||
> Advanced acoustic analysis algorithms for bioacoustic pattern discovery.
|
||||
|
||||
**sevensense-analysis** provides sophisticated analysis tools for understanding bird vocalizations at scale. From clustering calls into groups, detecting recurring motifs, to modeling temporal patterns with Markov chains, it transforms raw embeddings into actionable ecological insights.
|
||||
|
||||
## Features
|
||||
|
||||
- **HDBSCAN Clustering**: Density-based clustering for call-type discovery
|
||||
- **Markov Models**: Temporal sequence analysis and prediction
|
||||
- **Motif Detection**: Find recurring vocal patterns
|
||||
- **Statistical Analysis**: Entropy, diversity indices, anomaly scores
|
||||
- **Temporal Patterns**: Diel rhythms, seasonal trends
|
||||
- **Multi-scale Analysis**: From milliseconds to months
|
||||
|
||||
## Use Cases
|
||||
|
||||
| Use Case | Description | Key Functions |
|
||||
|----------|-------------|---------------|
|
||||
| Call-Type Clustering | Group similar vocalizations | `hdbscan_cluster()` |
|
||||
| Sequence Analysis | Model call sequences | `MarkovChain::analyze()` |
|
||||
| Motif Discovery | Find repeated patterns | `detect_motifs()` |
|
||||
| Diversity Metrics | Shannon/Simpson indices | `diversity_index()` |
|
||||
| Periodicity | Detect rhythmic patterns | `detect_periodicity()` |
|
||||
| Anomaly Detection | Find unusual calls | `anomaly_score()` |
|
||||
|
||||
## Installation
|
||||
|
||||
Add to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
sevensense-analysis = "0.1"
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{HdbscanClusterer, HdbscanConfig};
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Cluster embeddings by call type
|
||||
let config = HdbscanConfig {
|
||||
min_cluster_size: 5,
|
||||
min_samples: 3,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let clusterer = HdbscanClusterer::new(config);
|
||||
let labels = clusterer.fit(&embeddings)?;
|
||||
|
||||
// Count clusters (excluding noise = -1)
|
||||
let n_clusters = labels.iter().filter(|&&l| l >= 0).max().unwrap_or(&-1) + 1;
|
||||
println!("Found {} call types", n_clusters);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
<details>
|
||||
<summary><b>Tutorial: HDBSCAN Clustering</b></summary>
|
||||
|
||||
### Basic Clustering
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{HdbscanClusterer, HdbscanConfig};
|
||||
|
||||
let config = HdbscanConfig {
|
||||
min_cluster_size: 5, // Minimum points per cluster
|
||||
min_samples: 3, // Core point threshold
|
||||
epsilon: 0.0, // 0 = automatic selection
|
||||
metric: DistanceMetric::Euclidean,
|
||||
};
|
||||
|
||||
let clusterer = HdbscanClusterer::new(config);
|
||||
let result = clusterer.fit(&embeddings)?;
|
||||
|
||||
println!("Labels: {:?}", result.labels);
|
||||
println!("Probabilities: {:?}", result.probabilities);
|
||||
println!("Outlier scores: {:?}", result.outlier_scores);
|
||||
```
|
||||
|
||||
### Cluster Analysis
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{cluster_statistics, ClusterStats};
|
||||
|
||||
let stats = cluster_statistics(&embeddings, &labels)?;
|
||||
|
||||
for (cluster_id, stat) in stats.iter() {
|
||||
println!("Cluster {}:", cluster_id);
|
||||
println!(" Size: {}", stat.size);
|
||||
println!(" Centroid: {:?}", &stat.centroid[..5]); // First 5 dims
|
||||
println!(" Intra-cluster distance: {:.3}", stat.intra_distance);
|
||||
println!(" Silhouette score: {:.3}", stat.silhouette);
|
||||
}
|
||||
```
|
||||
|
||||
### Cluster Assignment for New Data
|
||||
|
||||
```rust
|
||||
// Assign new embeddings to existing clusters
|
||||
let new_embeddings = load_new_data()?;
|
||||
let assignments = clusterer.predict(&new_embeddings)?;
|
||||
|
||||
for (embedding, cluster) in new_embeddings.iter().zip(assignments.iter()) {
|
||||
if *cluster >= 0 {
|
||||
println!("Assigned to cluster {}", cluster);
|
||||
} else {
|
||||
println!("Classified as noise/outlier");
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>Tutorial: Markov Chain Analysis</b></summary>
|
||||
|
||||
### Building a Markov Model
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{MarkovChain, MarkovConfig};
|
||||
|
||||
// Sequences of cluster labels (call types)
|
||||
let sequences: Vec<Vec<i32>> = vec![
|
||||
vec![0, 1, 2, 0, 1], // Sequence 1
|
||||
vec![0, 1, 0, 2, 1], // Sequence 2
|
||||
vec![1, 2, 0, 1, 2], // Sequence 3
|
||||
];
|
||||
|
||||
let config = MarkovConfig {
|
||||
order: 1, // First-order Markov chain
|
||||
smoothing: 0.01, // Laplace smoothing
|
||||
};
|
||||
|
||||
let chain = MarkovChain::fit(&sequences, config)?;
|
||||
|
||||
// Get transition probabilities
|
||||
let probs = chain.transition_matrix();
|
||||
println!("P(1|0) = {:.3}", probs[(0, 1)]); // Probability of 1 given 0
|
||||
```
|
||||
|
||||
### Sequence Prediction
|
||||
|
||||
```rust
|
||||
// Predict next state
|
||||
let current_state = 0;
|
||||
let next_probs = chain.predict_next(current_state)?;
|
||||
|
||||
println!("Next state probabilities from state {}:", current_state);
|
||||
for (state, prob) in next_probs.iter().enumerate() {
|
||||
println!(" State {}: {:.3}", state, prob);
|
||||
}
|
||||
|
||||
// Generate synthetic sequence
|
||||
let generated = chain.generate(10, Some(0))?; // 10 states, starting from 0
|
||||
println!("Generated sequence: {:?}", generated);
|
||||
```
|
||||
|
||||
### Sequence Analysis
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::MarkovAnalysis;
|
||||
|
||||
let analysis = MarkovAnalysis::new(&chain);
|
||||
|
||||
// Stationary distribution
|
||||
let stationary = analysis.stationary_distribution()?;
|
||||
println!("Stationary distribution: {:?}", stationary);
|
||||
|
||||
// Entropy rate
|
||||
let entropy = analysis.entropy_rate()?;
|
||||
println!("Entropy rate: {:.3} bits", entropy);
|
||||
|
||||
// Expected hitting times
|
||||
let hitting_times = analysis.mean_hitting_times()?;
|
||||
println!("Mean hitting time 0→2: {:.2} steps", hitting_times[(0, 2)]);
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>Tutorial: Motif Detection</b></summary>
|
||||
|
||||
### Finding Repeated Patterns
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{MotifDetector, MotifConfig};
|
||||
|
||||
let config = MotifConfig {
|
||||
min_length: 3, // Minimum motif length
|
||||
max_length: 10, // Maximum motif length
|
||||
similarity_threshold: 0.85,
|
||||
min_occurrences: 2,
|
||||
};
|
||||
|
||||
let detector = MotifDetector::new(config);
|
||||
let motifs = detector.detect(&embeddings)?;
|
||||
|
||||
for motif in &motifs {
|
||||
println!("Motif found:");
|
||||
println!(" Length: {} segments", motif.length);
|
||||
println!(" Occurrences: {}", motif.occurrences.len());
|
||||
println!(" Positions: {:?}", motif.positions());
|
||||
println!(" Average similarity: {:.3}", motif.avg_similarity);
|
||||
}
|
||||
```
|
||||
|
||||
### Motif Visualization
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::motif_to_sequence;
|
||||
|
||||
for motif in motifs.iter().take(5) {
|
||||
// Get the representative sequence
|
||||
let sequence = motif_to_sequence(&embeddings, motif)?;
|
||||
|
||||
println!("Motif #{} (len={})", motif.id, motif.length);
|
||||
println!(" Representative: {:?}", sequence);
|
||||
|
||||
// Show all occurrences
|
||||
for (i, occ) in motif.occurrences.iter().enumerate() {
|
||||
println!(" Occurrence {}: positions {}-{}",
|
||||
i, occ.start, occ.end);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Cross-Recording Motifs
|
||||
|
||||
```rust
|
||||
// Find motifs that appear across multiple recordings
|
||||
let recordings: Vec<(RecordingId, Vec<Embedding>)> = load_recordings()?;
|
||||
|
||||
let cross_motifs = detector.detect_cross_recording(&recordings)?;
|
||||
|
||||
for motif in cross_motifs {
|
||||
println!("Cross-recording motif:");
|
||||
println!(" Appears in {} recordings", motif.recording_ids.len());
|
||||
println!(" Total occurrences: {}", motif.total_occurrences);
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>Tutorial: Statistical Analysis</b></summary>
|
||||
|
||||
### Diversity Indices
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{shannon_index, simpson_index, species_richness};
|
||||
|
||||
// Count species occurrences
|
||||
let species_counts = count_species(&labels)?;
|
||||
|
||||
let shannon = shannon_index(&species_counts);
|
||||
let simpson = simpson_index(&species_counts);
|
||||
let richness = species_richness(&species_counts);
|
||||
|
||||
println!("Shannon Index (H'): {:.3}", shannon);
|
||||
println!("Simpson Index (D): {:.3}", simpson);
|
||||
println!("Species Richness: {}", richness);
|
||||
```
|
||||
|
||||
### Entropy Analysis
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{sequence_entropy, normalized_entropy};
|
||||
|
||||
// Entropy of call sequences
|
||||
let sequence: Vec<i32> = vec![0, 1, 2, 0, 1, 0, 2, 1, 0];
|
||||
|
||||
let entropy = sequence_entropy(&sequence);
|
||||
let norm_entropy = normalized_entropy(&sequence);
|
||||
|
||||
println!("Sequence entropy: {:.3} bits", entropy);
|
||||
println!("Normalized entropy: {:.3}", norm_entropy); // 0-1 scale
|
||||
```
|
||||
|
||||
### Periodicity Detection
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{detect_periodicity, PeriodicityConfig};
|
||||
|
||||
let config = PeriodicityConfig {
|
||||
min_period: 2,
|
||||
max_period: 100,
|
||||
confidence_threshold: 0.7,
|
||||
};
|
||||
|
||||
let timestamps: Vec<f64> = get_call_timestamps()?;
|
||||
let periods = detect_periodicity(×tamps, config)?;
|
||||
|
||||
for (period, confidence) in periods {
|
||||
println!("Period: {:.1}s (confidence: {:.2})", period, confidence);
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>Tutorial: Temporal Analysis</b></summary>
|
||||
|
||||
### Diel Activity Patterns
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{DielAnalyzer, TimeOfDay};
|
||||
|
||||
let analyzer = DielAnalyzer::new();
|
||||
|
||||
// Analyze activity by time of day
|
||||
let pattern = analyzer.analyze(×tamps)?;
|
||||
|
||||
println!("Dawn chorus: {} calls", pattern.count(TimeOfDay::Dawn));
|
||||
println!("Morning: {} calls", pattern.count(TimeOfDay::Morning));
|
||||
println!("Midday: {} calls", pattern.count(TimeOfDay::Midday));
|
||||
println!("Evening: {} calls", pattern.count(TimeOfDay::Evening));
|
||||
println!("Night: {} calls", pattern.count(TimeOfDay::Night));
|
||||
|
||||
// Peak activity time
|
||||
let peak = pattern.peak_hour();
|
||||
println!("Peak activity: {:02}:00", peak);
|
||||
```
|
||||
|
||||
### Seasonal Trends
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{SeasonalAnalyzer, Season};
|
||||
|
||||
let analyzer = SeasonalAnalyzer::new();
|
||||
let trend = analyzer.analyze(&dated_records)?;
|
||||
|
||||
println!("Spring activity: {:.1}%", trend.percentage(Season::Spring));
|
||||
println!("Breeding season peak: {:?}", trend.breeding_peak());
|
||||
println!("Migration periods: {:?}", trend.migration_windows());
|
||||
```
|
||||
|
||||
### Time Series Analysis
|
||||
|
||||
```rust
|
||||
use sevensense_analysis::{TimeSeries, Aggregation};
|
||||
|
||||
let series = TimeSeries::from_events(&events)?;
|
||||
|
||||
// Aggregate by hour
|
||||
let hourly = series.aggregate(Aggregation::Hourly)?;
|
||||
|
||||
// Detect anomalies
|
||||
let anomalies = series.detect_anomalies(3.0)?; // 3-sigma threshold
|
||||
|
||||
for anomaly in anomalies {
|
||||
println!("Anomaly at {}: {} calls (expected: {})",
|
||||
anomaly.timestamp, anomaly.actual, anomaly.expected);
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
### HdbscanConfig Parameters
|
||||
|
||||
| Parameter | Default | Description |
|
||||
|-----------|---------|-------------|
|
||||
| `min_cluster_size` | 5 | Minimum cluster size |
|
||||
| `min_samples` | 3 | Core point threshold |
|
||||
| `epsilon` | 0.0 | Distance threshold (0=auto) |
|
||||
| `metric` | Euclidean | Distance metric |
|
||||
|
||||
### MarkovConfig Parameters
|
||||
|
||||
| Parameter | Default | Description |
|
||||
|-----------|---------|-------------|
|
||||
| `order` | 1 | Markov chain order |
|
||||
| `smoothing` | 0.01 | Laplace smoothing factor |
|
||||
|
||||
## Algorithms
|
||||
|
||||
| Algorithm | Complexity | Use Case |
|
||||
|-----------|------------|----------|
|
||||
| HDBSCAN | O(n log n) | Clustering with noise |
|
||||
| Markov Chain | O(n × s²) | Sequence modeling |
|
||||
| Motif Discovery | O(n² × m) | Pattern finding |
|
||||
| FFT Periodicity | O(n log n) | Rhythm detection |
|
||||
|
||||
## Links
|
||||
|
||||
- **Homepage**: [ruv.io](https://ruv.io)
|
||||
- **Repository**: [github.com/ruvnet/ruvector](https://github.com/ruvnet/ruvector)
|
||||
- **Crates.io**: [crates.io/crates/sevensense-analysis](https://crates.io/crates/sevensense-analysis)
|
||||
- **Documentation**: [docs.rs/sevensense-analysis](https://docs.rs/sevensense-analysis)
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see [LICENSE](../../LICENSE) for details.
|
||||
|
||||
---
|
||||
|
||||
*Part of the [7sense Bioacoustic Intelligence Platform](https://ruv.io) by rUv*
|
||||
@@ -0,0 +1,11 @@
|
||||
//! Application layer for the Analysis bounded context.
|
||||
//!
|
||||
//! Contains application services that orchestrate domain operations
|
||||
//! and coordinate with infrastructure components.
|
||||
|
||||
pub mod services;
|
||||
|
||||
// Re-export service types
|
||||
pub use services::{
|
||||
AnomalyDetectionService, ClusteringService, MotifDetectionService, SequenceAnalysisService,
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,692 @@
|
||||
//! Domain entities for the Analysis bounded context.
|
||||
//!
|
||||
//! This module contains the core domain entities representing clusters,
|
||||
//! prototypes, motifs, sequences, and anomalies in bioacoustic analysis.
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
use uuid::Uuid;
|
||||
|
||||
/// Unique identifier for a cluster.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct ClusterId(Uuid);
|
||||
|
||||
impl ClusterId {
|
||||
/// Create a new random cluster ID.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self(Uuid::new_v4())
|
||||
}
|
||||
|
||||
/// Create a cluster ID from a UUID.
|
||||
#[must_use]
|
||||
pub fn from_uuid(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
|
||||
/// Get the underlying UUID.
|
||||
#[must_use]
|
||||
pub fn as_uuid(&self) -> Uuid {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Noise cluster ID (used for HDBSCAN noise points).
|
||||
#[must_use]
|
||||
pub fn noise() -> Self {
|
||||
Self(Uuid::nil())
|
||||
}
|
||||
|
||||
/// Check if this is the noise cluster.
|
||||
#[must_use]
|
||||
pub fn is_noise(&self) -> bool {
|
||||
self.0.is_nil()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ClusterId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ClusterId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Uuid> for ClusterId {
|
||||
fn from(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
}
|
||||
|
||||
/// Unique identifier for an embedding (from sevensense-embedding context).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct EmbeddingId(Uuid);
|
||||
|
||||
impl EmbeddingId {
|
||||
/// Create a new random embedding ID.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self(Uuid::new_v4())
|
||||
}
|
||||
|
||||
/// Create from UUID.
|
||||
#[must_use]
|
||||
pub fn from_uuid(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
|
||||
/// Get the underlying UUID.
|
||||
#[must_use]
|
||||
pub fn as_uuid(&self) -> Uuid {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for EmbeddingId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EmbeddingId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Uuid> for EmbeddingId {
|
||||
fn from(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
}
|
||||
|
||||
/// Unique identifier for a recording (from sevensense-audio context).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct RecordingId(Uuid);
|
||||
|
||||
impl RecordingId {
|
||||
/// Create a new random recording ID.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self(Uuid::new_v4())
|
||||
}
|
||||
|
||||
/// Create from UUID.
|
||||
#[must_use]
|
||||
pub fn from_uuid(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
|
||||
/// Get the underlying UUID.
|
||||
#[must_use]
|
||||
pub fn as_uuid(&self) -> Uuid {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for RecordingId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for RecordingId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Uuid> for RecordingId {
|
||||
fn from(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
}
|
||||
|
||||
/// Unique identifier for a segment (from sevensense-audio context).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct SegmentId(Uuid);
|
||||
|
||||
impl SegmentId {
|
||||
/// Create a new random segment ID.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self(Uuid::new_v4())
|
||||
}
|
||||
|
||||
/// Create from UUID.
|
||||
#[must_use]
|
||||
pub fn from_uuid(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
|
||||
/// Get the underlying UUID.
|
||||
#[must_use]
|
||||
pub fn as_uuid(&self) -> Uuid {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SegmentId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for SegmentId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Uuid> for SegmentId {
|
||||
fn from(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
}
|
||||
|
||||
/// A cluster of acoustically similar call segments.
|
||||
///
|
||||
/// Clusters group embeddings that represent similar vocalizations,
|
||||
/// enabling pattern discovery and call type identification.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Cluster {
|
||||
/// Unique identifier for this cluster.
|
||||
pub id: ClusterId,
|
||||
|
||||
/// The prototype (representative) embedding ID for this cluster.
|
||||
pub prototype_id: EmbeddingId,
|
||||
|
||||
/// IDs of all embeddings belonging to this cluster.
|
||||
pub member_ids: Vec<EmbeddingId>,
|
||||
|
||||
/// Centroid vector (mean of all member embeddings).
|
||||
pub centroid: Vec<f32>,
|
||||
|
||||
/// Variance within the cluster (measure of spread).
|
||||
pub variance: f32,
|
||||
|
||||
/// Optional human-readable label for the cluster.
|
||||
pub label: Option<String>,
|
||||
|
||||
/// Timestamp when the cluster was created.
|
||||
pub created_at: DateTime<Utc>,
|
||||
|
||||
/// Timestamp when the cluster was last updated.
|
||||
pub updated_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl Cluster {
|
||||
/// Create a new cluster with the given parameters.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
prototype_id: EmbeddingId,
|
||||
member_ids: Vec<EmbeddingId>,
|
||||
centroid: Vec<f32>,
|
||||
variance: f32,
|
||||
) -> Self {
|
||||
let now = Utc::now();
|
||||
Self {
|
||||
id: ClusterId::new(),
|
||||
prototype_id,
|
||||
member_ids,
|
||||
centroid,
|
||||
variance,
|
||||
label: None,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the number of members in this cluster.
|
||||
#[must_use]
|
||||
pub fn member_count(&self) -> usize {
|
||||
self.member_ids.len()
|
||||
}
|
||||
|
||||
/// Check if an embedding is a member of this cluster.
|
||||
#[must_use]
|
||||
pub fn contains(&self, embedding_id: &EmbeddingId) -> bool {
|
||||
self.member_ids.contains(embedding_id)
|
||||
}
|
||||
|
||||
/// Add a member to the cluster.
|
||||
pub fn add_member(&mut self, embedding_id: EmbeddingId) {
|
||||
if !self.member_ids.contains(&embedding_id) {
|
||||
self.member_ids.push(embedding_id);
|
||||
self.updated_at = Utc::now();
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove a member from the cluster.
|
||||
pub fn remove_member(&mut self, embedding_id: &EmbeddingId) -> bool {
|
||||
if let Some(pos) = self.member_ids.iter().position(|id| id == embedding_id) {
|
||||
self.member_ids.remove(pos);
|
||||
self.updated_at = Utc::now();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the centroid vector.
|
||||
pub fn update_centroid(&mut self, centroid: Vec<f32>, variance: f32) {
|
||||
self.centroid = centroid;
|
||||
self.variance = variance;
|
||||
self.updated_at = Utc::now();
|
||||
}
|
||||
|
||||
/// Set a human-readable label for this cluster.
|
||||
pub fn set_label(&mut self, label: impl Into<String>) {
|
||||
self.label = Some(label.into());
|
||||
self.updated_at = Utc::now();
|
||||
}
|
||||
}
|
||||
|
||||
/// A prototype (exemplar) embedding that best represents a cluster.
|
||||
///
|
||||
/// Prototypes are actual call segments that serve as the most representative
|
||||
/// examples of their cluster, useful for visualization and interpretation.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Prototype {
|
||||
/// The embedding ID of this prototype.
|
||||
pub id: EmbeddingId,
|
||||
|
||||
/// The cluster this prototype represents.
|
||||
pub cluster_id: ClusterId,
|
||||
|
||||
/// Score indicating how well this exemplar represents the cluster.
|
||||
/// Higher scores indicate better representation.
|
||||
pub exemplar_score: f32,
|
||||
|
||||
/// Optional path to the spectrogram image for visualization.
|
||||
pub spectrogram_path: Option<PathBuf>,
|
||||
|
||||
/// Timestamp when this prototype was identified.
|
||||
pub created_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl Prototype {
|
||||
/// Create a new prototype.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
id: EmbeddingId,
|
||||
cluster_id: ClusterId,
|
||||
exemplar_score: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
id,
|
||||
cluster_id,
|
||||
exemplar_score,
|
||||
spectrogram_path: None,
|
||||
created_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the spectrogram path for this prototype.
|
||||
pub fn set_spectrogram_path(&mut self, path: impl Into<PathBuf>) {
|
||||
self.spectrogram_path = Some(path.into());
|
||||
}
|
||||
}
|
||||
|
||||
/// A motif (recurring pattern) in vocalization sequences.
|
||||
///
|
||||
/// Motifs represent frequently occurring sequences of cluster assignments,
|
||||
/// indicating repeated vocal phrases or behavioral patterns.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Motif {
|
||||
/// Unique identifier for this motif.
|
||||
pub id: String,
|
||||
|
||||
/// The sequence of cluster IDs that define this motif.
|
||||
pub sequence: Vec<ClusterId>,
|
||||
|
||||
/// Number of times this motif occurs in the analyzed data.
|
||||
pub occurrences: usize,
|
||||
|
||||
/// Average duration of this motif in milliseconds.
|
||||
pub avg_duration_ms: f64,
|
||||
|
||||
/// Confidence score for this motif (0.0 to 1.0).
|
||||
pub confidence: f32,
|
||||
|
||||
/// All occurrences of this motif.
|
||||
pub occurrence_instances: Vec<MotifOccurrence>,
|
||||
|
||||
/// Timestamp when this motif was discovered.
|
||||
pub discovered_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl Motif {
|
||||
/// Create a new motif.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
sequence: Vec<ClusterId>,
|
||||
occurrences: usize,
|
||||
avg_duration_ms: f64,
|
||||
confidence: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
id: Uuid::new_v4().to_string(),
|
||||
sequence,
|
||||
occurrences,
|
||||
avg_duration_ms,
|
||||
confidence,
|
||||
occurrence_instances: Vec::new(),
|
||||
discovered_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the length of this motif (number of clusters).
|
||||
#[must_use]
|
||||
pub fn length(&self) -> usize {
|
||||
self.sequence.len()
|
||||
}
|
||||
|
||||
/// Add an occurrence instance to this motif.
|
||||
pub fn add_occurrence(&mut self, occurrence: MotifOccurrence) {
|
||||
self.occurrence_instances.push(occurrence);
|
||||
self.occurrences = self.occurrence_instances.len();
|
||||
}
|
||||
|
||||
/// Check if this motif contains a specific cluster.
|
||||
#[must_use]
|
||||
pub fn contains_cluster(&self, cluster_id: &ClusterId) -> bool {
|
||||
self.sequence.contains(cluster_id)
|
||||
}
|
||||
}
|
||||
|
||||
/// A specific occurrence of a motif in a recording.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MotifOccurrence {
|
||||
/// The recording where this occurrence was found.
|
||||
pub recording_id: RecordingId,
|
||||
|
||||
/// The segment IDs that make up this occurrence.
|
||||
pub segment_ids: Vec<SegmentId>,
|
||||
|
||||
/// Start time within the recording (milliseconds).
|
||||
pub start_time_ms: u64,
|
||||
|
||||
/// End time within the recording (milliseconds).
|
||||
pub end_time_ms: u64,
|
||||
|
||||
/// Similarity score to the motif template.
|
||||
pub similarity: f32,
|
||||
}
|
||||
|
||||
impl MotifOccurrence {
|
||||
/// Create a new motif occurrence.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
recording_id: RecordingId,
|
||||
segment_ids: Vec<SegmentId>,
|
||||
start_time_ms: u64,
|
||||
end_time_ms: u64,
|
||||
similarity: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
recording_id,
|
||||
segment_ids,
|
||||
start_time_ms,
|
||||
end_time_ms,
|
||||
similarity,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the duration of this occurrence in milliseconds.
|
||||
#[must_use]
|
||||
pub fn duration_ms(&self) -> u64 {
|
||||
self.end_time_ms.saturating_sub(self.start_time_ms)
|
||||
}
|
||||
}
|
||||
|
||||
/// Analysis of a vocalization sequence from a recording.
|
||||
///
|
||||
/// Contains transition information, entropy metrics, and stereotypy scores
|
||||
/// for understanding sequential patterns in bird vocalizations.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SequenceAnalysis {
|
||||
/// The recording this analysis pertains to.
|
||||
pub recording_id: RecordingId,
|
||||
|
||||
/// Transitions between clusters with weights (probabilities).
|
||||
/// Format: (source_cluster, target_cluster, probability)
|
||||
pub transitions: Vec<(ClusterId, ClusterId, f32)>,
|
||||
|
||||
/// Shannon entropy of the transition distribution.
|
||||
/// Higher values indicate more unpredictable sequences.
|
||||
pub entropy: f32,
|
||||
|
||||
/// Stereotypy score (0.0 to 1.0).
|
||||
/// Higher values indicate more repetitive/stereotyped sequences.
|
||||
pub stereotypy_score: f32,
|
||||
|
||||
/// The sequence of cluster IDs in order.
|
||||
pub cluster_sequence: Vec<ClusterId>,
|
||||
|
||||
/// The segment IDs corresponding to the cluster sequence.
|
||||
pub segment_ids: Vec<SegmentId>,
|
||||
|
||||
/// Timestamp when this analysis was performed.
|
||||
pub analyzed_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl SequenceAnalysis {
|
||||
/// Create a new sequence analysis.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
recording_id: RecordingId,
|
||||
transitions: Vec<(ClusterId, ClusterId, f32)>,
|
||||
entropy: f32,
|
||||
stereotypy_score: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
recording_id,
|
||||
transitions,
|
||||
entropy,
|
||||
stereotypy_score,
|
||||
cluster_sequence: Vec::new(),
|
||||
segment_ids: Vec::new(),
|
||||
analyzed_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the number of unique transitions.
|
||||
#[must_use]
|
||||
pub fn unique_transition_count(&self) -> usize {
|
||||
self.transitions.len()
|
||||
}
|
||||
|
||||
/// Get all clusters involved in the sequence.
|
||||
#[must_use]
|
||||
pub fn unique_clusters(&self) -> Vec<ClusterId> {
|
||||
let mut clusters: Vec<ClusterId> = self.cluster_sequence.clone();
|
||||
clusters.sort_by_key(|c| c.as_uuid());
|
||||
clusters.dedup();
|
||||
clusters
|
||||
}
|
||||
|
||||
/// Set the cluster sequence and corresponding segment IDs.
|
||||
pub fn set_sequence(&mut self, clusters: Vec<ClusterId>, segments: Vec<SegmentId>) {
|
||||
self.cluster_sequence = clusters;
|
||||
self.segment_ids = segments;
|
||||
}
|
||||
}
|
||||
|
||||
/// Type of anomaly detected in the analysis.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum AnomalyType {
|
||||
/// Rare vocalization (low occurrence count).
|
||||
Rare,
|
||||
/// Novel vocalization (doesn't fit any cluster well).
|
||||
Novel,
|
||||
/// Artifact (likely noise or recording issue).
|
||||
Artifact,
|
||||
/// Outlier within a cluster.
|
||||
Outlier,
|
||||
/// Unknown anomaly type.
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for AnomalyType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
AnomalyType::Rare => write!(f, "Rare"),
|
||||
AnomalyType::Novel => write!(f, "Novel"),
|
||||
AnomalyType::Artifact => write!(f, "Artifact"),
|
||||
AnomalyType::Outlier => write!(f, "Outlier"),
|
||||
AnomalyType::Unknown => write!(f, "Unknown"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An anomalous embedding that doesn't fit well into any cluster.
|
||||
///
|
||||
/// Anomalies can represent rare vocalizations, novel sounds, or artifacts.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Anomaly {
|
||||
/// The embedding that is anomalous.
|
||||
pub embedding_id: EmbeddingId,
|
||||
|
||||
/// Anomaly score (higher = more anomalous).
|
||||
pub anomaly_score: f32,
|
||||
|
||||
/// The nearest cluster to this anomaly.
|
||||
pub nearest_cluster: ClusterId,
|
||||
|
||||
/// Distance from the anomaly to the nearest cluster's centroid.
|
||||
pub distance_to_centroid: f32,
|
||||
|
||||
/// Type of anomaly detected.
|
||||
pub anomaly_type: AnomalyType,
|
||||
|
||||
/// Local outlier factor (if computed).
|
||||
pub local_outlier_factor: Option<f32>,
|
||||
|
||||
/// Timestamp when this anomaly was detected.
|
||||
pub detected_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl Anomaly {
|
||||
/// Create a new anomaly.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
embedding_id: EmbeddingId,
|
||||
anomaly_score: f32,
|
||||
nearest_cluster: ClusterId,
|
||||
distance_to_centroid: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
embedding_id,
|
||||
anomaly_score,
|
||||
nearest_cluster,
|
||||
distance_to_centroid,
|
||||
anomaly_type: AnomalyType::Unknown,
|
||||
local_outlier_factor: None,
|
||||
detected_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the anomaly type.
|
||||
pub fn set_type(&mut self, anomaly_type: AnomalyType) {
|
||||
self.anomaly_type = anomaly_type;
|
||||
}
|
||||
|
||||
/// Set the local outlier factor.
|
||||
pub fn set_lof(&mut self, lof: f32) {
|
||||
self.local_outlier_factor = Some(lof);
|
||||
}
|
||||
|
||||
/// Check if this is a severe anomaly (score > threshold).
|
||||
#[must_use]
|
||||
pub fn is_severe(&self, threshold: f32) -> bool {
|
||||
self.anomaly_score > threshold
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_cluster_id_creation() {
|
||||
let id1 = ClusterId::new();
|
||||
let id2 = ClusterId::new();
|
||||
assert_ne!(id1, id2);
|
||||
|
||||
let noise = ClusterId::noise();
|
||||
assert!(noise.is_noise());
|
||||
assert!(!id1.is_noise());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cluster_member_operations() {
|
||||
let mut cluster = Cluster::new(
|
||||
EmbeddingId::new(),
|
||||
vec![EmbeddingId::new()],
|
||||
vec![0.0; 1536],
|
||||
0.1,
|
||||
);
|
||||
|
||||
let new_member = EmbeddingId::new();
|
||||
cluster.add_member(new_member);
|
||||
assert_eq!(cluster.member_count(), 2);
|
||||
assert!(cluster.contains(&new_member));
|
||||
|
||||
cluster.remove_member(&new_member);
|
||||
assert_eq!(cluster.member_count(), 1);
|
||||
assert!(!cluster.contains(&new_member));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_motif_length() {
|
||||
let motif = Motif::new(
|
||||
vec![ClusterId::new(), ClusterId::new(), ClusterId::new()],
|
||||
5,
|
||||
1500.0,
|
||||
0.85,
|
||||
);
|
||||
assert_eq!(motif.length(), 3);
|
||||
assert_eq!(motif.occurrences, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sequence_analysis_unique_clusters() {
|
||||
let c1 = ClusterId::new();
|
||||
let c2 = ClusterId::new();
|
||||
|
||||
let mut analysis = SequenceAnalysis::new(
|
||||
RecordingId::new(),
|
||||
vec![],
|
||||
1.5,
|
||||
0.3,
|
||||
);
|
||||
analysis.set_sequence(
|
||||
vec![c1, c2, c1, c2, c1],
|
||||
vec![SegmentId::new(); 5],
|
||||
);
|
||||
|
||||
let unique = analysis.unique_clusters();
|
||||
assert_eq!(unique.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_anomaly_severity() {
|
||||
let mut anomaly = Anomaly::new(
|
||||
EmbeddingId::new(),
|
||||
0.8,
|
||||
ClusterId::new(),
|
||||
2.5,
|
||||
);
|
||||
|
||||
assert!(anomaly.is_severe(0.5));
|
||||
assert!(!anomaly.is_severe(0.9));
|
||||
|
||||
anomaly.set_type(AnomalyType::Novel);
|
||||
assert_eq!(anomaly.anomaly_type, AnomalyType::Novel);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,522 @@
|
||||
//! Domain events for the Analysis bounded context.
|
||||
//!
|
||||
//! Domain events represent significant occurrences within the Analysis domain
|
||||
//! that other parts of the system may need to react to.
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::entities::{AnomalyType, ClusterId, EmbeddingId, RecordingId};
|
||||
use super::value_objects::ClusteringMethod;
|
||||
|
||||
/// Base trait for analysis domain events.
|
||||
pub trait AnalysisEvent: Send + Sync {
|
||||
/// Get the unique event ID.
|
||||
fn event_id(&self) -> Uuid;
|
||||
|
||||
/// Get the timestamp when the event occurred.
|
||||
fn occurred_at(&self) -> DateTime<Utc>;
|
||||
|
||||
/// Get the event type name.
|
||||
fn event_type(&self) -> &'static str;
|
||||
}
|
||||
|
||||
/// Event emitted when clustering is completed.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClustersDiscovered {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// Number of clusters discovered.
|
||||
pub cluster_count: usize,
|
||||
|
||||
/// Number of noise points (not assigned to any cluster).
|
||||
pub noise_count: usize,
|
||||
|
||||
/// Clustering method used.
|
||||
pub method: ClusteringMethod,
|
||||
|
||||
/// Silhouette score (if computed).
|
||||
pub silhouette_score: Option<f32>,
|
||||
|
||||
/// Total number of embeddings processed.
|
||||
pub total_embeddings: usize,
|
||||
}
|
||||
|
||||
impl ClustersDiscovered {
|
||||
/// Create a new ClustersDiscovered event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
cluster_count: usize,
|
||||
noise_count: usize,
|
||||
method: ClusteringMethod,
|
||||
total_embeddings: usize,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
cluster_count,
|
||||
noise_count,
|
||||
method,
|
||||
silhouette_score: None,
|
||||
total_embeddings,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add silhouette score to the event.
|
||||
#[must_use]
|
||||
pub fn with_silhouette_score(mut self, score: f32) -> Self {
|
||||
self.silhouette_score = Some(score);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for ClustersDiscovered {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"ClustersDiscovered"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when an embedding is assigned to a cluster.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClusterAssigned {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The embedding that was assigned.
|
||||
pub embedding_id: EmbeddingId,
|
||||
|
||||
/// The cluster it was assigned to.
|
||||
pub cluster_id: ClusterId,
|
||||
|
||||
/// Confidence/probability of the assignment.
|
||||
pub confidence: f32,
|
||||
|
||||
/// Distance to the cluster centroid.
|
||||
pub distance_to_centroid: f32,
|
||||
}
|
||||
|
||||
impl ClusterAssigned {
|
||||
/// Create a new ClusterAssigned event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
embedding_id: EmbeddingId,
|
||||
cluster_id: ClusterId,
|
||||
confidence: f32,
|
||||
distance_to_centroid: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
embedding_id,
|
||||
cluster_id,
|
||||
confidence,
|
||||
distance_to_centroid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for ClusterAssigned {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"ClusterAssigned"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when a motif pattern is detected.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MotifDetected {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The motif ID.
|
||||
pub motif_id: String,
|
||||
|
||||
/// The cluster sequence defining the motif.
|
||||
pub pattern: Vec<ClusterId>,
|
||||
|
||||
/// Number of occurrences found.
|
||||
pub occurrences: usize,
|
||||
|
||||
/// Confidence score for this motif.
|
||||
pub confidence: f32,
|
||||
|
||||
/// Average duration in milliseconds.
|
||||
pub avg_duration_ms: f64,
|
||||
}
|
||||
|
||||
impl MotifDetected {
|
||||
/// Create a new MotifDetected event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
motif_id: String,
|
||||
pattern: Vec<ClusterId>,
|
||||
occurrences: usize,
|
||||
confidence: f32,
|
||||
avg_duration_ms: f64,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
motif_id,
|
||||
pattern,
|
||||
occurrences,
|
||||
confidence,
|
||||
avg_duration_ms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for MotifDetected {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"MotifDetected"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when a sequence is analyzed.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SequenceAnalyzed {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The recording that was analyzed.
|
||||
pub recording_id: RecordingId,
|
||||
|
||||
/// Shannon entropy of the sequence.
|
||||
pub entropy: f32,
|
||||
|
||||
/// Stereotypy score.
|
||||
pub stereotypy_score: f32,
|
||||
|
||||
/// Number of unique clusters in the sequence.
|
||||
pub unique_clusters: usize,
|
||||
|
||||
/// Number of unique transitions.
|
||||
pub unique_transitions: usize,
|
||||
|
||||
/// Total sequence length.
|
||||
pub sequence_length: usize,
|
||||
}
|
||||
|
||||
impl SequenceAnalyzed {
|
||||
/// Create a new SequenceAnalyzed event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
recording_id: RecordingId,
|
||||
entropy: f32,
|
||||
stereotypy_score: f32,
|
||||
unique_clusters: usize,
|
||||
unique_transitions: usize,
|
||||
sequence_length: usize,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
recording_id,
|
||||
entropy,
|
||||
stereotypy_score,
|
||||
unique_clusters,
|
||||
unique_transitions,
|
||||
sequence_length,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for SequenceAnalyzed {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"SequenceAnalyzed"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when an anomaly is detected.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AnomalyDetected {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The embedding identified as anomalous.
|
||||
pub embedding_id: EmbeddingId,
|
||||
|
||||
/// Anomaly score.
|
||||
pub anomaly_score: f32,
|
||||
|
||||
/// Type of anomaly.
|
||||
pub anomaly_type: AnomalyType,
|
||||
|
||||
/// The nearest cluster.
|
||||
pub nearest_cluster: ClusterId,
|
||||
|
||||
/// Distance to the nearest cluster centroid.
|
||||
pub distance_to_centroid: f32,
|
||||
}
|
||||
|
||||
impl AnomalyDetected {
|
||||
/// Create a new AnomalyDetected event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
embedding_id: EmbeddingId,
|
||||
anomaly_score: f32,
|
||||
anomaly_type: AnomalyType,
|
||||
nearest_cluster: ClusterId,
|
||||
distance_to_centroid: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
embedding_id,
|
||||
anomaly_score,
|
||||
anomaly_type,
|
||||
nearest_cluster,
|
||||
distance_to_centroid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for AnomalyDetected {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"AnomalyDetected"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when cluster prototypes are updated.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PrototypesComputed {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The cluster for which prototypes were computed.
|
||||
pub cluster_id: ClusterId,
|
||||
|
||||
/// Number of prototypes computed.
|
||||
pub prototype_count: usize,
|
||||
|
||||
/// Best exemplar score.
|
||||
pub best_exemplar_score: f32,
|
||||
}
|
||||
|
||||
impl PrototypesComputed {
|
||||
/// Create a new PrototypesComputed event.
|
||||
#[must_use]
|
||||
pub fn new(cluster_id: ClusterId, prototype_count: usize, best_exemplar_score: f32) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
cluster_id,
|
||||
prototype_count,
|
||||
best_exemplar_score,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for PrototypesComputed {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"PrototypesComputed"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when a cluster label is updated.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClusterLabeled {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The cluster that was labeled.
|
||||
pub cluster_id: ClusterId,
|
||||
|
||||
/// The new label (None if label was removed).
|
||||
pub label: Option<String>,
|
||||
|
||||
/// Previous label (None if no previous label).
|
||||
pub previous_label: Option<String>,
|
||||
}
|
||||
|
||||
impl ClusterLabeled {
|
||||
/// Create a new ClusterLabeled event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
cluster_id: ClusterId,
|
||||
label: Option<String>,
|
||||
previous_label: Option<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
cluster_id,
|
||||
label,
|
||||
previous_label,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for ClusterLabeled {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"ClusterLabeled"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event publisher trait for analysis events.
|
||||
#[async_trait::async_trait]
|
||||
pub trait AnalysisEventPublisher: Send + Sync {
|
||||
/// Publish an analysis event.
|
||||
async fn publish<E: AnalysisEvent + Serialize + 'static>(
|
||||
&self,
|
||||
event: E,
|
||||
) -> Result<(), EventPublishError>;
|
||||
}
|
||||
|
||||
/// Error type for event publishing.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum EventPublishError {
|
||||
/// Serialization failed.
|
||||
#[error("Failed to serialize event: {0}")]
|
||||
Serialization(String),
|
||||
|
||||
/// Transport error.
|
||||
#[error("Failed to publish event: {0}")]
|
||||
Transport(String),
|
||||
|
||||
/// Channel closed.
|
||||
#[error("Event channel closed")]
|
||||
ChannelClosed,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_clusters_discovered_event() {
|
||||
let event = ClustersDiscovered::new(
|
||||
10,
|
||||
5,
|
||||
ClusteringMethod::HDBSCAN,
|
||||
100,
|
||||
)
|
||||
.with_silhouette_score(0.75);
|
||||
|
||||
assert_eq!(event.cluster_count, 10);
|
||||
assert_eq!(event.noise_count, 5);
|
||||
assert_eq!(event.silhouette_score, Some(0.75));
|
||||
assert_eq!(event.event_type(), "ClustersDiscovered");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cluster_assigned_event() {
|
||||
let event = ClusterAssigned::new(
|
||||
EmbeddingId::new(),
|
||||
ClusterId::new(),
|
||||
0.95,
|
||||
0.1,
|
||||
);
|
||||
|
||||
assert_eq!(event.confidence, 0.95);
|
||||
assert_eq!(event.event_type(), "ClusterAssigned");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_motif_detected_event() {
|
||||
let pattern = vec![ClusterId::new(), ClusterId::new()];
|
||||
let event = MotifDetected::new(
|
||||
"motif-1".to_string(),
|
||||
pattern.clone(),
|
||||
10,
|
||||
0.85,
|
||||
1500.0,
|
||||
);
|
||||
|
||||
assert_eq!(event.pattern.len(), 2);
|
||||
assert_eq!(event.occurrences, 10);
|
||||
assert_eq!(event.event_type(), "MotifDetected");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_anomaly_detected_event() {
|
||||
let event = AnomalyDetected::new(
|
||||
EmbeddingId::new(),
|
||||
0.9,
|
||||
AnomalyType::Novel,
|
||||
ClusterId::new(),
|
||||
2.5,
|
||||
);
|
||||
|
||||
assert_eq!(event.anomaly_type, AnomalyType::Novel);
|
||||
assert_eq!(event.event_type(), "AnomalyDetected");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
//! Domain layer for the Analysis bounded context.
|
||||
//!
|
||||
//! Contains core domain entities, value objects, repository traits, and domain events.
|
||||
|
||||
pub mod entities;
|
||||
pub mod events;
|
||||
pub mod repository;
|
||||
pub mod value_objects;
|
||||
|
||||
// Re-export commonly used types
|
||||
pub use entities::*;
|
||||
pub use events::*;
|
||||
pub use repository::*;
|
||||
pub use value_objects::*;
|
||||
@@ -0,0 +1,290 @@
|
||||
//! Repository traits for the Analysis bounded context.
|
||||
//!
|
||||
//! These traits define the persistence interfaces for domain entities.
|
||||
//! Implementations are provided in the infrastructure layer.
|
||||
|
||||
use async_trait::async_trait;
|
||||
use thiserror::Error;
|
||||
|
||||
use super::entities::{
|
||||
Anomaly, Cluster, ClusterId, EmbeddingId, Motif, Prototype, RecordingId, SequenceAnalysis,
|
||||
};
|
||||
|
||||
/// Errors that can occur during repository operations.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum RepositoryError {
|
||||
/// Entity not found.
|
||||
#[error("Entity not found: {0}")]
|
||||
NotFound(String),
|
||||
|
||||
/// Duplicate entity.
|
||||
#[error("Duplicate entity: {0}")]
|
||||
Duplicate(String),
|
||||
|
||||
/// Database connection error.
|
||||
#[error("Connection error: {0}")]
|
||||
ConnectionError(String),
|
||||
|
||||
/// Query execution error.
|
||||
#[error("Query error: {0}")]
|
||||
QueryError(String),
|
||||
|
||||
/// Serialization/deserialization error.
|
||||
#[error("Serialization error: {0}")]
|
||||
SerializationError(String),
|
||||
|
||||
/// Invalid data error.
|
||||
#[error("Invalid data: {0}")]
|
||||
InvalidData(String),
|
||||
|
||||
/// Concurrency conflict.
|
||||
#[error("Concurrency conflict: {0}")]
|
||||
ConcurrencyError(String),
|
||||
|
||||
/// Internal error.
|
||||
#[error("Internal error: {0}")]
|
||||
Internal(String),
|
||||
}
|
||||
|
||||
/// Result type for repository operations.
|
||||
pub type Result<T> = std::result::Result<T, RepositoryError>;
|
||||
|
||||
/// Repository for cluster persistence.
|
||||
#[async_trait]
|
||||
pub trait ClusterRepository: Send + Sync {
|
||||
/// Save a cluster to the repository.
|
||||
async fn save_cluster(&self, cluster: &Cluster) -> Result<()>;
|
||||
|
||||
/// Save multiple clusters in a batch.
|
||||
async fn save_clusters(&self, clusters: &[Cluster]) -> Result<()>;
|
||||
|
||||
/// Find a cluster by its ID.
|
||||
async fn find_cluster(&self, id: &ClusterId) -> Result<Option<Cluster>>;
|
||||
|
||||
/// List all clusters.
|
||||
async fn list_clusters(&self) -> Result<Vec<Cluster>>;
|
||||
|
||||
/// List clusters with pagination.
|
||||
async fn list_clusters_paginated(
|
||||
&self,
|
||||
offset: usize,
|
||||
limit: usize,
|
||||
) -> Result<Vec<Cluster>>;
|
||||
|
||||
/// Assign an embedding to a cluster.
|
||||
async fn assign_to_cluster(
|
||||
&self,
|
||||
embedding_id: &EmbeddingId,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Remove an embedding from its cluster.
|
||||
async fn remove_from_cluster(&self, embedding_id: &EmbeddingId) -> Result<()>;
|
||||
|
||||
/// Find the cluster containing a specific embedding.
|
||||
async fn find_cluster_by_embedding(
|
||||
&self,
|
||||
embedding_id: &EmbeddingId,
|
||||
) -> Result<Option<Cluster>>;
|
||||
|
||||
/// Delete a cluster.
|
||||
async fn delete_cluster(&self, id: &ClusterId) -> Result<()>;
|
||||
|
||||
/// Delete all clusters.
|
||||
async fn delete_all_clusters(&self) -> Result<()>;
|
||||
|
||||
/// Get cluster count.
|
||||
async fn cluster_count(&self) -> Result<usize>;
|
||||
|
||||
/// Find clusters by label pattern.
|
||||
async fn find_clusters_by_label(&self, label_pattern: &str) -> Result<Vec<Cluster>>;
|
||||
|
||||
/// Update cluster label.
|
||||
async fn update_cluster_label(
|
||||
&self,
|
||||
id: &ClusterId,
|
||||
label: Option<String>,
|
||||
) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Repository for prototype persistence.
|
||||
#[async_trait]
|
||||
pub trait PrototypeRepository: Send + Sync {
|
||||
/// Save a prototype.
|
||||
async fn save_prototype(&self, prototype: &Prototype) -> Result<()>;
|
||||
|
||||
/// Save multiple prototypes in a batch.
|
||||
async fn save_prototypes(&self, prototypes: &[Prototype]) -> Result<()>;
|
||||
|
||||
/// Find prototypes for a cluster.
|
||||
async fn find_prototypes_by_cluster(
|
||||
&self,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<Vec<Prototype>>;
|
||||
|
||||
/// Find the best prototype for a cluster.
|
||||
async fn find_best_prototype(
|
||||
&self,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<Option<Prototype>>;
|
||||
|
||||
/// Delete prototypes for a cluster.
|
||||
async fn delete_prototypes_by_cluster(&self, cluster_id: &ClusterId) -> Result<()>;
|
||||
|
||||
/// Delete all prototypes.
|
||||
async fn delete_all_prototypes(&self) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Repository for motif persistence.
|
||||
#[async_trait]
|
||||
pub trait MotifRepository: Send + Sync {
|
||||
/// Save a motif.
|
||||
async fn save_motif(&self, motif: &Motif) -> Result<()>;
|
||||
|
||||
/// Save multiple motifs in a batch.
|
||||
async fn save_motifs(&self, motifs: &[Motif]) -> Result<()>;
|
||||
|
||||
/// Find a motif by its ID.
|
||||
async fn find_motif(&self, id: &str) -> Result<Option<Motif>>;
|
||||
|
||||
/// Find motifs containing a specific cluster.
|
||||
async fn find_motifs_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Motif>>;
|
||||
|
||||
/// List all motifs.
|
||||
async fn list_motifs(&self) -> Result<Vec<Motif>>;
|
||||
|
||||
/// List motifs with minimum confidence.
|
||||
async fn find_motifs_by_confidence(&self, min_confidence: f32) -> Result<Vec<Motif>>;
|
||||
|
||||
/// List motifs with minimum occurrences.
|
||||
async fn find_motifs_by_occurrences(&self, min_occurrences: usize) -> Result<Vec<Motif>>;
|
||||
|
||||
/// Delete a motif.
|
||||
async fn delete_motif(&self, id: &str) -> Result<()>;
|
||||
|
||||
/// Delete all motifs.
|
||||
async fn delete_all_motifs(&self) -> Result<()>;
|
||||
|
||||
/// Get motif count.
|
||||
async fn motif_count(&self) -> Result<usize>;
|
||||
|
||||
/// Find motifs by sequence pattern (exact match).
|
||||
async fn find_motifs_by_sequence(&self, sequence: &[ClusterId]) -> Result<Vec<Motif>>;
|
||||
|
||||
/// Find motifs by sequence pattern (subsequence match).
|
||||
async fn find_motifs_containing_subsequence(
|
||||
&self,
|
||||
subsequence: &[ClusterId],
|
||||
) -> Result<Vec<Motif>>;
|
||||
}
|
||||
|
||||
/// Repository for sequence analysis persistence.
|
||||
#[async_trait]
|
||||
pub trait SequenceRepository: Send + Sync {
|
||||
/// Save a sequence analysis.
|
||||
async fn save_sequence_analysis(&self, analysis: &SequenceAnalysis) -> Result<()>;
|
||||
|
||||
/// Find sequence analysis for a recording.
|
||||
async fn find_sequence_by_recording(
|
||||
&self,
|
||||
recording_id: &RecordingId,
|
||||
) -> Result<Option<SequenceAnalysis>>;
|
||||
|
||||
/// List all sequence analyses.
|
||||
async fn list_sequence_analyses(&self) -> Result<Vec<SequenceAnalysis>>;
|
||||
|
||||
/// Delete sequence analysis for a recording.
|
||||
async fn delete_sequence_by_recording(&self, recording_id: &RecordingId) -> Result<()>;
|
||||
|
||||
/// Delete all sequence analyses.
|
||||
async fn delete_all_sequences(&self) -> Result<()>;
|
||||
|
||||
/// Find sequences with entropy above threshold.
|
||||
async fn find_sequences_by_entropy(&self, min_entropy: f32) -> Result<Vec<SequenceAnalysis>>;
|
||||
|
||||
/// Find sequences with stereotypy above threshold.
|
||||
async fn find_sequences_by_stereotypy(
|
||||
&self,
|
||||
min_stereotypy: f32,
|
||||
) -> Result<Vec<SequenceAnalysis>>;
|
||||
}
|
||||
|
||||
/// Repository for anomaly persistence.
|
||||
#[async_trait]
|
||||
pub trait AnomalyRepository: Send + Sync {
|
||||
/// Save an anomaly.
|
||||
async fn save_anomaly(&self, anomaly: &Anomaly) -> Result<()>;
|
||||
|
||||
/// Save multiple anomalies in a batch.
|
||||
async fn save_anomalies(&self, anomalies: &[Anomaly]) -> Result<()>;
|
||||
|
||||
/// Find an anomaly by embedding ID.
|
||||
async fn find_anomaly(&self, embedding_id: &EmbeddingId) -> Result<Option<Anomaly>>;
|
||||
|
||||
/// List all anomalies.
|
||||
async fn list_anomalies(&self) -> Result<Vec<Anomaly>>;
|
||||
|
||||
/// Find anomalies with score above threshold.
|
||||
async fn find_anomalies_by_score(&self, min_score: f32) -> Result<Vec<Anomaly>>;
|
||||
|
||||
/// Find anomalies near a specific cluster.
|
||||
async fn find_anomalies_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Anomaly>>;
|
||||
|
||||
/// Delete an anomaly.
|
||||
async fn delete_anomaly(&self, embedding_id: &EmbeddingId) -> Result<()>;
|
||||
|
||||
/// Delete all anomalies.
|
||||
async fn delete_all_anomalies(&self) -> Result<()>;
|
||||
|
||||
/// Get anomaly count.
|
||||
async fn anomaly_count(&self) -> Result<usize>;
|
||||
}
|
||||
|
||||
/// Combined repository for all analysis entities.
|
||||
///
|
||||
/// This trait combines all individual repositories for convenience
|
||||
/// when a single interface to all analysis data is needed.
|
||||
#[async_trait]
|
||||
pub trait AnalysisRepository:
|
||||
ClusterRepository + PrototypeRepository + MotifRepository + SequenceRepository + AnomalyRepository
|
||||
{
|
||||
/// Clear all analysis data.
|
||||
async fn clear_all(&self) -> Result<()> {
|
||||
self.delete_all_clusters().await?;
|
||||
self.delete_all_prototypes().await?;
|
||||
self.delete_all_motifs().await?;
|
||||
self.delete_all_sequences().await?;
|
||||
self.delete_all_anomalies().await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Unit of work for transactional operations.
|
||||
#[async_trait]
|
||||
pub trait UnitOfWork: Send + Sync {
|
||||
/// Type of repository returned by this unit of work.
|
||||
type Repository: AnalysisRepository;
|
||||
|
||||
/// Begin a new transaction and return a repository.
|
||||
async fn begin(&self) -> Result<Self::Repository>;
|
||||
|
||||
/// Commit the current transaction.
|
||||
async fn commit(&self) -> Result<()>;
|
||||
|
||||
/// Rollback the current transaction.
|
||||
async fn rollback(&self) -> Result<()>;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_repository_error_display() {
|
||||
let err = RepositoryError::NotFound("cluster-123".to_string());
|
||||
assert!(format!("{}", err).contains("cluster-123"));
|
||||
|
||||
let err = RepositoryError::QueryError("syntax error".to_string());
|
||||
assert!(format!("{}", err).contains("syntax error"));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,616 @@
|
||||
//! Value objects for the Analysis bounded context.
|
||||
//!
|
||||
//! Value objects are immutable objects that represent concepts without identity.
|
||||
//! They are defined by their attributes rather than a unique identifier.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use super::entities::ClusterId;
|
||||
|
||||
/// Method used for clustering embeddings.
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub enum ClusteringMethod {
|
||||
/// HDBSCAN (Hierarchical Density-Based Spatial Clustering).
|
||||
/// Good for discovering clusters of varying densities and shapes.
|
||||
HDBSCAN,
|
||||
|
||||
/// K-Means clustering with fixed number of clusters.
|
||||
KMeans {
|
||||
/// Number of clusters to create.
|
||||
k: usize,
|
||||
},
|
||||
|
||||
/// Spectral clustering using eigenvalues of similarity matrix.
|
||||
Spectral {
|
||||
/// Number of clusters to create.
|
||||
n_clusters: usize,
|
||||
},
|
||||
|
||||
/// Agglomerative hierarchical clustering.
|
||||
Agglomerative {
|
||||
/// Number of clusters to create.
|
||||
n_clusters: usize,
|
||||
/// Linkage criterion (ward, complete, average, single).
|
||||
linkage: LinkageMethod,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for ClusteringMethod {
|
||||
fn default() -> Self {
|
||||
Self::HDBSCAN
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ClusteringMethod {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ClusteringMethod::HDBSCAN => write!(f, "HDBSCAN"),
|
||||
ClusteringMethod::KMeans { k } => write!(f, "K-Means (k={})", k),
|
||||
ClusteringMethod::Spectral { n_clusters } => {
|
||||
write!(f, "Spectral (n={})", n_clusters)
|
||||
}
|
||||
ClusteringMethod::Agglomerative { n_clusters, linkage } => {
|
||||
write!(f, "Agglomerative (n={}, {:?})", n_clusters, linkage)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Linkage method for agglomerative clustering.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum LinkageMethod {
|
||||
/// Ward's minimum variance method.
|
||||
Ward,
|
||||
/// Complete linkage (maximum distance).
|
||||
Complete,
|
||||
/// Average linkage (mean distance).
|
||||
Average,
|
||||
/// Single linkage (minimum distance).
|
||||
Single,
|
||||
}
|
||||
|
||||
impl Default for LinkageMethod {
|
||||
fn default() -> Self {
|
||||
Self::Ward
|
||||
}
|
||||
}
|
||||
|
||||
/// Distance metric for clustering.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum DistanceMetric {
|
||||
/// Euclidean distance (L2 norm).
|
||||
Euclidean,
|
||||
/// Cosine distance (1 - cosine similarity).
|
||||
Cosine,
|
||||
/// Manhattan distance (L1 norm).
|
||||
Manhattan,
|
||||
/// Poincare distance (hyperbolic space).
|
||||
Poincare,
|
||||
}
|
||||
|
||||
impl Default for DistanceMetric {
|
||||
fn default() -> Self {
|
||||
Self::Cosine
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DistanceMetric {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
DistanceMetric::Euclidean => write!(f, "Euclidean"),
|
||||
DistanceMetric::Cosine => write!(f, "Cosine"),
|
||||
DistanceMetric::Manhattan => write!(f, "Manhattan"),
|
||||
DistanceMetric::Poincare => write!(f, "Poincare"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parameters for clustering algorithms.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClusteringParameters {
|
||||
/// Minimum number of points to form a cluster (HDBSCAN).
|
||||
pub min_cluster_size: usize,
|
||||
|
||||
/// Minimum number of samples in neighborhood (HDBSCAN).
|
||||
pub min_samples: usize,
|
||||
|
||||
/// Epsilon for DBSCAN-like algorithms (optional distance threshold).
|
||||
pub epsilon: Option<f32>,
|
||||
|
||||
/// Distance metric to use.
|
||||
pub metric: DistanceMetric,
|
||||
|
||||
/// Maximum number of clusters (optional limit).
|
||||
pub max_clusters: Option<usize>,
|
||||
|
||||
/// Whether to allow single-point clusters.
|
||||
pub allow_single_cluster: bool,
|
||||
}
|
||||
|
||||
impl Default for ClusteringParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
min_cluster_size: 5,
|
||||
min_samples: 3,
|
||||
epsilon: None,
|
||||
metric: DistanceMetric::Cosine,
|
||||
max_clusters: None,
|
||||
allow_single_cluster: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusteringParameters {
|
||||
/// Create parameters for HDBSCAN.
|
||||
#[must_use]
|
||||
pub fn hdbscan(min_cluster_size: usize, min_samples: usize) -> Self {
|
||||
Self {
|
||||
min_cluster_size,
|
||||
min_samples,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Create parameters for K-means.
|
||||
#[must_use]
|
||||
pub fn kmeans() -> Self {
|
||||
Self {
|
||||
min_cluster_size: 1,
|
||||
min_samples: 1,
|
||||
allow_single_cluster: true,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the distance metric.
|
||||
#[must_use]
|
||||
pub fn with_metric(mut self, metric: DistanceMetric) -> Self {
|
||||
self.metric = metric;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the epsilon threshold.
|
||||
#[must_use]
|
||||
pub fn with_epsilon(mut self, epsilon: f32) -> Self {
|
||||
self.epsilon = Some(epsilon);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for clustering operations.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClusteringConfig {
|
||||
/// The clustering method to use.
|
||||
pub method: ClusteringMethod,
|
||||
|
||||
/// Parameters for the clustering algorithm.
|
||||
pub parameters: ClusteringParameters,
|
||||
|
||||
/// Whether to compute cluster prototypes.
|
||||
pub compute_prototypes: bool,
|
||||
|
||||
/// Number of prototypes to compute per cluster.
|
||||
pub prototypes_per_cluster: usize,
|
||||
|
||||
/// Whether to compute silhouette scores.
|
||||
pub compute_silhouette: bool,
|
||||
|
||||
/// Random seed for reproducibility.
|
||||
pub random_seed: Option<u64>,
|
||||
}
|
||||
|
||||
impl Default for ClusteringConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
method: ClusteringMethod::HDBSCAN,
|
||||
parameters: ClusteringParameters::default(),
|
||||
compute_prototypes: true,
|
||||
prototypes_per_cluster: 3,
|
||||
compute_silhouette: true,
|
||||
random_seed: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusteringConfig {
|
||||
/// Create a HDBSCAN configuration.
|
||||
#[must_use]
|
||||
pub fn hdbscan(min_cluster_size: usize, min_samples: usize) -> Self {
|
||||
Self {
|
||||
method: ClusteringMethod::HDBSCAN,
|
||||
parameters: ClusteringParameters::hdbscan(min_cluster_size, min_samples),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a K-means configuration.
|
||||
#[must_use]
|
||||
pub fn kmeans(k: usize) -> Self {
|
||||
Self {
|
||||
method: ClusteringMethod::KMeans { k },
|
||||
parameters: ClusteringParameters::kmeans(),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Set a random seed for reproducibility.
|
||||
#[must_use]
|
||||
pub fn with_seed(mut self, seed: u64) -> Self {
|
||||
self.random_seed = Some(seed);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for motif detection.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MotifConfig {
|
||||
/// Minimum length of motifs to detect.
|
||||
pub min_length: usize,
|
||||
|
||||
/// Maximum length of motifs to detect.
|
||||
pub max_length: usize,
|
||||
|
||||
/// Minimum number of occurrences for a motif.
|
||||
pub min_occurrences: usize,
|
||||
|
||||
/// Minimum confidence threshold for motifs.
|
||||
pub min_confidence: f32,
|
||||
|
||||
/// Whether to allow overlapping occurrences.
|
||||
pub allow_overlap: bool,
|
||||
|
||||
/// Maximum gap (in clusters) between motif elements.
|
||||
pub max_gap: usize,
|
||||
}
|
||||
|
||||
impl Default for MotifConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
min_length: 2,
|
||||
max_length: 10,
|
||||
min_occurrences: 3,
|
||||
min_confidence: 0.5,
|
||||
allow_overlap: false,
|
||||
max_gap: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MotifConfig {
|
||||
/// Create a strict motif configuration (no gaps, no overlap).
|
||||
#[must_use]
|
||||
pub fn strict() -> Self {
|
||||
Self {
|
||||
min_length: 3,
|
||||
max_length: 8,
|
||||
min_occurrences: 5,
|
||||
min_confidence: 0.7,
|
||||
allow_overlap: false,
|
||||
max_gap: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a relaxed motif configuration (allows gaps).
|
||||
#[must_use]
|
||||
pub fn relaxed() -> Self {
|
||||
Self {
|
||||
min_length: 2,
|
||||
max_length: 15,
|
||||
min_occurrences: 2,
|
||||
min_confidence: 0.3,
|
||||
allow_overlap: true,
|
||||
max_gap: 2,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the length range.
|
||||
#[must_use]
|
||||
pub fn with_length_range(mut self, min: usize, max: usize) -> Self {
|
||||
self.min_length = min;
|
||||
self.max_length = max;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics computed from sequence analysis.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SequenceMetrics {
|
||||
/// Shannon entropy of the sequence.
|
||||
pub entropy: f32,
|
||||
|
||||
/// Normalized entropy (entropy / max_entropy).
|
||||
pub normalized_entropy: f32,
|
||||
|
||||
/// Stereotypy score (1 - normalized_entropy).
|
||||
pub stereotypy: f32,
|
||||
|
||||
/// Number of unique clusters in the sequence.
|
||||
pub unique_clusters: usize,
|
||||
|
||||
/// Number of unique transitions in the sequence.
|
||||
pub unique_transitions: usize,
|
||||
|
||||
/// Total number of transitions.
|
||||
pub total_transitions: usize,
|
||||
|
||||
/// Most common transition and its probability.
|
||||
pub dominant_transition: Option<(ClusterId, ClusterId, f32)>,
|
||||
|
||||
/// Repetition rate (self-transitions / total).
|
||||
pub repetition_rate: f32,
|
||||
}
|
||||
|
||||
impl Default for SequenceMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
entropy: 0.0,
|
||||
normalized_entropy: 0.0,
|
||||
stereotypy: 1.0,
|
||||
unique_clusters: 0,
|
||||
unique_transitions: 0,
|
||||
total_transitions: 0,
|
||||
dominant_transition: None,
|
||||
repetition_rate: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Transition matrix for Markov chain analysis.
|
||||
///
|
||||
/// Represents the probabilities of transitioning from one cluster to another.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TransitionMatrix {
|
||||
/// Ordered list of cluster IDs (defines row/column indices).
|
||||
pub cluster_ids: Vec<ClusterId>,
|
||||
|
||||
/// Transition probabilities (row = source, column = target).
|
||||
/// Values are probabilities (0.0 to 1.0, rows sum to 1.0).
|
||||
pub probabilities: Vec<Vec<f32>>,
|
||||
|
||||
/// Raw observation counts (row = source, column = target).
|
||||
pub observations: Vec<Vec<u32>>,
|
||||
|
||||
/// Mapping from ClusterId to matrix index.
|
||||
#[serde(skip)]
|
||||
index_map: HashMap<ClusterId, usize>,
|
||||
}
|
||||
|
||||
impl TransitionMatrix {
|
||||
/// Create a new transition matrix for the given clusters.
|
||||
#[must_use]
|
||||
pub fn new(cluster_ids: Vec<ClusterId>) -> Self {
|
||||
let n = cluster_ids.len();
|
||||
let index_map: HashMap<ClusterId, usize> = cluster_ids
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, id)| (*id, i))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
cluster_ids,
|
||||
probabilities: vec![vec![0.0; n]; n],
|
||||
observations: vec![vec![0; n]; n],
|
||||
index_map,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the number of clusters (states) in the matrix.
|
||||
#[must_use]
|
||||
pub fn size(&self) -> usize {
|
||||
self.cluster_ids.len()
|
||||
}
|
||||
|
||||
/// Get the index for a cluster ID.
|
||||
#[must_use]
|
||||
pub fn index_of(&self, cluster_id: &ClusterId) -> Option<usize> {
|
||||
self.index_map.get(cluster_id).copied()
|
||||
}
|
||||
|
||||
/// Record an observed transition.
|
||||
pub fn record_transition(&mut self, from: &ClusterId, to: &ClusterId) {
|
||||
if let (Some(i), Some(j)) = (self.index_of(from), self.index_of(to)) {
|
||||
self.observations[i][j] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute probabilities from observation counts.
|
||||
pub fn compute_probabilities(&mut self) {
|
||||
for i in 0..self.size() {
|
||||
let row_sum: u32 = self.observations[i].iter().sum();
|
||||
if row_sum > 0 {
|
||||
for j in 0..self.size() {
|
||||
self.probabilities[i][j] = self.observations[i][j] as f32 / row_sum as f32;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the transition probability from one cluster to another.
|
||||
#[must_use]
|
||||
pub fn probability(&self, from: &ClusterId, to: &ClusterId) -> Option<f32> {
|
||||
match (self.index_of(from), self.index_of(to)) {
|
||||
(Some(i), Some(j)) => Some(self.probabilities[i][j]),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the observation count for a transition.
|
||||
#[must_use]
|
||||
pub fn observation_count(&self, from: &ClusterId, to: &ClusterId) -> Option<u32> {
|
||||
match (self.index_of(from), self.index_of(to)) {
|
||||
(Some(i), Some(j)) => Some(self.observations[i][j]),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all non-zero transitions as (from, to, probability) tuples.
|
||||
#[must_use]
|
||||
pub fn non_zero_transitions(&self) -> Vec<(ClusterId, ClusterId, f32)> {
|
||||
let mut transitions = Vec::new();
|
||||
for (i, from) in self.cluster_ids.iter().enumerate() {
|
||||
for (j, to) in self.cluster_ids.iter().enumerate() {
|
||||
let prob = self.probabilities[i][j];
|
||||
if prob > 0.0 {
|
||||
transitions.push((*from, *to, prob));
|
||||
}
|
||||
}
|
||||
}
|
||||
transitions
|
||||
}
|
||||
|
||||
/// Get the stationary distribution (eigenvector of eigenvalue 1).
|
||||
/// Returns None if the matrix is not ergodic.
|
||||
#[must_use]
|
||||
pub fn stationary_distribution(&self) -> Option<Vec<f32>> {
|
||||
// Power iteration method for finding stationary distribution
|
||||
let n = self.size();
|
||||
if n == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut dist = vec![1.0 / n as f32; n];
|
||||
let max_iterations = 1000;
|
||||
let tolerance = 1e-8;
|
||||
|
||||
for _ in 0..max_iterations {
|
||||
let mut new_dist = vec![0.0; n];
|
||||
|
||||
// Matrix-vector multiplication: new_dist = dist * P^T
|
||||
for j in 0..n {
|
||||
for i in 0..n {
|
||||
new_dist[j] += dist[i] * self.probabilities[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// Check convergence
|
||||
let diff: f32 = dist
|
||||
.iter()
|
||||
.zip(new_dist.iter())
|
||||
.map(|(a, b)| (a - b).abs())
|
||||
.sum();
|
||||
|
||||
dist = new_dist;
|
||||
|
||||
if diff < tolerance {
|
||||
return Some(dist);
|
||||
}
|
||||
}
|
||||
|
||||
Some(dist)
|
||||
}
|
||||
|
||||
/// Rebuild the index map (needed after deserialization).
|
||||
pub fn rebuild_index_map(&mut self) {
|
||||
self.index_map = self
|
||||
.cluster_ids
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, id)| (*id, i))
|
||||
.collect();
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a clustering operation.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClusteringResult {
|
||||
/// The clusters discovered.
|
||||
pub clusters: Vec<super::entities::Cluster>,
|
||||
|
||||
/// Embeddings classified as noise (HDBSCAN).
|
||||
pub noise: Vec<super::entities::EmbeddingId>,
|
||||
|
||||
/// Silhouette score (if computed).
|
||||
pub silhouette_score: Option<f32>,
|
||||
|
||||
/// V-measure score (if ground truth available).
|
||||
pub v_measure: Option<f32>,
|
||||
|
||||
/// Prototypes for each cluster.
|
||||
pub prototypes: Vec<super::entities::Prototype>,
|
||||
|
||||
/// Parameters used for clustering.
|
||||
pub parameters: ClusteringParameters,
|
||||
|
||||
/// Method used for clustering.
|
||||
pub method: ClusteringMethod,
|
||||
}
|
||||
|
||||
impl ClusteringResult {
|
||||
/// Get the number of clusters (excluding noise).
|
||||
#[must_use]
|
||||
pub fn cluster_count(&self) -> usize {
|
||||
self.clusters.len()
|
||||
}
|
||||
|
||||
/// Get the noise rate (proportion of points in noise).
|
||||
#[must_use]
|
||||
pub fn noise_rate(&self) -> f32 {
|
||||
let total = self
|
||||
.clusters
|
||||
.iter()
|
||||
.map(|c| c.member_count())
|
||||
.sum::<usize>()
|
||||
+ self.noise.len();
|
||||
if total == 0 {
|
||||
0.0
|
||||
} else {
|
||||
self.noise.len() as f32 / total as f32
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_clustering_config_creation() {
|
||||
let config = ClusteringConfig::hdbscan(10, 5);
|
||||
assert!(matches!(config.method, ClusteringMethod::HDBSCAN));
|
||||
assert_eq!(config.parameters.min_cluster_size, 10);
|
||||
assert_eq!(config.parameters.min_samples, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_transition_matrix() {
|
||||
let c1 = ClusterId::new();
|
||||
let c2 = ClusterId::new();
|
||||
let c3 = ClusterId::new();
|
||||
|
||||
let mut matrix = TransitionMatrix::new(vec![c1, c2, c3]);
|
||||
|
||||
// Record some transitions
|
||||
matrix.record_transition(&c1, &c2);
|
||||
matrix.record_transition(&c1, &c2);
|
||||
matrix.record_transition(&c1, &c3);
|
||||
matrix.record_transition(&c2, &c1);
|
||||
|
||||
matrix.compute_probabilities();
|
||||
|
||||
// c1 -> c2 should be 2/3
|
||||
assert!((matrix.probability(&c1, &c2).unwrap() - 2.0 / 3.0).abs() < 0.001);
|
||||
// c1 -> c3 should be 1/3
|
||||
assert!((matrix.probability(&c1, &c3).unwrap() - 1.0 / 3.0).abs() < 0.001);
|
||||
// c2 -> c1 should be 1.0
|
||||
assert!((matrix.probability(&c2, &c1).unwrap() - 1.0).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_motif_config() {
|
||||
let config = MotifConfig::strict();
|
||||
assert_eq!(config.min_length, 3);
|
||||
assert_eq!(config.min_occurrences, 5);
|
||||
assert!(!config.allow_overlap);
|
||||
|
||||
let relaxed = MotifConfig::relaxed();
|
||||
assert!(relaxed.allow_overlap);
|
||||
assert_eq!(relaxed.max_gap, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distance_metric_display() {
|
||||
assert_eq!(format!("{}", DistanceMetric::Cosine), "Cosine");
|
||||
assert_eq!(format!("{}", DistanceMetric::Euclidean), "Euclidean");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,404 @@
|
||||
//! HDBSCAN clustering implementation.
|
||||
//!
|
||||
//! Hierarchical Density-Based Spatial Clustering of Applications with Noise.
|
||||
//! This implementation uses core distance and mutual reachability distance
|
||||
//! to build a minimum spanning tree and extract clusters.
|
||||
|
||||
use ndarray::{Array2, ArrayView1};
|
||||
use petgraph::graph::{NodeIndex, UnGraph};
|
||||
use petgraph::algo::min_spanning_tree;
|
||||
use petgraph::data::FromElements;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
use crate::application::services::AnalysisError;
|
||||
use crate::domain::value_objects::DistanceMetric;
|
||||
|
||||
/// HDBSCAN clustering algorithm.
|
||||
pub struct HdbscanClusterer {
|
||||
/// Minimum cluster size.
|
||||
min_cluster_size: usize,
|
||||
/// Minimum samples for core point determination.
|
||||
min_samples: usize,
|
||||
/// Distance metric to use.
|
||||
metric: DistanceMetric,
|
||||
}
|
||||
|
||||
impl HdbscanClusterer {
|
||||
/// Create a new HDBSCAN clusterer.
|
||||
#[must_use]
|
||||
pub fn new(min_cluster_size: usize, min_samples: usize, metric: DistanceMetric) -> Self {
|
||||
Self {
|
||||
min_cluster_size,
|
||||
min_samples,
|
||||
metric,
|
||||
}
|
||||
}
|
||||
|
||||
/// Fit HDBSCAN to the data and return cluster labels.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `data` - 2D array where rows are samples and columns are features
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Vector of cluster labels (-1 for noise).
|
||||
#[instrument(skip(self, data), fields(n_samples = data.nrows(), n_features = data.ncols()))]
|
||||
pub fn fit(&self, data: &Array2<f32>) -> Result<Vec<i32>, AnalysisError> {
|
||||
let n = data.nrows();
|
||||
if n < self.min_cluster_size {
|
||||
return Err(AnalysisError::InsufficientData(format!(
|
||||
"Need at least {} samples, got {}",
|
||||
self.min_cluster_size, n
|
||||
)));
|
||||
}
|
||||
|
||||
debug!(
|
||||
n_samples = n,
|
||||
min_cluster_size = self.min_cluster_size,
|
||||
min_samples = self.min_samples,
|
||||
"Starting HDBSCAN fit"
|
||||
);
|
||||
|
||||
// Step 1: Compute pairwise distances
|
||||
let distances = self.compute_pairwise_distances(data);
|
||||
|
||||
// Step 2: Compute core distances
|
||||
let core_distances = self.compute_core_distances(&distances);
|
||||
|
||||
// Step 3: Compute mutual reachability distances
|
||||
let mrd = self.compute_mutual_reachability(&distances, &core_distances);
|
||||
|
||||
// Step 4: Build minimum spanning tree
|
||||
let mst = self.build_mst(&mrd);
|
||||
|
||||
// Step 5: Build cluster hierarchy
|
||||
let labels = self.extract_clusters(&mst, n);
|
||||
|
||||
debug!(
|
||||
n_clusters = labels.iter().filter(|&&l| l >= 0).collect::<HashSet<_>>().len(),
|
||||
n_noise = labels.iter().filter(|&&l| l < 0).count(),
|
||||
"HDBSCAN fit completed"
|
||||
);
|
||||
|
||||
Ok(labels)
|
||||
}
|
||||
|
||||
/// Compute pairwise distance matrix.
|
||||
fn compute_pairwise_distances(&self, data: &Array2<f32>) -> Array2<f32> {
|
||||
let n = data.nrows();
|
||||
let mut distances = Array2::<f32>::zeros((n, n));
|
||||
|
||||
for i in 0..n {
|
||||
for j in (i + 1)..n {
|
||||
let dist = self.distance(data.row(i), data.row(j));
|
||||
distances[[i, j]] = dist;
|
||||
distances[[j, i]] = dist;
|
||||
}
|
||||
}
|
||||
|
||||
distances
|
||||
}
|
||||
|
||||
/// Compute core distance for each point (k-th nearest neighbor distance).
|
||||
fn compute_core_distances(&self, distances: &Array2<f32>) -> Vec<f32> {
|
||||
let n = distances.nrows();
|
||||
let k = self.min_samples.min(n - 1);
|
||||
|
||||
let mut core_distances = Vec::with_capacity(n);
|
||||
|
||||
for i in 0..n {
|
||||
let mut row_distances: Vec<f32> = distances.row(i).to_vec();
|
||||
row_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
// k-th nearest neighbor (index k because index 0 is self with distance 0)
|
||||
let core_dist = row_distances.get(k).copied().unwrap_or(f32::MAX);
|
||||
core_distances.push(core_dist);
|
||||
}
|
||||
|
||||
core_distances
|
||||
}
|
||||
|
||||
/// Compute mutual reachability distance matrix.
|
||||
fn compute_mutual_reachability(
|
||||
&self,
|
||||
distances: &Array2<f32>,
|
||||
core_distances: &[f32],
|
||||
) -> Array2<f32> {
|
||||
let n = distances.nrows();
|
||||
let mut mrd = Array2::<f32>::zeros((n, n));
|
||||
|
||||
for i in 0..n {
|
||||
for j in (i + 1)..n {
|
||||
let d = distances[[i, j]];
|
||||
let mr = core_distances[i].max(core_distances[j]).max(d);
|
||||
mrd[[i, j]] = mr;
|
||||
mrd[[j, i]] = mr;
|
||||
}
|
||||
}
|
||||
|
||||
mrd
|
||||
}
|
||||
|
||||
/// Build minimum spanning tree from mutual reachability distances.
|
||||
fn build_mst(&self, mrd: &Array2<f32>) -> Vec<(usize, usize, f32)> {
|
||||
let n = mrd.nrows();
|
||||
|
||||
// Build graph with all edges
|
||||
let mut graph = UnGraph::<usize, f32>::new_undirected();
|
||||
|
||||
// Add nodes
|
||||
let nodes: Vec<NodeIndex> = (0..n).map(|i| graph.add_node(i)).collect();
|
||||
|
||||
// Add edges (only upper triangle to avoid duplicates)
|
||||
for i in 0..n {
|
||||
for j in (i + 1)..n {
|
||||
let weight = mrd[[i, j]];
|
||||
if weight < f32::MAX {
|
||||
graph.add_edge(nodes[i], nodes[j], weight);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute MST using Prim's algorithm via petgraph
|
||||
let mst_graph = UnGraph::<usize, f32>::from_elements(min_spanning_tree(&graph));
|
||||
|
||||
// Extract edges from MST
|
||||
let mut edges: Vec<(usize, usize, f32)> = mst_graph
|
||||
.edge_indices()
|
||||
.filter_map(|e| {
|
||||
let (a, b) = mst_graph.edge_endpoints(e)?;
|
||||
let weight = *mst_graph.edge_weight(e)?;
|
||||
let a_val = *mst_graph.node_weight(a)?;
|
||||
let b_val = *mst_graph.node_weight(b)?;
|
||||
Some((a_val, b_val, weight))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by weight descending for cluster extraction
|
||||
edges.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
edges
|
||||
}
|
||||
|
||||
/// Extract flat clusters from MST using HDBSCAN* algorithm.
|
||||
fn extract_clusters(&self, mst: &[(usize, usize, f32)], n: usize) -> Vec<i32> {
|
||||
// Use simplified cluster extraction based on edge cutting
|
||||
// This is a simplified version - full HDBSCAN uses condensed tree
|
||||
|
||||
let mut labels = vec![-1i32; n];
|
||||
let mut current_cluster = 0i32;
|
||||
|
||||
// Build adjacency from MST
|
||||
let mut adj: HashMap<usize, Vec<(usize, f32)>> = HashMap::new();
|
||||
for &(a, b, w) in mst {
|
||||
adj.entry(a).or_default().push((b, w));
|
||||
adj.entry(b).or_default().push((a, w));
|
||||
}
|
||||
|
||||
// Find connected components, removing edges above threshold
|
||||
// Use adaptive threshold based on edge weight distribution
|
||||
let threshold = self.compute_threshold(mst);
|
||||
|
||||
let mut visited = vec![false; n];
|
||||
|
||||
for start in 0..n {
|
||||
if visited[start] {
|
||||
continue;
|
||||
}
|
||||
|
||||
// BFS to find connected component
|
||||
let mut component = Vec::new();
|
||||
let mut queue = vec![start];
|
||||
|
||||
while let Some(node) = queue.pop() {
|
||||
if visited[node] {
|
||||
continue;
|
||||
}
|
||||
visited[node] = true;
|
||||
component.push(node);
|
||||
|
||||
if let Some(neighbors) = adj.get(&node) {
|
||||
for &(neighbor, weight) in neighbors {
|
||||
if !visited[neighbor] && weight < threshold {
|
||||
queue.push(neighbor);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Only assign cluster label if component is large enough
|
||||
if component.len() >= self.min_cluster_size {
|
||||
for &node in &component {
|
||||
labels[node] = current_cluster;
|
||||
}
|
||||
current_cluster += 1;
|
||||
}
|
||||
}
|
||||
|
||||
labels
|
||||
}
|
||||
|
||||
/// Compute adaptive threshold for edge cutting.
|
||||
fn compute_threshold(&self, mst: &[(usize, usize, f32)]) -> f32 {
|
||||
if mst.is_empty() {
|
||||
return f32::MAX;
|
||||
}
|
||||
|
||||
let weights: Vec<f32> = mst.iter().map(|&(_, _, w)| w).collect();
|
||||
let n = weights.len();
|
||||
|
||||
// Use median + IQR method for threshold
|
||||
let mut sorted = weights.clone();
|
||||
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
let _median = sorted[n / 2];
|
||||
let q1 = sorted[n / 4];
|
||||
let q3 = sorted[3 * n / 4];
|
||||
let iqr = q3 - q1;
|
||||
|
||||
// Threshold at Q3 + 1.5 * IQR (outlier boundary)
|
||||
q3 + 1.5 * iqr
|
||||
}
|
||||
|
||||
/// Compute distance between two vectors.
|
||||
fn distance(&self, a: ArrayView1<f32>, b: ArrayView1<f32>) -> f32 {
|
||||
match self.metric {
|
||||
DistanceMetric::Euclidean => {
|
||||
a.iter()
|
||||
.zip(b.iter())
|
||||
.map(|(x, y)| (x - y).powi(2))
|
||||
.sum::<f32>()
|
||||
.sqrt()
|
||||
}
|
||||
DistanceMetric::Cosine => {
|
||||
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
|
||||
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if norm_a == 0.0 || norm_b == 0.0 {
|
||||
1.0
|
||||
} else {
|
||||
1.0 - (dot / (norm_a * norm_b))
|
||||
}
|
||||
}
|
||||
DistanceMetric::Manhattan => a.iter().zip(b.iter()).map(|(x, y)| (x - y).abs()).sum(),
|
||||
DistanceMetric::Poincare => {
|
||||
// Simplified - would need proper hyperbolic distance
|
||||
a.iter()
|
||||
.zip(b.iter())
|
||||
.map(|(x, y)| (x - y).powi(2))
|
||||
.sum::<f32>()
|
||||
.sqrt()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Single linkage tree node for cluster hierarchy.
|
||||
#[derive(Debug, Clone)]
|
||||
struct SingleLinkageNode {
|
||||
left: Option<usize>,
|
||||
right: Option<usize>,
|
||||
distance: f32,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
/// HDBSCAN condensed tree for cluster extraction.
|
||||
#[derive(Debug)]
|
||||
pub struct CondensedTree {
|
||||
nodes: Vec<CondensedNode>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct CondensedNode {
|
||||
parent: Option<usize>,
|
||||
children: Vec<usize>,
|
||||
lambda_birth: f32,
|
||||
lambda_death: f32,
|
||||
stability: f32,
|
||||
points: HashSet<usize>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use ndarray::Array1;
|
||||
|
||||
fn create_clustered_data() -> Array2<f32> {
|
||||
// Create 3 clear clusters with deterministic variation
|
||||
let mut data = Array2::<f32>::zeros((30, 2));
|
||||
|
||||
// Cluster 1: around (0, 0)
|
||||
for i in 0..10 {
|
||||
data[[i, 0]] = rand_offset(0.0, i);
|
||||
data[[i, 1]] = rand_offset(0.0, i + 1);
|
||||
}
|
||||
|
||||
// Cluster 2: around (5, 5)
|
||||
for i in 10..20 {
|
||||
data[[i, 0]] = rand_offset(5.0, i);
|
||||
data[[i, 1]] = rand_offset(5.0, i + 1);
|
||||
}
|
||||
|
||||
// Cluster 3: around (10, 0)
|
||||
for i in 20..30 {
|
||||
data[[i, 0]] = rand_offset(10.0, i);
|
||||
data[[i, 1]] = rand_offset(0.0, i + 1);
|
||||
}
|
||||
|
||||
data
|
||||
}
|
||||
|
||||
fn rand_offset(center: f32, seed: usize) -> f32 {
|
||||
// Deterministic "random" offset using seed for variation
|
||||
let variation = ((seed as f32 * 1.618) % 1.0 - 0.5) * 0.5;
|
||||
center + variation
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hdbscan_basic() {
|
||||
let clusterer = HdbscanClusterer::new(3, 2, DistanceMetric::Euclidean);
|
||||
let data = create_clustered_data();
|
||||
|
||||
let labels = clusterer.fit(&data).unwrap();
|
||||
assert_eq!(labels.len(), 30);
|
||||
|
||||
// Should have at least one cluster
|
||||
let n_clusters = labels.iter().filter(|&&l| l >= 0).collect::<HashSet<_>>().len();
|
||||
assert!(n_clusters >= 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hdbscan_insufficient_data() {
|
||||
let clusterer = HdbscanClusterer::new(10, 5, DistanceMetric::Euclidean);
|
||||
let data = Array2::<f32>::zeros((5, 2));
|
||||
|
||||
let result = clusterer.fit(&data);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distance_euclidean() {
|
||||
let clusterer = HdbscanClusterer::new(5, 3, DistanceMetric::Euclidean);
|
||||
let a = Array1::from_vec(vec![0.0, 0.0]);
|
||||
let b = Array1::from_vec(vec![3.0, 4.0]);
|
||||
|
||||
let dist = clusterer.distance(a.view(), b.view());
|
||||
assert!((dist - 5.0).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distance_cosine() {
|
||||
let clusterer = HdbscanClusterer::new(5, 3, DistanceMetric::Cosine);
|
||||
let a = Array1::from_vec(vec![1.0, 0.0]);
|
||||
let b = Array1::from_vec(vec![1.0, 0.0]);
|
||||
|
||||
let dist = clusterer.distance(a.view(), b.view());
|
||||
assert!(dist.abs() < 0.001); // Same vector = 0 distance
|
||||
|
||||
let c = Array1::from_vec(vec![0.0, 1.0]);
|
||||
let dist2 = clusterer.distance(a.view(), c.view());
|
||||
assert!((dist2 - 1.0).abs() < 0.001); // Orthogonal = 1 distance
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,384 @@
|
||||
//! K-Means clustering implementation.
|
||||
//!
|
||||
//! Standard K-Means algorithm with k-means++ initialization for
|
||||
//! partitioning embeddings into k clusters.
|
||||
|
||||
use ndarray::{Array2, ArrayView1};
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
use crate::application::services::AnalysisError;
|
||||
|
||||
/// K-Means clustering algorithm.
|
||||
pub struct KMeansClusterer {
|
||||
/// Number of clusters.
|
||||
k: usize,
|
||||
/// Maximum iterations.
|
||||
max_iterations: usize,
|
||||
/// Convergence tolerance.
|
||||
tolerance: f32,
|
||||
/// Random seed for reproducibility.
|
||||
seed: Option<u64>,
|
||||
}
|
||||
|
||||
impl KMeansClusterer {
|
||||
/// Create a new K-Means clusterer.
|
||||
#[must_use]
|
||||
pub fn new(k: usize, seed: Option<u64>) -> Self {
|
||||
Self {
|
||||
k,
|
||||
max_iterations: 300,
|
||||
tolerance: 1e-4,
|
||||
seed,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set maximum iterations.
|
||||
#[must_use]
|
||||
pub fn with_max_iterations(mut self, max_iterations: usize) -> Self {
|
||||
self.max_iterations = max_iterations;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set convergence tolerance.
|
||||
#[must_use]
|
||||
pub fn with_tolerance(mut self, tolerance: f32) -> Self {
|
||||
self.tolerance = tolerance;
|
||||
self
|
||||
}
|
||||
|
||||
/// Fit K-Means to the data and return cluster labels and centroids.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `data` - 2D array where rows are samples and columns are features
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Tuple of (cluster labels, centroid matrix)
|
||||
#[instrument(skip(self, data), fields(n_samples = data.nrows(), n_features = data.ncols(), k = self.k))]
|
||||
pub fn fit(&self, data: &Array2<f32>) -> Result<(Vec<usize>, Array2<f32>), AnalysisError> {
|
||||
let n = data.nrows();
|
||||
let d = data.ncols();
|
||||
|
||||
if n < self.k {
|
||||
return Err(AnalysisError::InsufficientData(format!(
|
||||
"Need at least {} samples for k={}, got {}",
|
||||
self.k, self.k, n
|
||||
)));
|
||||
}
|
||||
|
||||
debug!(
|
||||
n_samples = n,
|
||||
n_features = d,
|
||||
k = self.k,
|
||||
"Starting K-Means fit"
|
||||
);
|
||||
|
||||
// Initialize centroids using k-means++ algorithm
|
||||
let mut centroids = self.kmeans_plus_plus_init(data);
|
||||
|
||||
let mut labels = vec![0usize; n];
|
||||
let mut prev_inertia = f32::MAX;
|
||||
|
||||
for iteration in 0..self.max_iterations {
|
||||
// Assignment step: assign each point to nearest centroid
|
||||
for i in 0..n {
|
||||
let point = data.row(i);
|
||||
let mut min_dist = f32::MAX;
|
||||
let mut best_cluster = 0;
|
||||
|
||||
for (j, centroid) in centroids.outer_iter().enumerate() {
|
||||
let dist = self.euclidean_distance(point, centroid);
|
||||
if dist < min_dist {
|
||||
min_dist = dist;
|
||||
best_cluster = j;
|
||||
}
|
||||
}
|
||||
|
||||
labels[i] = best_cluster;
|
||||
}
|
||||
|
||||
// Update step: compute new centroids
|
||||
let mut new_centroids = Array2::<f32>::zeros((self.k, d));
|
||||
let mut counts = vec![0usize; self.k];
|
||||
|
||||
for (i, &label) in labels.iter().enumerate() {
|
||||
for j in 0..d {
|
||||
new_centroids[[label, j]] += data[[i, j]];
|
||||
}
|
||||
counts[label] += 1;
|
||||
}
|
||||
|
||||
for j in 0..self.k {
|
||||
if counts[j] > 0 {
|
||||
for l in 0..d {
|
||||
new_centroids[[j, l]] /= counts[j] as f32;
|
||||
}
|
||||
} else {
|
||||
// Handle empty cluster by keeping old centroid
|
||||
for l in 0..d {
|
||||
new_centroids[[j, l]] = centroids[[j, l]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute inertia (sum of squared distances to centroids)
|
||||
let inertia: f32 = labels
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, &label)| {
|
||||
self.euclidean_distance(data.row(i), centroids.row(label)).powi(2)
|
||||
})
|
||||
.sum();
|
||||
|
||||
// Check convergence
|
||||
let inertia_change = (prev_inertia - inertia).abs() / prev_inertia.max(1.0);
|
||||
|
||||
debug!(
|
||||
iteration = iteration,
|
||||
inertia = inertia,
|
||||
change = inertia_change,
|
||||
"K-Means iteration"
|
||||
);
|
||||
|
||||
if inertia_change < self.tolerance {
|
||||
debug!(
|
||||
iterations = iteration + 1,
|
||||
final_inertia = inertia,
|
||||
"K-Means converged"
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
centroids = new_centroids;
|
||||
prev_inertia = inertia;
|
||||
}
|
||||
|
||||
Ok((labels, centroids))
|
||||
}
|
||||
|
||||
/// Initialize centroids using k-means++ algorithm.
|
||||
fn kmeans_plus_plus_init(&self, data: &Array2<f32>) -> Array2<f32> {
|
||||
let n = data.nrows();
|
||||
let d = data.ncols();
|
||||
let mut centroids = Array2::<f32>::zeros((self.k, d));
|
||||
|
||||
// Use seed for deterministic initialization if provided
|
||||
let seed = self.seed.unwrap_or(42);
|
||||
let mut rng_state = seed;
|
||||
|
||||
// Helper function for pseudo-random number generation
|
||||
let mut next_random = || {
|
||||
rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
|
||||
((rng_state >> 33) as f32) / (u32::MAX as f32)
|
||||
};
|
||||
|
||||
// Choose first centroid randomly
|
||||
let first_idx = (next_random() * n as f32) as usize % n;
|
||||
for j in 0..d {
|
||||
centroids[[0, j]] = data[[first_idx, j]];
|
||||
}
|
||||
|
||||
// Choose remaining centroids with probability proportional to D^2
|
||||
for i in 1..self.k {
|
||||
// Compute distances to nearest existing centroid
|
||||
let mut distances = Vec::with_capacity(n);
|
||||
let mut total_dist = 0.0f32;
|
||||
|
||||
for point_idx in 0..n {
|
||||
let point = data.row(point_idx);
|
||||
let mut min_dist = f32::MAX;
|
||||
|
||||
for j in 0..i {
|
||||
let dist = self.euclidean_distance(point, centroids.row(j));
|
||||
min_dist = min_dist.min(dist);
|
||||
}
|
||||
|
||||
let dist_sq = min_dist * min_dist;
|
||||
distances.push(dist_sq);
|
||||
total_dist += dist_sq;
|
||||
}
|
||||
|
||||
// Sample proportionally to D^2
|
||||
let target = next_random() * total_dist;
|
||||
let mut cumsum = 0.0f32;
|
||||
let mut chosen_idx = 0;
|
||||
|
||||
for (idx, &dist) in distances.iter().enumerate() {
|
||||
cumsum += dist;
|
||||
if cumsum >= target {
|
||||
chosen_idx = idx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for j in 0..d {
|
||||
centroids[[i, j]] = data[[chosen_idx, j]];
|
||||
}
|
||||
}
|
||||
|
||||
centroids
|
||||
}
|
||||
|
||||
/// Compute Euclidean distance between two vectors.
|
||||
fn euclidean_distance(&self, a: ArrayView1<f32>, b: ArrayView1<f32>) -> f32 {
|
||||
a.iter()
|
||||
.zip(b.iter())
|
||||
.map(|(x, y)| (x - y).powi(2))
|
||||
.sum::<f32>()
|
||||
.sqrt()
|
||||
}
|
||||
|
||||
/// Predict cluster labels for new data given fitted centroids.
|
||||
pub fn predict(&self, data: &Array2<f32>, centroids: &Array2<f32>) -> Vec<usize> {
|
||||
let n = data.nrows();
|
||||
let mut labels = vec![0usize; n];
|
||||
|
||||
for i in 0..n {
|
||||
let point = data.row(i);
|
||||
let mut min_dist = f32::MAX;
|
||||
let mut best_cluster = 0;
|
||||
|
||||
for (j, centroid) in centroids.outer_iter().enumerate() {
|
||||
let dist = self.euclidean_distance(point, centroid);
|
||||
if dist < min_dist {
|
||||
min_dist = dist;
|
||||
best_cluster = j;
|
||||
}
|
||||
}
|
||||
|
||||
labels[i] = best_cluster;
|
||||
}
|
||||
|
||||
labels
|
||||
}
|
||||
|
||||
/// Compute inertia (within-cluster sum of squares).
|
||||
pub fn compute_inertia(
|
||||
&self,
|
||||
data: &Array2<f32>,
|
||||
labels: &[usize],
|
||||
centroids: &Array2<f32>,
|
||||
) -> f32 {
|
||||
labels
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, &label)| {
|
||||
self.euclidean_distance(data.row(i), centroids.row(label)).powi(2)
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use ndarray::Array1;
|
||||
|
||||
fn create_test_data() -> Array2<f32> {
|
||||
// Create simple separable clusters
|
||||
let mut data = Array2::<f32>::zeros((12, 2));
|
||||
|
||||
// Cluster 0: points near (0, 0)
|
||||
data[[0, 0]] = 0.0;
|
||||
data[[0, 1]] = 0.0;
|
||||
data[[1, 0]] = 0.1;
|
||||
data[[1, 1]] = 0.1;
|
||||
data[[2, 0]] = -0.1;
|
||||
data[[2, 1]] = 0.1;
|
||||
data[[3, 0]] = 0.0;
|
||||
data[[3, 1]] = -0.1;
|
||||
|
||||
// Cluster 1: points near (5, 5)
|
||||
data[[4, 0]] = 5.0;
|
||||
data[[4, 1]] = 5.0;
|
||||
data[[5, 0]] = 5.1;
|
||||
data[[5, 1]] = 5.1;
|
||||
data[[6, 0]] = 4.9;
|
||||
data[[6, 1]] = 5.0;
|
||||
data[[7, 0]] = 5.0;
|
||||
data[[7, 1]] = 4.9;
|
||||
|
||||
// Cluster 2: points near (10, 0)
|
||||
data[[8, 0]] = 10.0;
|
||||
data[[8, 1]] = 0.0;
|
||||
data[[9, 0]] = 10.1;
|
||||
data[[9, 1]] = 0.1;
|
||||
data[[10, 0]] = 9.9;
|
||||
data[[10, 1]] = 0.0;
|
||||
data[[11, 0]] = 10.0;
|
||||
data[[11, 1]] = -0.1;
|
||||
|
||||
data
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmeans_basic() {
|
||||
let clusterer = KMeansClusterer::new(3, Some(42));
|
||||
let data = create_test_data();
|
||||
|
||||
let (labels, centroids) = clusterer.fit(&data).unwrap();
|
||||
|
||||
assert_eq!(labels.len(), 12);
|
||||
assert_eq!(centroids.nrows(), 3);
|
||||
|
||||
// Check that points in same original cluster have same label
|
||||
// (with high probability given clear separation)
|
||||
assert_eq!(labels[0], labels[1]);
|
||||
assert_eq!(labels[0], labels[2]);
|
||||
assert_eq!(labels[0], labels[3]);
|
||||
|
||||
assert_eq!(labels[4], labels[5]);
|
||||
assert_eq!(labels[4], labels[6]);
|
||||
assert_eq!(labels[4], labels[7]);
|
||||
|
||||
assert_eq!(labels[8], labels[9]);
|
||||
assert_eq!(labels[8], labels[10]);
|
||||
assert_eq!(labels[8], labels[11]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmeans_insufficient_data() {
|
||||
let clusterer = KMeansClusterer::new(10, None);
|
||||
let data = Array2::<f32>::zeros((5, 2));
|
||||
|
||||
let result = clusterer.fit(&data);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmeans_predict() {
|
||||
let clusterer = KMeansClusterer::new(2, Some(42));
|
||||
|
||||
let train_data = Array2::from_shape_vec(
|
||||
(4, 2),
|
||||
vec![0.0, 0.0, 0.1, 0.1, 5.0, 5.0, 5.1, 5.1],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (_, centroids) = clusterer.fit(&train_data).unwrap();
|
||||
|
||||
let test_data = Array2::from_shape_vec(
|
||||
(2, 2),
|
||||
vec![0.05, 0.05, 4.95, 4.95],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let predictions = clusterer.predict(&test_data, ¢roids);
|
||||
assert_eq!(predictions.len(), 2);
|
||||
|
||||
// First point should be in same cluster as (0,0) points
|
||||
// Second point should be in same cluster as (5,5) points
|
||||
assert_ne!(predictions[0], predictions[1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euclidean_distance() {
|
||||
let clusterer = KMeansClusterer::new(2, None);
|
||||
let a = Array1::from_vec(vec![0.0, 0.0]);
|
||||
let b = Array1::from_vec(vec![3.0, 4.0]);
|
||||
|
||||
let dist = clusterer.euclidean_distance(a.view(), b.view());
|
||||
assert!((dist - 5.0).abs() < 0.001);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,524 @@
|
||||
//! Markov chain analysis for vocalization sequences.
|
||||
//!
|
||||
//! Provides transition matrix computation, entropy calculation,
|
||||
//! and sequence analysis for understanding vocalization patterns.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
use crate::domain::entities::ClusterId;
|
||||
use crate::domain::value_objects::{SequenceMetrics, TransitionMatrix};
|
||||
|
||||
/// Markov chain analyzer for vocalization sequences.
|
||||
pub struct MarkovAnalyzer {
|
||||
/// Smoothing factor for probability estimation (Laplace smoothing).
|
||||
smoothing: f32,
|
||||
}
|
||||
|
||||
impl MarkovAnalyzer {
|
||||
/// Create a new Markov analyzer.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self { smoothing: 0.0 }
|
||||
}
|
||||
|
||||
/// Create with Laplace smoothing.
|
||||
#[must_use]
|
||||
pub fn with_smoothing(smoothing: f32) -> Self {
|
||||
Self { smoothing }
|
||||
}
|
||||
|
||||
/// Build a transition matrix from a sequence of cluster IDs.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `sequence` - Ordered sequence of cluster IDs
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A TransitionMatrix representing transition probabilities.
|
||||
#[instrument(skip(self, sequence), fields(seq_len = sequence.len()))]
|
||||
pub fn build_transition_matrix(&self, sequence: &[ClusterId]) -> TransitionMatrix {
|
||||
// Collect all unique clusters
|
||||
let unique_clusters: Vec<ClusterId> = sequence
|
||||
.iter()
|
||||
.copied()
|
||||
.collect::<HashSet<_>>()
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let mut matrix = TransitionMatrix::new(unique_clusters);
|
||||
|
||||
// Count transitions
|
||||
for window in sequence.windows(2) {
|
||||
matrix.record_transition(&window[0], &window[1]);
|
||||
}
|
||||
|
||||
// Apply smoothing if configured
|
||||
if self.smoothing > 0.0 {
|
||||
self.apply_smoothing(&mut matrix);
|
||||
}
|
||||
|
||||
// Compute probabilities
|
||||
matrix.compute_probabilities();
|
||||
|
||||
debug!(
|
||||
n_states = matrix.size(),
|
||||
n_transitions = matrix.non_zero_transitions().len(),
|
||||
"Built transition matrix"
|
||||
);
|
||||
|
||||
matrix
|
||||
}
|
||||
|
||||
/// Build transition matrix from multiple sequences.
|
||||
#[instrument(skip(self, sequences))]
|
||||
pub fn build_from_sequences(&self, sequences: &[Vec<ClusterId>]) -> TransitionMatrix {
|
||||
// Collect all unique clusters from all sequences
|
||||
let unique_clusters: Vec<ClusterId> = sequences
|
||||
.iter()
|
||||
.flatten()
|
||||
.copied()
|
||||
.collect::<HashSet<_>>()
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let mut matrix = TransitionMatrix::new(unique_clusters);
|
||||
|
||||
// Count transitions from all sequences
|
||||
for sequence in sequences {
|
||||
for window in sequence.windows(2) {
|
||||
matrix.record_transition(&window[0], &window[1]);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply smoothing and compute probabilities
|
||||
if self.smoothing > 0.0 {
|
||||
self.apply_smoothing(&mut matrix);
|
||||
}
|
||||
matrix.compute_probabilities();
|
||||
|
||||
matrix
|
||||
}
|
||||
|
||||
/// Compute Shannon entropy of transition probabilities.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `transitions` - Slice of (source, target, probability) tuples
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Entropy value in nats (natural log base).
|
||||
#[must_use]
|
||||
pub fn compute_entropy(&self, transitions: &[(ClusterId, ClusterId, f32)]) -> f32 {
|
||||
let mut entropy = 0.0f32;
|
||||
|
||||
for &(_, _, prob) in transitions {
|
||||
if prob > 0.0 {
|
||||
entropy -= prob * prob.ln();
|
||||
}
|
||||
}
|
||||
|
||||
entropy
|
||||
}
|
||||
|
||||
/// Compute entropy rate of a Markov chain.
|
||||
///
|
||||
/// The entropy rate is the average entropy per step, weighted
|
||||
/// by the stationary distribution.
|
||||
#[must_use]
|
||||
pub fn compute_entropy_rate(&self, matrix: &TransitionMatrix) -> f32 {
|
||||
let stationary = match matrix.stationary_distribution() {
|
||||
Some(dist) => dist,
|
||||
None => return 0.0,
|
||||
};
|
||||
|
||||
let n = matrix.size();
|
||||
let mut entropy_rate = 0.0f32;
|
||||
|
||||
for (i, &pi) in stationary.iter().enumerate() {
|
||||
if pi <= 0.0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Compute entropy of row i
|
||||
let mut row_entropy = 0.0f32;
|
||||
for j in 0..n {
|
||||
let prob = matrix.probabilities[i][j];
|
||||
if prob > 0.0 {
|
||||
row_entropy -= prob * prob.ln();
|
||||
}
|
||||
}
|
||||
|
||||
entropy_rate += pi * row_entropy;
|
||||
}
|
||||
|
||||
entropy_rate
|
||||
}
|
||||
|
||||
/// Compute sequence metrics from a cluster sequence.
|
||||
#[instrument(skip(self, sequence))]
|
||||
pub fn compute_metrics(&self, sequence: &[ClusterId]) -> SequenceMetrics {
|
||||
if sequence.len() < 2 {
|
||||
return SequenceMetrics::default();
|
||||
}
|
||||
|
||||
let matrix = self.build_transition_matrix(sequence);
|
||||
let transitions = matrix.non_zero_transitions();
|
||||
|
||||
// Count unique elements
|
||||
let unique_clusters: HashSet<_> = sequence.iter().collect();
|
||||
let total_transitions = sequence.len() - 1;
|
||||
|
||||
// Count self-transitions
|
||||
let self_transitions = sequence
|
||||
.windows(2)
|
||||
.filter(|w| w[0] == w[1])
|
||||
.count();
|
||||
|
||||
// Compute entropy
|
||||
let entropy = self.compute_entropy(&transitions);
|
||||
|
||||
// Normalize entropy
|
||||
let max_entropy = (unique_clusters.len() as f32).ln().max(1.0);
|
||||
let normalized_entropy = if max_entropy > 0.0 {
|
||||
entropy / max_entropy
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
// Find dominant transition
|
||||
let dominant_transition = transitions
|
||||
.iter()
|
||||
.max_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal))
|
||||
.map(|&(from, to, prob)| (from, to, prob));
|
||||
|
||||
SequenceMetrics {
|
||||
entropy,
|
||||
normalized_entropy,
|
||||
stereotypy: 1.0 - normalized_entropy,
|
||||
unique_clusters: unique_clusters.len(),
|
||||
unique_transitions: transitions.len(),
|
||||
total_transitions,
|
||||
dominant_transition,
|
||||
repetition_rate: self_transitions as f32 / total_transitions as f32,
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute stereotypy score (measure of sequence repetitiveness).
|
||||
///
|
||||
/// Higher values indicate more stereotyped/predictable sequences.
|
||||
#[must_use]
|
||||
pub fn compute_stereotypy(&self, matrix: &TransitionMatrix) -> f32 {
|
||||
let entropy_rate = self.compute_entropy_rate(matrix);
|
||||
let max_entropy = (matrix.size() as f32).ln();
|
||||
|
||||
if max_entropy > 0.0 {
|
||||
1.0 - (entropy_rate / max_entropy)
|
||||
} else {
|
||||
1.0
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect periodic patterns in a sequence.
|
||||
///
|
||||
/// Returns a vector of (period_length, confidence) tuples for detected patterns.
|
||||
#[instrument(skip(self, sequence))]
|
||||
pub fn detect_periodicity(&self, sequence: &[ClusterId]) -> Vec<(usize, f32)> {
|
||||
let n = sequence.len();
|
||||
if n < 4 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut periods = Vec::new();
|
||||
let max_period = n / 2;
|
||||
|
||||
for period in 2..=max_period {
|
||||
let matches = self.count_periodic_matches(sequence, period);
|
||||
let max_matches = n / period;
|
||||
let confidence = matches as f32 / max_matches as f32;
|
||||
|
||||
if confidence > 0.5 {
|
||||
periods.push((period, confidence));
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by confidence descending
|
||||
periods.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
periods
|
||||
}
|
||||
|
||||
/// Count matches for a given period length.
|
||||
fn count_periodic_matches(&self, sequence: &[ClusterId], period: usize) -> usize {
|
||||
let n = sequence.len();
|
||||
let mut matches = 0;
|
||||
|
||||
for i in period..n {
|
||||
if sequence[i] == sequence[i - period] {
|
||||
matches += 1;
|
||||
}
|
||||
}
|
||||
|
||||
matches
|
||||
}
|
||||
|
||||
/// Apply Laplace smoothing to observation counts.
|
||||
fn apply_smoothing(&self, matrix: &mut TransitionMatrix) {
|
||||
let n = matrix.size();
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
matrix.observations[i][j] += self.smoothing as u32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute log-likelihood of a sequence given a transition matrix.
|
||||
#[must_use]
|
||||
pub fn log_likelihood(&self, sequence: &[ClusterId], matrix: &TransitionMatrix) -> f32 {
|
||||
if sequence.len() < 2 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut log_prob = 0.0f32;
|
||||
|
||||
for window in sequence.windows(2) {
|
||||
if let Some(prob) = matrix.probability(&window[0], &window[1]) {
|
||||
if prob > 0.0 {
|
||||
log_prob += prob.ln();
|
||||
} else {
|
||||
// Unseen transition - return negative infinity
|
||||
return f32::NEG_INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log_prob
|
||||
}
|
||||
|
||||
/// Find the most likely next cluster given current state.
|
||||
#[must_use]
|
||||
pub fn predict_next(
|
||||
&self,
|
||||
current: &ClusterId,
|
||||
matrix: &TransitionMatrix,
|
||||
) -> Option<(ClusterId, f32)> {
|
||||
let idx = matrix.index_of(current)?;
|
||||
|
||||
let mut best_cluster = None;
|
||||
let mut best_prob = 0.0f32;
|
||||
|
||||
for (j, &target_id) in matrix.cluster_ids.iter().enumerate() {
|
||||
let prob = matrix.probabilities[idx][j];
|
||||
if prob > best_prob {
|
||||
best_prob = prob;
|
||||
best_cluster = Some(target_id);
|
||||
}
|
||||
}
|
||||
|
||||
best_cluster.map(|c| (c, best_prob))
|
||||
}
|
||||
|
||||
/// Generate a sequence from the Markov chain.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `matrix` - The transition matrix
|
||||
/// * `start` - Starting cluster
|
||||
/// * `length` - Desired sequence length
|
||||
/// * `seed` - Random seed for reproducibility
|
||||
pub fn generate_sequence(
|
||||
&self,
|
||||
matrix: &TransitionMatrix,
|
||||
start: ClusterId,
|
||||
length: usize,
|
||||
seed: u64,
|
||||
) -> Vec<ClusterId> {
|
||||
let mut sequence = Vec::with_capacity(length);
|
||||
sequence.push(start);
|
||||
|
||||
let mut rng_state = seed;
|
||||
let mut next_random = || {
|
||||
rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
|
||||
((rng_state >> 33) as f32) / (u32::MAX as f32)
|
||||
};
|
||||
|
||||
let mut current = start;
|
||||
|
||||
for _ in 1..length {
|
||||
let idx = match matrix.index_of(¤t) {
|
||||
Some(i) => i,
|
||||
None => break,
|
||||
};
|
||||
|
||||
// Sample from transition probabilities
|
||||
let r = next_random();
|
||||
let mut cumsum = 0.0f32;
|
||||
let mut next_cluster = current;
|
||||
|
||||
for (j, &cluster_id) in matrix.cluster_ids.iter().enumerate() {
|
||||
cumsum += matrix.probabilities[idx][j];
|
||||
if r < cumsum {
|
||||
next_cluster = cluster_id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
sequence.push(next_cluster);
|
||||
current = next_cluster;
|
||||
}
|
||||
|
||||
sequence
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MarkovAnalyzer {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn create_test_sequence() -> Vec<ClusterId> {
|
||||
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
|
||||
let c2 = ClusterId::from_uuid(uuid::Uuid::from_u128(2));
|
||||
let c3 = ClusterId::from_uuid(uuid::Uuid::from_u128(3));
|
||||
|
||||
// Pattern: c1 -> c2 -> c3 -> c1 -> c2 -> c3 (periodic)
|
||||
vec![c1, c2, c3, c1, c2, c3, c1, c2, c3]
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_transition_matrix() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
let sequence = create_test_sequence();
|
||||
|
||||
let matrix = analyzer.build_transition_matrix(&sequence);
|
||||
|
||||
assert_eq!(matrix.size(), 3);
|
||||
assert!(!matrix.non_zero_transitions().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_entropy_computation() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
|
||||
// Uniform distribution should have higher entropy
|
||||
let c1 = ClusterId::new();
|
||||
let c2 = ClusterId::new();
|
||||
|
||||
let uniform_transitions = vec![
|
||||
(c1, c1, 0.25),
|
||||
(c1, c2, 0.25),
|
||||
(c2, c1, 0.25),
|
||||
(c2, c2, 0.25),
|
||||
];
|
||||
|
||||
let entropy = analyzer.compute_entropy(&uniform_transitions);
|
||||
assert!(entropy > 0.0);
|
||||
|
||||
// Deterministic distribution should have lower entropy
|
||||
let deterministic = vec![
|
||||
(c1, c2, 1.0),
|
||||
(c2, c1, 1.0),
|
||||
];
|
||||
|
||||
let det_entropy = analyzer.compute_entropy(&deterministic);
|
||||
assert!(det_entropy < entropy);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_metrics() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
let sequence = create_test_sequence();
|
||||
|
||||
let metrics = analyzer.compute_metrics(&sequence);
|
||||
|
||||
assert_eq!(metrics.unique_clusters, 3);
|
||||
// Deterministic sequence has zero entropy (each state has one successor)
|
||||
assert!(metrics.entropy >= 0.0);
|
||||
assert!(metrics.stereotypy >= 0.0 && metrics.stereotypy <= 1.0);
|
||||
assert!(metrics.total_transitions == sequence.len() - 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_periodicity_detection() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
|
||||
// Create highly periodic sequence
|
||||
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
|
||||
let c2 = ClusterId::from_uuid(uuid::Uuid::from_u128(2));
|
||||
|
||||
let periodic_sequence = vec![c1, c2, c1, c2, c1, c2, c1, c2, c1, c2];
|
||||
let periods = analyzer.detect_periodicity(&periodic_sequence);
|
||||
|
||||
// Should detect period 2 (may not be first due to confidence calculation)
|
||||
assert!(!periods.is_empty());
|
||||
// Check that period 2 is in the detected periods
|
||||
let has_period_2 = periods.iter().any(|(p, _)| *p == 2);
|
||||
assert!(has_period_2, "Period 2 should be detected, found periods: {:?}", periods);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_predict_next() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
let sequence = create_test_sequence();
|
||||
let matrix = analyzer.build_transition_matrix(&sequence);
|
||||
|
||||
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
|
||||
let c2 = ClusterId::from_uuid(uuid::Uuid::from_u128(2));
|
||||
|
||||
// Given the pattern c1 -> c2 -> c3 -> ..., after c1 should come c2
|
||||
if let Some((next, prob)) = analyzer.predict_next(&c1, &matrix) {
|
||||
assert_eq!(next, c2);
|
||||
assert!(prob > 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sequence_generation() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
let sequence = create_test_sequence();
|
||||
let matrix = analyzer.build_transition_matrix(&sequence);
|
||||
|
||||
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
|
||||
let generated = analyzer.generate_sequence(&matrix, c1, 10, 42);
|
||||
|
||||
assert_eq!(generated.len(), 10);
|
||||
assert_eq!(generated[0], c1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smoothing() {
|
||||
let analyzer = MarkovAnalyzer::with_smoothing(1.0);
|
||||
|
||||
let c1 = ClusterId::new();
|
||||
let c2 = ClusterId::new();
|
||||
let sequence = vec![c1, c2, c1, c2];
|
||||
|
||||
let matrix = analyzer.build_transition_matrix(&sequence);
|
||||
|
||||
// With smoothing, all transitions should have non-zero probability
|
||||
for i in 0..matrix.size() {
|
||||
for j in 0..matrix.size() {
|
||||
assert!(matrix.probabilities[i][j] > 0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_log_likelihood() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
let sequence = create_test_sequence();
|
||||
let matrix = analyzer.build_transition_matrix(&sequence);
|
||||
|
||||
// Log-likelihood of the training sequence should be reasonably high
|
||||
let ll = analyzer.log_likelihood(&sequence, &matrix);
|
||||
assert!(ll.is_finite());
|
||||
assert!(ll <= 0.0); // Log probabilities are non-positive
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,681 @@
|
||||
//! In-memory repository implementation for testing and development.
|
||||
//!
|
||||
//! Provides thread-safe in-memory storage for all analysis entities.
|
||||
|
||||
use async_trait::async_trait;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::RwLock;
|
||||
|
||||
use crate::domain::entities::{
|
||||
Anomaly, Cluster, ClusterId, EmbeddingId, Motif, Prototype, RecordingId, SequenceAnalysis,
|
||||
};
|
||||
use crate::domain::repository::{
|
||||
AnomalyRepository, ClusterRepository, MotifRepository, PrototypeRepository,
|
||||
RepositoryError, Result, SequenceRepository,
|
||||
};
|
||||
|
||||
/// In-memory implementation of the analysis repositories.
|
||||
///
|
||||
/// Useful for testing and development. Not suitable for production use
|
||||
/// as data is lost on restart.
|
||||
pub struct InMemoryAnalysisRepository {
|
||||
clusters: RwLock<HashMap<ClusterId, Cluster>>,
|
||||
prototypes: RwLock<HashMap<ClusterId, Vec<Prototype>>>,
|
||||
motifs: RwLock<HashMap<String, Motif>>,
|
||||
sequences: RwLock<HashMap<RecordingId, SequenceAnalysis>>,
|
||||
anomalies: RwLock<HashMap<EmbeddingId, Anomaly>>,
|
||||
/// Mapping from embedding ID to cluster ID
|
||||
embedding_assignments: RwLock<HashMap<EmbeddingId, ClusterId>>,
|
||||
}
|
||||
|
||||
impl InMemoryAnalysisRepository {
|
||||
/// Create a new empty in-memory repository.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
clusters: RwLock::new(HashMap::new()),
|
||||
prototypes: RwLock::new(HashMap::new()),
|
||||
motifs: RwLock::new(HashMap::new()),
|
||||
sequences: RwLock::new(HashMap::new()),
|
||||
anomalies: RwLock::new(HashMap::new()),
|
||||
embedding_assignments: RwLock::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get statistics about stored data.
|
||||
#[must_use]
|
||||
pub fn stats(&self) -> RepositoryStats {
|
||||
let clusters = self.clusters.read().unwrap();
|
||||
let prototypes = self.prototypes.read().unwrap();
|
||||
let motifs = self.motifs.read().unwrap();
|
||||
let sequences = self.sequences.read().unwrap();
|
||||
let anomalies = self.anomalies.read().unwrap();
|
||||
|
||||
RepositoryStats {
|
||||
cluster_count: clusters.len(),
|
||||
prototype_count: prototypes.values().map(|v| v.len()).sum(),
|
||||
motif_count: motifs.len(),
|
||||
sequence_count: sequences.len(),
|
||||
anomaly_count: anomalies.len(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for InMemoryAnalysisRepository {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Statistics about repository contents.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RepositoryStats {
|
||||
/// Number of clusters stored.
|
||||
pub cluster_count: usize,
|
||||
/// Total number of prototypes.
|
||||
pub prototype_count: usize,
|
||||
/// Number of motifs stored.
|
||||
pub motif_count: usize,
|
||||
/// Number of sequence analyses stored.
|
||||
pub sequence_count: usize,
|
||||
/// Number of anomalies stored.
|
||||
pub anomaly_count: usize,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ClusterRepository for InMemoryAnalysisRepository {
|
||||
async fn save_cluster(&self, cluster: &Cluster) -> Result<()> {
|
||||
let mut clusters = self.clusters.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
clusters.insert(cluster.id, cluster.clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_clusters(&self, clusters_to_save: &[Cluster]) -> Result<()> {
|
||||
let mut clusters = self.clusters.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
for cluster in clusters_to_save {
|
||||
clusters.insert(cluster.id, cluster.clone());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_cluster(&self, id: &ClusterId) -> Result<Option<Cluster>> {
|
||||
let clusters = self.clusters.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(clusters.get(id).cloned())
|
||||
}
|
||||
|
||||
async fn list_clusters(&self) -> Result<Vec<Cluster>> {
|
||||
let clusters = self.clusters.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(clusters.values().cloned().collect())
|
||||
}
|
||||
|
||||
async fn list_clusters_paginated(
|
||||
&self,
|
||||
offset: usize,
|
||||
limit: usize,
|
||||
) -> Result<Vec<Cluster>> {
|
||||
let clusters = self.clusters.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(clusters.values().skip(offset).take(limit).cloned().collect())
|
||||
}
|
||||
|
||||
async fn assign_to_cluster(
|
||||
&self,
|
||||
embedding_id: &EmbeddingId,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<()> {
|
||||
let mut assignments = self.embedding_assignments.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
assignments.insert(*embedding_id, *cluster_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn remove_from_cluster(&self, embedding_id: &EmbeddingId) -> Result<()> {
|
||||
let mut assignments = self.embedding_assignments.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
assignments.remove(embedding_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_cluster_by_embedding(
|
||||
&self,
|
||||
embedding_id: &EmbeddingId,
|
||||
) -> Result<Option<Cluster>> {
|
||||
// Extract the cluster_id and drop the guard before await
|
||||
let cluster_id = {
|
||||
let assignments = self.embedding_assignments.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
assignments.get(embedding_id).cloned()
|
||||
};
|
||||
|
||||
if let Some(cluster_id) = cluster_id {
|
||||
self.find_cluster(&cluster_id).await
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_cluster(&self, id: &ClusterId) -> Result<()> {
|
||||
let mut clusters = self.clusters.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
clusters.remove(id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_clusters(&self) -> Result<()> {
|
||||
let mut clusters = self.clusters.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
clusters.clear();
|
||||
|
||||
let mut assignments = self.embedding_assignments.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
assignments.clear();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn cluster_count(&self) -> Result<usize> {
|
||||
let clusters = self.clusters.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(clusters.len())
|
||||
}
|
||||
|
||||
async fn find_clusters_by_label(&self, label_pattern: &str) -> Result<Vec<Cluster>> {
|
||||
let clusters = self.clusters.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(clusters
|
||||
.values()
|
||||
.filter(|c| {
|
||||
c.label
|
||||
.as_ref()
|
||||
.map_or(false, |l| l.contains(label_pattern))
|
||||
})
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn update_cluster_label(
|
||||
&self,
|
||||
id: &ClusterId,
|
||||
label: Option<String>,
|
||||
) -> Result<()> {
|
||||
let mut clusters = self.clusters.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
if let Some(cluster) = clusters.get_mut(id) {
|
||||
cluster.label = label;
|
||||
Ok(())
|
||||
} else {
|
||||
Err(RepositoryError::NotFound(format!("Cluster {}", id)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PrototypeRepository for InMemoryAnalysisRepository {
|
||||
async fn save_prototype(&self, prototype: &Prototype) -> Result<()> {
|
||||
let mut prototypes = self.prototypes.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
prototypes
|
||||
.entry(prototype.cluster_id)
|
||||
.or_default()
|
||||
.push(prototype.clone());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_prototypes(&self, prototypes_to_save: &[Prototype]) -> Result<()> {
|
||||
let mut prototypes = self.prototypes.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
for prototype in prototypes_to_save {
|
||||
prototypes
|
||||
.entry(prototype.cluster_id)
|
||||
.or_default()
|
||||
.push(prototype.clone());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_prototypes_by_cluster(
|
||||
&self,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<Vec<Prototype>> {
|
||||
let prototypes = self.prototypes.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(prototypes.get(cluster_id).cloned().unwrap_or_default())
|
||||
}
|
||||
|
||||
async fn find_best_prototype(
|
||||
&self,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<Option<Prototype>> {
|
||||
let prototypes = self.prototypes.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(prototypes.get(cluster_id).and_then(|protos| {
|
||||
protos
|
||||
.iter()
|
||||
.max_by(|a, b| {
|
||||
a.exemplar_score
|
||||
.partial_cmp(&b.exemplar_score)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
})
|
||||
.cloned()
|
||||
}))
|
||||
}
|
||||
|
||||
async fn delete_prototypes_by_cluster(&self, cluster_id: &ClusterId) -> Result<()> {
|
||||
let mut prototypes = self.prototypes.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
prototypes.remove(cluster_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_prototypes(&self) -> Result<()> {
|
||||
let mut prototypes = self.prototypes.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
prototypes.clear();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl MotifRepository for InMemoryAnalysisRepository {
|
||||
async fn save_motif(&self, motif: &Motif) -> Result<()> {
|
||||
let mut motifs = self.motifs.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
motifs.insert(motif.id.clone(), motif.clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_motifs(&self, motifs_to_save: &[Motif]) -> Result<()> {
|
||||
let mut motifs = self.motifs.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
for motif in motifs_to_save {
|
||||
motifs.insert(motif.id.clone(), motif.clone());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_motif(&self, id: &str) -> Result<Option<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(motifs.get(id).cloned())
|
||||
}
|
||||
|
||||
async fn find_motifs_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(motifs
|
||||
.values()
|
||||
.filter(|m| m.contains_cluster(cluster_id))
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn list_motifs(&self) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(motifs.values().cloned().collect())
|
||||
}
|
||||
|
||||
async fn find_motifs_by_confidence(&self, min_confidence: f32) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(motifs
|
||||
.values()
|
||||
.filter(|m| m.confidence >= min_confidence)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn find_motifs_by_occurrences(&self, min_occurrences: usize) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(motifs
|
||||
.values()
|
||||
.filter(|m| m.occurrences >= min_occurrences)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn delete_motif(&self, id: &str) -> Result<()> {
|
||||
let mut motifs = self.motifs.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
motifs.remove(id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_motifs(&self) -> Result<()> {
|
||||
let mut motifs = self.motifs.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
motifs.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn motif_count(&self) -> Result<usize> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(motifs.len())
|
||||
}
|
||||
|
||||
async fn find_motifs_by_sequence(&self, sequence: &[ClusterId]) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(motifs
|
||||
.values()
|
||||
.filter(|m| m.sequence == sequence)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn find_motifs_containing_subsequence(
|
||||
&self,
|
||||
subsequence: &[ClusterId],
|
||||
) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(motifs
|
||||
.values()
|
||||
.filter(|m| {
|
||||
m.sequence
|
||||
.windows(subsequence.len())
|
||||
.any(|w| w == subsequence)
|
||||
})
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl SequenceRepository for InMemoryAnalysisRepository {
|
||||
async fn save_sequence_analysis(&self, analysis: &SequenceAnalysis) -> Result<()> {
|
||||
let mut sequences = self.sequences.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
sequences.insert(analysis.recording_id, analysis.clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_sequence_by_recording(
|
||||
&self,
|
||||
recording_id: &RecordingId,
|
||||
) -> Result<Option<SequenceAnalysis>> {
|
||||
let sequences = self.sequences.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(sequences.get(recording_id).cloned())
|
||||
}
|
||||
|
||||
async fn list_sequence_analyses(&self) -> Result<Vec<SequenceAnalysis>> {
|
||||
let sequences = self.sequences.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(sequences.values().cloned().collect())
|
||||
}
|
||||
|
||||
async fn delete_sequence_by_recording(&self, recording_id: &RecordingId) -> Result<()> {
|
||||
let mut sequences = self.sequences.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
sequences.remove(recording_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_sequences(&self) -> Result<()> {
|
||||
let mut sequences = self.sequences.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
sequences.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_sequences_by_entropy(&self, min_entropy: f32) -> Result<Vec<SequenceAnalysis>> {
|
||||
let sequences = self.sequences.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(sequences
|
||||
.values()
|
||||
.filter(|s| s.entropy >= min_entropy)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn find_sequences_by_stereotypy(
|
||||
&self,
|
||||
min_stereotypy: f32,
|
||||
) -> Result<Vec<SequenceAnalysis>> {
|
||||
let sequences = self.sequences.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(sequences
|
||||
.values()
|
||||
.filter(|s| s.stereotypy_score >= min_stereotypy)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl AnomalyRepository for InMemoryAnalysisRepository {
|
||||
async fn save_anomaly(&self, anomaly: &Anomaly) -> Result<()> {
|
||||
let mut anomalies = self.anomalies.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
anomalies.insert(anomaly.embedding_id, anomaly.clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_anomalies(&self, anomalies_to_save: &[Anomaly]) -> Result<()> {
|
||||
let mut anomalies = self.anomalies.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
for anomaly in anomalies_to_save {
|
||||
anomalies.insert(anomaly.embedding_id, anomaly.clone());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_anomaly(&self, embedding_id: &EmbeddingId) -> Result<Option<Anomaly>> {
|
||||
let anomalies = self.anomalies.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(anomalies.get(embedding_id).cloned())
|
||||
}
|
||||
|
||||
async fn list_anomalies(&self) -> Result<Vec<Anomaly>> {
|
||||
let anomalies = self.anomalies.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(anomalies.values().cloned().collect())
|
||||
}
|
||||
|
||||
async fn find_anomalies_by_score(&self, min_score: f32) -> Result<Vec<Anomaly>> {
|
||||
let anomalies = self.anomalies.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(anomalies
|
||||
.values()
|
||||
.filter(|a| a.anomaly_score >= min_score)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn find_anomalies_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Anomaly>> {
|
||||
let anomalies = self.anomalies.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(anomalies
|
||||
.values()
|
||||
.filter(|a| a.nearest_cluster == *cluster_id)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn delete_anomaly(&self, embedding_id: &EmbeddingId) -> Result<()> {
|
||||
let mut anomalies = self.anomalies.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
anomalies.remove(embedding_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_anomalies(&self) -> Result<()> {
|
||||
let mut anomalies = self.anomalies.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
anomalies.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn anomaly_count(&self) -> Result<usize> {
|
||||
let anomalies = self.anomalies.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(anomalies.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cluster_crud() {
|
||||
let repo = InMemoryAnalysisRepository::new();
|
||||
|
||||
let cluster = Cluster::new(
|
||||
EmbeddingId::new(),
|
||||
vec![EmbeddingId::new()],
|
||||
vec![0.0; 10],
|
||||
0.1,
|
||||
);
|
||||
|
||||
// Save
|
||||
repo.save_cluster(&cluster).await.unwrap();
|
||||
|
||||
// Find
|
||||
let found = repo.find_cluster(&cluster.id).await.unwrap();
|
||||
assert!(found.is_some());
|
||||
|
||||
// List
|
||||
let all = repo.list_clusters().await.unwrap();
|
||||
assert_eq!(all.len(), 1);
|
||||
|
||||
// Delete
|
||||
repo.delete_cluster(&cluster.id).await.unwrap();
|
||||
let found = repo.find_cluster(&cluster.id).await.unwrap();
|
||||
assert!(found.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_motif_crud() {
|
||||
let repo = InMemoryAnalysisRepository::new();
|
||||
|
||||
let motif = Motif::new(
|
||||
vec![ClusterId::new(), ClusterId::new()],
|
||||
5,
|
||||
1500.0,
|
||||
0.8,
|
||||
);
|
||||
|
||||
repo.save_motif(&motif).await.unwrap();
|
||||
|
||||
let found = repo.find_motif(&motif.id).await.unwrap();
|
||||
assert!(found.is_some());
|
||||
|
||||
let count = repo.motif_count().await.unwrap();
|
||||
assert_eq!(count, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_sequence_crud() {
|
||||
let repo = InMemoryAnalysisRepository::new();
|
||||
|
||||
let recording_id = RecordingId::new();
|
||||
let analysis = SequenceAnalysis::new(
|
||||
recording_id,
|
||||
vec![],
|
||||
1.5,
|
||||
0.5,
|
||||
);
|
||||
|
||||
repo.save_sequence_analysis(&analysis).await.unwrap();
|
||||
|
||||
let found = repo.find_sequence_by_recording(&recording_id).await.unwrap();
|
||||
assert!(found.is_some());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_anomaly_filtering() {
|
||||
let repo = InMemoryAnalysisRepository::new();
|
||||
|
||||
let anomaly1 = Anomaly::new(
|
||||
EmbeddingId::new(),
|
||||
0.9,
|
||||
ClusterId::new(),
|
||||
2.0,
|
||||
);
|
||||
|
||||
let anomaly2 = Anomaly::new(
|
||||
EmbeddingId::new(),
|
||||
0.3,
|
||||
ClusterId::new(),
|
||||
0.5,
|
||||
);
|
||||
|
||||
repo.save_anomalies(&[anomaly1, anomaly2]).await.unwrap();
|
||||
|
||||
let high_score = repo.find_anomalies_by_score(0.5).await.unwrap();
|
||||
assert_eq!(high_score.len(), 1);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
//! Infrastructure layer for the Analysis bounded context.
|
||||
//!
|
||||
//! Contains concrete implementations of clustering algorithms,
|
||||
//! Markov chain analysis, and other infrastructure components.
|
||||
|
||||
pub mod hdbscan;
|
||||
pub mod kmeans;
|
||||
pub mod markov;
|
||||
pub mod memory_repository;
|
||||
|
||||
// Re-export main types
|
||||
pub use hdbscan::HdbscanClusterer;
|
||||
pub use kmeans::KMeansClusterer;
|
||||
pub use markov::MarkovAnalyzer;
|
||||
pub use memory_repository::InMemoryAnalysisRepository;
|
||||
@@ -0,0 +1,78 @@
|
||||
//! # sevensense-analysis
|
||||
//!
|
||||
//! Analysis bounded context for 7sense bioacoustic analysis platform.
|
||||
//!
|
||||
//! This crate provides clustering, motif detection, sequence analysis, and anomaly
|
||||
//! detection capabilities for bioacoustic embeddings.
|
||||
//!
|
||||
//! ## Features
|
||||
//!
|
||||
//! - **Clustering**: HDBSCAN and K-means clustering for grouping similar vocalizations
|
||||
//! - **Prototype Extraction**: Identify representative embeddings (exemplars) for each cluster
|
||||
//! - **Motif Detection**: Discover recurring patterns in vocalization sequences
|
||||
//! - **Sequence Analysis**: Markov chain analysis, transition matrices, entropy computation
|
||||
//! - **Anomaly Detection**: Identify unusual or novel vocalizations
|
||||
//!
|
||||
//! ## Architecture
|
||||
//!
|
||||
//! This crate follows Domain-Driven Design (DDD) with hexagonal architecture:
|
||||
//!
|
||||
//! - `domain/` - Core domain entities, value objects, and repository traits
|
||||
//! - `application/` - Application services orchestrating domain operations
|
||||
//! - `infrastructure/` - Concrete implementations (HDBSCAN, Markov chains, etc.)
|
||||
//!
|
||||
//! ## Example
|
||||
//!
|
||||
//! ```rust,ignore
|
||||
//! use sevensense_analysis::{
|
||||
//! application::ClusteringService,
|
||||
//! domain::{ClusteringConfig, ClusteringMethod},
|
||||
//! };
|
||||
//!
|
||||
//! let service = ClusteringService::new(ClusteringConfig::default());
|
||||
//! let embeddings = vec![/* ... */];
|
||||
//! let clusters = service.run_hdbscan(&embeddings).await?;
|
||||
//! ```
|
||||
|
||||
#![warn(missing_docs)]
|
||||
#![warn(clippy::all)]
|
||||
#![allow(clippy::module_name_repetitions)]
|
||||
|
||||
pub mod domain;
|
||||
pub mod application;
|
||||
pub mod infrastructure;
|
||||
pub mod metrics;
|
||||
|
||||
// Re-export primary types for convenience
|
||||
pub use domain::entities::{
|
||||
Anomaly, AnomalyType, Cluster, ClusterId, EmbeddingId, Motif, MotifOccurrence, Prototype,
|
||||
RecordingId, SegmentId, SequenceAnalysis,
|
||||
};
|
||||
pub use domain::repository::{ClusterRepository, MotifRepository, SequenceRepository};
|
||||
pub use domain::events::{
|
||||
AnalysisEvent, ClusterAssigned, ClustersDiscovered, MotifDetected, SequenceAnalyzed,
|
||||
};
|
||||
pub use domain::value_objects::{
|
||||
ClusteringConfig, ClusteringMethod, ClusteringParameters, MotifConfig, SequenceMetrics,
|
||||
TransitionMatrix,
|
||||
};
|
||||
|
||||
pub use application::services::{
|
||||
AnomalyDetectionService, ClusteringService, MotifDetectionService, SequenceAnalysisService,
|
||||
};
|
||||
|
||||
pub use metrics::{
|
||||
ClusteringMetrics, SequenceEntropy, SilhouetteScore, VMeasure,
|
||||
};
|
||||
|
||||
/// Crate version information
|
||||
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
||||
/// Prelude module for convenient imports
|
||||
pub mod prelude {
|
||||
pub use crate::domain::entities::*;
|
||||
pub use crate::domain::repository::*;
|
||||
pub use crate::domain::value_objects::*;
|
||||
pub use crate::application::services::*;
|
||||
pub use crate::metrics::*;
|
||||
}
|
||||
1131
examples/vibecast-7sense/crates/sevensense-analysis/src/metrics.rs
Normal file
1131
examples/vibecast-7sense/crates/sevensense-analysis/src/metrics.rs
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user