Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
//! Application layer for the Analysis bounded context.
//!
//! Contains application services that orchestrate domain operations
//! and coordinate with infrastructure components.
pub mod services;
// Re-export service types
pub use services::{
AnomalyDetectionService, ClusteringService, MotifDetectionService, SequenceAnalysisService,
};

View File

@@ -0,0 +1,692 @@
//! Domain entities for the Analysis bounded context.
//!
//! This module contains the core domain entities representing clusters,
//! prototypes, motifs, sequences, and anomalies in bioacoustic analysis.
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use uuid::Uuid;
/// Unique identifier for a cluster.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ClusterId(Uuid);
impl ClusterId {
/// Create a new random cluster ID.
#[must_use]
pub fn new() -> Self {
Self(Uuid::new_v4())
}
/// Create a cluster ID from a UUID.
#[must_use]
pub fn from_uuid(uuid: Uuid) -> Self {
Self(uuid)
}
/// Get the underlying UUID.
#[must_use]
pub fn as_uuid(&self) -> Uuid {
self.0
}
/// Noise cluster ID (used for HDBSCAN noise points).
#[must_use]
pub fn noise() -> Self {
Self(Uuid::nil())
}
/// Check if this is the noise cluster.
#[must_use]
pub fn is_noise(&self) -> bool {
self.0.is_nil()
}
}
impl Default for ClusterId {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for ClusterId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl From<Uuid> for ClusterId {
fn from(uuid: Uuid) -> Self {
Self(uuid)
}
}
/// Unique identifier for an embedding (from sevensense-embedding context).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct EmbeddingId(Uuid);
impl EmbeddingId {
/// Create a new random embedding ID.
#[must_use]
pub fn new() -> Self {
Self(Uuid::new_v4())
}
/// Create from UUID.
#[must_use]
pub fn from_uuid(uuid: Uuid) -> Self {
Self(uuid)
}
/// Get the underlying UUID.
#[must_use]
pub fn as_uuid(&self) -> Uuid {
self.0
}
}
impl Default for EmbeddingId {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for EmbeddingId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl From<Uuid> for EmbeddingId {
fn from(uuid: Uuid) -> Self {
Self(uuid)
}
}
/// Unique identifier for a recording (from sevensense-audio context).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct RecordingId(Uuid);
impl RecordingId {
/// Create a new random recording ID.
#[must_use]
pub fn new() -> Self {
Self(Uuid::new_v4())
}
/// Create from UUID.
#[must_use]
pub fn from_uuid(uuid: Uuid) -> Self {
Self(uuid)
}
/// Get the underlying UUID.
#[must_use]
pub fn as_uuid(&self) -> Uuid {
self.0
}
}
impl Default for RecordingId {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for RecordingId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl From<Uuid> for RecordingId {
fn from(uuid: Uuid) -> Self {
Self(uuid)
}
}
/// Unique identifier for a segment (from sevensense-audio context).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct SegmentId(Uuid);
impl SegmentId {
/// Create a new random segment ID.
#[must_use]
pub fn new() -> Self {
Self(Uuid::new_v4())
}
/// Create from UUID.
#[must_use]
pub fn from_uuid(uuid: Uuid) -> Self {
Self(uuid)
}
/// Get the underlying UUID.
#[must_use]
pub fn as_uuid(&self) -> Uuid {
self.0
}
}
impl Default for SegmentId {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for SegmentId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl From<Uuid> for SegmentId {
fn from(uuid: Uuid) -> Self {
Self(uuid)
}
}
/// A cluster of acoustically similar call segments.
///
/// Clusters group embeddings that represent similar vocalizations,
/// enabling pattern discovery and call type identification.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Cluster {
/// Unique identifier for this cluster.
pub id: ClusterId,
/// The prototype (representative) embedding ID for this cluster.
pub prototype_id: EmbeddingId,
/// IDs of all embeddings belonging to this cluster.
pub member_ids: Vec<EmbeddingId>,
/// Centroid vector (mean of all member embeddings).
pub centroid: Vec<f32>,
/// Variance within the cluster (measure of spread).
pub variance: f32,
/// Optional human-readable label for the cluster.
pub label: Option<String>,
/// Timestamp when the cluster was created.
pub created_at: DateTime<Utc>,
/// Timestamp when the cluster was last updated.
pub updated_at: DateTime<Utc>,
}
impl Cluster {
/// Create a new cluster with the given parameters.
#[must_use]
pub fn new(
prototype_id: EmbeddingId,
member_ids: Vec<EmbeddingId>,
centroid: Vec<f32>,
variance: f32,
) -> Self {
let now = Utc::now();
Self {
id: ClusterId::new(),
prototype_id,
member_ids,
centroid,
variance,
label: None,
created_at: now,
updated_at: now,
}
}
/// Get the number of members in this cluster.
#[must_use]
pub fn member_count(&self) -> usize {
self.member_ids.len()
}
/// Check if an embedding is a member of this cluster.
#[must_use]
pub fn contains(&self, embedding_id: &EmbeddingId) -> bool {
self.member_ids.contains(embedding_id)
}
/// Add a member to the cluster.
pub fn add_member(&mut self, embedding_id: EmbeddingId) {
if !self.member_ids.contains(&embedding_id) {
self.member_ids.push(embedding_id);
self.updated_at = Utc::now();
}
}
/// Remove a member from the cluster.
pub fn remove_member(&mut self, embedding_id: &EmbeddingId) -> bool {
if let Some(pos) = self.member_ids.iter().position(|id| id == embedding_id) {
self.member_ids.remove(pos);
self.updated_at = Utc::now();
true
} else {
false
}
}
/// Update the centroid vector.
pub fn update_centroid(&mut self, centroid: Vec<f32>, variance: f32) {
self.centroid = centroid;
self.variance = variance;
self.updated_at = Utc::now();
}
/// Set a human-readable label for this cluster.
pub fn set_label(&mut self, label: impl Into<String>) {
self.label = Some(label.into());
self.updated_at = Utc::now();
}
}
/// A prototype (exemplar) embedding that best represents a cluster.
///
/// Prototypes are actual call segments that serve as the most representative
/// examples of their cluster, useful for visualization and interpretation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Prototype {
/// The embedding ID of this prototype.
pub id: EmbeddingId,
/// The cluster this prototype represents.
pub cluster_id: ClusterId,
/// Score indicating how well this exemplar represents the cluster.
/// Higher scores indicate better representation.
pub exemplar_score: f32,
/// Optional path to the spectrogram image for visualization.
pub spectrogram_path: Option<PathBuf>,
/// Timestamp when this prototype was identified.
pub created_at: DateTime<Utc>,
}
impl Prototype {
/// Create a new prototype.
#[must_use]
pub fn new(
id: EmbeddingId,
cluster_id: ClusterId,
exemplar_score: f32,
) -> Self {
Self {
id,
cluster_id,
exemplar_score,
spectrogram_path: None,
created_at: Utc::now(),
}
}
/// Set the spectrogram path for this prototype.
pub fn set_spectrogram_path(&mut self, path: impl Into<PathBuf>) {
self.spectrogram_path = Some(path.into());
}
}
/// A motif (recurring pattern) in vocalization sequences.
///
/// Motifs represent frequently occurring sequences of cluster assignments,
/// indicating repeated vocal phrases or behavioral patterns.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Motif {
/// Unique identifier for this motif.
pub id: String,
/// The sequence of cluster IDs that define this motif.
pub sequence: Vec<ClusterId>,
/// Number of times this motif occurs in the analyzed data.
pub occurrences: usize,
/// Average duration of this motif in milliseconds.
pub avg_duration_ms: f64,
/// Confidence score for this motif (0.0 to 1.0).
pub confidence: f32,
/// All occurrences of this motif.
pub occurrence_instances: Vec<MotifOccurrence>,
/// Timestamp when this motif was discovered.
pub discovered_at: DateTime<Utc>,
}
impl Motif {
/// Create a new motif.
#[must_use]
pub fn new(
sequence: Vec<ClusterId>,
occurrences: usize,
avg_duration_ms: f64,
confidence: f32,
) -> Self {
Self {
id: Uuid::new_v4().to_string(),
sequence,
occurrences,
avg_duration_ms,
confidence,
occurrence_instances: Vec::new(),
discovered_at: Utc::now(),
}
}
/// Get the length of this motif (number of clusters).
#[must_use]
pub fn length(&self) -> usize {
self.sequence.len()
}
/// Add an occurrence instance to this motif.
pub fn add_occurrence(&mut self, occurrence: MotifOccurrence) {
self.occurrence_instances.push(occurrence);
self.occurrences = self.occurrence_instances.len();
}
/// Check if this motif contains a specific cluster.
#[must_use]
pub fn contains_cluster(&self, cluster_id: &ClusterId) -> bool {
self.sequence.contains(cluster_id)
}
}
/// A specific occurrence of a motif in a recording.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MotifOccurrence {
/// The recording where this occurrence was found.
pub recording_id: RecordingId,
/// The segment IDs that make up this occurrence.
pub segment_ids: Vec<SegmentId>,
/// Start time within the recording (milliseconds).
pub start_time_ms: u64,
/// End time within the recording (milliseconds).
pub end_time_ms: u64,
/// Similarity score to the motif template.
pub similarity: f32,
}
impl MotifOccurrence {
/// Create a new motif occurrence.
#[must_use]
pub fn new(
recording_id: RecordingId,
segment_ids: Vec<SegmentId>,
start_time_ms: u64,
end_time_ms: u64,
similarity: f32,
) -> Self {
Self {
recording_id,
segment_ids,
start_time_ms,
end_time_ms,
similarity,
}
}
/// Get the duration of this occurrence in milliseconds.
#[must_use]
pub fn duration_ms(&self) -> u64 {
self.end_time_ms.saturating_sub(self.start_time_ms)
}
}
/// Analysis of a vocalization sequence from a recording.
///
/// Contains transition information, entropy metrics, and stereotypy scores
/// for understanding sequential patterns in bird vocalizations.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SequenceAnalysis {
/// The recording this analysis pertains to.
pub recording_id: RecordingId,
/// Transitions between clusters with weights (probabilities).
/// Format: (source_cluster, target_cluster, probability)
pub transitions: Vec<(ClusterId, ClusterId, f32)>,
/// Shannon entropy of the transition distribution.
/// Higher values indicate more unpredictable sequences.
pub entropy: f32,
/// Stereotypy score (0.0 to 1.0).
/// Higher values indicate more repetitive/stereotyped sequences.
pub stereotypy_score: f32,
/// The sequence of cluster IDs in order.
pub cluster_sequence: Vec<ClusterId>,
/// The segment IDs corresponding to the cluster sequence.
pub segment_ids: Vec<SegmentId>,
/// Timestamp when this analysis was performed.
pub analyzed_at: DateTime<Utc>,
}
impl SequenceAnalysis {
/// Create a new sequence analysis.
#[must_use]
pub fn new(
recording_id: RecordingId,
transitions: Vec<(ClusterId, ClusterId, f32)>,
entropy: f32,
stereotypy_score: f32,
) -> Self {
Self {
recording_id,
transitions,
entropy,
stereotypy_score,
cluster_sequence: Vec::new(),
segment_ids: Vec::new(),
analyzed_at: Utc::now(),
}
}
/// Get the number of unique transitions.
#[must_use]
pub fn unique_transition_count(&self) -> usize {
self.transitions.len()
}
/// Get all clusters involved in the sequence.
#[must_use]
pub fn unique_clusters(&self) -> Vec<ClusterId> {
let mut clusters: Vec<ClusterId> = self.cluster_sequence.clone();
clusters.sort_by_key(|c| c.as_uuid());
clusters.dedup();
clusters
}
/// Set the cluster sequence and corresponding segment IDs.
pub fn set_sequence(&mut self, clusters: Vec<ClusterId>, segments: Vec<SegmentId>) {
self.cluster_sequence = clusters;
self.segment_ids = segments;
}
}
/// Type of anomaly detected in the analysis.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AnomalyType {
/// Rare vocalization (low occurrence count).
Rare,
/// Novel vocalization (doesn't fit any cluster well).
Novel,
/// Artifact (likely noise or recording issue).
Artifact,
/// Outlier within a cluster.
Outlier,
/// Unknown anomaly type.
Unknown,
}
impl std::fmt::Display for AnomalyType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
AnomalyType::Rare => write!(f, "Rare"),
AnomalyType::Novel => write!(f, "Novel"),
AnomalyType::Artifact => write!(f, "Artifact"),
AnomalyType::Outlier => write!(f, "Outlier"),
AnomalyType::Unknown => write!(f, "Unknown"),
}
}
}
/// An anomalous embedding that doesn't fit well into any cluster.
///
/// Anomalies can represent rare vocalizations, novel sounds, or artifacts.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Anomaly {
/// The embedding that is anomalous.
pub embedding_id: EmbeddingId,
/// Anomaly score (higher = more anomalous).
pub anomaly_score: f32,
/// The nearest cluster to this anomaly.
pub nearest_cluster: ClusterId,
/// Distance from the anomaly to the nearest cluster's centroid.
pub distance_to_centroid: f32,
/// Type of anomaly detected.
pub anomaly_type: AnomalyType,
/// Local outlier factor (if computed).
pub local_outlier_factor: Option<f32>,
/// Timestamp when this anomaly was detected.
pub detected_at: DateTime<Utc>,
}
impl Anomaly {
/// Create a new anomaly.
#[must_use]
pub fn new(
embedding_id: EmbeddingId,
anomaly_score: f32,
nearest_cluster: ClusterId,
distance_to_centroid: f32,
) -> Self {
Self {
embedding_id,
anomaly_score,
nearest_cluster,
distance_to_centroid,
anomaly_type: AnomalyType::Unknown,
local_outlier_factor: None,
detected_at: Utc::now(),
}
}
/// Set the anomaly type.
pub fn set_type(&mut self, anomaly_type: AnomalyType) {
self.anomaly_type = anomaly_type;
}
/// Set the local outlier factor.
pub fn set_lof(&mut self, lof: f32) {
self.local_outlier_factor = Some(lof);
}
/// Check if this is a severe anomaly (score > threshold).
#[must_use]
pub fn is_severe(&self, threshold: f32) -> bool {
self.anomaly_score > threshold
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cluster_id_creation() {
let id1 = ClusterId::new();
let id2 = ClusterId::new();
assert_ne!(id1, id2);
let noise = ClusterId::noise();
assert!(noise.is_noise());
assert!(!id1.is_noise());
}
#[test]
fn test_cluster_member_operations() {
let mut cluster = Cluster::new(
EmbeddingId::new(),
vec![EmbeddingId::new()],
vec![0.0; 1536],
0.1,
);
let new_member = EmbeddingId::new();
cluster.add_member(new_member);
assert_eq!(cluster.member_count(), 2);
assert!(cluster.contains(&new_member));
cluster.remove_member(&new_member);
assert_eq!(cluster.member_count(), 1);
assert!(!cluster.contains(&new_member));
}
#[test]
fn test_motif_length() {
let motif = Motif::new(
vec![ClusterId::new(), ClusterId::new(), ClusterId::new()],
5,
1500.0,
0.85,
);
assert_eq!(motif.length(), 3);
assert_eq!(motif.occurrences, 5);
}
#[test]
fn test_sequence_analysis_unique_clusters() {
let c1 = ClusterId::new();
let c2 = ClusterId::new();
let mut analysis = SequenceAnalysis::new(
RecordingId::new(),
vec![],
1.5,
0.3,
);
analysis.set_sequence(
vec![c1, c2, c1, c2, c1],
vec![SegmentId::new(); 5],
);
let unique = analysis.unique_clusters();
assert_eq!(unique.len(), 2);
}
#[test]
fn test_anomaly_severity() {
let mut anomaly = Anomaly::new(
EmbeddingId::new(),
0.8,
ClusterId::new(),
2.5,
);
assert!(anomaly.is_severe(0.5));
assert!(!anomaly.is_severe(0.9));
anomaly.set_type(AnomalyType::Novel);
assert_eq!(anomaly.anomaly_type, AnomalyType::Novel);
}
}

View File

@@ -0,0 +1,522 @@
//! Domain events for the Analysis bounded context.
//!
//! Domain events represent significant occurrences within the Analysis domain
//! that other parts of the system may need to react to.
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use super::entities::{AnomalyType, ClusterId, EmbeddingId, RecordingId};
use super::value_objects::ClusteringMethod;
/// Base trait for analysis domain events.
pub trait AnalysisEvent: Send + Sync {
/// Get the unique event ID.
fn event_id(&self) -> Uuid;
/// Get the timestamp when the event occurred.
fn occurred_at(&self) -> DateTime<Utc>;
/// Get the event type name.
fn event_type(&self) -> &'static str;
}
/// Event emitted when clustering is completed.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClustersDiscovered {
/// Unique event ID.
pub event_id: Uuid,
/// When the event occurred.
pub occurred_at: DateTime<Utc>,
/// Number of clusters discovered.
pub cluster_count: usize,
/// Number of noise points (not assigned to any cluster).
pub noise_count: usize,
/// Clustering method used.
pub method: ClusteringMethod,
/// Silhouette score (if computed).
pub silhouette_score: Option<f32>,
/// Total number of embeddings processed.
pub total_embeddings: usize,
}
impl ClustersDiscovered {
/// Create a new ClustersDiscovered event.
#[must_use]
pub fn new(
cluster_count: usize,
noise_count: usize,
method: ClusteringMethod,
total_embeddings: usize,
) -> Self {
Self {
event_id: Uuid::new_v4(),
occurred_at: Utc::now(),
cluster_count,
noise_count,
method,
silhouette_score: None,
total_embeddings,
}
}
/// Add silhouette score to the event.
#[must_use]
pub fn with_silhouette_score(mut self, score: f32) -> Self {
self.silhouette_score = Some(score);
self
}
}
impl AnalysisEvent for ClustersDiscovered {
fn event_id(&self) -> Uuid {
self.event_id
}
fn occurred_at(&self) -> DateTime<Utc> {
self.occurred_at
}
fn event_type(&self) -> &'static str {
"ClustersDiscovered"
}
}
/// Event emitted when an embedding is assigned to a cluster.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClusterAssigned {
/// Unique event ID.
pub event_id: Uuid,
/// When the event occurred.
pub occurred_at: DateTime<Utc>,
/// The embedding that was assigned.
pub embedding_id: EmbeddingId,
/// The cluster it was assigned to.
pub cluster_id: ClusterId,
/// Confidence/probability of the assignment.
pub confidence: f32,
/// Distance to the cluster centroid.
pub distance_to_centroid: f32,
}
impl ClusterAssigned {
/// Create a new ClusterAssigned event.
#[must_use]
pub fn new(
embedding_id: EmbeddingId,
cluster_id: ClusterId,
confidence: f32,
distance_to_centroid: f32,
) -> Self {
Self {
event_id: Uuid::new_v4(),
occurred_at: Utc::now(),
embedding_id,
cluster_id,
confidence,
distance_to_centroid,
}
}
}
impl AnalysisEvent for ClusterAssigned {
fn event_id(&self) -> Uuid {
self.event_id
}
fn occurred_at(&self) -> DateTime<Utc> {
self.occurred_at
}
fn event_type(&self) -> &'static str {
"ClusterAssigned"
}
}
/// Event emitted when a motif pattern is detected.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MotifDetected {
/// Unique event ID.
pub event_id: Uuid,
/// When the event occurred.
pub occurred_at: DateTime<Utc>,
/// The motif ID.
pub motif_id: String,
/// The cluster sequence defining the motif.
pub pattern: Vec<ClusterId>,
/// Number of occurrences found.
pub occurrences: usize,
/// Confidence score for this motif.
pub confidence: f32,
/// Average duration in milliseconds.
pub avg_duration_ms: f64,
}
impl MotifDetected {
/// Create a new MotifDetected event.
#[must_use]
pub fn new(
motif_id: String,
pattern: Vec<ClusterId>,
occurrences: usize,
confidence: f32,
avg_duration_ms: f64,
) -> Self {
Self {
event_id: Uuid::new_v4(),
occurred_at: Utc::now(),
motif_id,
pattern,
occurrences,
confidence,
avg_duration_ms,
}
}
}
impl AnalysisEvent for MotifDetected {
fn event_id(&self) -> Uuid {
self.event_id
}
fn occurred_at(&self) -> DateTime<Utc> {
self.occurred_at
}
fn event_type(&self) -> &'static str {
"MotifDetected"
}
}
/// Event emitted when a sequence is analyzed.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SequenceAnalyzed {
/// Unique event ID.
pub event_id: Uuid,
/// When the event occurred.
pub occurred_at: DateTime<Utc>,
/// The recording that was analyzed.
pub recording_id: RecordingId,
/// Shannon entropy of the sequence.
pub entropy: f32,
/// Stereotypy score.
pub stereotypy_score: f32,
/// Number of unique clusters in the sequence.
pub unique_clusters: usize,
/// Number of unique transitions.
pub unique_transitions: usize,
/// Total sequence length.
pub sequence_length: usize,
}
impl SequenceAnalyzed {
/// Create a new SequenceAnalyzed event.
#[must_use]
pub fn new(
recording_id: RecordingId,
entropy: f32,
stereotypy_score: f32,
unique_clusters: usize,
unique_transitions: usize,
sequence_length: usize,
) -> Self {
Self {
event_id: Uuid::new_v4(),
occurred_at: Utc::now(),
recording_id,
entropy,
stereotypy_score,
unique_clusters,
unique_transitions,
sequence_length,
}
}
}
impl AnalysisEvent for SequenceAnalyzed {
fn event_id(&self) -> Uuid {
self.event_id
}
fn occurred_at(&self) -> DateTime<Utc> {
self.occurred_at
}
fn event_type(&self) -> &'static str {
"SequenceAnalyzed"
}
}
/// Event emitted when an anomaly is detected.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnomalyDetected {
/// Unique event ID.
pub event_id: Uuid,
/// When the event occurred.
pub occurred_at: DateTime<Utc>,
/// The embedding identified as anomalous.
pub embedding_id: EmbeddingId,
/// Anomaly score.
pub anomaly_score: f32,
/// Type of anomaly.
pub anomaly_type: AnomalyType,
/// The nearest cluster.
pub nearest_cluster: ClusterId,
/// Distance to the nearest cluster centroid.
pub distance_to_centroid: f32,
}
impl AnomalyDetected {
/// Create a new AnomalyDetected event.
#[must_use]
pub fn new(
embedding_id: EmbeddingId,
anomaly_score: f32,
anomaly_type: AnomalyType,
nearest_cluster: ClusterId,
distance_to_centroid: f32,
) -> Self {
Self {
event_id: Uuid::new_v4(),
occurred_at: Utc::now(),
embedding_id,
anomaly_score,
anomaly_type,
nearest_cluster,
distance_to_centroid,
}
}
}
impl AnalysisEvent for AnomalyDetected {
fn event_id(&self) -> Uuid {
self.event_id
}
fn occurred_at(&self) -> DateTime<Utc> {
self.occurred_at
}
fn event_type(&self) -> &'static str {
"AnomalyDetected"
}
}
/// Event emitted when cluster prototypes are updated.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PrototypesComputed {
/// Unique event ID.
pub event_id: Uuid,
/// When the event occurred.
pub occurred_at: DateTime<Utc>,
/// The cluster for which prototypes were computed.
pub cluster_id: ClusterId,
/// Number of prototypes computed.
pub prototype_count: usize,
/// Best exemplar score.
pub best_exemplar_score: f32,
}
impl PrototypesComputed {
/// Create a new PrototypesComputed event.
#[must_use]
pub fn new(cluster_id: ClusterId, prototype_count: usize, best_exemplar_score: f32) -> Self {
Self {
event_id: Uuid::new_v4(),
occurred_at: Utc::now(),
cluster_id,
prototype_count,
best_exemplar_score,
}
}
}
impl AnalysisEvent for PrototypesComputed {
fn event_id(&self) -> Uuid {
self.event_id
}
fn occurred_at(&self) -> DateTime<Utc> {
self.occurred_at
}
fn event_type(&self) -> &'static str {
"PrototypesComputed"
}
}
/// Event emitted when a cluster label is updated.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClusterLabeled {
/// Unique event ID.
pub event_id: Uuid,
/// When the event occurred.
pub occurred_at: DateTime<Utc>,
/// The cluster that was labeled.
pub cluster_id: ClusterId,
/// The new label (None if label was removed).
pub label: Option<String>,
/// Previous label (None if no previous label).
pub previous_label: Option<String>,
}
impl ClusterLabeled {
/// Create a new ClusterLabeled event.
#[must_use]
pub fn new(
cluster_id: ClusterId,
label: Option<String>,
previous_label: Option<String>,
) -> Self {
Self {
event_id: Uuid::new_v4(),
occurred_at: Utc::now(),
cluster_id,
label,
previous_label,
}
}
}
impl AnalysisEvent for ClusterLabeled {
fn event_id(&self) -> Uuid {
self.event_id
}
fn occurred_at(&self) -> DateTime<Utc> {
self.occurred_at
}
fn event_type(&self) -> &'static str {
"ClusterLabeled"
}
}
/// Event publisher trait for analysis events.
#[async_trait::async_trait]
pub trait AnalysisEventPublisher: Send + Sync {
/// Publish an analysis event.
async fn publish<E: AnalysisEvent + Serialize + 'static>(
&self,
event: E,
) -> Result<(), EventPublishError>;
}
/// Error type for event publishing.
#[derive(Debug, thiserror::Error)]
pub enum EventPublishError {
/// Serialization failed.
#[error("Failed to serialize event: {0}")]
Serialization(String),
/// Transport error.
#[error("Failed to publish event: {0}")]
Transport(String),
/// Channel closed.
#[error("Event channel closed")]
ChannelClosed,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clusters_discovered_event() {
let event = ClustersDiscovered::new(
10,
5,
ClusteringMethod::HDBSCAN,
100,
)
.with_silhouette_score(0.75);
assert_eq!(event.cluster_count, 10);
assert_eq!(event.noise_count, 5);
assert_eq!(event.silhouette_score, Some(0.75));
assert_eq!(event.event_type(), "ClustersDiscovered");
}
#[test]
fn test_cluster_assigned_event() {
let event = ClusterAssigned::new(
EmbeddingId::new(),
ClusterId::new(),
0.95,
0.1,
);
assert_eq!(event.confidence, 0.95);
assert_eq!(event.event_type(), "ClusterAssigned");
}
#[test]
fn test_motif_detected_event() {
let pattern = vec![ClusterId::new(), ClusterId::new()];
let event = MotifDetected::new(
"motif-1".to_string(),
pattern.clone(),
10,
0.85,
1500.0,
);
assert_eq!(event.pattern.len(), 2);
assert_eq!(event.occurrences, 10);
assert_eq!(event.event_type(), "MotifDetected");
}
#[test]
fn test_anomaly_detected_event() {
let event = AnomalyDetected::new(
EmbeddingId::new(),
0.9,
AnomalyType::Novel,
ClusterId::new(),
2.5,
);
assert_eq!(event.anomaly_type, AnomalyType::Novel);
assert_eq!(event.event_type(), "AnomalyDetected");
}
}

View File

@@ -0,0 +1,14 @@
//! Domain layer for the Analysis bounded context.
//!
//! Contains core domain entities, value objects, repository traits, and domain events.
pub mod entities;
pub mod events;
pub mod repository;
pub mod value_objects;
// Re-export commonly used types
pub use entities::*;
pub use events::*;
pub use repository::*;
pub use value_objects::*;

View File

@@ -0,0 +1,290 @@
//! Repository traits for the Analysis bounded context.
//!
//! These traits define the persistence interfaces for domain entities.
//! Implementations are provided in the infrastructure layer.
use async_trait::async_trait;
use thiserror::Error;
use super::entities::{
Anomaly, Cluster, ClusterId, EmbeddingId, Motif, Prototype, RecordingId, SequenceAnalysis,
};
/// Errors that can occur during repository operations.
#[derive(Debug, Error)]
pub enum RepositoryError {
/// Entity not found.
#[error("Entity not found: {0}")]
NotFound(String),
/// Duplicate entity.
#[error("Duplicate entity: {0}")]
Duplicate(String),
/// Database connection error.
#[error("Connection error: {0}")]
ConnectionError(String),
/// Query execution error.
#[error("Query error: {0}")]
QueryError(String),
/// Serialization/deserialization error.
#[error("Serialization error: {0}")]
SerializationError(String),
/// Invalid data error.
#[error("Invalid data: {0}")]
InvalidData(String),
/// Concurrency conflict.
#[error("Concurrency conflict: {0}")]
ConcurrencyError(String),
/// Internal error.
#[error("Internal error: {0}")]
Internal(String),
}
/// Result type for repository operations.
pub type Result<T> = std::result::Result<T, RepositoryError>;
/// Repository for cluster persistence.
#[async_trait]
pub trait ClusterRepository: Send + Sync {
/// Save a cluster to the repository.
async fn save_cluster(&self, cluster: &Cluster) -> Result<()>;
/// Save multiple clusters in a batch.
async fn save_clusters(&self, clusters: &[Cluster]) -> Result<()>;
/// Find a cluster by its ID.
async fn find_cluster(&self, id: &ClusterId) -> Result<Option<Cluster>>;
/// List all clusters.
async fn list_clusters(&self) -> Result<Vec<Cluster>>;
/// List clusters with pagination.
async fn list_clusters_paginated(
&self,
offset: usize,
limit: usize,
) -> Result<Vec<Cluster>>;
/// Assign an embedding to a cluster.
async fn assign_to_cluster(
&self,
embedding_id: &EmbeddingId,
cluster_id: &ClusterId,
) -> Result<()>;
/// Remove an embedding from its cluster.
async fn remove_from_cluster(&self, embedding_id: &EmbeddingId) -> Result<()>;
/// Find the cluster containing a specific embedding.
async fn find_cluster_by_embedding(
&self,
embedding_id: &EmbeddingId,
) -> Result<Option<Cluster>>;
/// Delete a cluster.
async fn delete_cluster(&self, id: &ClusterId) -> Result<()>;
/// Delete all clusters.
async fn delete_all_clusters(&self) -> Result<()>;
/// Get cluster count.
async fn cluster_count(&self) -> Result<usize>;
/// Find clusters by label pattern.
async fn find_clusters_by_label(&self, label_pattern: &str) -> Result<Vec<Cluster>>;
/// Update cluster label.
async fn update_cluster_label(
&self,
id: &ClusterId,
label: Option<String>,
) -> Result<()>;
}
/// Repository for prototype persistence.
#[async_trait]
pub trait PrototypeRepository: Send + Sync {
/// Save a prototype.
async fn save_prototype(&self, prototype: &Prototype) -> Result<()>;
/// Save multiple prototypes in a batch.
async fn save_prototypes(&self, prototypes: &[Prototype]) -> Result<()>;
/// Find prototypes for a cluster.
async fn find_prototypes_by_cluster(
&self,
cluster_id: &ClusterId,
) -> Result<Vec<Prototype>>;
/// Find the best prototype for a cluster.
async fn find_best_prototype(
&self,
cluster_id: &ClusterId,
) -> Result<Option<Prototype>>;
/// Delete prototypes for a cluster.
async fn delete_prototypes_by_cluster(&self, cluster_id: &ClusterId) -> Result<()>;
/// Delete all prototypes.
async fn delete_all_prototypes(&self) -> Result<()>;
}
/// Repository for motif persistence.
#[async_trait]
pub trait MotifRepository: Send + Sync {
/// Save a motif.
async fn save_motif(&self, motif: &Motif) -> Result<()>;
/// Save multiple motifs in a batch.
async fn save_motifs(&self, motifs: &[Motif]) -> Result<()>;
/// Find a motif by its ID.
async fn find_motif(&self, id: &str) -> Result<Option<Motif>>;
/// Find motifs containing a specific cluster.
async fn find_motifs_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Motif>>;
/// List all motifs.
async fn list_motifs(&self) -> Result<Vec<Motif>>;
/// List motifs with minimum confidence.
async fn find_motifs_by_confidence(&self, min_confidence: f32) -> Result<Vec<Motif>>;
/// List motifs with minimum occurrences.
async fn find_motifs_by_occurrences(&self, min_occurrences: usize) -> Result<Vec<Motif>>;
/// Delete a motif.
async fn delete_motif(&self, id: &str) -> Result<()>;
/// Delete all motifs.
async fn delete_all_motifs(&self) -> Result<()>;
/// Get motif count.
async fn motif_count(&self) -> Result<usize>;
/// Find motifs by sequence pattern (exact match).
async fn find_motifs_by_sequence(&self, sequence: &[ClusterId]) -> Result<Vec<Motif>>;
/// Find motifs by sequence pattern (subsequence match).
async fn find_motifs_containing_subsequence(
&self,
subsequence: &[ClusterId],
) -> Result<Vec<Motif>>;
}
/// Repository for sequence analysis persistence.
#[async_trait]
pub trait SequenceRepository: Send + Sync {
/// Save a sequence analysis.
async fn save_sequence_analysis(&self, analysis: &SequenceAnalysis) -> Result<()>;
/// Find sequence analysis for a recording.
async fn find_sequence_by_recording(
&self,
recording_id: &RecordingId,
) -> Result<Option<SequenceAnalysis>>;
/// List all sequence analyses.
async fn list_sequence_analyses(&self) -> Result<Vec<SequenceAnalysis>>;
/// Delete sequence analysis for a recording.
async fn delete_sequence_by_recording(&self, recording_id: &RecordingId) -> Result<()>;
/// Delete all sequence analyses.
async fn delete_all_sequences(&self) -> Result<()>;
/// Find sequences with entropy above threshold.
async fn find_sequences_by_entropy(&self, min_entropy: f32) -> Result<Vec<SequenceAnalysis>>;
/// Find sequences with stereotypy above threshold.
async fn find_sequences_by_stereotypy(
&self,
min_stereotypy: f32,
) -> Result<Vec<SequenceAnalysis>>;
}
/// Repository for anomaly persistence.
#[async_trait]
pub trait AnomalyRepository: Send + Sync {
/// Save an anomaly.
async fn save_anomaly(&self, anomaly: &Anomaly) -> Result<()>;
/// Save multiple anomalies in a batch.
async fn save_anomalies(&self, anomalies: &[Anomaly]) -> Result<()>;
/// Find an anomaly by embedding ID.
async fn find_anomaly(&self, embedding_id: &EmbeddingId) -> Result<Option<Anomaly>>;
/// List all anomalies.
async fn list_anomalies(&self) -> Result<Vec<Anomaly>>;
/// Find anomalies with score above threshold.
async fn find_anomalies_by_score(&self, min_score: f32) -> Result<Vec<Anomaly>>;
/// Find anomalies near a specific cluster.
async fn find_anomalies_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Anomaly>>;
/// Delete an anomaly.
async fn delete_anomaly(&self, embedding_id: &EmbeddingId) -> Result<()>;
/// Delete all anomalies.
async fn delete_all_anomalies(&self) -> Result<()>;
/// Get anomaly count.
async fn anomaly_count(&self) -> Result<usize>;
}
/// Combined repository for all analysis entities.
///
/// This trait combines all individual repositories for convenience
/// when a single interface to all analysis data is needed.
#[async_trait]
pub trait AnalysisRepository:
ClusterRepository + PrototypeRepository + MotifRepository + SequenceRepository + AnomalyRepository
{
/// Clear all analysis data.
async fn clear_all(&self) -> Result<()> {
self.delete_all_clusters().await?;
self.delete_all_prototypes().await?;
self.delete_all_motifs().await?;
self.delete_all_sequences().await?;
self.delete_all_anomalies().await?;
Ok(())
}
}
/// Unit of work for transactional operations.
#[async_trait]
pub trait UnitOfWork: Send + Sync {
/// Type of repository returned by this unit of work.
type Repository: AnalysisRepository;
/// Begin a new transaction and return a repository.
async fn begin(&self) -> Result<Self::Repository>;
/// Commit the current transaction.
async fn commit(&self) -> Result<()>;
/// Rollback the current transaction.
async fn rollback(&self) -> Result<()>;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_repository_error_display() {
let err = RepositoryError::NotFound("cluster-123".to_string());
assert!(format!("{}", err).contains("cluster-123"));
let err = RepositoryError::QueryError("syntax error".to_string());
assert!(format!("{}", err).contains("syntax error"));
}
}

View File

@@ -0,0 +1,616 @@
//! Value objects for the Analysis bounded context.
//!
//! Value objects are immutable objects that represent concepts without identity.
//! They are defined by their attributes rather than a unique identifier.
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use super::entities::ClusterId;
/// Method used for clustering embeddings.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub enum ClusteringMethod {
/// HDBSCAN (Hierarchical Density-Based Spatial Clustering).
/// Good for discovering clusters of varying densities and shapes.
HDBSCAN,
/// K-Means clustering with fixed number of clusters.
KMeans {
/// Number of clusters to create.
k: usize,
},
/// Spectral clustering using eigenvalues of similarity matrix.
Spectral {
/// Number of clusters to create.
n_clusters: usize,
},
/// Agglomerative hierarchical clustering.
Agglomerative {
/// Number of clusters to create.
n_clusters: usize,
/// Linkage criterion (ward, complete, average, single).
linkage: LinkageMethod,
},
}
impl Default for ClusteringMethod {
fn default() -> Self {
Self::HDBSCAN
}
}
impl std::fmt::Display for ClusteringMethod {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ClusteringMethod::HDBSCAN => write!(f, "HDBSCAN"),
ClusteringMethod::KMeans { k } => write!(f, "K-Means (k={})", k),
ClusteringMethod::Spectral { n_clusters } => {
write!(f, "Spectral (n={})", n_clusters)
}
ClusteringMethod::Agglomerative { n_clusters, linkage } => {
write!(f, "Agglomerative (n={}, {:?})", n_clusters, linkage)
}
}
}
}
/// Linkage method for agglomerative clustering.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum LinkageMethod {
/// Ward's minimum variance method.
Ward,
/// Complete linkage (maximum distance).
Complete,
/// Average linkage (mean distance).
Average,
/// Single linkage (minimum distance).
Single,
}
impl Default for LinkageMethod {
fn default() -> Self {
Self::Ward
}
}
/// Distance metric for clustering.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum DistanceMetric {
/// Euclidean distance (L2 norm).
Euclidean,
/// Cosine distance (1 - cosine similarity).
Cosine,
/// Manhattan distance (L1 norm).
Manhattan,
/// Poincare distance (hyperbolic space).
Poincare,
}
impl Default for DistanceMetric {
fn default() -> Self {
Self::Cosine
}
}
impl std::fmt::Display for DistanceMetric {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
DistanceMetric::Euclidean => write!(f, "Euclidean"),
DistanceMetric::Cosine => write!(f, "Cosine"),
DistanceMetric::Manhattan => write!(f, "Manhattan"),
DistanceMetric::Poincare => write!(f, "Poincare"),
}
}
}
/// Parameters for clustering algorithms.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClusteringParameters {
/// Minimum number of points to form a cluster (HDBSCAN).
pub min_cluster_size: usize,
/// Minimum number of samples in neighborhood (HDBSCAN).
pub min_samples: usize,
/// Epsilon for DBSCAN-like algorithms (optional distance threshold).
pub epsilon: Option<f32>,
/// Distance metric to use.
pub metric: DistanceMetric,
/// Maximum number of clusters (optional limit).
pub max_clusters: Option<usize>,
/// Whether to allow single-point clusters.
pub allow_single_cluster: bool,
}
impl Default for ClusteringParameters {
fn default() -> Self {
Self {
min_cluster_size: 5,
min_samples: 3,
epsilon: None,
metric: DistanceMetric::Cosine,
max_clusters: None,
allow_single_cluster: false,
}
}
}
impl ClusteringParameters {
/// Create parameters for HDBSCAN.
#[must_use]
pub fn hdbscan(min_cluster_size: usize, min_samples: usize) -> Self {
Self {
min_cluster_size,
min_samples,
..Default::default()
}
}
/// Create parameters for K-means.
#[must_use]
pub fn kmeans() -> Self {
Self {
min_cluster_size: 1,
min_samples: 1,
allow_single_cluster: true,
..Default::default()
}
}
/// Set the distance metric.
#[must_use]
pub fn with_metric(mut self, metric: DistanceMetric) -> Self {
self.metric = metric;
self
}
/// Set the epsilon threshold.
#[must_use]
pub fn with_epsilon(mut self, epsilon: f32) -> Self {
self.epsilon = Some(epsilon);
self
}
}
/// Configuration for clustering operations.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClusteringConfig {
/// The clustering method to use.
pub method: ClusteringMethod,
/// Parameters for the clustering algorithm.
pub parameters: ClusteringParameters,
/// Whether to compute cluster prototypes.
pub compute_prototypes: bool,
/// Number of prototypes to compute per cluster.
pub prototypes_per_cluster: usize,
/// Whether to compute silhouette scores.
pub compute_silhouette: bool,
/// Random seed for reproducibility.
pub random_seed: Option<u64>,
}
impl Default for ClusteringConfig {
fn default() -> Self {
Self {
method: ClusteringMethod::HDBSCAN,
parameters: ClusteringParameters::default(),
compute_prototypes: true,
prototypes_per_cluster: 3,
compute_silhouette: true,
random_seed: None,
}
}
}
impl ClusteringConfig {
/// Create a HDBSCAN configuration.
#[must_use]
pub fn hdbscan(min_cluster_size: usize, min_samples: usize) -> Self {
Self {
method: ClusteringMethod::HDBSCAN,
parameters: ClusteringParameters::hdbscan(min_cluster_size, min_samples),
..Default::default()
}
}
/// Create a K-means configuration.
#[must_use]
pub fn kmeans(k: usize) -> Self {
Self {
method: ClusteringMethod::KMeans { k },
parameters: ClusteringParameters::kmeans(),
..Default::default()
}
}
/// Set a random seed for reproducibility.
#[must_use]
pub fn with_seed(mut self, seed: u64) -> Self {
self.random_seed = Some(seed);
self
}
}
/// Configuration for motif detection.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MotifConfig {
/// Minimum length of motifs to detect.
pub min_length: usize,
/// Maximum length of motifs to detect.
pub max_length: usize,
/// Minimum number of occurrences for a motif.
pub min_occurrences: usize,
/// Minimum confidence threshold for motifs.
pub min_confidence: f32,
/// Whether to allow overlapping occurrences.
pub allow_overlap: bool,
/// Maximum gap (in clusters) between motif elements.
pub max_gap: usize,
}
impl Default for MotifConfig {
fn default() -> Self {
Self {
min_length: 2,
max_length: 10,
min_occurrences: 3,
min_confidence: 0.5,
allow_overlap: false,
max_gap: 0,
}
}
}
impl MotifConfig {
/// Create a strict motif configuration (no gaps, no overlap).
#[must_use]
pub fn strict() -> Self {
Self {
min_length: 3,
max_length: 8,
min_occurrences: 5,
min_confidence: 0.7,
allow_overlap: false,
max_gap: 0,
}
}
/// Create a relaxed motif configuration (allows gaps).
#[must_use]
pub fn relaxed() -> Self {
Self {
min_length: 2,
max_length: 15,
min_occurrences: 2,
min_confidence: 0.3,
allow_overlap: true,
max_gap: 2,
}
}
/// Set the length range.
#[must_use]
pub fn with_length_range(mut self, min: usize, max: usize) -> Self {
self.min_length = min;
self.max_length = max;
self
}
}
/// Metrics computed from sequence analysis.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SequenceMetrics {
/// Shannon entropy of the sequence.
pub entropy: f32,
/// Normalized entropy (entropy / max_entropy).
pub normalized_entropy: f32,
/// Stereotypy score (1 - normalized_entropy).
pub stereotypy: f32,
/// Number of unique clusters in the sequence.
pub unique_clusters: usize,
/// Number of unique transitions in the sequence.
pub unique_transitions: usize,
/// Total number of transitions.
pub total_transitions: usize,
/// Most common transition and its probability.
pub dominant_transition: Option<(ClusterId, ClusterId, f32)>,
/// Repetition rate (self-transitions / total).
pub repetition_rate: f32,
}
impl Default for SequenceMetrics {
fn default() -> Self {
Self {
entropy: 0.0,
normalized_entropy: 0.0,
stereotypy: 1.0,
unique_clusters: 0,
unique_transitions: 0,
total_transitions: 0,
dominant_transition: None,
repetition_rate: 0.0,
}
}
}
/// Transition matrix for Markov chain analysis.
///
/// Represents the probabilities of transitioning from one cluster to another.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TransitionMatrix {
/// Ordered list of cluster IDs (defines row/column indices).
pub cluster_ids: Vec<ClusterId>,
/// Transition probabilities (row = source, column = target).
/// Values are probabilities (0.0 to 1.0, rows sum to 1.0).
pub probabilities: Vec<Vec<f32>>,
/// Raw observation counts (row = source, column = target).
pub observations: Vec<Vec<u32>>,
/// Mapping from ClusterId to matrix index.
#[serde(skip)]
index_map: HashMap<ClusterId, usize>,
}
impl TransitionMatrix {
/// Create a new transition matrix for the given clusters.
#[must_use]
pub fn new(cluster_ids: Vec<ClusterId>) -> Self {
let n = cluster_ids.len();
let index_map: HashMap<ClusterId, usize> = cluster_ids
.iter()
.enumerate()
.map(|(i, id)| (*id, i))
.collect();
Self {
cluster_ids,
probabilities: vec![vec![0.0; n]; n],
observations: vec![vec![0; n]; n],
index_map,
}
}
/// Get the number of clusters (states) in the matrix.
#[must_use]
pub fn size(&self) -> usize {
self.cluster_ids.len()
}
/// Get the index for a cluster ID.
#[must_use]
pub fn index_of(&self, cluster_id: &ClusterId) -> Option<usize> {
self.index_map.get(cluster_id).copied()
}
/// Record an observed transition.
pub fn record_transition(&mut self, from: &ClusterId, to: &ClusterId) {
if let (Some(i), Some(j)) = (self.index_of(from), self.index_of(to)) {
self.observations[i][j] += 1;
}
}
/// Compute probabilities from observation counts.
pub fn compute_probabilities(&mut self) {
for i in 0..self.size() {
let row_sum: u32 = self.observations[i].iter().sum();
if row_sum > 0 {
for j in 0..self.size() {
self.probabilities[i][j] = self.observations[i][j] as f32 / row_sum as f32;
}
}
}
}
/// Get the transition probability from one cluster to another.
#[must_use]
pub fn probability(&self, from: &ClusterId, to: &ClusterId) -> Option<f32> {
match (self.index_of(from), self.index_of(to)) {
(Some(i), Some(j)) => Some(self.probabilities[i][j]),
_ => None,
}
}
/// Get the observation count for a transition.
#[must_use]
pub fn observation_count(&self, from: &ClusterId, to: &ClusterId) -> Option<u32> {
match (self.index_of(from), self.index_of(to)) {
(Some(i), Some(j)) => Some(self.observations[i][j]),
_ => None,
}
}
/// Get all non-zero transitions as (from, to, probability) tuples.
#[must_use]
pub fn non_zero_transitions(&self) -> Vec<(ClusterId, ClusterId, f32)> {
let mut transitions = Vec::new();
for (i, from) in self.cluster_ids.iter().enumerate() {
for (j, to) in self.cluster_ids.iter().enumerate() {
let prob = self.probabilities[i][j];
if prob > 0.0 {
transitions.push((*from, *to, prob));
}
}
}
transitions
}
/// Get the stationary distribution (eigenvector of eigenvalue 1).
/// Returns None if the matrix is not ergodic.
#[must_use]
pub fn stationary_distribution(&self) -> Option<Vec<f32>> {
// Power iteration method for finding stationary distribution
let n = self.size();
if n == 0 {
return None;
}
let mut dist = vec![1.0 / n as f32; n];
let max_iterations = 1000;
let tolerance = 1e-8;
for _ in 0..max_iterations {
let mut new_dist = vec![0.0; n];
// Matrix-vector multiplication: new_dist = dist * P^T
for j in 0..n {
for i in 0..n {
new_dist[j] += dist[i] * self.probabilities[i][j];
}
}
// Check convergence
let diff: f32 = dist
.iter()
.zip(new_dist.iter())
.map(|(a, b)| (a - b).abs())
.sum();
dist = new_dist;
if diff < tolerance {
return Some(dist);
}
}
Some(dist)
}
/// Rebuild the index map (needed after deserialization).
pub fn rebuild_index_map(&mut self) {
self.index_map = self
.cluster_ids
.iter()
.enumerate()
.map(|(i, id)| (*id, i))
.collect();
}
}
/// Result of a clustering operation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClusteringResult {
/// The clusters discovered.
pub clusters: Vec<super::entities::Cluster>,
/// Embeddings classified as noise (HDBSCAN).
pub noise: Vec<super::entities::EmbeddingId>,
/// Silhouette score (if computed).
pub silhouette_score: Option<f32>,
/// V-measure score (if ground truth available).
pub v_measure: Option<f32>,
/// Prototypes for each cluster.
pub prototypes: Vec<super::entities::Prototype>,
/// Parameters used for clustering.
pub parameters: ClusteringParameters,
/// Method used for clustering.
pub method: ClusteringMethod,
}
impl ClusteringResult {
/// Get the number of clusters (excluding noise).
#[must_use]
pub fn cluster_count(&self) -> usize {
self.clusters.len()
}
/// Get the noise rate (proportion of points in noise).
#[must_use]
pub fn noise_rate(&self) -> f32 {
let total = self
.clusters
.iter()
.map(|c| c.member_count())
.sum::<usize>()
+ self.noise.len();
if total == 0 {
0.0
} else {
self.noise.len() as f32 / total as f32
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clustering_config_creation() {
let config = ClusteringConfig::hdbscan(10, 5);
assert!(matches!(config.method, ClusteringMethod::HDBSCAN));
assert_eq!(config.parameters.min_cluster_size, 10);
assert_eq!(config.parameters.min_samples, 5);
}
#[test]
fn test_transition_matrix() {
let c1 = ClusterId::new();
let c2 = ClusterId::new();
let c3 = ClusterId::new();
let mut matrix = TransitionMatrix::new(vec![c1, c2, c3]);
// Record some transitions
matrix.record_transition(&c1, &c2);
matrix.record_transition(&c1, &c2);
matrix.record_transition(&c1, &c3);
matrix.record_transition(&c2, &c1);
matrix.compute_probabilities();
// c1 -> c2 should be 2/3
assert!((matrix.probability(&c1, &c2).unwrap() - 2.0 / 3.0).abs() < 0.001);
// c1 -> c3 should be 1/3
assert!((matrix.probability(&c1, &c3).unwrap() - 1.0 / 3.0).abs() < 0.001);
// c2 -> c1 should be 1.0
assert!((matrix.probability(&c2, &c1).unwrap() - 1.0).abs() < 0.001);
}
#[test]
fn test_motif_config() {
let config = MotifConfig::strict();
assert_eq!(config.min_length, 3);
assert_eq!(config.min_occurrences, 5);
assert!(!config.allow_overlap);
let relaxed = MotifConfig::relaxed();
assert!(relaxed.allow_overlap);
assert_eq!(relaxed.max_gap, 2);
}
#[test]
fn test_distance_metric_display() {
assert_eq!(format!("{}", DistanceMetric::Cosine), "Cosine");
assert_eq!(format!("{}", DistanceMetric::Euclidean), "Euclidean");
}
}

View File

@@ -0,0 +1,404 @@
//! HDBSCAN clustering implementation.
//!
//! Hierarchical Density-Based Spatial Clustering of Applications with Noise.
//! This implementation uses core distance and mutual reachability distance
//! to build a minimum spanning tree and extract clusters.
use ndarray::{Array2, ArrayView1};
use petgraph::graph::{NodeIndex, UnGraph};
use petgraph::algo::min_spanning_tree;
use petgraph::data::FromElements;
use std::collections::{HashMap, HashSet};
use tracing::{debug, instrument};
use crate::application::services::AnalysisError;
use crate::domain::value_objects::DistanceMetric;
/// HDBSCAN clustering algorithm.
pub struct HdbscanClusterer {
/// Minimum cluster size.
min_cluster_size: usize,
/// Minimum samples for core point determination.
min_samples: usize,
/// Distance metric to use.
metric: DistanceMetric,
}
impl HdbscanClusterer {
/// Create a new HDBSCAN clusterer.
#[must_use]
pub fn new(min_cluster_size: usize, min_samples: usize, metric: DistanceMetric) -> Self {
Self {
min_cluster_size,
min_samples,
metric,
}
}
/// Fit HDBSCAN to the data and return cluster labels.
///
/// # Arguments
///
/// * `data` - 2D array where rows are samples and columns are features
///
/// # Returns
///
/// Vector of cluster labels (-1 for noise).
#[instrument(skip(self, data), fields(n_samples = data.nrows(), n_features = data.ncols()))]
pub fn fit(&self, data: &Array2<f32>) -> Result<Vec<i32>, AnalysisError> {
let n = data.nrows();
if n < self.min_cluster_size {
return Err(AnalysisError::InsufficientData(format!(
"Need at least {} samples, got {}",
self.min_cluster_size, n
)));
}
debug!(
n_samples = n,
min_cluster_size = self.min_cluster_size,
min_samples = self.min_samples,
"Starting HDBSCAN fit"
);
// Step 1: Compute pairwise distances
let distances = self.compute_pairwise_distances(data);
// Step 2: Compute core distances
let core_distances = self.compute_core_distances(&distances);
// Step 3: Compute mutual reachability distances
let mrd = self.compute_mutual_reachability(&distances, &core_distances);
// Step 4: Build minimum spanning tree
let mst = self.build_mst(&mrd);
// Step 5: Build cluster hierarchy
let labels = self.extract_clusters(&mst, n);
debug!(
n_clusters = labels.iter().filter(|&&l| l >= 0).collect::<HashSet<_>>().len(),
n_noise = labels.iter().filter(|&&l| l < 0).count(),
"HDBSCAN fit completed"
);
Ok(labels)
}
/// Compute pairwise distance matrix.
fn compute_pairwise_distances(&self, data: &Array2<f32>) -> Array2<f32> {
let n = data.nrows();
let mut distances = Array2::<f32>::zeros((n, n));
for i in 0..n {
for j in (i + 1)..n {
let dist = self.distance(data.row(i), data.row(j));
distances[[i, j]] = dist;
distances[[j, i]] = dist;
}
}
distances
}
/// Compute core distance for each point (k-th nearest neighbor distance).
fn compute_core_distances(&self, distances: &Array2<f32>) -> Vec<f32> {
let n = distances.nrows();
let k = self.min_samples.min(n - 1);
let mut core_distances = Vec::with_capacity(n);
for i in 0..n {
let mut row_distances: Vec<f32> = distances.row(i).to_vec();
row_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
// k-th nearest neighbor (index k because index 0 is self with distance 0)
let core_dist = row_distances.get(k).copied().unwrap_or(f32::MAX);
core_distances.push(core_dist);
}
core_distances
}
/// Compute mutual reachability distance matrix.
fn compute_mutual_reachability(
&self,
distances: &Array2<f32>,
core_distances: &[f32],
) -> Array2<f32> {
let n = distances.nrows();
let mut mrd = Array2::<f32>::zeros((n, n));
for i in 0..n {
for j in (i + 1)..n {
let d = distances[[i, j]];
let mr = core_distances[i].max(core_distances[j]).max(d);
mrd[[i, j]] = mr;
mrd[[j, i]] = mr;
}
}
mrd
}
/// Build minimum spanning tree from mutual reachability distances.
fn build_mst(&self, mrd: &Array2<f32>) -> Vec<(usize, usize, f32)> {
let n = mrd.nrows();
// Build graph with all edges
let mut graph = UnGraph::<usize, f32>::new_undirected();
// Add nodes
let nodes: Vec<NodeIndex> = (0..n).map(|i| graph.add_node(i)).collect();
// Add edges (only upper triangle to avoid duplicates)
for i in 0..n {
for j in (i + 1)..n {
let weight = mrd[[i, j]];
if weight < f32::MAX {
graph.add_edge(nodes[i], nodes[j], weight);
}
}
}
// Compute MST using Prim's algorithm via petgraph
let mst_graph = UnGraph::<usize, f32>::from_elements(min_spanning_tree(&graph));
// Extract edges from MST
let mut edges: Vec<(usize, usize, f32)> = mst_graph
.edge_indices()
.filter_map(|e| {
let (a, b) = mst_graph.edge_endpoints(e)?;
let weight = *mst_graph.edge_weight(e)?;
let a_val = *mst_graph.node_weight(a)?;
let b_val = *mst_graph.node_weight(b)?;
Some((a_val, b_val, weight))
})
.collect();
// Sort by weight descending for cluster extraction
edges.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
edges
}
/// Extract flat clusters from MST using HDBSCAN* algorithm.
fn extract_clusters(&self, mst: &[(usize, usize, f32)], n: usize) -> Vec<i32> {
// Use simplified cluster extraction based on edge cutting
// This is a simplified version - full HDBSCAN uses condensed tree
let mut labels = vec![-1i32; n];
let mut current_cluster = 0i32;
// Build adjacency from MST
let mut adj: HashMap<usize, Vec<(usize, f32)>> = HashMap::new();
for &(a, b, w) in mst {
adj.entry(a).or_default().push((b, w));
adj.entry(b).or_default().push((a, w));
}
// Find connected components, removing edges above threshold
// Use adaptive threshold based on edge weight distribution
let threshold = self.compute_threshold(mst);
let mut visited = vec![false; n];
for start in 0..n {
if visited[start] {
continue;
}
// BFS to find connected component
let mut component = Vec::new();
let mut queue = vec![start];
while let Some(node) = queue.pop() {
if visited[node] {
continue;
}
visited[node] = true;
component.push(node);
if let Some(neighbors) = adj.get(&node) {
for &(neighbor, weight) in neighbors {
if !visited[neighbor] && weight < threshold {
queue.push(neighbor);
}
}
}
}
// Only assign cluster label if component is large enough
if component.len() >= self.min_cluster_size {
for &node in &component {
labels[node] = current_cluster;
}
current_cluster += 1;
}
}
labels
}
/// Compute adaptive threshold for edge cutting.
fn compute_threshold(&self, mst: &[(usize, usize, f32)]) -> f32 {
if mst.is_empty() {
return f32::MAX;
}
let weights: Vec<f32> = mst.iter().map(|&(_, _, w)| w).collect();
let n = weights.len();
// Use median + IQR method for threshold
let mut sorted = weights.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let _median = sorted[n / 2];
let q1 = sorted[n / 4];
let q3 = sorted[3 * n / 4];
let iqr = q3 - q1;
// Threshold at Q3 + 1.5 * IQR (outlier boundary)
q3 + 1.5 * iqr
}
/// Compute distance between two vectors.
fn distance(&self, a: ArrayView1<f32>, b: ArrayView1<f32>) -> f32 {
match self.metric {
DistanceMetric::Euclidean => {
a.iter()
.zip(b.iter())
.map(|(x, y)| (x - y).powi(2))
.sum::<f32>()
.sqrt()
}
DistanceMetric::Cosine => {
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm_a == 0.0 || norm_b == 0.0 {
1.0
} else {
1.0 - (dot / (norm_a * norm_b))
}
}
DistanceMetric::Manhattan => a.iter().zip(b.iter()).map(|(x, y)| (x - y).abs()).sum(),
DistanceMetric::Poincare => {
// Simplified - would need proper hyperbolic distance
a.iter()
.zip(b.iter())
.map(|(x, y)| (x - y).powi(2))
.sum::<f32>()
.sqrt()
}
}
}
}
/// Single linkage tree node for cluster hierarchy.
#[derive(Debug, Clone)]
struct SingleLinkageNode {
left: Option<usize>,
right: Option<usize>,
distance: f32,
size: usize,
}
/// HDBSCAN condensed tree for cluster extraction.
#[derive(Debug)]
pub struct CondensedTree {
nodes: Vec<CondensedNode>,
}
#[derive(Debug, Clone)]
struct CondensedNode {
parent: Option<usize>,
children: Vec<usize>,
lambda_birth: f32,
lambda_death: f32,
stability: f32,
points: HashSet<usize>,
}
#[cfg(test)]
mod tests {
use super::*;
use ndarray::Array1;
fn create_clustered_data() -> Array2<f32> {
// Create 3 clear clusters with deterministic variation
let mut data = Array2::<f32>::zeros((30, 2));
// Cluster 1: around (0, 0)
for i in 0..10 {
data[[i, 0]] = rand_offset(0.0, i);
data[[i, 1]] = rand_offset(0.0, i + 1);
}
// Cluster 2: around (5, 5)
for i in 10..20 {
data[[i, 0]] = rand_offset(5.0, i);
data[[i, 1]] = rand_offset(5.0, i + 1);
}
// Cluster 3: around (10, 0)
for i in 20..30 {
data[[i, 0]] = rand_offset(10.0, i);
data[[i, 1]] = rand_offset(0.0, i + 1);
}
data
}
fn rand_offset(center: f32, seed: usize) -> f32 {
// Deterministic "random" offset using seed for variation
let variation = ((seed as f32 * 1.618) % 1.0 - 0.5) * 0.5;
center + variation
}
#[test]
fn test_hdbscan_basic() {
let clusterer = HdbscanClusterer::new(3, 2, DistanceMetric::Euclidean);
let data = create_clustered_data();
let labels = clusterer.fit(&data).unwrap();
assert_eq!(labels.len(), 30);
// Should have at least one cluster
let n_clusters = labels.iter().filter(|&&l| l >= 0).collect::<HashSet<_>>().len();
assert!(n_clusters >= 1);
}
#[test]
fn test_hdbscan_insufficient_data() {
let clusterer = HdbscanClusterer::new(10, 5, DistanceMetric::Euclidean);
let data = Array2::<f32>::zeros((5, 2));
let result = clusterer.fit(&data);
assert!(result.is_err());
}
#[test]
fn test_distance_euclidean() {
let clusterer = HdbscanClusterer::new(5, 3, DistanceMetric::Euclidean);
let a = Array1::from_vec(vec![0.0, 0.0]);
let b = Array1::from_vec(vec![3.0, 4.0]);
let dist = clusterer.distance(a.view(), b.view());
assert!((dist - 5.0).abs() < 0.001);
}
#[test]
fn test_distance_cosine() {
let clusterer = HdbscanClusterer::new(5, 3, DistanceMetric::Cosine);
let a = Array1::from_vec(vec![1.0, 0.0]);
let b = Array1::from_vec(vec![1.0, 0.0]);
let dist = clusterer.distance(a.view(), b.view());
assert!(dist.abs() < 0.001); // Same vector = 0 distance
let c = Array1::from_vec(vec![0.0, 1.0]);
let dist2 = clusterer.distance(a.view(), c.view());
assert!((dist2 - 1.0).abs() < 0.001); // Orthogonal = 1 distance
}
}

View File

@@ -0,0 +1,384 @@
//! K-Means clustering implementation.
//!
//! Standard K-Means algorithm with k-means++ initialization for
//! partitioning embeddings into k clusters.
use ndarray::{Array2, ArrayView1};
use tracing::{debug, instrument};
use crate::application::services::AnalysisError;
/// K-Means clustering algorithm.
pub struct KMeansClusterer {
/// Number of clusters.
k: usize,
/// Maximum iterations.
max_iterations: usize,
/// Convergence tolerance.
tolerance: f32,
/// Random seed for reproducibility.
seed: Option<u64>,
}
impl KMeansClusterer {
/// Create a new K-Means clusterer.
#[must_use]
pub fn new(k: usize, seed: Option<u64>) -> Self {
Self {
k,
max_iterations: 300,
tolerance: 1e-4,
seed,
}
}
/// Set maximum iterations.
#[must_use]
pub fn with_max_iterations(mut self, max_iterations: usize) -> Self {
self.max_iterations = max_iterations;
self
}
/// Set convergence tolerance.
#[must_use]
pub fn with_tolerance(mut self, tolerance: f32) -> Self {
self.tolerance = tolerance;
self
}
/// Fit K-Means to the data and return cluster labels and centroids.
///
/// # Arguments
///
/// * `data` - 2D array where rows are samples and columns are features
///
/// # Returns
///
/// Tuple of (cluster labels, centroid matrix)
#[instrument(skip(self, data), fields(n_samples = data.nrows(), n_features = data.ncols(), k = self.k))]
pub fn fit(&self, data: &Array2<f32>) -> Result<(Vec<usize>, Array2<f32>), AnalysisError> {
let n = data.nrows();
let d = data.ncols();
if n < self.k {
return Err(AnalysisError::InsufficientData(format!(
"Need at least {} samples for k={}, got {}",
self.k, self.k, n
)));
}
debug!(
n_samples = n,
n_features = d,
k = self.k,
"Starting K-Means fit"
);
// Initialize centroids using k-means++ algorithm
let mut centroids = self.kmeans_plus_plus_init(data);
let mut labels = vec![0usize; n];
let mut prev_inertia = f32::MAX;
for iteration in 0..self.max_iterations {
// Assignment step: assign each point to nearest centroid
for i in 0..n {
let point = data.row(i);
let mut min_dist = f32::MAX;
let mut best_cluster = 0;
for (j, centroid) in centroids.outer_iter().enumerate() {
let dist = self.euclidean_distance(point, centroid);
if dist < min_dist {
min_dist = dist;
best_cluster = j;
}
}
labels[i] = best_cluster;
}
// Update step: compute new centroids
let mut new_centroids = Array2::<f32>::zeros((self.k, d));
let mut counts = vec![0usize; self.k];
for (i, &label) in labels.iter().enumerate() {
for j in 0..d {
new_centroids[[label, j]] += data[[i, j]];
}
counts[label] += 1;
}
for j in 0..self.k {
if counts[j] > 0 {
for l in 0..d {
new_centroids[[j, l]] /= counts[j] as f32;
}
} else {
// Handle empty cluster by keeping old centroid
for l in 0..d {
new_centroids[[j, l]] = centroids[[j, l]];
}
}
}
// Compute inertia (sum of squared distances to centroids)
let inertia: f32 = labels
.iter()
.enumerate()
.map(|(i, &label)| {
self.euclidean_distance(data.row(i), centroids.row(label)).powi(2)
})
.sum();
// Check convergence
let inertia_change = (prev_inertia - inertia).abs() / prev_inertia.max(1.0);
debug!(
iteration = iteration,
inertia = inertia,
change = inertia_change,
"K-Means iteration"
);
if inertia_change < self.tolerance {
debug!(
iterations = iteration + 1,
final_inertia = inertia,
"K-Means converged"
);
break;
}
centroids = new_centroids;
prev_inertia = inertia;
}
Ok((labels, centroids))
}
/// Initialize centroids using k-means++ algorithm.
fn kmeans_plus_plus_init(&self, data: &Array2<f32>) -> Array2<f32> {
let n = data.nrows();
let d = data.ncols();
let mut centroids = Array2::<f32>::zeros((self.k, d));
// Use seed for deterministic initialization if provided
let seed = self.seed.unwrap_or(42);
let mut rng_state = seed;
// Helper function for pseudo-random number generation
let mut next_random = || {
rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
((rng_state >> 33) as f32) / (u32::MAX as f32)
};
// Choose first centroid randomly
let first_idx = (next_random() * n as f32) as usize % n;
for j in 0..d {
centroids[[0, j]] = data[[first_idx, j]];
}
// Choose remaining centroids with probability proportional to D^2
for i in 1..self.k {
// Compute distances to nearest existing centroid
let mut distances = Vec::with_capacity(n);
let mut total_dist = 0.0f32;
for point_idx in 0..n {
let point = data.row(point_idx);
let mut min_dist = f32::MAX;
for j in 0..i {
let dist = self.euclidean_distance(point, centroids.row(j));
min_dist = min_dist.min(dist);
}
let dist_sq = min_dist * min_dist;
distances.push(dist_sq);
total_dist += dist_sq;
}
// Sample proportionally to D^2
let target = next_random() * total_dist;
let mut cumsum = 0.0f32;
let mut chosen_idx = 0;
for (idx, &dist) in distances.iter().enumerate() {
cumsum += dist;
if cumsum >= target {
chosen_idx = idx;
break;
}
}
for j in 0..d {
centroids[[i, j]] = data[[chosen_idx, j]];
}
}
centroids
}
/// Compute Euclidean distance between two vectors.
fn euclidean_distance(&self, a: ArrayView1<f32>, b: ArrayView1<f32>) -> f32 {
a.iter()
.zip(b.iter())
.map(|(x, y)| (x - y).powi(2))
.sum::<f32>()
.sqrt()
}
/// Predict cluster labels for new data given fitted centroids.
pub fn predict(&self, data: &Array2<f32>, centroids: &Array2<f32>) -> Vec<usize> {
let n = data.nrows();
let mut labels = vec![0usize; n];
for i in 0..n {
let point = data.row(i);
let mut min_dist = f32::MAX;
let mut best_cluster = 0;
for (j, centroid) in centroids.outer_iter().enumerate() {
let dist = self.euclidean_distance(point, centroid);
if dist < min_dist {
min_dist = dist;
best_cluster = j;
}
}
labels[i] = best_cluster;
}
labels
}
/// Compute inertia (within-cluster sum of squares).
pub fn compute_inertia(
&self,
data: &Array2<f32>,
labels: &[usize],
centroids: &Array2<f32>,
) -> f32 {
labels
.iter()
.enumerate()
.map(|(i, &label)| {
self.euclidean_distance(data.row(i), centroids.row(label)).powi(2)
})
.sum()
}
}
#[cfg(test)]
mod tests {
use super::*;
use ndarray::Array1;
fn create_test_data() -> Array2<f32> {
// Create simple separable clusters
let mut data = Array2::<f32>::zeros((12, 2));
// Cluster 0: points near (0, 0)
data[[0, 0]] = 0.0;
data[[0, 1]] = 0.0;
data[[1, 0]] = 0.1;
data[[1, 1]] = 0.1;
data[[2, 0]] = -0.1;
data[[2, 1]] = 0.1;
data[[3, 0]] = 0.0;
data[[3, 1]] = -0.1;
// Cluster 1: points near (5, 5)
data[[4, 0]] = 5.0;
data[[4, 1]] = 5.0;
data[[5, 0]] = 5.1;
data[[5, 1]] = 5.1;
data[[6, 0]] = 4.9;
data[[6, 1]] = 5.0;
data[[7, 0]] = 5.0;
data[[7, 1]] = 4.9;
// Cluster 2: points near (10, 0)
data[[8, 0]] = 10.0;
data[[8, 1]] = 0.0;
data[[9, 0]] = 10.1;
data[[9, 1]] = 0.1;
data[[10, 0]] = 9.9;
data[[10, 1]] = 0.0;
data[[11, 0]] = 10.0;
data[[11, 1]] = -0.1;
data
}
#[test]
fn test_kmeans_basic() {
let clusterer = KMeansClusterer::new(3, Some(42));
let data = create_test_data();
let (labels, centroids) = clusterer.fit(&data).unwrap();
assert_eq!(labels.len(), 12);
assert_eq!(centroids.nrows(), 3);
// Check that points in same original cluster have same label
// (with high probability given clear separation)
assert_eq!(labels[0], labels[1]);
assert_eq!(labels[0], labels[2]);
assert_eq!(labels[0], labels[3]);
assert_eq!(labels[4], labels[5]);
assert_eq!(labels[4], labels[6]);
assert_eq!(labels[4], labels[7]);
assert_eq!(labels[8], labels[9]);
assert_eq!(labels[8], labels[10]);
assert_eq!(labels[8], labels[11]);
}
#[test]
fn test_kmeans_insufficient_data() {
let clusterer = KMeansClusterer::new(10, None);
let data = Array2::<f32>::zeros((5, 2));
let result = clusterer.fit(&data);
assert!(result.is_err());
}
#[test]
fn test_kmeans_predict() {
let clusterer = KMeansClusterer::new(2, Some(42));
let train_data = Array2::from_shape_vec(
(4, 2),
vec![0.0, 0.0, 0.1, 0.1, 5.0, 5.0, 5.1, 5.1],
)
.unwrap();
let (_, centroids) = clusterer.fit(&train_data).unwrap();
let test_data = Array2::from_shape_vec(
(2, 2),
vec![0.05, 0.05, 4.95, 4.95],
)
.unwrap();
let predictions = clusterer.predict(&test_data, &centroids);
assert_eq!(predictions.len(), 2);
// First point should be in same cluster as (0,0) points
// Second point should be in same cluster as (5,5) points
assert_ne!(predictions[0], predictions[1]);
}
#[test]
fn test_euclidean_distance() {
let clusterer = KMeansClusterer::new(2, None);
let a = Array1::from_vec(vec![0.0, 0.0]);
let b = Array1::from_vec(vec![3.0, 4.0]);
let dist = clusterer.euclidean_distance(a.view(), b.view());
assert!((dist - 5.0).abs() < 0.001);
}
}

View File

@@ -0,0 +1,524 @@
//! Markov chain analysis for vocalization sequences.
//!
//! Provides transition matrix computation, entropy calculation,
//! and sequence analysis for understanding vocalization patterns.
use std::collections::HashSet;
use tracing::{debug, instrument};
use crate::domain::entities::ClusterId;
use crate::domain::value_objects::{SequenceMetrics, TransitionMatrix};
/// Markov chain analyzer for vocalization sequences.
pub struct MarkovAnalyzer {
/// Smoothing factor for probability estimation (Laplace smoothing).
smoothing: f32,
}
impl MarkovAnalyzer {
/// Create a new Markov analyzer.
#[must_use]
pub fn new() -> Self {
Self { smoothing: 0.0 }
}
/// Create with Laplace smoothing.
#[must_use]
pub fn with_smoothing(smoothing: f32) -> Self {
Self { smoothing }
}
/// Build a transition matrix from a sequence of cluster IDs.
///
/// # Arguments
///
/// * `sequence` - Ordered sequence of cluster IDs
///
/// # Returns
///
/// A TransitionMatrix representing transition probabilities.
#[instrument(skip(self, sequence), fields(seq_len = sequence.len()))]
pub fn build_transition_matrix(&self, sequence: &[ClusterId]) -> TransitionMatrix {
// Collect all unique clusters
let unique_clusters: Vec<ClusterId> = sequence
.iter()
.copied()
.collect::<HashSet<_>>()
.into_iter()
.collect();
let mut matrix = TransitionMatrix::new(unique_clusters);
// Count transitions
for window in sequence.windows(2) {
matrix.record_transition(&window[0], &window[1]);
}
// Apply smoothing if configured
if self.smoothing > 0.0 {
self.apply_smoothing(&mut matrix);
}
// Compute probabilities
matrix.compute_probabilities();
debug!(
n_states = matrix.size(),
n_transitions = matrix.non_zero_transitions().len(),
"Built transition matrix"
);
matrix
}
/// Build transition matrix from multiple sequences.
#[instrument(skip(self, sequences))]
pub fn build_from_sequences(&self, sequences: &[Vec<ClusterId>]) -> TransitionMatrix {
// Collect all unique clusters from all sequences
let unique_clusters: Vec<ClusterId> = sequences
.iter()
.flatten()
.copied()
.collect::<HashSet<_>>()
.into_iter()
.collect();
let mut matrix = TransitionMatrix::new(unique_clusters);
// Count transitions from all sequences
for sequence in sequences {
for window in sequence.windows(2) {
matrix.record_transition(&window[0], &window[1]);
}
}
// Apply smoothing and compute probabilities
if self.smoothing > 0.0 {
self.apply_smoothing(&mut matrix);
}
matrix.compute_probabilities();
matrix
}
/// Compute Shannon entropy of transition probabilities.
///
/// # Arguments
///
/// * `transitions` - Slice of (source, target, probability) tuples
///
/// # Returns
///
/// Entropy value in nats (natural log base).
#[must_use]
pub fn compute_entropy(&self, transitions: &[(ClusterId, ClusterId, f32)]) -> f32 {
let mut entropy = 0.0f32;
for &(_, _, prob) in transitions {
if prob > 0.0 {
entropy -= prob * prob.ln();
}
}
entropy
}
/// Compute entropy rate of a Markov chain.
///
/// The entropy rate is the average entropy per step, weighted
/// by the stationary distribution.
#[must_use]
pub fn compute_entropy_rate(&self, matrix: &TransitionMatrix) -> f32 {
let stationary = match matrix.stationary_distribution() {
Some(dist) => dist,
None => return 0.0,
};
let n = matrix.size();
let mut entropy_rate = 0.0f32;
for (i, &pi) in stationary.iter().enumerate() {
if pi <= 0.0 {
continue;
}
// Compute entropy of row i
let mut row_entropy = 0.0f32;
for j in 0..n {
let prob = matrix.probabilities[i][j];
if prob > 0.0 {
row_entropy -= prob * prob.ln();
}
}
entropy_rate += pi * row_entropy;
}
entropy_rate
}
/// Compute sequence metrics from a cluster sequence.
#[instrument(skip(self, sequence))]
pub fn compute_metrics(&self, sequence: &[ClusterId]) -> SequenceMetrics {
if sequence.len() < 2 {
return SequenceMetrics::default();
}
let matrix = self.build_transition_matrix(sequence);
let transitions = matrix.non_zero_transitions();
// Count unique elements
let unique_clusters: HashSet<_> = sequence.iter().collect();
let total_transitions = sequence.len() - 1;
// Count self-transitions
let self_transitions = sequence
.windows(2)
.filter(|w| w[0] == w[1])
.count();
// Compute entropy
let entropy = self.compute_entropy(&transitions);
// Normalize entropy
let max_entropy = (unique_clusters.len() as f32).ln().max(1.0);
let normalized_entropy = if max_entropy > 0.0 {
entropy / max_entropy
} else {
0.0
};
// Find dominant transition
let dominant_transition = transitions
.iter()
.max_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal))
.map(|&(from, to, prob)| (from, to, prob));
SequenceMetrics {
entropy,
normalized_entropy,
stereotypy: 1.0 - normalized_entropy,
unique_clusters: unique_clusters.len(),
unique_transitions: transitions.len(),
total_transitions,
dominant_transition,
repetition_rate: self_transitions as f32 / total_transitions as f32,
}
}
/// Compute stereotypy score (measure of sequence repetitiveness).
///
/// Higher values indicate more stereotyped/predictable sequences.
#[must_use]
pub fn compute_stereotypy(&self, matrix: &TransitionMatrix) -> f32 {
let entropy_rate = self.compute_entropy_rate(matrix);
let max_entropy = (matrix.size() as f32).ln();
if max_entropy > 0.0 {
1.0 - (entropy_rate / max_entropy)
} else {
1.0
}
}
/// Detect periodic patterns in a sequence.
///
/// Returns a vector of (period_length, confidence) tuples for detected patterns.
#[instrument(skip(self, sequence))]
pub fn detect_periodicity(&self, sequence: &[ClusterId]) -> Vec<(usize, f32)> {
let n = sequence.len();
if n < 4 {
return Vec::new();
}
let mut periods = Vec::new();
let max_period = n / 2;
for period in 2..=max_period {
let matches = self.count_periodic_matches(sequence, period);
let max_matches = n / period;
let confidence = matches as f32 / max_matches as f32;
if confidence > 0.5 {
periods.push((period, confidence));
}
}
// Sort by confidence descending
periods.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
periods
}
/// Count matches for a given period length.
fn count_periodic_matches(&self, sequence: &[ClusterId], period: usize) -> usize {
let n = sequence.len();
let mut matches = 0;
for i in period..n {
if sequence[i] == sequence[i - period] {
matches += 1;
}
}
matches
}
/// Apply Laplace smoothing to observation counts.
fn apply_smoothing(&self, matrix: &mut TransitionMatrix) {
let n = matrix.size();
for i in 0..n {
for j in 0..n {
matrix.observations[i][j] += self.smoothing as u32;
}
}
}
/// Compute log-likelihood of a sequence given a transition matrix.
#[must_use]
pub fn log_likelihood(&self, sequence: &[ClusterId], matrix: &TransitionMatrix) -> f32 {
if sequence.len() < 2 {
return 0.0;
}
let mut log_prob = 0.0f32;
for window in sequence.windows(2) {
if let Some(prob) = matrix.probability(&window[0], &window[1]) {
if prob > 0.0 {
log_prob += prob.ln();
} else {
// Unseen transition - return negative infinity
return f32::NEG_INFINITY;
}
}
}
log_prob
}
/// Find the most likely next cluster given current state.
#[must_use]
pub fn predict_next(
&self,
current: &ClusterId,
matrix: &TransitionMatrix,
) -> Option<(ClusterId, f32)> {
let idx = matrix.index_of(current)?;
let mut best_cluster = None;
let mut best_prob = 0.0f32;
for (j, &target_id) in matrix.cluster_ids.iter().enumerate() {
let prob = matrix.probabilities[idx][j];
if prob > best_prob {
best_prob = prob;
best_cluster = Some(target_id);
}
}
best_cluster.map(|c| (c, best_prob))
}
/// Generate a sequence from the Markov chain.
///
/// # Arguments
///
/// * `matrix` - The transition matrix
/// * `start` - Starting cluster
/// * `length` - Desired sequence length
/// * `seed` - Random seed for reproducibility
pub fn generate_sequence(
&self,
matrix: &TransitionMatrix,
start: ClusterId,
length: usize,
seed: u64,
) -> Vec<ClusterId> {
let mut sequence = Vec::with_capacity(length);
sequence.push(start);
let mut rng_state = seed;
let mut next_random = || {
rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
((rng_state >> 33) as f32) / (u32::MAX as f32)
};
let mut current = start;
for _ in 1..length {
let idx = match matrix.index_of(&current) {
Some(i) => i,
None => break,
};
// Sample from transition probabilities
let r = next_random();
let mut cumsum = 0.0f32;
let mut next_cluster = current;
for (j, &cluster_id) in matrix.cluster_ids.iter().enumerate() {
cumsum += matrix.probabilities[idx][j];
if r < cumsum {
next_cluster = cluster_id;
break;
}
}
sequence.push(next_cluster);
current = next_cluster;
}
sequence
}
}
impl Default for MarkovAnalyzer {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn create_test_sequence() -> Vec<ClusterId> {
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
let c2 = ClusterId::from_uuid(uuid::Uuid::from_u128(2));
let c3 = ClusterId::from_uuid(uuid::Uuid::from_u128(3));
// Pattern: c1 -> c2 -> c3 -> c1 -> c2 -> c3 (periodic)
vec![c1, c2, c3, c1, c2, c3, c1, c2, c3]
}
#[test]
fn test_build_transition_matrix() {
let analyzer = MarkovAnalyzer::new();
let sequence = create_test_sequence();
let matrix = analyzer.build_transition_matrix(&sequence);
assert_eq!(matrix.size(), 3);
assert!(!matrix.non_zero_transitions().is_empty());
}
#[test]
fn test_entropy_computation() {
let analyzer = MarkovAnalyzer::new();
// Uniform distribution should have higher entropy
let c1 = ClusterId::new();
let c2 = ClusterId::new();
let uniform_transitions = vec![
(c1, c1, 0.25),
(c1, c2, 0.25),
(c2, c1, 0.25),
(c2, c2, 0.25),
];
let entropy = analyzer.compute_entropy(&uniform_transitions);
assert!(entropy > 0.0);
// Deterministic distribution should have lower entropy
let deterministic = vec![
(c1, c2, 1.0),
(c2, c1, 1.0),
];
let det_entropy = analyzer.compute_entropy(&deterministic);
assert!(det_entropy < entropy);
}
#[test]
fn test_compute_metrics() {
let analyzer = MarkovAnalyzer::new();
let sequence = create_test_sequence();
let metrics = analyzer.compute_metrics(&sequence);
assert_eq!(metrics.unique_clusters, 3);
// Deterministic sequence has zero entropy (each state has one successor)
assert!(metrics.entropy >= 0.0);
assert!(metrics.stereotypy >= 0.0 && metrics.stereotypy <= 1.0);
assert!(metrics.total_transitions == sequence.len() - 1);
}
#[test]
fn test_periodicity_detection() {
let analyzer = MarkovAnalyzer::new();
// Create highly periodic sequence
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
let c2 = ClusterId::from_uuid(uuid::Uuid::from_u128(2));
let periodic_sequence = vec![c1, c2, c1, c2, c1, c2, c1, c2, c1, c2];
let periods = analyzer.detect_periodicity(&periodic_sequence);
// Should detect period 2 (may not be first due to confidence calculation)
assert!(!periods.is_empty());
// Check that period 2 is in the detected periods
let has_period_2 = periods.iter().any(|(p, _)| *p == 2);
assert!(has_period_2, "Period 2 should be detected, found periods: {:?}", periods);
}
#[test]
fn test_predict_next() {
let analyzer = MarkovAnalyzer::new();
let sequence = create_test_sequence();
let matrix = analyzer.build_transition_matrix(&sequence);
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
let c2 = ClusterId::from_uuid(uuid::Uuid::from_u128(2));
// Given the pattern c1 -> c2 -> c3 -> ..., after c1 should come c2
if let Some((next, prob)) = analyzer.predict_next(&c1, &matrix) {
assert_eq!(next, c2);
assert!(prob > 0.0);
}
}
#[test]
fn test_sequence_generation() {
let analyzer = MarkovAnalyzer::new();
let sequence = create_test_sequence();
let matrix = analyzer.build_transition_matrix(&sequence);
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
let generated = analyzer.generate_sequence(&matrix, c1, 10, 42);
assert_eq!(generated.len(), 10);
assert_eq!(generated[0], c1);
}
#[test]
fn test_smoothing() {
let analyzer = MarkovAnalyzer::with_smoothing(1.0);
let c1 = ClusterId::new();
let c2 = ClusterId::new();
let sequence = vec![c1, c2, c1, c2];
let matrix = analyzer.build_transition_matrix(&sequence);
// With smoothing, all transitions should have non-zero probability
for i in 0..matrix.size() {
for j in 0..matrix.size() {
assert!(matrix.probabilities[i][j] > 0.0);
}
}
}
#[test]
fn test_log_likelihood() {
let analyzer = MarkovAnalyzer::new();
let sequence = create_test_sequence();
let matrix = analyzer.build_transition_matrix(&sequence);
// Log-likelihood of the training sequence should be reasonably high
let ll = analyzer.log_likelihood(&sequence, &matrix);
assert!(ll.is_finite());
assert!(ll <= 0.0); // Log probabilities are non-positive
}
}

View File

@@ -0,0 +1,681 @@
//! In-memory repository implementation for testing and development.
//!
//! Provides thread-safe in-memory storage for all analysis entities.
use async_trait::async_trait;
use std::collections::HashMap;
use std::sync::RwLock;
use crate::domain::entities::{
Anomaly, Cluster, ClusterId, EmbeddingId, Motif, Prototype, RecordingId, SequenceAnalysis,
};
use crate::domain::repository::{
AnomalyRepository, ClusterRepository, MotifRepository, PrototypeRepository,
RepositoryError, Result, SequenceRepository,
};
/// In-memory implementation of the analysis repositories.
///
/// Useful for testing and development. Not suitable for production use
/// as data is lost on restart.
pub struct InMemoryAnalysisRepository {
clusters: RwLock<HashMap<ClusterId, Cluster>>,
prototypes: RwLock<HashMap<ClusterId, Vec<Prototype>>>,
motifs: RwLock<HashMap<String, Motif>>,
sequences: RwLock<HashMap<RecordingId, SequenceAnalysis>>,
anomalies: RwLock<HashMap<EmbeddingId, Anomaly>>,
/// Mapping from embedding ID to cluster ID
embedding_assignments: RwLock<HashMap<EmbeddingId, ClusterId>>,
}
impl InMemoryAnalysisRepository {
/// Create a new empty in-memory repository.
#[must_use]
pub fn new() -> Self {
Self {
clusters: RwLock::new(HashMap::new()),
prototypes: RwLock::new(HashMap::new()),
motifs: RwLock::new(HashMap::new()),
sequences: RwLock::new(HashMap::new()),
anomalies: RwLock::new(HashMap::new()),
embedding_assignments: RwLock::new(HashMap::new()),
}
}
/// Get statistics about stored data.
#[must_use]
pub fn stats(&self) -> RepositoryStats {
let clusters = self.clusters.read().unwrap();
let prototypes = self.prototypes.read().unwrap();
let motifs = self.motifs.read().unwrap();
let sequences = self.sequences.read().unwrap();
let anomalies = self.anomalies.read().unwrap();
RepositoryStats {
cluster_count: clusters.len(),
prototype_count: prototypes.values().map(|v| v.len()).sum(),
motif_count: motifs.len(),
sequence_count: sequences.len(),
anomaly_count: anomalies.len(),
}
}
}
impl Default for InMemoryAnalysisRepository {
fn default() -> Self {
Self::new()
}
}
/// Statistics about repository contents.
#[derive(Debug, Clone)]
pub struct RepositoryStats {
/// Number of clusters stored.
pub cluster_count: usize,
/// Total number of prototypes.
pub prototype_count: usize,
/// Number of motifs stored.
pub motif_count: usize,
/// Number of sequence analyses stored.
pub sequence_count: usize,
/// Number of anomalies stored.
pub anomaly_count: usize,
}
#[async_trait]
impl ClusterRepository for InMemoryAnalysisRepository {
async fn save_cluster(&self, cluster: &Cluster) -> Result<()> {
let mut clusters = self.clusters.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
clusters.insert(cluster.id, cluster.clone());
Ok(())
}
async fn save_clusters(&self, clusters_to_save: &[Cluster]) -> Result<()> {
let mut clusters = self.clusters.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
for cluster in clusters_to_save {
clusters.insert(cluster.id, cluster.clone());
}
Ok(())
}
async fn find_cluster(&self, id: &ClusterId) -> Result<Option<Cluster>> {
let clusters = self.clusters.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(clusters.get(id).cloned())
}
async fn list_clusters(&self) -> Result<Vec<Cluster>> {
let clusters = self.clusters.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(clusters.values().cloned().collect())
}
async fn list_clusters_paginated(
&self,
offset: usize,
limit: usize,
) -> Result<Vec<Cluster>> {
let clusters = self.clusters.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(clusters.values().skip(offset).take(limit).cloned().collect())
}
async fn assign_to_cluster(
&self,
embedding_id: &EmbeddingId,
cluster_id: &ClusterId,
) -> Result<()> {
let mut assignments = self.embedding_assignments.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
assignments.insert(*embedding_id, *cluster_id);
Ok(())
}
async fn remove_from_cluster(&self, embedding_id: &EmbeddingId) -> Result<()> {
let mut assignments = self.embedding_assignments.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
assignments.remove(embedding_id);
Ok(())
}
async fn find_cluster_by_embedding(
&self,
embedding_id: &EmbeddingId,
) -> Result<Option<Cluster>> {
// Extract the cluster_id and drop the guard before await
let cluster_id = {
let assignments = self.embedding_assignments.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
assignments.get(embedding_id).cloned()
};
if let Some(cluster_id) = cluster_id {
self.find_cluster(&cluster_id).await
} else {
Ok(None)
}
}
async fn delete_cluster(&self, id: &ClusterId) -> Result<()> {
let mut clusters = self.clusters.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
clusters.remove(id);
Ok(())
}
async fn delete_all_clusters(&self) -> Result<()> {
let mut clusters = self.clusters.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
clusters.clear();
let mut assignments = self.embedding_assignments.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
assignments.clear();
Ok(())
}
async fn cluster_count(&self) -> Result<usize> {
let clusters = self.clusters.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(clusters.len())
}
async fn find_clusters_by_label(&self, label_pattern: &str) -> Result<Vec<Cluster>> {
let clusters = self.clusters.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(clusters
.values()
.filter(|c| {
c.label
.as_ref()
.map_or(false, |l| l.contains(label_pattern))
})
.cloned()
.collect())
}
async fn update_cluster_label(
&self,
id: &ClusterId,
label: Option<String>,
) -> Result<()> {
let mut clusters = self.clusters.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
if let Some(cluster) = clusters.get_mut(id) {
cluster.label = label;
Ok(())
} else {
Err(RepositoryError::NotFound(format!("Cluster {}", id)))
}
}
}
#[async_trait]
impl PrototypeRepository for InMemoryAnalysisRepository {
async fn save_prototype(&self, prototype: &Prototype) -> Result<()> {
let mut prototypes = self.prototypes.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
prototypes
.entry(prototype.cluster_id)
.or_default()
.push(prototype.clone());
Ok(())
}
async fn save_prototypes(&self, prototypes_to_save: &[Prototype]) -> Result<()> {
let mut prototypes = self.prototypes.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
for prototype in prototypes_to_save {
prototypes
.entry(prototype.cluster_id)
.or_default()
.push(prototype.clone());
}
Ok(())
}
async fn find_prototypes_by_cluster(
&self,
cluster_id: &ClusterId,
) -> Result<Vec<Prototype>> {
let prototypes = self.prototypes.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(prototypes.get(cluster_id).cloned().unwrap_or_default())
}
async fn find_best_prototype(
&self,
cluster_id: &ClusterId,
) -> Result<Option<Prototype>> {
let prototypes = self.prototypes.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(prototypes.get(cluster_id).and_then(|protos| {
protos
.iter()
.max_by(|a, b| {
a.exemplar_score
.partial_cmp(&b.exemplar_score)
.unwrap_or(std::cmp::Ordering::Equal)
})
.cloned()
}))
}
async fn delete_prototypes_by_cluster(&self, cluster_id: &ClusterId) -> Result<()> {
let mut prototypes = self.prototypes.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
prototypes.remove(cluster_id);
Ok(())
}
async fn delete_all_prototypes(&self) -> Result<()> {
let mut prototypes = self.prototypes.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
prototypes.clear();
Ok(())
}
}
#[async_trait]
impl MotifRepository for InMemoryAnalysisRepository {
async fn save_motif(&self, motif: &Motif) -> Result<()> {
let mut motifs = self.motifs.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
motifs.insert(motif.id.clone(), motif.clone());
Ok(())
}
async fn save_motifs(&self, motifs_to_save: &[Motif]) -> Result<()> {
let mut motifs = self.motifs.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
for motif in motifs_to_save {
motifs.insert(motif.id.clone(), motif.clone());
}
Ok(())
}
async fn find_motif(&self, id: &str) -> Result<Option<Motif>> {
let motifs = self.motifs.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(motifs.get(id).cloned())
}
async fn find_motifs_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Motif>> {
let motifs = self.motifs.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(motifs
.values()
.filter(|m| m.contains_cluster(cluster_id))
.cloned()
.collect())
}
async fn list_motifs(&self) -> Result<Vec<Motif>> {
let motifs = self.motifs.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(motifs.values().cloned().collect())
}
async fn find_motifs_by_confidence(&self, min_confidence: f32) -> Result<Vec<Motif>> {
let motifs = self.motifs.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(motifs
.values()
.filter(|m| m.confidence >= min_confidence)
.cloned()
.collect())
}
async fn find_motifs_by_occurrences(&self, min_occurrences: usize) -> Result<Vec<Motif>> {
let motifs = self.motifs.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(motifs
.values()
.filter(|m| m.occurrences >= min_occurrences)
.cloned()
.collect())
}
async fn delete_motif(&self, id: &str) -> Result<()> {
let mut motifs = self.motifs.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
motifs.remove(id);
Ok(())
}
async fn delete_all_motifs(&self) -> Result<()> {
let mut motifs = self.motifs.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
motifs.clear();
Ok(())
}
async fn motif_count(&self) -> Result<usize> {
let motifs = self.motifs.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(motifs.len())
}
async fn find_motifs_by_sequence(&self, sequence: &[ClusterId]) -> Result<Vec<Motif>> {
let motifs = self.motifs.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(motifs
.values()
.filter(|m| m.sequence == sequence)
.cloned()
.collect())
}
async fn find_motifs_containing_subsequence(
&self,
subsequence: &[ClusterId],
) -> Result<Vec<Motif>> {
let motifs = self.motifs.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(motifs
.values()
.filter(|m| {
m.sequence
.windows(subsequence.len())
.any(|w| w == subsequence)
})
.cloned()
.collect())
}
}
#[async_trait]
impl SequenceRepository for InMemoryAnalysisRepository {
async fn save_sequence_analysis(&self, analysis: &SequenceAnalysis) -> Result<()> {
let mut sequences = self.sequences.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
sequences.insert(analysis.recording_id, analysis.clone());
Ok(())
}
async fn find_sequence_by_recording(
&self,
recording_id: &RecordingId,
) -> Result<Option<SequenceAnalysis>> {
let sequences = self.sequences.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(sequences.get(recording_id).cloned())
}
async fn list_sequence_analyses(&self) -> Result<Vec<SequenceAnalysis>> {
let sequences = self.sequences.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(sequences.values().cloned().collect())
}
async fn delete_sequence_by_recording(&self, recording_id: &RecordingId) -> Result<()> {
let mut sequences = self.sequences.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
sequences.remove(recording_id);
Ok(())
}
async fn delete_all_sequences(&self) -> Result<()> {
let mut sequences = self.sequences.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
sequences.clear();
Ok(())
}
async fn find_sequences_by_entropy(&self, min_entropy: f32) -> Result<Vec<SequenceAnalysis>> {
let sequences = self.sequences.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(sequences
.values()
.filter(|s| s.entropy >= min_entropy)
.cloned()
.collect())
}
async fn find_sequences_by_stereotypy(
&self,
min_stereotypy: f32,
) -> Result<Vec<SequenceAnalysis>> {
let sequences = self.sequences.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(sequences
.values()
.filter(|s| s.stereotypy_score >= min_stereotypy)
.cloned()
.collect())
}
}
#[async_trait]
impl AnomalyRepository for InMemoryAnalysisRepository {
async fn save_anomaly(&self, anomaly: &Anomaly) -> Result<()> {
let mut anomalies = self.anomalies.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
anomalies.insert(anomaly.embedding_id, anomaly.clone());
Ok(())
}
async fn save_anomalies(&self, anomalies_to_save: &[Anomaly]) -> Result<()> {
let mut anomalies = self.anomalies.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
for anomaly in anomalies_to_save {
anomalies.insert(anomaly.embedding_id, anomaly.clone());
}
Ok(())
}
async fn find_anomaly(&self, embedding_id: &EmbeddingId) -> Result<Option<Anomaly>> {
let anomalies = self.anomalies.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(anomalies.get(embedding_id).cloned())
}
async fn list_anomalies(&self) -> Result<Vec<Anomaly>> {
let anomalies = self.anomalies.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(anomalies.values().cloned().collect())
}
async fn find_anomalies_by_score(&self, min_score: f32) -> Result<Vec<Anomaly>> {
let anomalies = self.anomalies.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(anomalies
.values()
.filter(|a| a.anomaly_score >= min_score)
.cloned()
.collect())
}
async fn find_anomalies_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Anomaly>> {
let anomalies = self.anomalies.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(anomalies
.values()
.filter(|a| a.nearest_cluster == *cluster_id)
.cloned()
.collect())
}
async fn delete_anomaly(&self, embedding_id: &EmbeddingId) -> Result<()> {
let mut anomalies = self.anomalies.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
anomalies.remove(embedding_id);
Ok(())
}
async fn delete_all_anomalies(&self) -> Result<()> {
let mut anomalies = self.anomalies.write().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
anomalies.clear();
Ok(())
}
async fn anomaly_count(&self) -> Result<usize> {
let anomalies = self.anomalies.read().map_err(|e| {
RepositoryError::Internal(format!("Lock error: {}", e))
})?;
Ok(anomalies.len())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_cluster_crud() {
let repo = InMemoryAnalysisRepository::new();
let cluster = Cluster::new(
EmbeddingId::new(),
vec![EmbeddingId::new()],
vec![0.0; 10],
0.1,
);
// Save
repo.save_cluster(&cluster).await.unwrap();
// Find
let found = repo.find_cluster(&cluster.id).await.unwrap();
assert!(found.is_some());
// List
let all = repo.list_clusters().await.unwrap();
assert_eq!(all.len(), 1);
// Delete
repo.delete_cluster(&cluster.id).await.unwrap();
let found = repo.find_cluster(&cluster.id).await.unwrap();
assert!(found.is_none());
}
#[tokio::test]
async fn test_motif_crud() {
let repo = InMemoryAnalysisRepository::new();
let motif = Motif::new(
vec![ClusterId::new(), ClusterId::new()],
5,
1500.0,
0.8,
);
repo.save_motif(&motif).await.unwrap();
let found = repo.find_motif(&motif.id).await.unwrap();
assert!(found.is_some());
let count = repo.motif_count().await.unwrap();
assert_eq!(count, 1);
}
#[tokio::test]
async fn test_sequence_crud() {
let repo = InMemoryAnalysisRepository::new();
let recording_id = RecordingId::new();
let analysis = SequenceAnalysis::new(
recording_id,
vec![],
1.5,
0.5,
);
repo.save_sequence_analysis(&analysis).await.unwrap();
let found = repo.find_sequence_by_recording(&recording_id).await.unwrap();
assert!(found.is_some());
}
#[tokio::test]
async fn test_anomaly_filtering() {
let repo = InMemoryAnalysisRepository::new();
let anomaly1 = Anomaly::new(
EmbeddingId::new(),
0.9,
ClusterId::new(),
2.0,
);
let anomaly2 = Anomaly::new(
EmbeddingId::new(),
0.3,
ClusterId::new(),
0.5,
);
repo.save_anomalies(&[anomaly1, anomaly2]).await.unwrap();
let high_score = repo.find_anomalies_by_score(0.5).await.unwrap();
assert_eq!(high_score.len(), 1);
}
}

View File

@@ -0,0 +1,15 @@
//! Infrastructure layer for the Analysis bounded context.
//!
//! Contains concrete implementations of clustering algorithms,
//! Markov chain analysis, and other infrastructure components.
pub mod hdbscan;
pub mod kmeans;
pub mod markov;
pub mod memory_repository;
// Re-export main types
pub use hdbscan::HdbscanClusterer;
pub use kmeans::KMeansClusterer;
pub use markov::MarkovAnalyzer;
pub use memory_repository::InMemoryAnalysisRepository;

View File

@@ -0,0 +1,78 @@
//! # sevensense-analysis
//!
//! Analysis bounded context for 7sense bioacoustic analysis platform.
//!
//! This crate provides clustering, motif detection, sequence analysis, and anomaly
//! detection capabilities for bioacoustic embeddings.
//!
//! ## Features
//!
//! - **Clustering**: HDBSCAN and K-means clustering for grouping similar vocalizations
//! - **Prototype Extraction**: Identify representative embeddings (exemplars) for each cluster
//! - **Motif Detection**: Discover recurring patterns in vocalization sequences
//! - **Sequence Analysis**: Markov chain analysis, transition matrices, entropy computation
//! - **Anomaly Detection**: Identify unusual or novel vocalizations
//!
//! ## Architecture
//!
//! This crate follows Domain-Driven Design (DDD) with hexagonal architecture:
//!
//! - `domain/` - Core domain entities, value objects, and repository traits
//! - `application/` - Application services orchestrating domain operations
//! - `infrastructure/` - Concrete implementations (HDBSCAN, Markov chains, etc.)
//!
//! ## Example
//!
//! ```rust,ignore
//! use sevensense_analysis::{
//! application::ClusteringService,
//! domain::{ClusteringConfig, ClusteringMethod},
//! };
//!
//! let service = ClusteringService::new(ClusteringConfig::default());
//! let embeddings = vec![/* ... */];
//! let clusters = service.run_hdbscan(&embeddings).await?;
//! ```
#![warn(missing_docs)]
#![warn(clippy::all)]
#![allow(clippy::module_name_repetitions)]
pub mod domain;
pub mod application;
pub mod infrastructure;
pub mod metrics;
// Re-export primary types for convenience
pub use domain::entities::{
Anomaly, AnomalyType, Cluster, ClusterId, EmbeddingId, Motif, MotifOccurrence, Prototype,
RecordingId, SegmentId, SequenceAnalysis,
};
pub use domain::repository::{ClusterRepository, MotifRepository, SequenceRepository};
pub use domain::events::{
AnalysisEvent, ClusterAssigned, ClustersDiscovered, MotifDetected, SequenceAnalyzed,
};
pub use domain::value_objects::{
ClusteringConfig, ClusteringMethod, ClusteringParameters, MotifConfig, SequenceMetrics,
TransitionMatrix,
};
pub use application::services::{
AnomalyDetectionService, ClusteringService, MotifDetectionService, SequenceAnalysisService,
};
pub use metrics::{
ClusteringMetrics, SequenceEntropy, SilhouetteScore, VMeasure,
};
/// Crate version information
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
/// Prelude module for convenient imports
pub mod prelude {
pub use crate::domain::entities::*;
pub use crate::domain::repository::*;
pub use crate::domain::value_objects::*;
pub use crate::application::services::*;
pub use crate::metrics::*;
}

File diff suppressed because it is too large Load Diff