Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
11
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/application/mod.rs
vendored
Normal file
11
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/application/mod.rs
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
//! Application layer for the Analysis bounded context.
|
||||
//!
|
||||
//! Contains application services that orchestrate domain operations
|
||||
//! and coordinate with infrastructure components.
|
||||
|
||||
pub mod services;
|
||||
|
||||
// Re-export service types
|
||||
pub use services::{
|
||||
AnomalyDetectionService, ClusteringService, MotifDetectionService, SequenceAnalysisService,
|
||||
};
|
||||
1230
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/application/services.rs
vendored
Normal file
1230
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/application/services.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
692
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/domain/entities.rs
vendored
Normal file
692
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/domain/entities.rs
vendored
Normal file
@@ -0,0 +1,692 @@
|
||||
//! Domain entities for the Analysis bounded context.
|
||||
//!
|
||||
//! This module contains the core domain entities representing clusters,
|
||||
//! prototypes, motifs, sequences, and anomalies in bioacoustic analysis.
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
use uuid::Uuid;
|
||||
|
||||
/// Unique identifier for a cluster.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct ClusterId(Uuid);
|
||||
|
||||
impl ClusterId {
|
||||
/// Create a new random cluster ID.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self(Uuid::new_v4())
|
||||
}
|
||||
|
||||
/// Create a cluster ID from a UUID.
|
||||
#[must_use]
|
||||
pub fn from_uuid(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
|
||||
/// Get the underlying UUID.
|
||||
#[must_use]
|
||||
pub fn as_uuid(&self) -> Uuid {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Noise cluster ID (used for HDBSCAN noise points).
|
||||
#[must_use]
|
||||
pub fn noise() -> Self {
|
||||
Self(Uuid::nil())
|
||||
}
|
||||
|
||||
/// Check if this is the noise cluster.
|
||||
#[must_use]
|
||||
pub fn is_noise(&self) -> bool {
|
||||
self.0.is_nil()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ClusterId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ClusterId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Uuid> for ClusterId {
|
||||
fn from(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
}
|
||||
|
||||
/// Unique identifier for an embedding (from sevensense-embedding context).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct EmbeddingId(Uuid);
|
||||
|
||||
impl EmbeddingId {
|
||||
/// Create a new random embedding ID.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self(Uuid::new_v4())
|
||||
}
|
||||
|
||||
/// Create from UUID.
|
||||
#[must_use]
|
||||
pub fn from_uuid(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
|
||||
/// Get the underlying UUID.
|
||||
#[must_use]
|
||||
pub fn as_uuid(&self) -> Uuid {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for EmbeddingId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EmbeddingId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Uuid> for EmbeddingId {
|
||||
fn from(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
}
|
||||
|
||||
/// Unique identifier for a recording (from sevensense-audio context).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct RecordingId(Uuid);
|
||||
|
||||
impl RecordingId {
|
||||
/// Create a new random recording ID.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self(Uuid::new_v4())
|
||||
}
|
||||
|
||||
/// Create from UUID.
|
||||
#[must_use]
|
||||
pub fn from_uuid(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
|
||||
/// Get the underlying UUID.
|
||||
#[must_use]
|
||||
pub fn as_uuid(&self) -> Uuid {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for RecordingId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for RecordingId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Uuid> for RecordingId {
|
||||
fn from(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
}
|
||||
|
||||
/// Unique identifier for a segment (from sevensense-audio context).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct SegmentId(Uuid);
|
||||
|
||||
impl SegmentId {
|
||||
/// Create a new random segment ID.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self(Uuid::new_v4())
|
||||
}
|
||||
|
||||
/// Create from UUID.
|
||||
#[must_use]
|
||||
pub fn from_uuid(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
|
||||
/// Get the underlying UUID.
|
||||
#[must_use]
|
||||
pub fn as_uuid(&self) -> Uuid {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SegmentId {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for SegmentId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Uuid> for SegmentId {
|
||||
fn from(uuid: Uuid) -> Self {
|
||||
Self(uuid)
|
||||
}
|
||||
}
|
||||
|
||||
/// A cluster of acoustically similar call segments.
|
||||
///
|
||||
/// Clusters group embeddings that represent similar vocalizations,
|
||||
/// enabling pattern discovery and call type identification.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Cluster {
|
||||
/// Unique identifier for this cluster.
|
||||
pub id: ClusterId,
|
||||
|
||||
/// The prototype (representative) embedding ID for this cluster.
|
||||
pub prototype_id: EmbeddingId,
|
||||
|
||||
/// IDs of all embeddings belonging to this cluster.
|
||||
pub member_ids: Vec<EmbeddingId>,
|
||||
|
||||
/// Centroid vector (mean of all member embeddings).
|
||||
pub centroid: Vec<f32>,
|
||||
|
||||
/// Variance within the cluster (measure of spread).
|
||||
pub variance: f32,
|
||||
|
||||
/// Optional human-readable label for the cluster.
|
||||
pub label: Option<String>,
|
||||
|
||||
/// Timestamp when the cluster was created.
|
||||
pub created_at: DateTime<Utc>,
|
||||
|
||||
/// Timestamp when the cluster was last updated.
|
||||
pub updated_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl Cluster {
|
||||
/// Create a new cluster with the given parameters.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
prototype_id: EmbeddingId,
|
||||
member_ids: Vec<EmbeddingId>,
|
||||
centroid: Vec<f32>,
|
||||
variance: f32,
|
||||
) -> Self {
|
||||
let now = Utc::now();
|
||||
Self {
|
||||
id: ClusterId::new(),
|
||||
prototype_id,
|
||||
member_ids,
|
||||
centroid,
|
||||
variance,
|
||||
label: None,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the number of members in this cluster.
|
||||
#[must_use]
|
||||
pub fn member_count(&self) -> usize {
|
||||
self.member_ids.len()
|
||||
}
|
||||
|
||||
/// Check if an embedding is a member of this cluster.
|
||||
#[must_use]
|
||||
pub fn contains(&self, embedding_id: &EmbeddingId) -> bool {
|
||||
self.member_ids.contains(embedding_id)
|
||||
}
|
||||
|
||||
/// Add a member to the cluster.
|
||||
pub fn add_member(&mut self, embedding_id: EmbeddingId) {
|
||||
if !self.member_ids.contains(&embedding_id) {
|
||||
self.member_ids.push(embedding_id);
|
||||
self.updated_at = Utc::now();
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove a member from the cluster.
|
||||
pub fn remove_member(&mut self, embedding_id: &EmbeddingId) -> bool {
|
||||
if let Some(pos) = self.member_ids.iter().position(|id| id == embedding_id) {
|
||||
self.member_ids.remove(pos);
|
||||
self.updated_at = Utc::now();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the centroid vector.
|
||||
pub fn update_centroid(&mut self, centroid: Vec<f32>, variance: f32) {
|
||||
self.centroid = centroid;
|
||||
self.variance = variance;
|
||||
self.updated_at = Utc::now();
|
||||
}
|
||||
|
||||
/// Set a human-readable label for this cluster.
|
||||
pub fn set_label(&mut self, label: impl Into<String>) {
|
||||
self.label = Some(label.into());
|
||||
self.updated_at = Utc::now();
|
||||
}
|
||||
}
|
||||
|
||||
/// A prototype (exemplar) embedding that best represents a cluster.
|
||||
///
|
||||
/// Prototypes are actual call segments that serve as the most representative
|
||||
/// examples of their cluster, useful for visualization and interpretation.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Prototype {
|
||||
/// The embedding ID of this prototype.
|
||||
pub id: EmbeddingId,
|
||||
|
||||
/// The cluster this prototype represents.
|
||||
pub cluster_id: ClusterId,
|
||||
|
||||
/// Score indicating how well this exemplar represents the cluster.
|
||||
/// Higher scores indicate better representation.
|
||||
pub exemplar_score: f32,
|
||||
|
||||
/// Optional path to the spectrogram image for visualization.
|
||||
pub spectrogram_path: Option<PathBuf>,
|
||||
|
||||
/// Timestamp when this prototype was identified.
|
||||
pub created_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl Prototype {
|
||||
/// Create a new prototype.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
id: EmbeddingId,
|
||||
cluster_id: ClusterId,
|
||||
exemplar_score: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
id,
|
||||
cluster_id,
|
||||
exemplar_score,
|
||||
spectrogram_path: None,
|
||||
created_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the spectrogram path for this prototype.
|
||||
pub fn set_spectrogram_path(&mut self, path: impl Into<PathBuf>) {
|
||||
self.spectrogram_path = Some(path.into());
|
||||
}
|
||||
}
|
||||
|
||||
/// A motif (recurring pattern) in vocalization sequences.
|
||||
///
|
||||
/// Motifs represent frequently occurring sequences of cluster assignments,
|
||||
/// indicating repeated vocal phrases or behavioral patterns.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Motif {
|
||||
/// Unique identifier for this motif.
|
||||
pub id: String,
|
||||
|
||||
/// The sequence of cluster IDs that define this motif.
|
||||
pub sequence: Vec<ClusterId>,
|
||||
|
||||
/// Number of times this motif occurs in the analyzed data.
|
||||
pub occurrences: usize,
|
||||
|
||||
/// Average duration of this motif in milliseconds.
|
||||
pub avg_duration_ms: f64,
|
||||
|
||||
/// Confidence score for this motif (0.0 to 1.0).
|
||||
pub confidence: f32,
|
||||
|
||||
/// All occurrences of this motif.
|
||||
pub occurrence_instances: Vec<MotifOccurrence>,
|
||||
|
||||
/// Timestamp when this motif was discovered.
|
||||
pub discovered_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl Motif {
|
||||
/// Create a new motif.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
sequence: Vec<ClusterId>,
|
||||
occurrences: usize,
|
||||
avg_duration_ms: f64,
|
||||
confidence: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
id: Uuid::new_v4().to_string(),
|
||||
sequence,
|
||||
occurrences,
|
||||
avg_duration_ms,
|
||||
confidence,
|
||||
occurrence_instances: Vec::new(),
|
||||
discovered_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the length of this motif (number of clusters).
|
||||
#[must_use]
|
||||
pub fn length(&self) -> usize {
|
||||
self.sequence.len()
|
||||
}
|
||||
|
||||
/// Add an occurrence instance to this motif.
|
||||
pub fn add_occurrence(&mut self, occurrence: MotifOccurrence) {
|
||||
self.occurrence_instances.push(occurrence);
|
||||
self.occurrences = self.occurrence_instances.len();
|
||||
}
|
||||
|
||||
/// Check if this motif contains a specific cluster.
|
||||
#[must_use]
|
||||
pub fn contains_cluster(&self, cluster_id: &ClusterId) -> bool {
|
||||
self.sequence.contains(cluster_id)
|
||||
}
|
||||
}
|
||||
|
||||
/// A specific occurrence of a motif in a recording.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MotifOccurrence {
|
||||
/// The recording where this occurrence was found.
|
||||
pub recording_id: RecordingId,
|
||||
|
||||
/// The segment IDs that make up this occurrence.
|
||||
pub segment_ids: Vec<SegmentId>,
|
||||
|
||||
/// Start time within the recording (milliseconds).
|
||||
pub start_time_ms: u64,
|
||||
|
||||
/// End time within the recording (milliseconds).
|
||||
pub end_time_ms: u64,
|
||||
|
||||
/// Similarity score to the motif template.
|
||||
pub similarity: f32,
|
||||
}
|
||||
|
||||
impl MotifOccurrence {
|
||||
/// Create a new motif occurrence.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
recording_id: RecordingId,
|
||||
segment_ids: Vec<SegmentId>,
|
||||
start_time_ms: u64,
|
||||
end_time_ms: u64,
|
||||
similarity: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
recording_id,
|
||||
segment_ids,
|
||||
start_time_ms,
|
||||
end_time_ms,
|
||||
similarity,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the duration of this occurrence in milliseconds.
|
||||
#[must_use]
|
||||
pub fn duration_ms(&self) -> u64 {
|
||||
self.end_time_ms.saturating_sub(self.start_time_ms)
|
||||
}
|
||||
}
|
||||
|
||||
/// Analysis of a vocalization sequence from a recording.
|
||||
///
|
||||
/// Contains transition information, entropy metrics, and stereotypy scores
|
||||
/// for understanding sequential patterns in bird vocalizations.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SequenceAnalysis {
|
||||
/// The recording this analysis pertains to.
|
||||
pub recording_id: RecordingId,
|
||||
|
||||
/// Transitions between clusters with weights (probabilities).
|
||||
/// Format: (source_cluster, target_cluster, probability)
|
||||
pub transitions: Vec<(ClusterId, ClusterId, f32)>,
|
||||
|
||||
/// Shannon entropy of the transition distribution.
|
||||
/// Higher values indicate more unpredictable sequences.
|
||||
pub entropy: f32,
|
||||
|
||||
/// Stereotypy score (0.0 to 1.0).
|
||||
/// Higher values indicate more repetitive/stereotyped sequences.
|
||||
pub stereotypy_score: f32,
|
||||
|
||||
/// The sequence of cluster IDs in order.
|
||||
pub cluster_sequence: Vec<ClusterId>,
|
||||
|
||||
/// The segment IDs corresponding to the cluster sequence.
|
||||
pub segment_ids: Vec<SegmentId>,
|
||||
|
||||
/// Timestamp when this analysis was performed.
|
||||
pub analyzed_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl SequenceAnalysis {
|
||||
/// Create a new sequence analysis.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
recording_id: RecordingId,
|
||||
transitions: Vec<(ClusterId, ClusterId, f32)>,
|
||||
entropy: f32,
|
||||
stereotypy_score: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
recording_id,
|
||||
transitions,
|
||||
entropy,
|
||||
stereotypy_score,
|
||||
cluster_sequence: Vec::new(),
|
||||
segment_ids: Vec::new(),
|
||||
analyzed_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the number of unique transitions.
|
||||
#[must_use]
|
||||
pub fn unique_transition_count(&self) -> usize {
|
||||
self.transitions.len()
|
||||
}
|
||||
|
||||
/// Get all clusters involved in the sequence.
|
||||
#[must_use]
|
||||
pub fn unique_clusters(&self) -> Vec<ClusterId> {
|
||||
let mut clusters: Vec<ClusterId> = self.cluster_sequence.clone();
|
||||
clusters.sort_by_key(|c| c.as_uuid());
|
||||
clusters.dedup();
|
||||
clusters
|
||||
}
|
||||
|
||||
/// Set the cluster sequence and corresponding segment IDs.
|
||||
pub fn set_sequence(&mut self, clusters: Vec<ClusterId>, segments: Vec<SegmentId>) {
|
||||
self.cluster_sequence = clusters;
|
||||
self.segment_ids = segments;
|
||||
}
|
||||
}
|
||||
|
||||
/// Type of anomaly detected in the analysis.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum AnomalyType {
|
||||
/// Rare vocalization (low occurrence count).
|
||||
Rare,
|
||||
/// Novel vocalization (doesn't fit any cluster well).
|
||||
Novel,
|
||||
/// Artifact (likely noise or recording issue).
|
||||
Artifact,
|
||||
/// Outlier within a cluster.
|
||||
Outlier,
|
||||
/// Unknown anomaly type.
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for AnomalyType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
AnomalyType::Rare => write!(f, "Rare"),
|
||||
AnomalyType::Novel => write!(f, "Novel"),
|
||||
AnomalyType::Artifact => write!(f, "Artifact"),
|
||||
AnomalyType::Outlier => write!(f, "Outlier"),
|
||||
AnomalyType::Unknown => write!(f, "Unknown"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An anomalous embedding that doesn't fit well into any cluster.
|
||||
///
|
||||
/// Anomalies can represent rare vocalizations, novel sounds, or artifacts.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Anomaly {
|
||||
/// The embedding that is anomalous.
|
||||
pub embedding_id: EmbeddingId,
|
||||
|
||||
/// Anomaly score (higher = more anomalous).
|
||||
pub anomaly_score: f32,
|
||||
|
||||
/// The nearest cluster to this anomaly.
|
||||
pub nearest_cluster: ClusterId,
|
||||
|
||||
/// Distance from the anomaly to the nearest cluster's centroid.
|
||||
pub distance_to_centroid: f32,
|
||||
|
||||
/// Type of anomaly detected.
|
||||
pub anomaly_type: AnomalyType,
|
||||
|
||||
/// Local outlier factor (if computed).
|
||||
pub local_outlier_factor: Option<f32>,
|
||||
|
||||
/// Timestamp when this anomaly was detected.
|
||||
pub detected_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl Anomaly {
|
||||
/// Create a new anomaly.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
embedding_id: EmbeddingId,
|
||||
anomaly_score: f32,
|
||||
nearest_cluster: ClusterId,
|
||||
distance_to_centroid: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
embedding_id,
|
||||
anomaly_score,
|
||||
nearest_cluster,
|
||||
distance_to_centroid,
|
||||
anomaly_type: AnomalyType::Unknown,
|
||||
local_outlier_factor: None,
|
||||
detected_at: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the anomaly type.
|
||||
pub fn set_type(&mut self, anomaly_type: AnomalyType) {
|
||||
self.anomaly_type = anomaly_type;
|
||||
}
|
||||
|
||||
/// Set the local outlier factor.
|
||||
pub fn set_lof(&mut self, lof: f32) {
|
||||
self.local_outlier_factor = Some(lof);
|
||||
}
|
||||
|
||||
/// Check if this is a severe anomaly (score > threshold).
|
||||
#[must_use]
|
||||
pub fn is_severe(&self, threshold: f32) -> bool {
|
||||
self.anomaly_score > threshold
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_cluster_id_creation() {
|
||||
let id1 = ClusterId::new();
|
||||
let id2 = ClusterId::new();
|
||||
assert_ne!(id1, id2);
|
||||
|
||||
let noise = ClusterId::noise();
|
||||
assert!(noise.is_noise());
|
||||
assert!(!id1.is_noise());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cluster_member_operations() {
|
||||
let mut cluster = Cluster::new(
|
||||
EmbeddingId::new(),
|
||||
vec![EmbeddingId::new()],
|
||||
vec![0.0; 1536],
|
||||
0.1,
|
||||
);
|
||||
|
||||
let new_member = EmbeddingId::new();
|
||||
cluster.add_member(new_member);
|
||||
assert_eq!(cluster.member_count(), 2);
|
||||
assert!(cluster.contains(&new_member));
|
||||
|
||||
cluster.remove_member(&new_member);
|
||||
assert_eq!(cluster.member_count(), 1);
|
||||
assert!(!cluster.contains(&new_member));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_motif_length() {
|
||||
let motif = Motif::new(
|
||||
vec![ClusterId::new(), ClusterId::new(), ClusterId::new()],
|
||||
5,
|
||||
1500.0,
|
||||
0.85,
|
||||
);
|
||||
assert_eq!(motif.length(), 3);
|
||||
assert_eq!(motif.occurrences, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sequence_analysis_unique_clusters() {
|
||||
let c1 = ClusterId::new();
|
||||
let c2 = ClusterId::new();
|
||||
|
||||
let mut analysis = SequenceAnalysis::new(
|
||||
RecordingId::new(),
|
||||
vec![],
|
||||
1.5,
|
||||
0.3,
|
||||
);
|
||||
analysis.set_sequence(
|
||||
vec![c1, c2, c1, c2, c1],
|
||||
vec![SegmentId::new(); 5],
|
||||
);
|
||||
|
||||
let unique = analysis.unique_clusters();
|
||||
assert_eq!(unique.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_anomaly_severity() {
|
||||
let mut anomaly = Anomaly::new(
|
||||
EmbeddingId::new(),
|
||||
0.8,
|
||||
ClusterId::new(),
|
||||
2.5,
|
||||
);
|
||||
|
||||
assert!(anomaly.is_severe(0.5));
|
||||
assert!(!anomaly.is_severe(0.9));
|
||||
|
||||
anomaly.set_type(AnomalyType::Novel);
|
||||
assert_eq!(anomaly.anomaly_type, AnomalyType::Novel);
|
||||
}
|
||||
}
|
||||
522
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/domain/events.rs
vendored
Normal file
522
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/domain/events.rs
vendored
Normal file
@@ -0,0 +1,522 @@
|
||||
//! Domain events for the Analysis bounded context.
|
||||
//!
|
||||
//! Domain events represent significant occurrences within the Analysis domain
|
||||
//! that other parts of the system may need to react to.
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::entities::{AnomalyType, ClusterId, EmbeddingId, RecordingId};
|
||||
use super::value_objects::ClusteringMethod;
|
||||
|
||||
/// Base trait for analysis domain events.
|
||||
pub trait AnalysisEvent: Send + Sync {
|
||||
/// Get the unique event ID.
|
||||
fn event_id(&self) -> Uuid;
|
||||
|
||||
/// Get the timestamp when the event occurred.
|
||||
fn occurred_at(&self) -> DateTime<Utc>;
|
||||
|
||||
/// Get the event type name.
|
||||
fn event_type(&self) -> &'static str;
|
||||
}
|
||||
|
||||
/// Event emitted when clustering is completed.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClustersDiscovered {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// Number of clusters discovered.
|
||||
pub cluster_count: usize,
|
||||
|
||||
/// Number of noise points (not assigned to any cluster).
|
||||
pub noise_count: usize,
|
||||
|
||||
/// Clustering method used.
|
||||
pub method: ClusteringMethod,
|
||||
|
||||
/// Silhouette score (if computed).
|
||||
pub silhouette_score: Option<f32>,
|
||||
|
||||
/// Total number of embeddings processed.
|
||||
pub total_embeddings: usize,
|
||||
}
|
||||
|
||||
impl ClustersDiscovered {
|
||||
/// Create a new ClustersDiscovered event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
cluster_count: usize,
|
||||
noise_count: usize,
|
||||
method: ClusteringMethod,
|
||||
total_embeddings: usize,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
cluster_count,
|
||||
noise_count,
|
||||
method,
|
||||
silhouette_score: None,
|
||||
total_embeddings,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add silhouette score to the event.
|
||||
#[must_use]
|
||||
pub fn with_silhouette_score(mut self, score: f32) -> Self {
|
||||
self.silhouette_score = Some(score);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for ClustersDiscovered {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"ClustersDiscovered"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when an embedding is assigned to a cluster.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClusterAssigned {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The embedding that was assigned.
|
||||
pub embedding_id: EmbeddingId,
|
||||
|
||||
/// The cluster it was assigned to.
|
||||
pub cluster_id: ClusterId,
|
||||
|
||||
/// Confidence/probability of the assignment.
|
||||
pub confidence: f32,
|
||||
|
||||
/// Distance to the cluster centroid.
|
||||
pub distance_to_centroid: f32,
|
||||
}
|
||||
|
||||
impl ClusterAssigned {
|
||||
/// Create a new ClusterAssigned event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
embedding_id: EmbeddingId,
|
||||
cluster_id: ClusterId,
|
||||
confidence: f32,
|
||||
distance_to_centroid: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
embedding_id,
|
||||
cluster_id,
|
||||
confidence,
|
||||
distance_to_centroid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for ClusterAssigned {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"ClusterAssigned"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when a motif pattern is detected.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MotifDetected {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The motif ID.
|
||||
pub motif_id: String,
|
||||
|
||||
/// The cluster sequence defining the motif.
|
||||
pub pattern: Vec<ClusterId>,
|
||||
|
||||
/// Number of occurrences found.
|
||||
pub occurrences: usize,
|
||||
|
||||
/// Confidence score for this motif.
|
||||
pub confidence: f32,
|
||||
|
||||
/// Average duration in milliseconds.
|
||||
pub avg_duration_ms: f64,
|
||||
}
|
||||
|
||||
impl MotifDetected {
|
||||
/// Create a new MotifDetected event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
motif_id: String,
|
||||
pattern: Vec<ClusterId>,
|
||||
occurrences: usize,
|
||||
confidence: f32,
|
||||
avg_duration_ms: f64,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
motif_id,
|
||||
pattern,
|
||||
occurrences,
|
||||
confidence,
|
||||
avg_duration_ms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for MotifDetected {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"MotifDetected"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when a sequence is analyzed.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SequenceAnalyzed {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The recording that was analyzed.
|
||||
pub recording_id: RecordingId,
|
||||
|
||||
/// Shannon entropy of the sequence.
|
||||
pub entropy: f32,
|
||||
|
||||
/// Stereotypy score.
|
||||
pub stereotypy_score: f32,
|
||||
|
||||
/// Number of unique clusters in the sequence.
|
||||
pub unique_clusters: usize,
|
||||
|
||||
/// Number of unique transitions.
|
||||
pub unique_transitions: usize,
|
||||
|
||||
/// Total sequence length.
|
||||
pub sequence_length: usize,
|
||||
}
|
||||
|
||||
impl SequenceAnalyzed {
|
||||
/// Create a new SequenceAnalyzed event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
recording_id: RecordingId,
|
||||
entropy: f32,
|
||||
stereotypy_score: f32,
|
||||
unique_clusters: usize,
|
||||
unique_transitions: usize,
|
||||
sequence_length: usize,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
recording_id,
|
||||
entropy,
|
||||
stereotypy_score,
|
||||
unique_clusters,
|
||||
unique_transitions,
|
||||
sequence_length,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for SequenceAnalyzed {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"SequenceAnalyzed"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when an anomaly is detected.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AnomalyDetected {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The embedding identified as anomalous.
|
||||
pub embedding_id: EmbeddingId,
|
||||
|
||||
/// Anomaly score.
|
||||
pub anomaly_score: f32,
|
||||
|
||||
/// Type of anomaly.
|
||||
pub anomaly_type: AnomalyType,
|
||||
|
||||
/// The nearest cluster.
|
||||
pub nearest_cluster: ClusterId,
|
||||
|
||||
/// Distance to the nearest cluster centroid.
|
||||
pub distance_to_centroid: f32,
|
||||
}
|
||||
|
||||
impl AnomalyDetected {
|
||||
/// Create a new AnomalyDetected event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
embedding_id: EmbeddingId,
|
||||
anomaly_score: f32,
|
||||
anomaly_type: AnomalyType,
|
||||
nearest_cluster: ClusterId,
|
||||
distance_to_centroid: f32,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
embedding_id,
|
||||
anomaly_score,
|
||||
anomaly_type,
|
||||
nearest_cluster,
|
||||
distance_to_centroid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for AnomalyDetected {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"AnomalyDetected"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when cluster prototypes are updated.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PrototypesComputed {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The cluster for which prototypes were computed.
|
||||
pub cluster_id: ClusterId,
|
||||
|
||||
/// Number of prototypes computed.
|
||||
pub prototype_count: usize,
|
||||
|
||||
/// Best exemplar score.
|
||||
pub best_exemplar_score: f32,
|
||||
}
|
||||
|
||||
impl PrototypesComputed {
|
||||
/// Create a new PrototypesComputed event.
|
||||
#[must_use]
|
||||
pub fn new(cluster_id: ClusterId, prototype_count: usize, best_exemplar_score: f32) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
cluster_id,
|
||||
prototype_count,
|
||||
best_exemplar_score,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for PrototypesComputed {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"PrototypesComputed"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event emitted when a cluster label is updated.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClusterLabeled {
|
||||
/// Unique event ID.
|
||||
pub event_id: Uuid,
|
||||
|
||||
/// When the event occurred.
|
||||
pub occurred_at: DateTime<Utc>,
|
||||
|
||||
/// The cluster that was labeled.
|
||||
pub cluster_id: ClusterId,
|
||||
|
||||
/// The new label (None if label was removed).
|
||||
pub label: Option<String>,
|
||||
|
||||
/// Previous label (None if no previous label).
|
||||
pub previous_label: Option<String>,
|
||||
}
|
||||
|
||||
impl ClusterLabeled {
|
||||
/// Create a new ClusterLabeled event.
|
||||
#[must_use]
|
||||
pub fn new(
|
||||
cluster_id: ClusterId,
|
||||
label: Option<String>,
|
||||
previous_label: Option<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
event_id: Uuid::new_v4(),
|
||||
occurred_at: Utc::now(),
|
||||
cluster_id,
|
||||
label,
|
||||
previous_label,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AnalysisEvent for ClusterLabeled {
|
||||
fn event_id(&self) -> Uuid {
|
||||
self.event_id
|
||||
}
|
||||
|
||||
fn occurred_at(&self) -> DateTime<Utc> {
|
||||
self.occurred_at
|
||||
}
|
||||
|
||||
fn event_type(&self) -> &'static str {
|
||||
"ClusterLabeled"
|
||||
}
|
||||
}
|
||||
|
||||
/// Event publisher trait for analysis events.
|
||||
#[async_trait::async_trait]
|
||||
pub trait AnalysisEventPublisher: Send + Sync {
|
||||
/// Publish an analysis event.
|
||||
async fn publish<E: AnalysisEvent + Serialize + 'static>(
|
||||
&self,
|
||||
event: E,
|
||||
) -> Result<(), EventPublishError>;
|
||||
}
|
||||
|
||||
/// Error type for event publishing.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum EventPublishError {
|
||||
/// Serialization failed.
|
||||
#[error("Failed to serialize event: {0}")]
|
||||
Serialization(String),
|
||||
|
||||
/// Transport error.
|
||||
#[error("Failed to publish event: {0}")]
|
||||
Transport(String),
|
||||
|
||||
/// Channel closed.
|
||||
#[error("Event channel closed")]
|
||||
ChannelClosed,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_clusters_discovered_event() {
|
||||
let event = ClustersDiscovered::new(
|
||||
10,
|
||||
5,
|
||||
ClusteringMethod::HDBSCAN,
|
||||
100,
|
||||
)
|
||||
.with_silhouette_score(0.75);
|
||||
|
||||
assert_eq!(event.cluster_count, 10);
|
||||
assert_eq!(event.noise_count, 5);
|
||||
assert_eq!(event.silhouette_score, Some(0.75));
|
||||
assert_eq!(event.event_type(), "ClustersDiscovered");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cluster_assigned_event() {
|
||||
let event = ClusterAssigned::new(
|
||||
EmbeddingId::new(),
|
||||
ClusterId::new(),
|
||||
0.95,
|
||||
0.1,
|
||||
);
|
||||
|
||||
assert_eq!(event.confidence, 0.95);
|
||||
assert_eq!(event.event_type(), "ClusterAssigned");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_motif_detected_event() {
|
||||
let pattern = vec![ClusterId::new(), ClusterId::new()];
|
||||
let event = MotifDetected::new(
|
||||
"motif-1".to_string(),
|
||||
pattern.clone(),
|
||||
10,
|
||||
0.85,
|
||||
1500.0,
|
||||
);
|
||||
|
||||
assert_eq!(event.pattern.len(), 2);
|
||||
assert_eq!(event.occurrences, 10);
|
||||
assert_eq!(event.event_type(), "MotifDetected");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_anomaly_detected_event() {
|
||||
let event = AnomalyDetected::new(
|
||||
EmbeddingId::new(),
|
||||
0.9,
|
||||
AnomalyType::Novel,
|
||||
ClusterId::new(),
|
||||
2.5,
|
||||
);
|
||||
|
||||
assert_eq!(event.anomaly_type, AnomalyType::Novel);
|
||||
assert_eq!(event.event_type(), "AnomalyDetected");
|
||||
}
|
||||
}
|
||||
14
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/domain/mod.rs
vendored
Normal file
14
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/domain/mod.rs
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
//! Domain layer for the Analysis bounded context.
|
||||
//!
|
||||
//! Contains core domain entities, value objects, repository traits, and domain events.
|
||||
|
||||
pub mod entities;
|
||||
pub mod events;
|
||||
pub mod repository;
|
||||
pub mod value_objects;
|
||||
|
||||
// Re-export commonly used types
|
||||
pub use entities::*;
|
||||
pub use events::*;
|
||||
pub use repository::*;
|
||||
pub use value_objects::*;
|
||||
290
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/domain/repository.rs
vendored
Normal file
290
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/domain/repository.rs
vendored
Normal file
@@ -0,0 +1,290 @@
|
||||
//! Repository traits for the Analysis bounded context.
|
||||
//!
|
||||
//! These traits define the persistence interfaces for domain entities.
|
||||
//! Implementations are provided in the infrastructure layer.
|
||||
|
||||
use async_trait::async_trait;
|
||||
use thiserror::Error;
|
||||
|
||||
use super::entities::{
|
||||
Anomaly, Cluster, ClusterId, EmbeddingId, Motif, Prototype, RecordingId, SequenceAnalysis,
|
||||
};
|
||||
|
||||
/// Errors that can occur during repository operations.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum RepositoryError {
|
||||
/// Entity not found.
|
||||
#[error("Entity not found: {0}")]
|
||||
NotFound(String),
|
||||
|
||||
/// Duplicate entity.
|
||||
#[error("Duplicate entity: {0}")]
|
||||
Duplicate(String),
|
||||
|
||||
/// Database connection error.
|
||||
#[error("Connection error: {0}")]
|
||||
ConnectionError(String),
|
||||
|
||||
/// Query execution error.
|
||||
#[error("Query error: {0}")]
|
||||
QueryError(String),
|
||||
|
||||
/// Serialization/deserialization error.
|
||||
#[error("Serialization error: {0}")]
|
||||
SerializationError(String),
|
||||
|
||||
/// Invalid data error.
|
||||
#[error("Invalid data: {0}")]
|
||||
InvalidData(String),
|
||||
|
||||
/// Concurrency conflict.
|
||||
#[error("Concurrency conflict: {0}")]
|
||||
ConcurrencyError(String),
|
||||
|
||||
/// Internal error.
|
||||
#[error("Internal error: {0}")]
|
||||
Internal(String),
|
||||
}
|
||||
|
||||
/// Result type for repository operations.
|
||||
pub type Result<T> = std::result::Result<T, RepositoryError>;
|
||||
|
||||
/// Repository for cluster persistence.
|
||||
#[async_trait]
|
||||
pub trait ClusterRepository: Send + Sync {
|
||||
/// Save a cluster to the repository.
|
||||
async fn save_cluster(&self, cluster: &Cluster) -> Result<()>;
|
||||
|
||||
/// Save multiple clusters in a batch.
|
||||
async fn save_clusters(&self, clusters: &[Cluster]) -> Result<()>;
|
||||
|
||||
/// Find a cluster by its ID.
|
||||
async fn find_cluster(&self, id: &ClusterId) -> Result<Option<Cluster>>;
|
||||
|
||||
/// List all clusters.
|
||||
async fn list_clusters(&self) -> Result<Vec<Cluster>>;
|
||||
|
||||
/// List clusters with pagination.
|
||||
async fn list_clusters_paginated(
|
||||
&self,
|
||||
offset: usize,
|
||||
limit: usize,
|
||||
) -> Result<Vec<Cluster>>;
|
||||
|
||||
/// Assign an embedding to a cluster.
|
||||
async fn assign_to_cluster(
|
||||
&self,
|
||||
embedding_id: &EmbeddingId,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Remove an embedding from its cluster.
|
||||
async fn remove_from_cluster(&self, embedding_id: &EmbeddingId) -> Result<()>;
|
||||
|
||||
/// Find the cluster containing a specific embedding.
|
||||
async fn find_cluster_by_embedding(
|
||||
&self,
|
||||
embedding_id: &EmbeddingId,
|
||||
) -> Result<Option<Cluster>>;
|
||||
|
||||
/// Delete a cluster.
|
||||
async fn delete_cluster(&self, id: &ClusterId) -> Result<()>;
|
||||
|
||||
/// Delete all clusters.
|
||||
async fn delete_all_clusters(&self) -> Result<()>;
|
||||
|
||||
/// Get cluster count.
|
||||
async fn cluster_count(&self) -> Result<usize>;
|
||||
|
||||
/// Find clusters by label pattern.
|
||||
async fn find_clusters_by_label(&self, label_pattern: &str) -> Result<Vec<Cluster>>;
|
||||
|
||||
/// Update cluster label.
|
||||
async fn update_cluster_label(
|
||||
&self,
|
||||
id: &ClusterId,
|
||||
label: Option<String>,
|
||||
) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Repository for prototype persistence.
|
||||
#[async_trait]
|
||||
pub trait PrototypeRepository: Send + Sync {
|
||||
/// Save a prototype.
|
||||
async fn save_prototype(&self, prototype: &Prototype) -> Result<()>;
|
||||
|
||||
/// Save multiple prototypes in a batch.
|
||||
async fn save_prototypes(&self, prototypes: &[Prototype]) -> Result<()>;
|
||||
|
||||
/// Find prototypes for a cluster.
|
||||
async fn find_prototypes_by_cluster(
|
||||
&self,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<Vec<Prototype>>;
|
||||
|
||||
/// Find the best prototype for a cluster.
|
||||
async fn find_best_prototype(
|
||||
&self,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<Option<Prototype>>;
|
||||
|
||||
/// Delete prototypes for a cluster.
|
||||
async fn delete_prototypes_by_cluster(&self, cluster_id: &ClusterId) -> Result<()>;
|
||||
|
||||
/// Delete all prototypes.
|
||||
async fn delete_all_prototypes(&self) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Repository for motif persistence.
|
||||
#[async_trait]
|
||||
pub trait MotifRepository: Send + Sync {
|
||||
/// Save a motif.
|
||||
async fn save_motif(&self, motif: &Motif) -> Result<()>;
|
||||
|
||||
/// Save multiple motifs in a batch.
|
||||
async fn save_motifs(&self, motifs: &[Motif]) -> Result<()>;
|
||||
|
||||
/// Find a motif by its ID.
|
||||
async fn find_motif(&self, id: &str) -> Result<Option<Motif>>;
|
||||
|
||||
/// Find motifs containing a specific cluster.
|
||||
async fn find_motifs_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Motif>>;
|
||||
|
||||
/// List all motifs.
|
||||
async fn list_motifs(&self) -> Result<Vec<Motif>>;
|
||||
|
||||
/// List motifs with minimum confidence.
|
||||
async fn find_motifs_by_confidence(&self, min_confidence: f32) -> Result<Vec<Motif>>;
|
||||
|
||||
/// List motifs with minimum occurrences.
|
||||
async fn find_motifs_by_occurrences(&self, min_occurrences: usize) -> Result<Vec<Motif>>;
|
||||
|
||||
/// Delete a motif.
|
||||
async fn delete_motif(&self, id: &str) -> Result<()>;
|
||||
|
||||
/// Delete all motifs.
|
||||
async fn delete_all_motifs(&self) -> Result<()>;
|
||||
|
||||
/// Get motif count.
|
||||
async fn motif_count(&self) -> Result<usize>;
|
||||
|
||||
/// Find motifs by sequence pattern (exact match).
|
||||
async fn find_motifs_by_sequence(&self, sequence: &[ClusterId]) -> Result<Vec<Motif>>;
|
||||
|
||||
/// Find motifs by sequence pattern (subsequence match).
|
||||
async fn find_motifs_containing_subsequence(
|
||||
&self,
|
||||
subsequence: &[ClusterId],
|
||||
) -> Result<Vec<Motif>>;
|
||||
}
|
||||
|
||||
/// Repository for sequence analysis persistence.
|
||||
#[async_trait]
|
||||
pub trait SequenceRepository: Send + Sync {
|
||||
/// Save a sequence analysis.
|
||||
async fn save_sequence_analysis(&self, analysis: &SequenceAnalysis) -> Result<()>;
|
||||
|
||||
/// Find sequence analysis for a recording.
|
||||
async fn find_sequence_by_recording(
|
||||
&self,
|
||||
recording_id: &RecordingId,
|
||||
) -> Result<Option<SequenceAnalysis>>;
|
||||
|
||||
/// List all sequence analyses.
|
||||
async fn list_sequence_analyses(&self) -> Result<Vec<SequenceAnalysis>>;
|
||||
|
||||
/// Delete sequence analysis for a recording.
|
||||
async fn delete_sequence_by_recording(&self, recording_id: &RecordingId) -> Result<()>;
|
||||
|
||||
/// Delete all sequence analyses.
|
||||
async fn delete_all_sequences(&self) -> Result<()>;
|
||||
|
||||
/// Find sequences with entropy above threshold.
|
||||
async fn find_sequences_by_entropy(&self, min_entropy: f32) -> Result<Vec<SequenceAnalysis>>;
|
||||
|
||||
/// Find sequences with stereotypy above threshold.
|
||||
async fn find_sequences_by_stereotypy(
|
||||
&self,
|
||||
min_stereotypy: f32,
|
||||
) -> Result<Vec<SequenceAnalysis>>;
|
||||
}
|
||||
|
||||
/// Repository for anomaly persistence.
|
||||
#[async_trait]
|
||||
pub trait AnomalyRepository: Send + Sync {
|
||||
/// Save an anomaly.
|
||||
async fn save_anomaly(&self, anomaly: &Anomaly) -> Result<()>;
|
||||
|
||||
/// Save multiple anomalies in a batch.
|
||||
async fn save_anomalies(&self, anomalies: &[Anomaly]) -> Result<()>;
|
||||
|
||||
/// Find an anomaly by embedding ID.
|
||||
async fn find_anomaly(&self, embedding_id: &EmbeddingId) -> Result<Option<Anomaly>>;
|
||||
|
||||
/// List all anomalies.
|
||||
async fn list_anomalies(&self) -> Result<Vec<Anomaly>>;
|
||||
|
||||
/// Find anomalies with score above threshold.
|
||||
async fn find_anomalies_by_score(&self, min_score: f32) -> Result<Vec<Anomaly>>;
|
||||
|
||||
/// Find anomalies near a specific cluster.
|
||||
async fn find_anomalies_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Anomaly>>;
|
||||
|
||||
/// Delete an anomaly.
|
||||
async fn delete_anomaly(&self, embedding_id: &EmbeddingId) -> Result<()>;
|
||||
|
||||
/// Delete all anomalies.
|
||||
async fn delete_all_anomalies(&self) -> Result<()>;
|
||||
|
||||
/// Get anomaly count.
|
||||
async fn anomaly_count(&self) -> Result<usize>;
|
||||
}
|
||||
|
||||
/// Combined repository for all analysis entities.
|
||||
///
|
||||
/// This trait combines all individual repositories for convenience
|
||||
/// when a single interface to all analysis data is needed.
|
||||
#[async_trait]
|
||||
pub trait AnalysisRepository:
|
||||
ClusterRepository + PrototypeRepository + MotifRepository + SequenceRepository + AnomalyRepository
|
||||
{
|
||||
/// Clear all analysis data.
|
||||
async fn clear_all(&self) -> Result<()> {
|
||||
self.delete_all_clusters().await?;
|
||||
self.delete_all_prototypes().await?;
|
||||
self.delete_all_motifs().await?;
|
||||
self.delete_all_sequences().await?;
|
||||
self.delete_all_anomalies().await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Unit of work for transactional operations.
|
||||
#[async_trait]
|
||||
pub trait UnitOfWork: Send + Sync {
|
||||
/// Type of repository returned by this unit of work.
|
||||
type Repository: AnalysisRepository;
|
||||
|
||||
/// Begin a new transaction and return a repository.
|
||||
async fn begin(&self) -> Result<Self::Repository>;
|
||||
|
||||
/// Commit the current transaction.
|
||||
async fn commit(&self) -> Result<()>;
|
||||
|
||||
/// Rollback the current transaction.
|
||||
async fn rollback(&self) -> Result<()>;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_repository_error_display() {
|
||||
let err = RepositoryError::NotFound("cluster-123".to_string());
|
||||
assert!(format!("{}", err).contains("cluster-123"));
|
||||
|
||||
let err = RepositoryError::QueryError("syntax error".to_string());
|
||||
assert!(format!("{}", err).contains("syntax error"));
|
||||
}
|
||||
}
|
||||
616
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/domain/value_objects.rs
vendored
Normal file
616
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/domain/value_objects.rs
vendored
Normal file
@@ -0,0 +1,616 @@
|
||||
//! Value objects for the Analysis bounded context.
|
||||
//!
|
||||
//! Value objects are immutable objects that represent concepts without identity.
|
||||
//! They are defined by their attributes rather than a unique identifier.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use super::entities::ClusterId;
|
||||
|
||||
/// Method used for clustering embeddings.
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub enum ClusteringMethod {
|
||||
/// HDBSCAN (Hierarchical Density-Based Spatial Clustering).
|
||||
/// Good for discovering clusters of varying densities and shapes.
|
||||
HDBSCAN,
|
||||
|
||||
/// K-Means clustering with fixed number of clusters.
|
||||
KMeans {
|
||||
/// Number of clusters to create.
|
||||
k: usize,
|
||||
},
|
||||
|
||||
/// Spectral clustering using eigenvalues of similarity matrix.
|
||||
Spectral {
|
||||
/// Number of clusters to create.
|
||||
n_clusters: usize,
|
||||
},
|
||||
|
||||
/// Agglomerative hierarchical clustering.
|
||||
Agglomerative {
|
||||
/// Number of clusters to create.
|
||||
n_clusters: usize,
|
||||
/// Linkage criterion (ward, complete, average, single).
|
||||
linkage: LinkageMethod,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for ClusteringMethod {
|
||||
fn default() -> Self {
|
||||
Self::HDBSCAN
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ClusteringMethod {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ClusteringMethod::HDBSCAN => write!(f, "HDBSCAN"),
|
||||
ClusteringMethod::KMeans { k } => write!(f, "K-Means (k={})", k),
|
||||
ClusteringMethod::Spectral { n_clusters } => {
|
||||
write!(f, "Spectral (n={})", n_clusters)
|
||||
}
|
||||
ClusteringMethod::Agglomerative { n_clusters, linkage } => {
|
||||
write!(f, "Agglomerative (n={}, {:?})", n_clusters, linkage)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Linkage method for agglomerative clustering.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum LinkageMethod {
|
||||
/// Ward's minimum variance method.
|
||||
Ward,
|
||||
/// Complete linkage (maximum distance).
|
||||
Complete,
|
||||
/// Average linkage (mean distance).
|
||||
Average,
|
||||
/// Single linkage (minimum distance).
|
||||
Single,
|
||||
}
|
||||
|
||||
impl Default for LinkageMethod {
|
||||
fn default() -> Self {
|
||||
Self::Ward
|
||||
}
|
||||
}
|
||||
|
||||
/// Distance metric for clustering.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum DistanceMetric {
|
||||
/// Euclidean distance (L2 norm).
|
||||
Euclidean,
|
||||
/// Cosine distance (1 - cosine similarity).
|
||||
Cosine,
|
||||
/// Manhattan distance (L1 norm).
|
||||
Manhattan,
|
||||
/// Poincare distance (hyperbolic space).
|
||||
Poincare,
|
||||
}
|
||||
|
||||
impl Default for DistanceMetric {
|
||||
fn default() -> Self {
|
||||
Self::Cosine
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DistanceMetric {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
DistanceMetric::Euclidean => write!(f, "Euclidean"),
|
||||
DistanceMetric::Cosine => write!(f, "Cosine"),
|
||||
DistanceMetric::Manhattan => write!(f, "Manhattan"),
|
||||
DistanceMetric::Poincare => write!(f, "Poincare"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parameters for clustering algorithms.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClusteringParameters {
|
||||
/// Minimum number of points to form a cluster (HDBSCAN).
|
||||
pub min_cluster_size: usize,
|
||||
|
||||
/// Minimum number of samples in neighborhood (HDBSCAN).
|
||||
pub min_samples: usize,
|
||||
|
||||
/// Epsilon for DBSCAN-like algorithms (optional distance threshold).
|
||||
pub epsilon: Option<f32>,
|
||||
|
||||
/// Distance metric to use.
|
||||
pub metric: DistanceMetric,
|
||||
|
||||
/// Maximum number of clusters (optional limit).
|
||||
pub max_clusters: Option<usize>,
|
||||
|
||||
/// Whether to allow single-point clusters.
|
||||
pub allow_single_cluster: bool,
|
||||
}
|
||||
|
||||
impl Default for ClusteringParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
min_cluster_size: 5,
|
||||
min_samples: 3,
|
||||
epsilon: None,
|
||||
metric: DistanceMetric::Cosine,
|
||||
max_clusters: None,
|
||||
allow_single_cluster: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusteringParameters {
|
||||
/// Create parameters for HDBSCAN.
|
||||
#[must_use]
|
||||
pub fn hdbscan(min_cluster_size: usize, min_samples: usize) -> Self {
|
||||
Self {
|
||||
min_cluster_size,
|
||||
min_samples,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Create parameters for K-means.
|
||||
#[must_use]
|
||||
pub fn kmeans() -> Self {
|
||||
Self {
|
||||
min_cluster_size: 1,
|
||||
min_samples: 1,
|
||||
allow_single_cluster: true,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the distance metric.
|
||||
#[must_use]
|
||||
pub fn with_metric(mut self, metric: DistanceMetric) -> Self {
|
||||
self.metric = metric;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the epsilon threshold.
|
||||
#[must_use]
|
||||
pub fn with_epsilon(mut self, epsilon: f32) -> Self {
|
||||
self.epsilon = Some(epsilon);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for clustering operations.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClusteringConfig {
|
||||
/// The clustering method to use.
|
||||
pub method: ClusteringMethod,
|
||||
|
||||
/// Parameters for the clustering algorithm.
|
||||
pub parameters: ClusteringParameters,
|
||||
|
||||
/// Whether to compute cluster prototypes.
|
||||
pub compute_prototypes: bool,
|
||||
|
||||
/// Number of prototypes to compute per cluster.
|
||||
pub prototypes_per_cluster: usize,
|
||||
|
||||
/// Whether to compute silhouette scores.
|
||||
pub compute_silhouette: bool,
|
||||
|
||||
/// Random seed for reproducibility.
|
||||
pub random_seed: Option<u64>,
|
||||
}
|
||||
|
||||
impl Default for ClusteringConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
method: ClusteringMethod::HDBSCAN,
|
||||
parameters: ClusteringParameters::default(),
|
||||
compute_prototypes: true,
|
||||
prototypes_per_cluster: 3,
|
||||
compute_silhouette: true,
|
||||
random_seed: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ClusteringConfig {
|
||||
/// Create a HDBSCAN configuration.
|
||||
#[must_use]
|
||||
pub fn hdbscan(min_cluster_size: usize, min_samples: usize) -> Self {
|
||||
Self {
|
||||
method: ClusteringMethod::HDBSCAN,
|
||||
parameters: ClusteringParameters::hdbscan(min_cluster_size, min_samples),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a K-means configuration.
|
||||
#[must_use]
|
||||
pub fn kmeans(k: usize) -> Self {
|
||||
Self {
|
||||
method: ClusteringMethod::KMeans { k },
|
||||
parameters: ClusteringParameters::kmeans(),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Set a random seed for reproducibility.
|
||||
#[must_use]
|
||||
pub fn with_seed(mut self, seed: u64) -> Self {
|
||||
self.random_seed = Some(seed);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for motif detection.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MotifConfig {
|
||||
/// Minimum length of motifs to detect.
|
||||
pub min_length: usize,
|
||||
|
||||
/// Maximum length of motifs to detect.
|
||||
pub max_length: usize,
|
||||
|
||||
/// Minimum number of occurrences for a motif.
|
||||
pub min_occurrences: usize,
|
||||
|
||||
/// Minimum confidence threshold for motifs.
|
||||
pub min_confidence: f32,
|
||||
|
||||
/// Whether to allow overlapping occurrences.
|
||||
pub allow_overlap: bool,
|
||||
|
||||
/// Maximum gap (in clusters) between motif elements.
|
||||
pub max_gap: usize,
|
||||
}
|
||||
|
||||
impl Default for MotifConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
min_length: 2,
|
||||
max_length: 10,
|
||||
min_occurrences: 3,
|
||||
min_confidence: 0.5,
|
||||
allow_overlap: false,
|
||||
max_gap: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MotifConfig {
|
||||
/// Create a strict motif configuration (no gaps, no overlap).
|
||||
#[must_use]
|
||||
pub fn strict() -> Self {
|
||||
Self {
|
||||
min_length: 3,
|
||||
max_length: 8,
|
||||
min_occurrences: 5,
|
||||
min_confidence: 0.7,
|
||||
allow_overlap: false,
|
||||
max_gap: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a relaxed motif configuration (allows gaps).
|
||||
#[must_use]
|
||||
pub fn relaxed() -> Self {
|
||||
Self {
|
||||
min_length: 2,
|
||||
max_length: 15,
|
||||
min_occurrences: 2,
|
||||
min_confidence: 0.3,
|
||||
allow_overlap: true,
|
||||
max_gap: 2,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the length range.
|
||||
#[must_use]
|
||||
pub fn with_length_range(mut self, min: usize, max: usize) -> Self {
|
||||
self.min_length = min;
|
||||
self.max_length = max;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics computed from sequence analysis.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SequenceMetrics {
|
||||
/// Shannon entropy of the sequence.
|
||||
pub entropy: f32,
|
||||
|
||||
/// Normalized entropy (entropy / max_entropy).
|
||||
pub normalized_entropy: f32,
|
||||
|
||||
/// Stereotypy score (1 - normalized_entropy).
|
||||
pub stereotypy: f32,
|
||||
|
||||
/// Number of unique clusters in the sequence.
|
||||
pub unique_clusters: usize,
|
||||
|
||||
/// Number of unique transitions in the sequence.
|
||||
pub unique_transitions: usize,
|
||||
|
||||
/// Total number of transitions.
|
||||
pub total_transitions: usize,
|
||||
|
||||
/// Most common transition and its probability.
|
||||
pub dominant_transition: Option<(ClusterId, ClusterId, f32)>,
|
||||
|
||||
/// Repetition rate (self-transitions / total).
|
||||
pub repetition_rate: f32,
|
||||
}
|
||||
|
||||
impl Default for SequenceMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
entropy: 0.0,
|
||||
normalized_entropy: 0.0,
|
||||
stereotypy: 1.0,
|
||||
unique_clusters: 0,
|
||||
unique_transitions: 0,
|
||||
total_transitions: 0,
|
||||
dominant_transition: None,
|
||||
repetition_rate: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Transition matrix for Markov chain analysis.
|
||||
///
|
||||
/// Represents the probabilities of transitioning from one cluster to another.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TransitionMatrix {
|
||||
/// Ordered list of cluster IDs (defines row/column indices).
|
||||
pub cluster_ids: Vec<ClusterId>,
|
||||
|
||||
/// Transition probabilities (row = source, column = target).
|
||||
/// Values are probabilities (0.0 to 1.0, rows sum to 1.0).
|
||||
pub probabilities: Vec<Vec<f32>>,
|
||||
|
||||
/// Raw observation counts (row = source, column = target).
|
||||
pub observations: Vec<Vec<u32>>,
|
||||
|
||||
/// Mapping from ClusterId to matrix index.
|
||||
#[serde(skip)]
|
||||
index_map: HashMap<ClusterId, usize>,
|
||||
}
|
||||
|
||||
impl TransitionMatrix {
|
||||
/// Create a new transition matrix for the given clusters.
|
||||
#[must_use]
|
||||
pub fn new(cluster_ids: Vec<ClusterId>) -> Self {
|
||||
let n = cluster_ids.len();
|
||||
let index_map: HashMap<ClusterId, usize> = cluster_ids
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, id)| (*id, i))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
cluster_ids,
|
||||
probabilities: vec![vec![0.0; n]; n],
|
||||
observations: vec![vec![0; n]; n],
|
||||
index_map,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the number of clusters (states) in the matrix.
|
||||
#[must_use]
|
||||
pub fn size(&self) -> usize {
|
||||
self.cluster_ids.len()
|
||||
}
|
||||
|
||||
/// Get the index for a cluster ID.
|
||||
#[must_use]
|
||||
pub fn index_of(&self, cluster_id: &ClusterId) -> Option<usize> {
|
||||
self.index_map.get(cluster_id).copied()
|
||||
}
|
||||
|
||||
/// Record an observed transition.
|
||||
pub fn record_transition(&mut self, from: &ClusterId, to: &ClusterId) {
|
||||
if let (Some(i), Some(j)) = (self.index_of(from), self.index_of(to)) {
|
||||
self.observations[i][j] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute probabilities from observation counts.
|
||||
pub fn compute_probabilities(&mut self) {
|
||||
for i in 0..self.size() {
|
||||
let row_sum: u32 = self.observations[i].iter().sum();
|
||||
if row_sum > 0 {
|
||||
for j in 0..self.size() {
|
||||
self.probabilities[i][j] = self.observations[i][j] as f32 / row_sum as f32;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the transition probability from one cluster to another.
|
||||
#[must_use]
|
||||
pub fn probability(&self, from: &ClusterId, to: &ClusterId) -> Option<f32> {
|
||||
match (self.index_of(from), self.index_of(to)) {
|
||||
(Some(i), Some(j)) => Some(self.probabilities[i][j]),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the observation count for a transition.
|
||||
#[must_use]
|
||||
pub fn observation_count(&self, from: &ClusterId, to: &ClusterId) -> Option<u32> {
|
||||
match (self.index_of(from), self.index_of(to)) {
|
||||
(Some(i), Some(j)) => Some(self.observations[i][j]),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all non-zero transitions as (from, to, probability) tuples.
|
||||
#[must_use]
|
||||
pub fn non_zero_transitions(&self) -> Vec<(ClusterId, ClusterId, f32)> {
|
||||
let mut transitions = Vec::new();
|
||||
for (i, from) in self.cluster_ids.iter().enumerate() {
|
||||
for (j, to) in self.cluster_ids.iter().enumerate() {
|
||||
let prob = self.probabilities[i][j];
|
||||
if prob > 0.0 {
|
||||
transitions.push((*from, *to, prob));
|
||||
}
|
||||
}
|
||||
}
|
||||
transitions
|
||||
}
|
||||
|
||||
/// Get the stationary distribution (eigenvector of eigenvalue 1).
|
||||
/// Returns None if the matrix is not ergodic.
|
||||
#[must_use]
|
||||
pub fn stationary_distribution(&self) -> Option<Vec<f32>> {
|
||||
// Power iteration method for finding stationary distribution
|
||||
let n = self.size();
|
||||
if n == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut dist = vec![1.0 / n as f32; n];
|
||||
let max_iterations = 1000;
|
||||
let tolerance = 1e-8;
|
||||
|
||||
for _ in 0..max_iterations {
|
||||
let mut new_dist = vec![0.0; n];
|
||||
|
||||
// Matrix-vector multiplication: new_dist = dist * P^T
|
||||
for j in 0..n {
|
||||
for i in 0..n {
|
||||
new_dist[j] += dist[i] * self.probabilities[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// Check convergence
|
||||
let diff: f32 = dist
|
||||
.iter()
|
||||
.zip(new_dist.iter())
|
||||
.map(|(a, b)| (a - b).abs())
|
||||
.sum();
|
||||
|
||||
dist = new_dist;
|
||||
|
||||
if diff < tolerance {
|
||||
return Some(dist);
|
||||
}
|
||||
}
|
||||
|
||||
Some(dist)
|
||||
}
|
||||
|
||||
/// Rebuild the index map (needed after deserialization).
|
||||
pub fn rebuild_index_map(&mut self) {
|
||||
self.index_map = self
|
||||
.cluster_ids
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, id)| (*id, i))
|
||||
.collect();
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a clustering operation.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ClusteringResult {
|
||||
/// The clusters discovered.
|
||||
pub clusters: Vec<super::entities::Cluster>,
|
||||
|
||||
/// Embeddings classified as noise (HDBSCAN).
|
||||
pub noise: Vec<super::entities::EmbeddingId>,
|
||||
|
||||
/// Silhouette score (if computed).
|
||||
pub silhouette_score: Option<f32>,
|
||||
|
||||
/// V-measure score (if ground truth available).
|
||||
pub v_measure: Option<f32>,
|
||||
|
||||
/// Prototypes for each cluster.
|
||||
pub prototypes: Vec<super::entities::Prototype>,
|
||||
|
||||
/// Parameters used for clustering.
|
||||
pub parameters: ClusteringParameters,
|
||||
|
||||
/// Method used for clustering.
|
||||
pub method: ClusteringMethod,
|
||||
}
|
||||
|
||||
impl ClusteringResult {
|
||||
/// Get the number of clusters (excluding noise).
|
||||
#[must_use]
|
||||
pub fn cluster_count(&self) -> usize {
|
||||
self.clusters.len()
|
||||
}
|
||||
|
||||
/// Get the noise rate (proportion of points in noise).
|
||||
#[must_use]
|
||||
pub fn noise_rate(&self) -> f32 {
|
||||
let total = self
|
||||
.clusters
|
||||
.iter()
|
||||
.map(|c| c.member_count())
|
||||
.sum::<usize>()
|
||||
+ self.noise.len();
|
||||
if total == 0 {
|
||||
0.0
|
||||
} else {
|
||||
self.noise.len() as f32 / total as f32
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_clustering_config_creation() {
|
||||
let config = ClusteringConfig::hdbscan(10, 5);
|
||||
assert!(matches!(config.method, ClusteringMethod::HDBSCAN));
|
||||
assert_eq!(config.parameters.min_cluster_size, 10);
|
||||
assert_eq!(config.parameters.min_samples, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_transition_matrix() {
|
||||
let c1 = ClusterId::new();
|
||||
let c2 = ClusterId::new();
|
||||
let c3 = ClusterId::new();
|
||||
|
||||
let mut matrix = TransitionMatrix::new(vec![c1, c2, c3]);
|
||||
|
||||
// Record some transitions
|
||||
matrix.record_transition(&c1, &c2);
|
||||
matrix.record_transition(&c1, &c2);
|
||||
matrix.record_transition(&c1, &c3);
|
||||
matrix.record_transition(&c2, &c1);
|
||||
|
||||
matrix.compute_probabilities();
|
||||
|
||||
// c1 -> c2 should be 2/3
|
||||
assert!((matrix.probability(&c1, &c2).unwrap() - 2.0 / 3.0).abs() < 0.001);
|
||||
// c1 -> c3 should be 1/3
|
||||
assert!((matrix.probability(&c1, &c3).unwrap() - 1.0 / 3.0).abs() < 0.001);
|
||||
// c2 -> c1 should be 1.0
|
||||
assert!((matrix.probability(&c2, &c1).unwrap() - 1.0).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_motif_config() {
|
||||
let config = MotifConfig::strict();
|
||||
assert_eq!(config.min_length, 3);
|
||||
assert_eq!(config.min_occurrences, 5);
|
||||
assert!(!config.allow_overlap);
|
||||
|
||||
let relaxed = MotifConfig::relaxed();
|
||||
assert!(relaxed.allow_overlap);
|
||||
assert_eq!(relaxed.max_gap, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distance_metric_display() {
|
||||
assert_eq!(format!("{}", DistanceMetric::Cosine), "Cosine");
|
||||
assert_eq!(format!("{}", DistanceMetric::Euclidean), "Euclidean");
|
||||
}
|
||||
}
|
||||
404
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/infrastructure/hdbscan.rs
vendored
Normal file
404
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/infrastructure/hdbscan.rs
vendored
Normal file
@@ -0,0 +1,404 @@
|
||||
//! HDBSCAN clustering implementation.
|
||||
//!
|
||||
//! Hierarchical Density-Based Spatial Clustering of Applications with Noise.
|
||||
//! This implementation uses core distance and mutual reachability distance
|
||||
//! to build a minimum spanning tree and extract clusters.
|
||||
|
||||
use ndarray::{Array2, ArrayView1};
|
||||
use petgraph::graph::{NodeIndex, UnGraph};
|
||||
use petgraph::algo::min_spanning_tree;
|
||||
use petgraph::data::FromElements;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
use crate::application::services::AnalysisError;
|
||||
use crate::domain::value_objects::DistanceMetric;
|
||||
|
||||
/// HDBSCAN clustering algorithm.
|
||||
pub struct HdbscanClusterer {
|
||||
/// Minimum cluster size.
|
||||
min_cluster_size: usize,
|
||||
/// Minimum samples for core point determination.
|
||||
min_samples: usize,
|
||||
/// Distance metric to use.
|
||||
metric: DistanceMetric,
|
||||
}
|
||||
|
||||
impl HdbscanClusterer {
|
||||
/// Create a new HDBSCAN clusterer.
|
||||
#[must_use]
|
||||
pub fn new(min_cluster_size: usize, min_samples: usize, metric: DistanceMetric) -> Self {
|
||||
Self {
|
||||
min_cluster_size,
|
||||
min_samples,
|
||||
metric,
|
||||
}
|
||||
}
|
||||
|
||||
/// Fit HDBSCAN to the data and return cluster labels.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `data` - 2D array where rows are samples and columns are features
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Vector of cluster labels (-1 for noise).
|
||||
#[instrument(skip(self, data), fields(n_samples = data.nrows(), n_features = data.ncols()))]
|
||||
pub fn fit(&self, data: &Array2<f32>) -> Result<Vec<i32>, AnalysisError> {
|
||||
let n = data.nrows();
|
||||
if n < self.min_cluster_size {
|
||||
return Err(AnalysisError::InsufficientData(format!(
|
||||
"Need at least {} samples, got {}",
|
||||
self.min_cluster_size, n
|
||||
)));
|
||||
}
|
||||
|
||||
debug!(
|
||||
n_samples = n,
|
||||
min_cluster_size = self.min_cluster_size,
|
||||
min_samples = self.min_samples,
|
||||
"Starting HDBSCAN fit"
|
||||
);
|
||||
|
||||
// Step 1: Compute pairwise distances
|
||||
let distances = self.compute_pairwise_distances(data);
|
||||
|
||||
// Step 2: Compute core distances
|
||||
let core_distances = self.compute_core_distances(&distances);
|
||||
|
||||
// Step 3: Compute mutual reachability distances
|
||||
let mrd = self.compute_mutual_reachability(&distances, &core_distances);
|
||||
|
||||
// Step 4: Build minimum spanning tree
|
||||
let mst = self.build_mst(&mrd);
|
||||
|
||||
// Step 5: Build cluster hierarchy
|
||||
let labels = self.extract_clusters(&mst, n);
|
||||
|
||||
debug!(
|
||||
n_clusters = labels.iter().filter(|&&l| l >= 0).collect::<HashSet<_>>().len(),
|
||||
n_noise = labels.iter().filter(|&&l| l < 0).count(),
|
||||
"HDBSCAN fit completed"
|
||||
);
|
||||
|
||||
Ok(labels)
|
||||
}
|
||||
|
||||
/// Compute pairwise distance matrix.
|
||||
fn compute_pairwise_distances(&self, data: &Array2<f32>) -> Array2<f32> {
|
||||
let n = data.nrows();
|
||||
let mut distances = Array2::<f32>::zeros((n, n));
|
||||
|
||||
for i in 0..n {
|
||||
for j in (i + 1)..n {
|
||||
let dist = self.distance(data.row(i), data.row(j));
|
||||
distances[[i, j]] = dist;
|
||||
distances[[j, i]] = dist;
|
||||
}
|
||||
}
|
||||
|
||||
distances
|
||||
}
|
||||
|
||||
/// Compute core distance for each point (k-th nearest neighbor distance).
|
||||
fn compute_core_distances(&self, distances: &Array2<f32>) -> Vec<f32> {
|
||||
let n = distances.nrows();
|
||||
let k = self.min_samples.min(n - 1);
|
||||
|
||||
let mut core_distances = Vec::with_capacity(n);
|
||||
|
||||
for i in 0..n {
|
||||
let mut row_distances: Vec<f32> = distances.row(i).to_vec();
|
||||
row_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
// k-th nearest neighbor (index k because index 0 is self with distance 0)
|
||||
let core_dist = row_distances.get(k).copied().unwrap_or(f32::MAX);
|
||||
core_distances.push(core_dist);
|
||||
}
|
||||
|
||||
core_distances
|
||||
}
|
||||
|
||||
/// Compute mutual reachability distance matrix.
|
||||
fn compute_mutual_reachability(
|
||||
&self,
|
||||
distances: &Array2<f32>,
|
||||
core_distances: &[f32],
|
||||
) -> Array2<f32> {
|
||||
let n = distances.nrows();
|
||||
let mut mrd = Array2::<f32>::zeros((n, n));
|
||||
|
||||
for i in 0..n {
|
||||
for j in (i + 1)..n {
|
||||
let d = distances[[i, j]];
|
||||
let mr = core_distances[i].max(core_distances[j]).max(d);
|
||||
mrd[[i, j]] = mr;
|
||||
mrd[[j, i]] = mr;
|
||||
}
|
||||
}
|
||||
|
||||
mrd
|
||||
}
|
||||
|
||||
/// Build minimum spanning tree from mutual reachability distances.
|
||||
fn build_mst(&self, mrd: &Array2<f32>) -> Vec<(usize, usize, f32)> {
|
||||
let n = mrd.nrows();
|
||||
|
||||
// Build graph with all edges
|
||||
let mut graph = UnGraph::<usize, f32>::new_undirected();
|
||||
|
||||
// Add nodes
|
||||
let nodes: Vec<NodeIndex> = (0..n).map(|i| graph.add_node(i)).collect();
|
||||
|
||||
// Add edges (only upper triangle to avoid duplicates)
|
||||
for i in 0..n {
|
||||
for j in (i + 1)..n {
|
||||
let weight = mrd[[i, j]];
|
||||
if weight < f32::MAX {
|
||||
graph.add_edge(nodes[i], nodes[j], weight);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute MST using Prim's algorithm via petgraph
|
||||
let mst_graph = UnGraph::<usize, f32>::from_elements(min_spanning_tree(&graph));
|
||||
|
||||
// Extract edges from MST
|
||||
let mut edges: Vec<(usize, usize, f32)> = mst_graph
|
||||
.edge_indices()
|
||||
.filter_map(|e| {
|
||||
let (a, b) = mst_graph.edge_endpoints(e)?;
|
||||
let weight = *mst_graph.edge_weight(e)?;
|
||||
let a_val = *mst_graph.node_weight(a)?;
|
||||
let b_val = *mst_graph.node_weight(b)?;
|
||||
Some((a_val, b_val, weight))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by weight descending for cluster extraction
|
||||
edges.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
edges
|
||||
}
|
||||
|
||||
/// Extract flat clusters from MST using HDBSCAN* algorithm.
|
||||
fn extract_clusters(&self, mst: &[(usize, usize, f32)], n: usize) -> Vec<i32> {
|
||||
// Use simplified cluster extraction based on edge cutting
|
||||
// This is a simplified version - full HDBSCAN uses condensed tree
|
||||
|
||||
let mut labels = vec![-1i32; n];
|
||||
let mut current_cluster = 0i32;
|
||||
|
||||
// Build adjacency from MST
|
||||
let mut adj: HashMap<usize, Vec<(usize, f32)>> = HashMap::new();
|
||||
for &(a, b, w) in mst {
|
||||
adj.entry(a).or_default().push((b, w));
|
||||
adj.entry(b).or_default().push((a, w));
|
||||
}
|
||||
|
||||
// Find connected components, removing edges above threshold
|
||||
// Use adaptive threshold based on edge weight distribution
|
||||
let threshold = self.compute_threshold(mst);
|
||||
|
||||
let mut visited = vec![false; n];
|
||||
|
||||
for start in 0..n {
|
||||
if visited[start] {
|
||||
continue;
|
||||
}
|
||||
|
||||
// BFS to find connected component
|
||||
let mut component = Vec::new();
|
||||
let mut queue = vec![start];
|
||||
|
||||
while let Some(node) = queue.pop() {
|
||||
if visited[node] {
|
||||
continue;
|
||||
}
|
||||
visited[node] = true;
|
||||
component.push(node);
|
||||
|
||||
if let Some(neighbors) = adj.get(&node) {
|
||||
for &(neighbor, weight) in neighbors {
|
||||
if !visited[neighbor] && weight < threshold {
|
||||
queue.push(neighbor);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Only assign cluster label if component is large enough
|
||||
if component.len() >= self.min_cluster_size {
|
||||
for &node in &component {
|
||||
labels[node] = current_cluster;
|
||||
}
|
||||
current_cluster += 1;
|
||||
}
|
||||
}
|
||||
|
||||
labels
|
||||
}
|
||||
|
||||
/// Compute adaptive threshold for edge cutting.
|
||||
fn compute_threshold(&self, mst: &[(usize, usize, f32)]) -> f32 {
|
||||
if mst.is_empty() {
|
||||
return f32::MAX;
|
||||
}
|
||||
|
||||
let weights: Vec<f32> = mst.iter().map(|&(_, _, w)| w).collect();
|
||||
let n = weights.len();
|
||||
|
||||
// Use median + IQR method for threshold
|
||||
let mut sorted = weights.clone();
|
||||
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
let _median = sorted[n / 2];
|
||||
let q1 = sorted[n / 4];
|
||||
let q3 = sorted[3 * n / 4];
|
||||
let iqr = q3 - q1;
|
||||
|
||||
// Threshold at Q3 + 1.5 * IQR (outlier boundary)
|
||||
q3 + 1.5 * iqr
|
||||
}
|
||||
|
||||
/// Compute distance between two vectors.
|
||||
fn distance(&self, a: ArrayView1<f32>, b: ArrayView1<f32>) -> f32 {
|
||||
match self.metric {
|
||||
DistanceMetric::Euclidean => {
|
||||
a.iter()
|
||||
.zip(b.iter())
|
||||
.map(|(x, y)| (x - y).powi(2))
|
||||
.sum::<f32>()
|
||||
.sqrt()
|
||||
}
|
||||
DistanceMetric::Cosine => {
|
||||
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
|
||||
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if norm_a == 0.0 || norm_b == 0.0 {
|
||||
1.0
|
||||
} else {
|
||||
1.0 - (dot / (norm_a * norm_b))
|
||||
}
|
||||
}
|
||||
DistanceMetric::Manhattan => a.iter().zip(b.iter()).map(|(x, y)| (x - y).abs()).sum(),
|
||||
DistanceMetric::Poincare => {
|
||||
// Simplified - would need proper hyperbolic distance
|
||||
a.iter()
|
||||
.zip(b.iter())
|
||||
.map(|(x, y)| (x - y).powi(2))
|
||||
.sum::<f32>()
|
||||
.sqrt()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Single linkage tree node for cluster hierarchy.
|
||||
#[derive(Debug, Clone)]
|
||||
struct SingleLinkageNode {
|
||||
left: Option<usize>,
|
||||
right: Option<usize>,
|
||||
distance: f32,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
/// HDBSCAN condensed tree for cluster extraction.
|
||||
#[derive(Debug)]
|
||||
pub struct CondensedTree {
|
||||
nodes: Vec<CondensedNode>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct CondensedNode {
|
||||
parent: Option<usize>,
|
||||
children: Vec<usize>,
|
||||
lambda_birth: f32,
|
||||
lambda_death: f32,
|
||||
stability: f32,
|
||||
points: HashSet<usize>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use ndarray::Array1;
|
||||
|
||||
fn create_clustered_data() -> Array2<f32> {
|
||||
// Create 3 clear clusters with deterministic variation
|
||||
let mut data = Array2::<f32>::zeros((30, 2));
|
||||
|
||||
// Cluster 1: around (0, 0)
|
||||
for i in 0..10 {
|
||||
data[[i, 0]] = rand_offset(0.0, i);
|
||||
data[[i, 1]] = rand_offset(0.0, i + 1);
|
||||
}
|
||||
|
||||
// Cluster 2: around (5, 5)
|
||||
for i in 10..20 {
|
||||
data[[i, 0]] = rand_offset(5.0, i);
|
||||
data[[i, 1]] = rand_offset(5.0, i + 1);
|
||||
}
|
||||
|
||||
// Cluster 3: around (10, 0)
|
||||
for i in 20..30 {
|
||||
data[[i, 0]] = rand_offset(10.0, i);
|
||||
data[[i, 1]] = rand_offset(0.0, i + 1);
|
||||
}
|
||||
|
||||
data
|
||||
}
|
||||
|
||||
fn rand_offset(center: f32, seed: usize) -> f32 {
|
||||
// Deterministic "random" offset using seed for variation
|
||||
let variation = ((seed as f32 * 1.618) % 1.0 - 0.5) * 0.5;
|
||||
center + variation
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hdbscan_basic() {
|
||||
let clusterer = HdbscanClusterer::new(3, 2, DistanceMetric::Euclidean);
|
||||
let data = create_clustered_data();
|
||||
|
||||
let labels = clusterer.fit(&data).unwrap();
|
||||
assert_eq!(labels.len(), 30);
|
||||
|
||||
// Should have at least one cluster
|
||||
let n_clusters = labels.iter().filter(|&&l| l >= 0).collect::<HashSet<_>>().len();
|
||||
assert!(n_clusters >= 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hdbscan_insufficient_data() {
|
||||
let clusterer = HdbscanClusterer::new(10, 5, DistanceMetric::Euclidean);
|
||||
let data = Array2::<f32>::zeros((5, 2));
|
||||
|
||||
let result = clusterer.fit(&data);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distance_euclidean() {
|
||||
let clusterer = HdbscanClusterer::new(5, 3, DistanceMetric::Euclidean);
|
||||
let a = Array1::from_vec(vec![0.0, 0.0]);
|
||||
let b = Array1::from_vec(vec![3.0, 4.0]);
|
||||
|
||||
let dist = clusterer.distance(a.view(), b.view());
|
||||
assert!((dist - 5.0).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distance_cosine() {
|
||||
let clusterer = HdbscanClusterer::new(5, 3, DistanceMetric::Cosine);
|
||||
let a = Array1::from_vec(vec![1.0, 0.0]);
|
||||
let b = Array1::from_vec(vec![1.0, 0.0]);
|
||||
|
||||
let dist = clusterer.distance(a.view(), b.view());
|
||||
assert!(dist.abs() < 0.001); // Same vector = 0 distance
|
||||
|
||||
let c = Array1::from_vec(vec![0.0, 1.0]);
|
||||
let dist2 = clusterer.distance(a.view(), c.view());
|
||||
assert!((dist2 - 1.0).abs() < 0.001); // Orthogonal = 1 distance
|
||||
}
|
||||
}
|
||||
384
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/infrastructure/kmeans.rs
vendored
Normal file
384
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/infrastructure/kmeans.rs
vendored
Normal file
@@ -0,0 +1,384 @@
|
||||
//! K-Means clustering implementation.
|
||||
//!
|
||||
//! Standard K-Means algorithm with k-means++ initialization for
|
||||
//! partitioning embeddings into k clusters.
|
||||
|
||||
use ndarray::{Array2, ArrayView1};
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
use crate::application::services::AnalysisError;
|
||||
|
||||
/// K-Means clustering algorithm.
|
||||
pub struct KMeansClusterer {
|
||||
/// Number of clusters.
|
||||
k: usize,
|
||||
/// Maximum iterations.
|
||||
max_iterations: usize,
|
||||
/// Convergence tolerance.
|
||||
tolerance: f32,
|
||||
/// Random seed for reproducibility.
|
||||
seed: Option<u64>,
|
||||
}
|
||||
|
||||
impl KMeansClusterer {
|
||||
/// Create a new K-Means clusterer.
|
||||
#[must_use]
|
||||
pub fn new(k: usize, seed: Option<u64>) -> Self {
|
||||
Self {
|
||||
k,
|
||||
max_iterations: 300,
|
||||
tolerance: 1e-4,
|
||||
seed,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set maximum iterations.
|
||||
#[must_use]
|
||||
pub fn with_max_iterations(mut self, max_iterations: usize) -> Self {
|
||||
self.max_iterations = max_iterations;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set convergence tolerance.
|
||||
#[must_use]
|
||||
pub fn with_tolerance(mut self, tolerance: f32) -> Self {
|
||||
self.tolerance = tolerance;
|
||||
self
|
||||
}
|
||||
|
||||
/// Fit K-Means to the data and return cluster labels and centroids.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `data` - 2D array where rows are samples and columns are features
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Tuple of (cluster labels, centroid matrix)
|
||||
#[instrument(skip(self, data), fields(n_samples = data.nrows(), n_features = data.ncols(), k = self.k))]
|
||||
pub fn fit(&self, data: &Array2<f32>) -> Result<(Vec<usize>, Array2<f32>), AnalysisError> {
|
||||
let n = data.nrows();
|
||||
let d = data.ncols();
|
||||
|
||||
if n < self.k {
|
||||
return Err(AnalysisError::InsufficientData(format!(
|
||||
"Need at least {} samples for k={}, got {}",
|
||||
self.k, self.k, n
|
||||
)));
|
||||
}
|
||||
|
||||
debug!(
|
||||
n_samples = n,
|
||||
n_features = d,
|
||||
k = self.k,
|
||||
"Starting K-Means fit"
|
||||
);
|
||||
|
||||
// Initialize centroids using k-means++ algorithm
|
||||
let mut centroids = self.kmeans_plus_plus_init(data);
|
||||
|
||||
let mut labels = vec![0usize; n];
|
||||
let mut prev_inertia = f32::MAX;
|
||||
|
||||
for iteration in 0..self.max_iterations {
|
||||
// Assignment step: assign each point to nearest centroid
|
||||
for i in 0..n {
|
||||
let point = data.row(i);
|
||||
let mut min_dist = f32::MAX;
|
||||
let mut best_cluster = 0;
|
||||
|
||||
for (j, centroid) in centroids.outer_iter().enumerate() {
|
||||
let dist = self.euclidean_distance(point, centroid);
|
||||
if dist < min_dist {
|
||||
min_dist = dist;
|
||||
best_cluster = j;
|
||||
}
|
||||
}
|
||||
|
||||
labels[i] = best_cluster;
|
||||
}
|
||||
|
||||
// Update step: compute new centroids
|
||||
let mut new_centroids = Array2::<f32>::zeros((self.k, d));
|
||||
let mut counts = vec![0usize; self.k];
|
||||
|
||||
for (i, &label) in labels.iter().enumerate() {
|
||||
for j in 0..d {
|
||||
new_centroids[[label, j]] += data[[i, j]];
|
||||
}
|
||||
counts[label] += 1;
|
||||
}
|
||||
|
||||
for j in 0..self.k {
|
||||
if counts[j] > 0 {
|
||||
for l in 0..d {
|
||||
new_centroids[[j, l]] /= counts[j] as f32;
|
||||
}
|
||||
} else {
|
||||
// Handle empty cluster by keeping old centroid
|
||||
for l in 0..d {
|
||||
new_centroids[[j, l]] = centroids[[j, l]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute inertia (sum of squared distances to centroids)
|
||||
let inertia: f32 = labels
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, &label)| {
|
||||
self.euclidean_distance(data.row(i), centroids.row(label)).powi(2)
|
||||
})
|
||||
.sum();
|
||||
|
||||
// Check convergence
|
||||
let inertia_change = (prev_inertia - inertia).abs() / prev_inertia.max(1.0);
|
||||
|
||||
debug!(
|
||||
iteration = iteration,
|
||||
inertia = inertia,
|
||||
change = inertia_change,
|
||||
"K-Means iteration"
|
||||
);
|
||||
|
||||
if inertia_change < self.tolerance {
|
||||
debug!(
|
||||
iterations = iteration + 1,
|
||||
final_inertia = inertia,
|
||||
"K-Means converged"
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
centroids = new_centroids;
|
||||
prev_inertia = inertia;
|
||||
}
|
||||
|
||||
Ok((labels, centroids))
|
||||
}
|
||||
|
||||
/// Initialize centroids using k-means++ algorithm.
|
||||
fn kmeans_plus_plus_init(&self, data: &Array2<f32>) -> Array2<f32> {
|
||||
let n = data.nrows();
|
||||
let d = data.ncols();
|
||||
let mut centroids = Array2::<f32>::zeros((self.k, d));
|
||||
|
||||
// Use seed for deterministic initialization if provided
|
||||
let seed = self.seed.unwrap_or(42);
|
||||
let mut rng_state = seed;
|
||||
|
||||
// Helper function for pseudo-random number generation
|
||||
let mut next_random = || {
|
||||
rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
|
||||
((rng_state >> 33) as f32) / (u32::MAX as f32)
|
||||
};
|
||||
|
||||
// Choose first centroid randomly
|
||||
let first_idx = (next_random() * n as f32) as usize % n;
|
||||
for j in 0..d {
|
||||
centroids[[0, j]] = data[[first_idx, j]];
|
||||
}
|
||||
|
||||
// Choose remaining centroids with probability proportional to D^2
|
||||
for i in 1..self.k {
|
||||
// Compute distances to nearest existing centroid
|
||||
let mut distances = Vec::with_capacity(n);
|
||||
let mut total_dist = 0.0f32;
|
||||
|
||||
for point_idx in 0..n {
|
||||
let point = data.row(point_idx);
|
||||
let mut min_dist = f32::MAX;
|
||||
|
||||
for j in 0..i {
|
||||
let dist = self.euclidean_distance(point, centroids.row(j));
|
||||
min_dist = min_dist.min(dist);
|
||||
}
|
||||
|
||||
let dist_sq = min_dist * min_dist;
|
||||
distances.push(dist_sq);
|
||||
total_dist += dist_sq;
|
||||
}
|
||||
|
||||
// Sample proportionally to D^2
|
||||
let target = next_random() * total_dist;
|
||||
let mut cumsum = 0.0f32;
|
||||
let mut chosen_idx = 0;
|
||||
|
||||
for (idx, &dist) in distances.iter().enumerate() {
|
||||
cumsum += dist;
|
||||
if cumsum >= target {
|
||||
chosen_idx = idx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for j in 0..d {
|
||||
centroids[[i, j]] = data[[chosen_idx, j]];
|
||||
}
|
||||
}
|
||||
|
||||
centroids
|
||||
}
|
||||
|
||||
/// Compute Euclidean distance between two vectors.
|
||||
fn euclidean_distance(&self, a: ArrayView1<f32>, b: ArrayView1<f32>) -> f32 {
|
||||
a.iter()
|
||||
.zip(b.iter())
|
||||
.map(|(x, y)| (x - y).powi(2))
|
||||
.sum::<f32>()
|
||||
.sqrt()
|
||||
}
|
||||
|
||||
/// Predict cluster labels for new data given fitted centroids.
|
||||
pub fn predict(&self, data: &Array2<f32>, centroids: &Array2<f32>) -> Vec<usize> {
|
||||
let n = data.nrows();
|
||||
let mut labels = vec![0usize; n];
|
||||
|
||||
for i in 0..n {
|
||||
let point = data.row(i);
|
||||
let mut min_dist = f32::MAX;
|
||||
let mut best_cluster = 0;
|
||||
|
||||
for (j, centroid) in centroids.outer_iter().enumerate() {
|
||||
let dist = self.euclidean_distance(point, centroid);
|
||||
if dist < min_dist {
|
||||
min_dist = dist;
|
||||
best_cluster = j;
|
||||
}
|
||||
}
|
||||
|
||||
labels[i] = best_cluster;
|
||||
}
|
||||
|
||||
labels
|
||||
}
|
||||
|
||||
/// Compute inertia (within-cluster sum of squares).
|
||||
pub fn compute_inertia(
|
||||
&self,
|
||||
data: &Array2<f32>,
|
||||
labels: &[usize],
|
||||
centroids: &Array2<f32>,
|
||||
) -> f32 {
|
||||
labels
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, &label)| {
|
||||
self.euclidean_distance(data.row(i), centroids.row(label)).powi(2)
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use ndarray::Array1;
|
||||
|
||||
fn create_test_data() -> Array2<f32> {
|
||||
// Create simple separable clusters
|
||||
let mut data = Array2::<f32>::zeros((12, 2));
|
||||
|
||||
// Cluster 0: points near (0, 0)
|
||||
data[[0, 0]] = 0.0;
|
||||
data[[0, 1]] = 0.0;
|
||||
data[[1, 0]] = 0.1;
|
||||
data[[1, 1]] = 0.1;
|
||||
data[[2, 0]] = -0.1;
|
||||
data[[2, 1]] = 0.1;
|
||||
data[[3, 0]] = 0.0;
|
||||
data[[3, 1]] = -0.1;
|
||||
|
||||
// Cluster 1: points near (5, 5)
|
||||
data[[4, 0]] = 5.0;
|
||||
data[[4, 1]] = 5.0;
|
||||
data[[5, 0]] = 5.1;
|
||||
data[[5, 1]] = 5.1;
|
||||
data[[6, 0]] = 4.9;
|
||||
data[[6, 1]] = 5.0;
|
||||
data[[7, 0]] = 5.0;
|
||||
data[[7, 1]] = 4.9;
|
||||
|
||||
// Cluster 2: points near (10, 0)
|
||||
data[[8, 0]] = 10.0;
|
||||
data[[8, 1]] = 0.0;
|
||||
data[[9, 0]] = 10.1;
|
||||
data[[9, 1]] = 0.1;
|
||||
data[[10, 0]] = 9.9;
|
||||
data[[10, 1]] = 0.0;
|
||||
data[[11, 0]] = 10.0;
|
||||
data[[11, 1]] = -0.1;
|
||||
|
||||
data
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmeans_basic() {
|
||||
let clusterer = KMeansClusterer::new(3, Some(42));
|
||||
let data = create_test_data();
|
||||
|
||||
let (labels, centroids) = clusterer.fit(&data).unwrap();
|
||||
|
||||
assert_eq!(labels.len(), 12);
|
||||
assert_eq!(centroids.nrows(), 3);
|
||||
|
||||
// Check that points in same original cluster have same label
|
||||
// (with high probability given clear separation)
|
||||
assert_eq!(labels[0], labels[1]);
|
||||
assert_eq!(labels[0], labels[2]);
|
||||
assert_eq!(labels[0], labels[3]);
|
||||
|
||||
assert_eq!(labels[4], labels[5]);
|
||||
assert_eq!(labels[4], labels[6]);
|
||||
assert_eq!(labels[4], labels[7]);
|
||||
|
||||
assert_eq!(labels[8], labels[9]);
|
||||
assert_eq!(labels[8], labels[10]);
|
||||
assert_eq!(labels[8], labels[11]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmeans_insufficient_data() {
|
||||
let clusterer = KMeansClusterer::new(10, None);
|
||||
let data = Array2::<f32>::zeros((5, 2));
|
||||
|
||||
let result = clusterer.fit(&data);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_kmeans_predict() {
|
||||
let clusterer = KMeansClusterer::new(2, Some(42));
|
||||
|
||||
let train_data = Array2::from_shape_vec(
|
||||
(4, 2),
|
||||
vec![0.0, 0.0, 0.1, 0.1, 5.0, 5.0, 5.1, 5.1],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (_, centroids) = clusterer.fit(&train_data).unwrap();
|
||||
|
||||
let test_data = Array2::from_shape_vec(
|
||||
(2, 2),
|
||||
vec![0.05, 0.05, 4.95, 4.95],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let predictions = clusterer.predict(&test_data, ¢roids);
|
||||
assert_eq!(predictions.len(), 2);
|
||||
|
||||
// First point should be in same cluster as (0,0) points
|
||||
// Second point should be in same cluster as (5,5) points
|
||||
assert_ne!(predictions[0], predictions[1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euclidean_distance() {
|
||||
let clusterer = KMeansClusterer::new(2, None);
|
||||
let a = Array1::from_vec(vec![0.0, 0.0]);
|
||||
let b = Array1::from_vec(vec![3.0, 4.0]);
|
||||
|
||||
let dist = clusterer.euclidean_distance(a.view(), b.view());
|
||||
assert!((dist - 5.0).abs() < 0.001);
|
||||
}
|
||||
}
|
||||
524
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/infrastructure/markov.rs
vendored
Normal file
524
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/infrastructure/markov.rs
vendored
Normal file
@@ -0,0 +1,524 @@
|
||||
//! Markov chain analysis for vocalization sequences.
|
||||
//!
|
||||
//! Provides transition matrix computation, entropy calculation,
|
||||
//! and sequence analysis for understanding vocalization patterns.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
use crate::domain::entities::ClusterId;
|
||||
use crate::domain::value_objects::{SequenceMetrics, TransitionMatrix};
|
||||
|
||||
/// Markov chain analyzer for vocalization sequences.
|
||||
pub struct MarkovAnalyzer {
|
||||
/// Smoothing factor for probability estimation (Laplace smoothing).
|
||||
smoothing: f32,
|
||||
}
|
||||
|
||||
impl MarkovAnalyzer {
|
||||
/// Create a new Markov analyzer.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self { smoothing: 0.0 }
|
||||
}
|
||||
|
||||
/// Create with Laplace smoothing.
|
||||
#[must_use]
|
||||
pub fn with_smoothing(smoothing: f32) -> Self {
|
||||
Self { smoothing }
|
||||
}
|
||||
|
||||
/// Build a transition matrix from a sequence of cluster IDs.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `sequence` - Ordered sequence of cluster IDs
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A TransitionMatrix representing transition probabilities.
|
||||
#[instrument(skip(self, sequence), fields(seq_len = sequence.len()))]
|
||||
pub fn build_transition_matrix(&self, sequence: &[ClusterId]) -> TransitionMatrix {
|
||||
// Collect all unique clusters
|
||||
let unique_clusters: Vec<ClusterId> = sequence
|
||||
.iter()
|
||||
.copied()
|
||||
.collect::<HashSet<_>>()
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let mut matrix = TransitionMatrix::new(unique_clusters);
|
||||
|
||||
// Count transitions
|
||||
for window in sequence.windows(2) {
|
||||
matrix.record_transition(&window[0], &window[1]);
|
||||
}
|
||||
|
||||
// Apply smoothing if configured
|
||||
if self.smoothing > 0.0 {
|
||||
self.apply_smoothing(&mut matrix);
|
||||
}
|
||||
|
||||
// Compute probabilities
|
||||
matrix.compute_probabilities();
|
||||
|
||||
debug!(
|
||||
n_states = matrix.size(),
|
||||
n_transitions = matrix.non_zero_transitions().len(),
|
||||
"Built transition matrix"
|
||||
);
|
||||
|
||||
matrix
|
||||
}
|
||||
|
||||
/// Build transition matrix from multiple sequences.
|
||||
#[instrument(skip(self, sequences))]
|
||||
pub fn build_from_sequences(&self, sequences: &[Vec<ClusterId>]) -> TransitionMatrix {
|
||||
// Collect all unique clusters from all sequences
|
||||
let unique_clusters: Vec<ClusterId> = sequences
|
||||
.iter()
|
||||
.flatten()
|
||||
.copied()
|
||||
.collect::<HashSet<_>>()
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let mut matrix = TransitionMatrix::new(unique_clusters);
|
||||
|
||||
// Count transitions from all sequences
|
||||
for sequence in sequences {
|
||||
for window in sequence.windows(2) {
|
||||
matrix.record_transition(&window[0], &window[1]);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply smoothing and compute probabilities
|
||||
if self.smoothing > 0.0 {
|
||||
self.apply_smoothing(&mut matrix);
|
||||
}
|
||||
matrix.compute_probabilities();
|
||||
|
||||
matrix
|
||||
}
|
||||
|
||||
/// Compute Shannon entropy of transition probabilities.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `transitions` - Slice of (source, target, probability) tuples
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Entropy value in nats (natural log base).
|
||||
#[must_use]
|
||||
pub fn compute_entropy(&self, transitions: &[(ClusterId, ClusterId, f32)]) -> f32 {
|
||||
let mut entropy = 0.0f32;
|
||||
|
||||
for &(_, _, prob) in transitions {
|
||||
if prob > 0.0 {
|
||||
entropy -= prob * prob.ln();
|
||||
}
|
||||
}
|
||||
|
||||
entropy
|
||||
}
|
||||
|
||||
/// Compute entropy rate of a Markov chain.
|
||||
///
|
||||
/// The entropy rate is the average entropy per step, weighted
|
||||
/// by the stationary distribution.
|
||||
#[must_use]
|
||||
pub fn compute_entropy_rate(&self, matrix: &TransitionMatrix) -> f32 {
|
||||
let stationary = match matrix.stationary_distribution() {
|
||||
Some(dist) => dist,
|
||||
None => return 0.0,
|
||||
};
|
||||
|
||||
let n = matrix.size();
|
||||
let mut entropy_rate = 0.0f32;
|
||||
|
||||
for (i, &pi) in stationary.iter().enumerate() {
|
||||
if pi <= 0.0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Compute entropy of row i
|
||||
let mut row_entropy = 0.0f32;
|
||||
for j in 0..n {
|
||||
let prob = matrix.probabilities[i][j];
|
||||
if prob > 0.0 {
|
||||
row_entropy -= prob * prob.ln();
|
||||
}
|
||||
}
|
||||
|
||||
entropy_rate += pi * row_entropy;
|
||||
}
|
||||
|
||||
entropy_rate
|
||||
}
|
||||
|
||||
/// Compute sequence metrics from a cluster sequence.
|
||||
#[instrument(skip(self, sequence))]
|
||||
pub fn compute_metrics(&self, sequence: &[ClusterId]) -> SequenceMetrics {
|
||||
if sequence.len() < 2 {
|
||||
return SequenceMetrics::default();
|
||||
}
|
||||
|
||||
let matrix = self.build_transition_matrix(sequence);
|
||||
let transitions = matrix.non_zero_transitions();
|
||||
|
||||
// Count unique elements
|
||||
let unique_clusters: HashSet<_> = sequence.iter().collect();
|
||||
let total_transitions = sequence.len() - 1;
|
||||
|
||||
// Count self-transitions
|
||||
let self_transitions = sequence
|
||||
.windows(2)
|
||||
.filter(|w| w[0] == w[1])
|
||||
.count();
|
||||
|
||||
// Compute entropy
|
||||
let entropy = self.compute_entropy(&transitions);
|
||||
|
||||
// Normalize entropy
|
||||
let max_entropy = (unique_clusters.len() as f32).ln().max(1.0);
|
||||
let normalized_entropy = if max_entropy > 0.0 {
|
||||
entropy / max_entropy
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
// Find dominant transition
|
||||
let dominant_transition = transitions
|
||||
.iter()
|
||||
.max_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal))
|
||||
.map(|&(from, to, prob)| (from, to, prob));
|
||||
|
||||
SequenceMetrics {
|
||||
entropy,
|
||||
normalized_entropy,
|
||||
stereotypy: 1.0 - normalized_entropy,
|
||||
unique_clusters: unique_clusters.len(),
|
||||
unique_transitions: transitions.len(),
|
||||
total_transitions,
|
||||
dominant_transition,
|
||||
repetition_rate: self_transitions as f32 / total_transitions as f32,
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute stereotypy score (measure of sequence repetitiveness).
|
||||
///
|
||||
/// Higher values indicate more stereotyped/predictable sequences.
|
||||
#[must_use]
|
||||
pub fn compute_stereotypy(&self, matrix: &TransitionMatrix) -> f32 {
|
||||
let entropy_rate = self.compute_entropy_rate(matrix);
|
||||
let max_entropy = (matrix.size() as f32).ln();
|
||||
|
||||
if max_entropy > 0.0 {
|
||||
1.0 - (entropy_rate / max_entropy)
|
||||
} else {
|
||||
1.0
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect periodic patterns in a sequence.
|
||||
///
|
||||
/// Returns a vector of (period_length, confidence) tuples for detected patterns.
|
||||
#[instrument(skip(self, sequence))]
|
||||
pub fn detect_periodicity(&self, sequence: &[ClusterId]) -> Vec<(usize, f32)> {
|
||||
let n = sequence.len();
|
||||
if n < 4 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut periods = Vec::new();
|
||||
let max_period = n / 2;
|
||||
|
||||
for period in 2..=max_period {
|
||||
let matches = self.count_periodic_matches(sequence, period);
|
||||
let max_matches = n / period;
|
||||
let confidence = matches as f32 / max_matches as f32;
|
||||
|
||||
if confidence > 0.5 {
|
||||
periods.push((period, confidence));
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by confidence descending
|
||||
periods.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
periods
|
||||
}
|
||||
|
||||
/// Count matches for a given period length.
|
||||
fn count_periodic_matches(&self, sequence: &[ClusterId], period: usize) -> usize {
|
||||
let n = sequence.len();
|
||||
let mut matches = 0;
|
||||
|
||||
for i in period..n {
|
||||
if sequence[i] == sequence[i - period] {
|
||||
matches += 1;
|
||||
}
|
||||
}
|
||||
|
||||
matches
|
||||
}
|
||||
|
||||
/// Apply Laplace smoothing to observation counts.
|
||||
fn apply_smoothing(&self, matrix: &mut TransitionMatrix) {
|
||||
let n = matrix.size();
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
matrix.observations[i][j] += self.smoothing as u32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute log-likelihood of a sequence given a transition matrix.
|
||||
#[must_use]
|
||||
pub fn log_likelihood(&self, sequence: &[ClusterId], matrix: &TransitionMatrix) -> f32 {
|
||||
if sequence.len() < 2 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut log_prob = 0.0f32;
|
||||
|
||||
for window in sequence.windows(2) {
|
||||
if let Some(prob) = matrix.probability(&window[0], &window[1]) {
|
||||
if prob > 0.0 {
|
||||
log_prob += prob.ln();
|
||||
} else {
|
||||
// Unseen transition - return negative infinity
|
||||
return f32::NEG_INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log_prob
|
||||
}
|
||||
|
||||
/// Find the most likely next cluster given current state.
|
||||
#[must_use]
|
||||
pub fn predict_next(
|
||||
&self,
|
||||
current: &ClusterId,
|
||||
matrix: &TransitionMatrix,
|
||||
) -> Option<(ClusterId, f32)> {
|
||||
let idx = matrix.index_of(current)?;
|
||||
|
||||
let mut best_cluster = None;
|
||||
let mut best_prob = 0.0f32;
|
||||
|
||||
for (j, &target_id) in matrix.cluster_ids.iter().enumerate() {
|
||||
let prob = matrix.probabilities[idx][j];
|
||||
if prob > best_prob {
|
||||
best_prob = prob;
|
||||
best_cluster = Some(target_id);
|
||||
}
|
||||
}
|
||||
|
||||
best_cluster.map(|c| (c, best_prob))
|
||||
}
|
||||
|
||||
/// Generate a sequence from the Markov chain.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `matrix` - The transition matrix
|
||||
/// * `start` - Starting cluster
|
||||
/// * `length` - Desired sequence length
|
||||
/// * `seed` - Random seed for reproducibility
|
||||
pub fn generate_sequence(
|
||||
&self,
|
||||
matrix: &TransitionMatrix,
|
||||
start: ClusterId,
|
||||
length: usize,
|
||||
seed: u64,
|
||||
) -> Vec<ClusterId> {
|
||||
let mut sequence = Vec::with_capacity(length);
|
||||
sequence.push(start);
|
||||
|
||||
let mut rng_state = seed;
|
||||
let mut next_random = || {
|
||||
rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
|
||||
((rng_state >> 33) as f32) / (u32::MAX as f32)
|
||||
};
|
||||
|
||||
let mut current = start;
|
||||
|
||||
for _ in 1..length {
|
||||
let idx = match matrix.index_of(¤t) {
|
||||
Some(i) => i,
|
||||
None => break,
|
||||
};
|
||||
|
||||
// Sample from transition probabilities
|
||||
let r = next_random();
|
||||
let mut cumsum = 0.0f32;
|
||||
let mut next_cluster = current;
|
||||
|
||||
for (j, &cluster_id) in matrix.cluster_ids.iter().enumerate() {
|
||||
cumsum += matrix.probabilities[idx][j];
|
||||
if r < cumsum {
|
||||
next_cluster = cluster_id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
sequence.push(next_cluster);
|
||||
current = next_cluster;
|
||||
}
|
||||
|
||||
sequence
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MarkovAnalyzer {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn create_test_sequence() -> Vec<ClusterId> {
|
||||
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
|
||||
let c2 = ClusterId::from_uuid(uuid::Uuid::from_u128(2));
|
||||
let c3 = ClusterId::from_uuid(uuid::Uuid::from_u128(3));
|
||||
|
||||
// Pattern: c1 -> c2 -> c3 -> c1 -> c2 -> c3 (periodic)
|
||||
vec![c1, c2, c3, c1, c2, c3, c1, c2, c3]
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_transition_matrix() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
let sequence = create_test_sequence();
|
||||
|
||||
let matrix = analyzer.build_transition_matrix(&sequence);
|
||||
|
||||
assert_eq!(matrix.size(), 3);
|
||||
assert!(!matrix.non_zero_transitions().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_entropy_computation() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
|
||||
// Uniform distribution should have higher entropy
|
||||
let c1 = ClusterId::new();
|
||||
let c2 = ClusterId::new();
|
||||
|
||||
let uniform_transitions = vec![
|
||||
(c1, c1, 0.25),
|
||||
(c1, c2, 0.25),
|
||||
(c2, c1, 0.25),
|
||||
(c2, c2, 0.25),
|
||||
];
|
||||
|
||||
let entropy = analyzer.compute_entropy(&uniform_transitions);
|
||||
assert!(entropy > 0.0);
|
||||
|
||||
// Deterministic distribution should have lower entropy
|
||||
let deterministic = vec![
|
||||
(c1, c2, 1.0),
|
||||
(c2, c1, 1.0),
|
||||
];
|
||||
|
||||
let det_entropy = analyzer.compute_entropy(&deterministic);
|
||||
assert!(det_entropy < entropy);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_metrics() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
let sequence = create_test_sequence();
|
||||
|
||||
let metrics = analyzer.compute_metrics(&sequence);
|
||||
|
||||
assert_eq!(metrics.unique_clusters, 3);
|
||||
// Deterministic sequence has zero entropy (each state has one successor)
|
||||
assert!(metrics.entropy >= 0.0);
|
||||
assert!(metrics.stereotypy >= 0.0 && metrics.stereotypy <= 1.0);
|
||||
assert!(metrics.total_transitions == sequence.len() - 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_periodicity_detection() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
|
||||
// Create highly periodic sequence
|
||||
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
|
||||
let c2 = ClusterId::from_uuid(uuid::Uuid::from_u128(2));
|
||||
|
||||
let periodic_sequence = vec![c1, c2, c1, c2, c1, c2, c1, c2, c1, c2];
|
||||
let periods = analyzer.detect_periodicity(&periodic_sequence);
|
||||
|
||||
// Should detect period 2 (may not be first due to confidence calculation)
|
||||
assert!(!periods.is_empty());
|
||||
// Check that period 2 is in the detected periods
|
||||
let has_period_2 = periods.iter().any(|(p, _)| *p == 2);
|
||||
assert!(has_period_2, "Period 2 should be detected, found periods: {:?}", periods);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_predict_next() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
let sequence = create_test_sequence();
|
||||
let matrix = analyzer.build_transition_matrix(&sequence);
|
||||
|
||||
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
|
||||
let c2 = ClusterId::from_uuid(uuid::Uuid::from_u128(2));
|
||||
|
||||
// Given the pattern c1 -> c2 -> c3 -> ..., after c1 should come c2
|
||||
if let Some((next, prob)) = analyzer.predict_next(&c1, &matrix) {
|
||||
assert_eq!(next, c2);
|
||||
assert!(prob > 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sequence_generation() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
let sequence = create_test_sequence();
|
||||
let matrix = analyzer.build_transition_matrix(&sequence);
|
||||
|
||||
let c1 = ClusterId::from_uuid(uuid::Uuid::from_u128(1));
|
||||
let generated = analyzer.generate_sequence(&matrix, c1, 10, 42);
|
||||
|
||||
assert_eq!(generated.len(), 10);
|
||||
assert_eq!(generated[0], c1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smoothing() {
|
||||
let analyzer = MarkovAnalyzer::with_smoothing(1.0);
|
||||
|
||||
let c1 = ClusterId::new();
|
||||
let c2 = ClusterId::new();
|
||||
let sequence = vec![c1, c2, c1, c2];
|
||||
|
||||
let matrix = analyzer.build_transition_matrix(&sequence);
|
||||
|
||||
// With smoothing, all transitions should have non-zero probability
|
||||
for i in 0..matrix.size() {
|
||||
for j in 0..matrix.size() {
|
||||
assert!(matrix.probabilities[i][j] > 0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_log_likelihood() {
|
||||
let analyzer = MarkovAnalyzer::new();
|
||||
let sequence = create_test_sequence();
|
||||
let matrix = analyzer.build_transition_matrix(&sequence);
|
||||
|
||||
// Log-likelihood of the training sequence should be reasonably high
|
||||
let ll = analyzer.log_likelihood(&sequence, &matrix);
|
||||
assert!(ll.is_finite());
|
||||
assert!(ll <= 0.0); // Log probabilities are non-positive
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,681 @@
|
||||
//! In-memory repository implementation for testing and development.
|
||||
//!
|
||||
//! Provides thread-safe in-memory storage for all analysis entities.
|
||||
|
||||
use async_trait::async_trait;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::RwLock;
|
||||
|
||||
use crate::domain::entities::{
|
||||
Anomaly, Cluster, ClusterId, EmbeddingId, Motif, Prototype, RecordingId, SequenceAnalysis,
|
||||
};
|
||||
use crate::domain::repository::{
|
||||
AnomalyRepository, ClusterRepository, MotifRepository, PrototypeRepository,
|
||||
RepositoryError, Result, SequenceRepository,
|
||||
};
|
||||
|
||||
/// In-memory implementation of the analysis repositories.
|
||||
///
|
||||
/// Useful for testing and development. Not suitable for production use
|
||||
/// as data is lost on restart.
|
||||
pub struct InMemoryAnalysisRepository {
|
||||
clusters: RwLock<HashMap<ClusterId, Cluster>>,
|
||||
prototypes: RwLock<HashMap<ClusterId, Vec<Prototype>>>,
|
||||
motifs: RwLock<HashMap<String, Motif>>,
|
||||
sequences: RwLock<HashMap<RecordingId, SequenceAnalysis>>,
|
||||
anomalies: RwLock<HashMap<EmbeddingId, Anomaly>>,
|
||||
/// Mapping from embedding ID to cluster ID
|
||||
embedding_assignments: RwLock<HashMap<EmbeddingId, ClusterId>>,
|
||||
}
|
||||
|
||||
impl InMemoryAnalysisRepository {
|
||||
/// Create a new empty in-memory repository.
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
clusters: RwLock::new(HashMap::new()),
|
||||
prototypes: RwLock::new(HashMap::new()),
|
||||
motifs: RwLock::new(HashMap::new()),
|
||||
sequences: RwLock::new(HashMap::new()),
|
||||
anomalies: RwLock::new(HashMap::new()),
|
||||
embedding_assignments: RwLock::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get statistics about stored data.
|
||||
#[must_use]
|
||||
pub fn stats(&self) -> RepositoryStats {
|
||||
let clusters = self.clusters.read().unwrap();
|
||||
let prototypes = self.prototypes.read().unwrap();
|
||||
let motifs = self.motifs.read().unwrap();
|
||||
let sequences = self.sequences.read().unwrap();
|
||||
let anomalies = self.anomalies.read().unwrap();
|
||||
|
||||
RepositoryStats {
|
||||
cluster_count: clusters.len(),
|
||||
prototype_count: prototypes.values().map(|v| v.len()).sum(),
|
||||
motif_count: motifs.len(),
|
||||
sequence_count: sequences.len(),
|
||||
anomaly_count: anomalies.len(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for InMemoryAnalysisRepository {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Statistics about repository contents.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RepositoryStats {
|
||||
/// Number of clusters stored.
|
||||
pub cluster_count: usize,
|
||||
/// Total number of prototypes.
|
||||
pub prototype_count: usize,
|
||||
/// Number of motifs stored.
|
||||
pub motif_count: usize,
|
||||
/// Number of sequence analyses stored.
|
||||
pub sequence_count: usize,
|
||||
/// Number of anomalies stored.
|
||||
pub anomaly_count: usize,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ClusterRepository for InMemoryAnalysisRepository {
|
||||
async fn save_cluster(&self, cluster: &Cluster) -> Result<()> {
|
||||
let mut clusters = self.clusters.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
clusters.insert(cluster.id, cluster.clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_clusters(&self, clusters_to_save: &[Cluster]) -> Result<()> {
|
||||
let mut clusters = self.clusters.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
for cluster in clusters_to_save {
|
||||
clusters.insert(cluster.id, cluster.clone());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_cluster(&self, id: &ClusterId) -> Result<Option<Cluster>> {
|
||||
let clusters = self.clusters.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(clusters.get(id).cloned())
|
||||
}
|
||||
|
||||
async fn list_clusters(&self) -> Result<Vec<Cluster>> {
|
||||
let clusters = self.clusters.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(clusters.values().cloned().collect())
|
||||
}
|
||||
|
||||
async fn list_clusters_paginated(
|
||||
&self,
|
||||
offset: usize,
|
||||
limit: usize,
|
||||
) -> Result<Vec<Cluster>> {
|
||||
let clusters = self.clusters.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(clusters.values().skip(offset).take(limit).cloned().collect())
|
||||
}
|
||||
|
||||
async fn assign_to_cluster(
|
||||
&self,
|
||||
embedding_id: &EmbeddingId,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<()> {
|
||||
let mut assignments = self.embedding_assignments.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
assignments.insert(*embedding_id, *cluster_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn remove_from_cluster(&self, embedding_id: &EmbeddingId) -> Result<()> {
|
||||
let mut assignments = self.embedding_assignments.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
assignments.remove(embedding_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_cluster_by_embedding(
|
||||
&self,
|
||||
embedding_id: &EmbeddingId,
|
||||
) -> Result<Option<Cluster>> {
|
||||
// Extract the cluster_id and drop the guard before await
|
||||
let cluster_id = {
|
||||
let assignments = self.embedding_assignments.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
assignments.get(embedding_id).cloned()
|
||||
};
|
||||
|
||||
if let Some(cluster_id) = cluster_id {
|
||||
self.find_cluster(&cluster_id).await
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_cluster(&self, id: &ClusterId) -> Result<()> {
|
||||
let mut clusters = self.clusters.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
clusters.remove(id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_clusters(&self) -> Result<()> {
|
||||
let mut clusters = self.clusters.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
clusters.clear();
|
||||
|
||||
let mut assignments = self.embedding_assignments.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
assignments.clear();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn cluster_count(&self) -> Result<usize> {
|
||||
let clusters = self.clusters.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(clusters.len())
|
||||
}
|
||||
|
||||
async fn find_clusters_by_label(&self, label_pattern: &str) -> Result<Vec<Cluster>> {
|
||||
let clusters = self.clusters.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(clusters
|
||||
.values()
|
||||
.filter(|c| {
|
||||
c.label
|
||||
.as_ref()
|
||||
.map_or(false, |l| l.contains(label_pattern))
|
||||
})
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn update_cluster_label(
|
||||
&self,
|
||||
id: &ClusterId,
|
||||
label: Option<String>,
|
||||
) -> Result<()> {
|
||||
let mut clusters = self.clusters.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
if let Some(cluster) = clusters.get_mut(id) {
|
||||
cluster.label = label;
|
||||
Ok(())
|
||||
} else {
|
||||
Err(RepositoryError::NotFound(format!("Cluster {}", id)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PrototypeRepository for InMemoryAnalysisRepository {
|
||||
async fn save_prototype(&self, prototype: &Prototype) -> Result<()> {
|
||||
let mut prototypes = self.prototypes.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
prototypes
|
||||
.entry(prototype.cluster_id)
|
||||
.or_default()
|
||||
.push(prototype.clone());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_prototypes(&self, prototypes_to_save: &[Prototype]) -> Result<()> {
|
||||
let mut prototypes = self.prototypes.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
for prototype in prototypes_to_save {
|
||||
prototypes
|
||||
.entry(prototype.cluster_id)
|
||||
.or_default()
|
||||
.push(prototype.clone());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_prototypes_by_cluster(
|
||||
&self,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<Vec<Prototype>> {
|
||||
let prototypes = self.prototypes.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(prototypes.get(cluster_id).cloned().unwrap_or_default())
|
||||
}
|
||||
|
||||
async fn find_best_prototype(
|
||||
&self,
|
||||
cluster_id: &ClusterId,
|
||||
) -> Result<Option<Prototype>> {
|
||||
let prototypes = self.prototypes.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(prototypes.get(cluster_id).and_then(|protos| {
|
||||
protos
|
||||
.iter()
|
||||
.max_by(|a, b| {
|
||||
a.exemplar_score
|
||||
.partial_cmp(&b.exemplar_score)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
})
|
||||
.cloned()
|
||||
}))
|
||||
}
|
||||
|
||||
async fn delete_prototypes_by_cluster(&self, cluster_id: &ClusterId) -> Result<()> {
|
||||
let mut prototypes = self.prototypes.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
prototypes.remove(cluster_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_prototypes(&self) -> Result<()> {
|
||||
let mut prototypes = self.prototypes.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
prototypes.clear();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl MotifRepository for InMemoryAnalysisRepository {
|
||||
async fn save_motif(&self, motif: &Motif) -> Result<()> {
|
||||
let mut motifs = self.motifs.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
motifs.insert(motif.id.clone(), motif.clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_motifs(&self, motifs_to_save: &[Motif]) -> Result<()> {
|
||||
let mut motifs = self.motifs.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
for motif in motifs_to_save {
|
||||
motifs.insert(motif.id.clone(), motif.clone());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_motif(&self, id: &str) -> Result<Option<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(motifs.get(id).cloned())
|
||||
}
|
||||
|
||||
async fn find_motifs_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(motifs
|
||||
.values()
|
||||
.filter(|m| m.contains_cluster(cluster_id))
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn list_motifs(&self) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(motifs.values().cloned().collect())
|
||||
}
|
||||
|
||||
async fn find_motifs_by_confidence(&self, min_confidence: f32) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(motifs
|
||||
.values()
|
||||
.filter(|m| m.confidence >= min_confidence)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn find_motifs_by_occurrences(&self, min_occurrences: usize) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(motifs
|
||||
.values()
|
||||
.filter(|m| m.occurrences >= min_occurrences)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn delete_motif(&self, id: &str) -> Result<()> {
|
||||
let mut motifs = self.motifs.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
motifs.remove(id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_motifs(&self) -> Result<()> {
|
||||
let mut motifs = self.motifs.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
motifs.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn motif_count(&self) -> Result<usize> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(motifs.len())
|
||||
}
|
||||
|
||||
async fn find_motifs_by_sequence(&self, sequence: &[ClusterId]) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(motifs
|
||||
.values()
|
||||
.filter(|m| m.sequence == sequence)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn find_motifs_containing_subsequence(
|
||||
&self,
|
||||
subsequence: &[ClusterId],
|
||||
) -> Result<Vec<Motif>> {
|
||||
let motifs = self.motifs.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(motifs
|
||||
.values()
|
||||
.filter(|m| {
|
||||
m.sequence
|
||||
.windows(subsequence.len())
|
||||
.any(|w| w == subsequence)
|
||||
})
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl SequenceRepository for InMemoryAnalysisRepository {
|
||||
async fn save_sequence_analysis(&self, analysis: &SequenceAnalysis) -> Result<()> {
|
||||
let mut sequences = self.sequences.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
sequences.insert(analysis.recording_id, analysis.clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_sequence_by_recording(
|
||||
&self,
|
||||
recording_id: &RecordingId,
|
||||
) -> Result<Option<SequenceAnalysis>> {
|
||||
let sequences = self.sequences.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(sequences.get(recording_id).cloned())
|
||||
}
|
||||
|
||||
async fn list_sequence_analyses(&self) -> Result<Vec<SequenceAnalysis>> {
|
||||
let sequences = self.sequences.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(sequences.values().cloned().collect())
|
||||
}
|
||||
|
||||
async fn delete_sequence_by_recording(&self, recording_id: &RecordingId) -> Result<()> {
|
||||
let mut sequences = self.sequences.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
sequences.remove(recording_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_sequences(&self) -> Result<()> {
|
||||
let mut sequences = self.sequences.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
sequences.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_sequences_by_entropy(&self, min_entropy: f32) -> Result<Vec<SequenceAnalysis>> {
|
||||
let sequences = self.sequences.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(sequences
|
||||
.values()
|
||||
.filter(|s| s.entropy >= min_entropy)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn find_sequences_by_stereotypy(
|
||||
&self,
|
||||
min_stereotypy: f32,
|
||||
) -> Result<Vec<SequenceAnalysis>> {
|
||||
let sequences = self.sequences.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(sequences
|
||||
.values()
|
||||
.filter(|s| s.stereotypy_score >= min_stereotypy)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl AnomalyRepository for InMemoryAnalysisRepository {
|
||||
async fn save_anomaly(&self, anomaly: &Anomaly) -> Result<()> {
|
||||
let mut anomalies = self.anomalies.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
anomalies.insert(anomaly.embedding_id, anomaly.clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn save_anomalies(&self, anomalies_to_save: &[Anomaly]) -> Result<()> {
|
||||
let mut anomalies = self.anomalies.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
for anomaly in anomalies_to_save {
|
||||
anomalies.insert(anomaly.embedding_id, anomaly.clone());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_anomaly(&self, embedding_id: &EmbeddingId) -> Result<Option<Anomaly>> {
|
||||
let anomalies = self.anomalies.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(anomalies.get(embedding_id).cloned())
|
||||
}
|
||||
|
||||
async fn list_anomalies(&self) -> Result<Vec<Anomaly>> {
|
||||
let anomalies = self.anomalies.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(anomalies.values().cloned().collect())
|
||||
}
|
||||
|
||||
async fn find_anomalies_by_score(&self, min_score: f32) -> Result<Vec<Anomaly>> {
|
||||
let anomalies = self.anomalies.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(anomalies
|
||||
.values()
|
||||
.filter(|a| a.anomaly_score >= min_score)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn find_anomalies_by_cluster(&self, cluster_id: &ClusterId) -> Result<Vec<Anomaly>> {
|
||||
let anomalies = self.anomalies.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
|
||||
Ok(anomalies
|
||||
.values()
|
||||
.filter(|a| a.nearest_cluster == *cluster_id)
|
||||
.cloned()
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn delete_anomaly(&self, embedding_id: &EmbeddingId) -> Result<()> {
|
||||
let mut anomalies = self.anomalies.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
anomalies.remove(embedding_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_all_anomalies(&self) -> Result<()> {
|
||||
let mut anomalies = self.anomalies.write().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
anomalies.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn anomaly_count(&self) -> Result<usize> {
|
||||
let anomalies = self.anomalies.read().map_err(|e| {
|
||||
RepositoryError::Internal(format!("Lock error: {}", e))
|
||||
})?;
|
||||
Ok(anomalies.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cluster_crud() {
|
||||
let repo = InMemoryAnalysisRepository::new();
|
||||
|
||||
let cluster = Cluster::new(
|
||||
EmbeddingId::new(),
|
||||
vec![EmbeddingId::new()],
|
||||
vec![0.0; 10],
|
||||
0.1,
|
||||
);
|
||||
|
||||
// Save
|
||||
repo.save_cluster(&cluster).await.unwrap();
|
||||
|
||||
// Find
|
||||
let found = repo.find_cluster(&cluster.id).await.unwrap();
|
||||
assert!(found.is_some());
|
||||
|
||||
// List
|
||||
let all = repo.list_clusters().await.unwrap();
|
||||
assert_eq!(all.len(), 1);
|
||||
|
||||
// Delete
|
||||
repo.delete_cluster(&cluster.id).await.unwrap();
|
||||
let found = repo.find_cluster(&cluster.id).await.unwrap();
|
||||
assert!(found.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_motif_crud() {
|
||||
let repo = InMemoryAnalysisRepository::new();
|
||||
|
||||
let motif = Motif::new(
|
||||
vec![ClusterId::new(), ClusterId::new()],
|
||||
5,
|
||||
1500.0,
|
||||
0.8,
|
||||
);
|
||||
|
||||
repo.save_motif(&motif).await.unwrap();
|
||||
|
||||
let found = repo.find_motif(&motif.id).await.unwrap();
|
||||
assert!(found.is_some());
|
||||
|
||||
let count = repo.motif_count().await.unwrap();
|
||||
assert_eq!(count, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_sequence_crud() {
|
||||
let repo = InMemoryAnalysisRepository::new();
|
||||
|
||||
let recording_id = RecordingId::new();
|
||||
let analysis = SequenceAnalysis::new(
|
||||
recording_id,
|
||||
vec![],
|
||||
1.5,
|
||||
0.5,
|
||||
);
|
||||
|
||||
repo.save_sequence_analysis(&analysis).await.unwrap();
|
||||
|
||||
let found = repo.find_sequence_by_recording(&recording_id).await.unwrap();
|
||||
assert!(found.is_some());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_anomaly_filtering() {
|
||||
let repo = InMemoryAnalysisRepository::new();
|
||||
|
||||
let anomaly1 = Anomaly::new(
|
||||
EmbeddingId::new(),
|
||||
0.9,
|
||||
ClusterId::new(),
|
||||
2.0,
|
||||
);
|
||||
|
||||
let anomaly2 = Anomaly::new(
|
||||
EmbeddingId::new(),
|
||||
0.3,
|
||||
ClusterId::new(),
|
||||
0.5,
|
||||
);
|
||||
|
||||
repo.save_anomalies(&[anomaly1, anomaly2]).await.unwrap();
|
||||
|
||||
let high_score = repo.find_anomalies_by_score(0.5).await.unwrap();
|
||||
assert_eq!(high_score.len(), 1);
|
||||
}
|
||||
}
|
||||
15
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/infrastructure/mod.rs
vendored
Normal file
15
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/infrastructure/mod.rs
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
//! Infrastructure layer for the Analysis bounded context.
|
||||
//!
|
||||
//! Contains concrete implementations of clustering algorithms,
|
||||
//! Markov chain analysis, and other infrastructure components.
|
||||
|
||||
pub mod hdbscan;
|
||||
pub mod kmeans;
|
||||
pub mod markov;
|
||||
pub mod memory_repository;
|
||||
|
||||
// Re-export main types
|
||||
pub use hdbscan::HdbscanClusterer;
|
||||
pub use kmeans::KMeansClusterer;
|
||||
pub use markov::MarkovAnalyzer;
|
||||
pub use memory_repository::InMemoryAnalysisRepository;
|
||||
78
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/lib.rs
vendored
Normal file
78
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/lib.rs
vendored
Normal file
@@ -0,0 +1,78 @@
|
||||
//! # sevensense-analysis
|
||||
//!
|
||||
//! Analysis bounded context for 7sense bioacoustic analysis platform.
|
||||
//!
|
||||
//! This crate provides clustering, motif detection, sequence analysis, and anomaly
|
||||
//! detection capabilities for bioacoustic embeddings.
|
||||
//!
|
||||
//! ## Features
|
||||
//!
|
||||
//! - **Clustering**: HDBSCAN and K-means clustering for grouping similar vocalizations
|
||||
//! - **Prototype Extraction**: Identify representative embeddings (exemplars) for each cluster
|
||||
//! - **Motif Detection**: Discover recurring patterns in vocalization sequences
|
||||
//! - **Sequence Analysis**: Markov chain analysis, transition matrices, entropy computation
|
||||
//! - **Anomaly Detection**: Identify unusual or novel vocalizations
|
||||
//!
|
||||
//! ## Architecture
|
||||
//!
|
||||
//! This crate follows Domain-Driven Design (DDD) with hexagonal architecture:
|
||||
//!
|
||||
//! - `domain/` - Core domain entities, value objects, and repository traits
|
||||
//! - `application/` - Application services orchestrating domain operations
|
||||
//! - `infrastructure/` - Concrete implementations (HDBSCAN, Markov chains, etc.)
|
||||
//!
|
||||
//! ## Example
|
||||
//!
|
||||
//! ```rust,ignore
|
||||
//! use sevensense_analysis::{
|
||||
//! application::ClusteringService,
|
||||
//! domain::{ClusteringConfig, ClusteringMethod},
|
||||
//! };
|
||||
//!
|
||||
//! let service = ClusteringService::new(ClusteringConfig::default());
|
||||
//! let embeddings = vec![/* ... */];
|
||||
//! let clusters = service.run_hdbscan(&embeddings).await?;
|
||||
//! ```
|
||||
|
||||
#![warn(missing_docs)]
|
||||
#![warn(clippy::all)]
|
||||
#![allow(clippy::module_name_repetitions)]
|
||||
|
||||
pub mod domain;
|
||||
pub mod application;
|
||||
pub mod infrastructure;
|
||||
pub mod metrics;
|
||||
|
||||
// Re-export primary types for convenience
|
||||
pub use domain::entities::{
|
||||
Anomaly, AnomalyType, Cluster, ClusterId, EmbeddingId, Motif, MotifOccurrence, Prototype,
|
||||
RecordingId, SegmentId, SequenceAnalysis,
|
||||
};
|
||||
pub use domain::repository::{ClusterRepository, MotifRepository, SequenceRepository};
|
||||
pub use domain::events::{
|
||||
AnalysisEvent, ClusterAssigned, ClustersDiscovered, MotifDetected, SequenceAnalyzed,
|
||||
};
|
||||
pub use domain::value_objects::{
|
||||
ClusteringConfig, ClusteringMethod, ClusteringParameters, MotifConfig, SequenceMetrics,
|
||||
TransitionMatrix,
|
||||
};
|
||||
|
||||
pub use application::services::{
|
||||
AnomalyDetectionService, ClusteringService, MotifDetectionService, SequenceAnalysisService,
|
||||
};
|
||||
|
||||
pub use metrics::{
|
||||
ClusteringMetrics, SequenceEntropy, SilhouetteScore, VMeasure,
|
||||
};
|
||||
|
||||
/// Crate version information
|
||||
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
||||
/// Prelude module for convenient imports
|
||||
pub mod prelude {
|
||||
pub use crate::domain::entities::*;
|
||||
pub use crate::domain::repository::*;
|
||||
pub use crate::domain::value_objects::*;
|
||||
pub use crate::application::services::*;
|
||||
pub use crate::metrics::*;
|
||||
}
|
||||
1131
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/metrics.rs
vendored
Normal file
1131
vendor/ruvector/examples/vibecast-7sense/crates/sevensense-analysis/src/metrics.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user