Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
//! Application layer for the Interpretation bounded context.
//!
//! Contains services for orchestrating interpretation workflows.
pub mod services;

View File

@@ -0,0 +1,769 @@
//! Application services for RAB interpretation.
//!
//! The `InterpretationService` orchestrates the building of evidence packs
//! and generation of interpretations with cited claims.
use std::sync::Arc;
use tracing::{debug, info, instrument, warn};
use crate::domain::entities::{
Claim, ClusterContext, EmbeddingId, EvidencePack, EvidenceRef, EvidenceRefType,
Interpretation, NeighborEvidence, RecordingMetadata, SequenceContext, SegmentId,
};
use crate::domain::repository::{ClusterRepository, EvidencePackRepository};
use crate::infrastructure::claim_generator::ClaimGenerator;
use crate::infrastructure::evidence_builder::EvidenceBuilder;
use crate::templates::InterpretationTemplates;
use crate::{Error, Result};
/// Configuration for the interpretation service.
#[derive(Debug, Clone)]
pub struct InterpretationConfig {
/// Maximum number of neighbors to include in evidence packs
pub max_neighbors: usize,
/// Whether to include spectrogram URLs in evidence
pub include_spectrograms: bool,
/// Minimum confidence threshold for claims
pub min_claim_confidence: f32,
/// Maximum number of claims per interpretation
pub max_claims: usize,
/// Whether to include sequence context
pub include_sequence_context: bool,
/// Number of preceding/following segments to include
pub sequence_context_window: usize,
/// Minimum overall confidence to accept an interpretation
pub min_interpretation_confidence: f32,
}
impl Default for InterpretationConfig {
fn default() -> Self {
Self {
max_neighbors: 10,
include_spectrograms: true,
min_claim_confidence: 0.5,
max_claims: 10,
include_sequence_context: true,
sequence_context_window: 3,
min_interpretation_confidence: 0.3,
}
}
}
impl InterpretationConfig {
/// Create a new configuration builder
pub fn builder() -> InterpretationConfigBuilder {
InterpretationConfigBuilder::default()
}
}
/// Builder for InterpretationConfig
#[derive(Debug, Default)]
pub struct InterpretationConfigBuilder {
config: InterpretationConfig,
}
impl InterpretationConfigBuilder {
pub fn max_neighbors(mut self, n: usize) -> Self {
self.config.max_neighbors = n;
self
}
pub fn include_spectrograms(mut self, include: bool) -> Self {
self.config.include_spectrograms = include;
self
}
pub fn min_claim_confidence(mut self, confidence: f32) -> Self {
self.config.min_claim_confidence = confidence;
self
}
pub fn max_claims(mut self, n: usize) -> Self {
self.config.max_claims = n;
self
}
pub fn include_sequence_context(mut self, include: bool) -> Self {
self.config.include_sequence_context = include;
self
}
pub fn sequence_context_window(mut self, window: usize) -> Self {
self.config.sequence_context_window = window;
self
}
pub fn min_interpretation_confidence(mut self, confidence: f32) -> Self {
self.config.min_interpretation_confidence = confidence;
self
}
pub fn build(self) -> InterpretationConfig {
self.config
}
}
/// Neighbor data from vector search (simplified interface).
///
/// This represents the data returned from the vector space service.
#[derive(Debug, Clone)]
pub struct Neighbor {
/// The embedding ID of the neighbor
pub embedding_id: EmbeddingId,
/// Distance from the query (lower = more similar)
pub distance: f32,
/// Optional metadata about the neighbor
pub metadata: Option<serde_json::Value>,
}
impl Neighbor {
/// Create a new neighbor
pub fn new(embedding_id: EmbeddingId, distance: f32) -> Self {
Self {
embedding_id,
distance,
metadata: None,
}
}
/// Add metadata
pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self {
self.metadata = Some(metadata);
self
}
}
/// Trait for vector space operations needed by the interpretation service.
///
/// This abstracts the vector search operations from sevensense-vector.
#[async_trait::async_trait]
pub trait VectorSpaceService: Send + Sync {
/// Find k nearest neighbors for an embedding
async fn find_neighbors(&self, embedding_id: &EmbeddingId, k: usize) -> Result<Vec<Neighbor>>;
/// Get the embedding vector for an ID
async fn get_embedding(&self, embedding_id: &EmbeddingId) -> Result<Option<Vec<f32>>>;
/// Calculate similarity between two embeddings
async fn calculate_similarity(
&self,
embedding_id_a: &EmbeddingId,
embedding_id_b: &EmbeddingId,
) -> Result<f32>;
}
/// Trait for sequence operations needed by the interpretation service.
///
/// This abstracts sequence analysis operations from sevensense-analysis.
#[async_trait::async_trait]
pub trait SequenceService: Send + Sync {
/// Get segments preceding the given segment in time
async fn get_preceding_segments(
&self,
segment_id: &SegmentId,
count: usize,
) -> Result<Vec<SegmentId>>;
/// Get segments following the given segment in time
async fn get_following_segments(
&self,
segment_id: &SegmentId,
count: usize,
) -> Result<Vec<SegmentId>>;
/// Detect motif patterns in a sequence
async fn detect_motif(&self, segment_ids: &[SegmentId]) -> Result<Option<String>>;
}
/// Trait for metadata lookup operations.
#[async_trait::async_trait]
pub trait MetadataService: Send + Sync {
/// Get recording metadata for an embedding
async fn get_recording_metadata(
&self,
embedding_id: &EmbeddingId,
) -> Result<RecordingMetadata>;
/// Get spectrogram URL for an embedding
async fn get_spectrogram_url(&self, embedding_id: &EmbeddingId) -> Result<Option<String>>;
/// Get segment ID for an embedding (if it represents a segment)
async fn get_segment_id(&self, embedding_id: &EmbeddingId) -> Result<Option<SegmentId>>;
}
/// Service for building evidence packs and generating interpretations.
///
/// This is the main entry point for the interpretation bounded context.
pub struct InterpretationService {
vector_service: Arc<dyn VectorSpaceService>,
cluster_repo: Arc<dyn ClusterRepository>,
metadata_service: Arc<dyn MetadataService>,
sequence_service: Option<Arc<dyn SequenceService>>,
evidence_pack_repo: Option<Arc<dyn EvidencePackRepository>>,
evidence_builder: EvidenceBuilder,
claim_generator: ClaimGenerator,
config: InterpretationConfig,
}
impl InterpretationService {
/// Create a new interpretation service.
pub fn new(
vector_service: Arc<dyn VectorSpaceService>,
cluster_repo: Arc<dyn ClusterRepository>,
metadata_service: Arc<dyn MetadataService>,
config: InterpretationConfig,
) -> Self {
let evidence_builder = EvidenceBuilder::new(&config);
let claim_generator = ClaimGenerator::new(&config);
Self {
vector_service,
cluster_repo,
metadata_service,
sequence_service: None,
evidence_pack_repo: None,
evidence_builder,
claim_generator,
config,
}
}
/// Add sequence service for temporal context
pub fn with_sequence_service(mut self, service: Arc<dyn SequenceService>) -> Self {
self.sequence_service = Some(service);
self
}
/// Add evidence pack repository for persistence
pub fn with_repository(mut self, repo: Arc<dyn EvidencePackRepository>) -> Self {
self.evidence_pack_repo = Some(repo);
self
}
/// Build an evidence pack for a query embedding.
///
/// This gathers all relevant evidence (neighbors, cluster context, sequence context)
/// and generates an interpretation with cited claims.
#[instrument(skip(self), fields(query_id = %query_id))]
pub async fn build_evidence_pack(&self, query_id: &EmbeddingId) -> Result<EvidencePack> {
info!("Building evidence pack for query: {}", query_id);
// Step 1: Find neighbors
let neighbors = self.vector_service
.find_neighbors(query_id, self.config.max_neighbors)
.await
.map_err(|e| Error::VectorServiceError(e.to_string()))?;
debug!("Found {} neighbors", neighbors.len());
// Step 2: Collect neighbor evidence
let neighbor_evidence = self
.collect_neighbor_evidence(&neighbors)
.await?;
// Step 3: Build cluster context
let cluster_context = self.build_cluster_context(query_id).await?;
debug!(
"Cluster context: assigned={}, confidence={}",
cluster_context.has_cluster(),
cluster_context.confidence
);
// Step 4: Build sequence context (if enabled and available)
let sequence_context = if self.config.include_sequence_context {
self.build_sequence_context(query_id).await?
} else {
None
};
// Step 5: Generate interpretation
let interpretation = self
.generate_interpretation_internal(
query_id,
&neighbor_evidence,
&cluster_context,
&sequence_context,
)
.await?;
// Step 6: Create evidence pack
let evidence_pack = EvidencePack::new(
query_id.clone(),
neighbor_evidence,
cluster_context,
sequence_context,
interpretation,
);
info!(
"Built evidence pack {} with {} neighbors, confidence={}",
evidence_pack.id,
evidence_pack.neighbors.len(),
evidence_pack.overall_confidence()
);
// Step 7: Persist if repository is available
if let Some(repo) = &self.evidence_pack_repo {
repo.save(&evidence_pack).await?;
debug!("Persisted evidence pack {}", evidence_pack.id);
}
Ok(evidence_pack)
}
/// Generate an interpretation for an existing evidence pack.
///
/// Useful for regenerating interpretations with different parameters.
#[instrument(skip(self, evidence))]
pub async fn generate_interpretation(
&self,
evidence: &EvidencePack,
) -> Result<Interpretation> {
self.generate_interpretation_internal(
&evidence.query_embedding_id,
&evidence.neighbors,
&evidence.cluster_context,
&evidence.sequence_context,
)
.await
}
/// Validate claims against evidence.
///
/// Returns each claim paired with a boolean indicating if it's well-supported.
#[instrument(skip(self, claims))]
pub async fn validate_claims(&self, claims: &[Claim]) -> Result<Vec<(Claim, bool)>> {
let mut results = Vec::with_capacity(claims.len());
for claim in claims {
let is_valid = self.validate_single_claim(claim).await?;
results.push((claim.clone(), is_valid));
}
let valid_count = results.iter().filter(|(_, v)| *v).count();
info!(
"Validated {} claims: {} valid, {} invalid",
claims.len(),
valid_count,
claims.len() - valid_count
);
Ok(results)
}
/// Collect neighbor evidence with metadata.
async fn collect_neighbor_evidence(
&self,
neighbors: &[Neighbor],
) -> Result<Vec<NeighborEvidence>> {
let mut evidence = Vec::with_capacity(neighbors.len());
for neighbor in neighbors {
let metadata = self
.metadata_service
.get_recording_metadata(&neighbor.embedding_id)
.await
.unwrap_or_else(|_| RecordingMetadata::new("unknown"));
let mut neighbor_ev = NeighborEvidence::new(
neighbor.embedding_id.clone(),
neighbor.distance,
metadata,
);
// Add cluster info
let cluster_ctx = self
.cluster_repo
.get_cluster_context(&neighbor.embedding_id)
.await
.unwrap_or_else(|_| ClusterContext::empty());
if let Some(cluster_id) = cluster_ctx.assigned_cluster {
neighbor_ev = neighbor_ev.with_cluster(cluster_id);
}
// Add spectrogram URL if enabled
if self.config.include_spectrograms {
if let Ok(Some(url)) = self
.metadata_service
.get_spectrogram_url(&neighbor.embedding_id)
.await
{
neighbor_ev = neighbor_ev.with_spectrogram(url);
}
}
evidence.push(neighbor_ev);
}
Ok(evidence)
}
/// Build cluster context for an embedding.
async fn build_cluster_context(&self, embedding_id: &EmbeddingId) -> Result<ClusterContext> {
self.cluster_repo
.get_cluster_context(embedding_id)
.await
.map_err(|e| Error::ClusterServiceError(e.to_string()))
}
/// Build sequence context if sequence service is available.
async fn build_sequence_context(
&self,
embedding_id: &EmbeddingId,
) -> Result<Option<SequenceContext>> {
let sequence_service = match &self.sequence_service {
Some(s) => s,
None => return Ok(None),
};
// Get segment ID for this embedding
let segment_id = match self.metadata_service.get_segment_id(embedding_id).await? {
Some(id) => id,
None => return Ok(None),
};
let window = self.config.sequence_context_window;
// Get preceding segments
let preceding = sequence_service
.get_preceding_segments(&segment_id, window)
.await
.unwrap_or_default();
// Get following segments
let following = sequence_service
.get_following_segments(&segment_id, window)
.await
.unwrap_or_default();
if preceding.is_empty() && following.is_empty() {
return Ok(None);
}
// Try to detect motif
let mut all_segments = preceding.clone();
all_segments.push(segment_id);
all_segments.extend(following.clone());
let motif = sequence_service.detect_motif(&all_segments).await.ok().flatten();
let context = SequenceContext::new(preceding, following);
let context = if let Some(m) = motif {
context.with_motif(m)
} else {
context
};
Ok(Some(context))
}
/// Internal implementation of interpretation generation.
async fn generate_interpretation_internal(
&self,
query_id: &EmbeddingId,
neighbors: &[NeighborEvidence],
cluster_context: &ClusterContext,
sequence_context: &Option<SequenceContext>,
) -> Result<Interpretation> {
// Generate structural description
let structural_description = self
.generate_structural_description(neighbors, cluster_context, sequence_context);
// Generate claims with evidence citations
let claims = self
.claim_generator
.generate_claims(query_id, neighbors, cluster_context, sequence_context)
.await?;
// Filter claims by confidence threshold
let claims: Vec<Claim> = claims
.into_iter()
.filter(|c| c.confidence >= self.config.min_claim_confidence)
.take(self.config.max_claims)
.collect();
// Calculate overall confidence
let confidence = if claims.is_empty() {
0.0
} else {
let sum: f32 = claims.iter().map(|c| c.confidence).sum();
sum / claims.len() as f32
};
Ok(Interpretation::new(structural_description, claims, confidence))
}
/// Generate a structural description of the acoustic signal.
fn generate_structural_description(
&self,
neighbors: &[NeighborEvidence],
cluster_context: &ClusterContext,
sequence_context: &Option<SequenceContext>,
) -> String {
let templates = InterpretationTemplates::new();
let mut parts = Vec::new();
// Describe based on neighbors
if !neighbors.is_empty() {
let avg_distance: f32 = neighbors.iter().map(|n| n.distance).sum::<f32>()
/ neighbors.len() as f32;
let similarity = 1.0 - avg_distance.min(1.0);
parts.push(templates.neighbor_description(neighbors.len(), similarity));
// Add taxon info if available
let taxa: Vec<&str> = neighbors
.iter()
.filter_map(|n| n.recording_metadata.taxon.as_deref())
.collect();
if !taxa.is_empty() {
parts.push(templates.taxon_description(&taxa));
}
}
// Describe cluster context
if cluster_context.has_cluster() {
let label = cluster_context
.cluster_label
.as_deref()
.unwrap_or("unlabeled");
parts.push(templates.cluster_description(
label,
cluster_context.confidence,
cluster_context.exemplar_similarity,
));
}
// Describe sequence context
if let Some(seq) = sequence_context {
if seq.has_temporal_context() {
parts.push(templates.sequence_description(
seq.sequence_length(),
seq.detected_motif.as_deref(),
));
}
}
if parts.is_empty() {
"Insufficient evidence for structural description.".to_string()
} else {
parts.join(" ")
}
}
/// Validate a single claim against its evidence.
async fn validate_single_claim(&self, claim: &Claim) -> Result<bool> {
// A claim is valid if it has evidence AND confidence is above threshold
if claim.evidence_refs.is_empty() {
warn!("Claim has no evidence references: {}", claim.statement);
return Ok(false);
}
if claim.confidence < self.config.min_claim_confidence {
debug!(
"Claim confidence {} below threshold {}: {}",
claim.confidence, self.config.min_claim_confidence, claim.statement
);
return Ok(false);
}
// Verify each evidence reference exists
for evidence_ref in &claim.evidence_refs {
let exists = match evidence_ref.ref_type {
EvidenceRefType::Neighbor => {
let emb_id = EmbeddingId::new(&evidence_ref.ref_id);
self.vector_service
.get_embedding(&emb_id)
.await
.map(|e| e.is_some())
.unwrap_or(false)
}
EvidenceRefType::Cluster => {
let cluster_id = crate::domain::entities::ClusterId::new(&evidence_ref.ref_id);
self.cluster_repo
.get_cluster_label(&cluster_id)
.await
.is_ok()
}
EvidenceRefType::Sequence | EvidenceRefType::Taxon => {
// These are derived evidence, assume valid if present
true
}
};
if !exists {
warn!(
"Evidence reference not found: {} ({})",
evidence_ref.ref_id, evidence_ref.ref_type
);
return Ok(false);
}
}
Ok(true)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::domain::repository::InMemoryClusterRepository;
use std::collections::HashMap;
use std::sync::RwLock;
// Mock implementations for testing
struct MockVectorService {
neighbors: RwLock<HashMap<String, Vec<Neighbor>>>,
embeddings: RwLock<HashMap<String, Vec<f32>>>,
}
impl MockVectorService {
fn new() -> Self {
Self {
neighbors: RwLock::new(HashMap::new()),
embeddings: RwLock::new(HashMap::new()),
}
}
fn add_neighbor(&self, query_id: &str, neighbor: Neighbor) {
let mut neighbors = self.neighbors.write().unwrap();
neighbors
.entry(query_id.to_string())
.or_default()
.push(neighbor);
}
fn add_embedding(&self, id: &str, embedding: Vec<f32>) {
let mut embeddings = self.embeddings.write().unwrap();
embeddings.insert(id.to_string(), embedding);
}
}
#[async_trait::async_trait]
impl VectorSpaceService for MockVectorService {
async fn find_neighbors(&self, embedding_id: &EmbeddingId, k: usize) -> Result<Vec<Neighbor>> {
let neighbors = self.neighbors.read().unwrap();
let result = neighbors
.get(embedding_id.as_str())
.map(|n| n.iter().take(k).cloned().collect())
.unwrap_or_default();
Ok(result)
}
async fn get_embedding(&self, embedding_id: &EmbeddingId) -> Result<Option<Vec<f32>>> {
let embeddings = self.embeddings.read().unwrap();
Ok(embeddings.get(embedding_id.as_str()).cloned())
}
async fn calculate_similarity(
&self,
_embedding_id_a: &EmbeddingId,
_embedding_id_b: &EmbeddingId,
) -> Result<f32> {
Ok(0.85)
}
}
struct MockMetadataService;
#[async_trait::async_trait]
impl MetadataService for MockMetadataService {
async fn get_recording_metadata(
&self,
embedding_id: &EmbeddingId,
) -> Result<RecordingMetadata> {
Ok(RecordingMetadata::new(format!("recording-{}", embedding_id)))
}
async fn get_spectrogram_url(&self, embedding_id: &EmbeddingId) -> Result<Option<String>> {
Ok(Some(format!(
"https://spectrograms.example.com/{}",
embedding_id
)))
}
async fn get_segment_id(&self, _embedding_id: &EmbeddingId) -> Result<Option<SegmentId>> {
Ok(None)
}
}
#[tokio::test]
async fn test_interpretation_service_build_evidence_pack() {
let vector_service = Arc::new(MockVectorService::new());
let cluster_repo = Arc::new(InMemoryClusterRepository::new());
let metadata_service = Arc::new(MockMetadataService);
// Add some test data
vector_service.add_neighbor(
"query-1",
Neighbor::new(EmbeddingId::new("neighbor-1"), 0.1),
);
vector_service.add_neighbor(
"query-1",
Neighbor::new(EmbeddingId::new("neighbor-2"), 0.2),
);
vector_service.add_embedding("neighbor-1", vec![0.1, 0.2, 0.3]);
vector_service.add_embedding("neighbor-2", vec![0.2, 0.3, 0.4]);
let config = InterpretationConfig::default();
let service = InterpretationService::new(
vector_service,
cluster_repo,
metadata_service,
config,
);
let query_id = EmbeddingId::new("query-1");
let result = service.build_evidence_pack(&query_id).await;
assert!(result.is_ok());
let pack = result.unwrap();
assert_eq!(pack.query_embedding_id, query_id);
assert_eq!(pack.neighbors.len(), 2);
}
#[tokio::test]
async fn test_validate_claims() {
let vector_service = Arc::new(MockVectorService::new());
let cluster_repo = Arc::new(InMemoryClusterRepository::new());
let metadata_service = Arc::new(MockMetadataService);
vector_service.add_embedding("evidence-1", vec![0.1, 0.2, 0.3]);
let config = InterpretationConfig::default();
let service = InterpretationService::new(
vector_service,
cluster_repo,
metadata_service,
config,
);
let valid_claim = Claim::new("Valid claim with evidence", 0.9)
.with_evidence(vec![EvidenceRef::neighbor(
&EmbeddingId::new("evidence-1"),
"Supporting evidence",
)]);
let invalid_claim = Claim::new("Invalid claim without evidence", 0.9);
let results = service
.validate_claims(&[valid_claim, invalid_claim])
.await
.unwrap();
assert_eq!(results.len(), 2);
assert!(results[0].1); // Valid claim
assert!(!results[1].1); // Invalid claim (no evidence)
}
}

View File

@@ -0,0 +1,691 @@
//! Core domain entities for the Interpretation bounded context.
//!
//! These entities represent RAB (Retrieval-Augmented Bioacoustics) evidence packs
//! and their associated interpretations with cited claims.
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
/// Type alias for timestamps
pub type Timestamp = DateTime<Utc>;
/// Unique identifier for embeddings
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct EmbeddingId(pub String);
impl EmbeddingId {
/// Create a new embedding ID
pub fn new(id: impl Into<String>) -> Self {
Self(id.into())
}
/// Generate a new random embedding ID
pub fn generate() -> Self {
Self(Uuid::new_v4().to_string())
}
/// Get the inner string value
pub fn as_str(&self) -> &str {
&self.0
}
}
impl From<String> for EmbeddingId {
fn from(s: String) -> Self {
Self(s)
}
}
impl From<&str> for EmbeddingId {
fn from(s: &str) -> Self {
Self(s.to_string())
}
}
impl std::fmt::Display for EmbeddingId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
/// Unique identifier for clusters
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ClusterId(pub String);
impl ClusterId {
pub fn new(id: impl Into<String>) -> Self {
Self(id.into())
}
pub fn as_str(&self) -> &str {
&self.0
}
}
impl std::fmt::Display for ClusterId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
/// Unique identifier for audio segments
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct SegmentId(pub String);
impl SegmentId {
pub fn new(id: impl Into<String>) -> Self {
Self(id.into())
}
pub fn as_str(&self) -> &str {
&self.0
}
}
/// Evidence pack containing all evidence for a bioacoustic query.
///
/// An evidence pack is the core artifact of RAB interpretation, bundling
/// together neighbor evidence, cluster context, sequence context, and
/// the generated interpretation with cited claims.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvidencePack {
/// Unique identifier for this evidence pack
pub id: String,
/// The query embedding that initiated this evidence pack
pub query_embedding_id: EmbeddingId,
/// Evidence from nearest neighbor search
pub neighbors: Vec<NeighborEvidence>,
/// Context from cluster analysis
pub cluster_context: ClusterContext,
/// Optional temporal sequence context
pub sequence_context: Option<SequenceContext>,
/// Generated interpretation with claims
pub interpretation: Interpretation,
/// When this evidence pack was created
pub created_at: Timestamp,
}
impl EvidencePack {
/// Create a new evidence pack with a generated ID
pub fn new(
query_embedding_id: EmbeddingId,
neighbors: Vec<NeighborEvidence>,
cluster_context: ClusterContext,
sequence_context: Option<SequenceContext>,
interpretation: Interpretation,
) -> Self {
Self {
id: Uuid::new_v4().to_string(),
query_embedding_id,
neighbors,
cluster_context,
sequence_context,
interpretation,
created_at: Utc::now(),
}
}
/// Get the total confidence score for this evidence pack
pub fn overall_confidence(&self) -> f32 {
let neighbor_confidence = if self.neighbors.is_empty() {
0.0
} else {
// Higher confidence if neighbors are close (low distance)
let avg_distance: f32 = self.neighbors.iter().map(|n| n.distance).sum::<f32>()
/ self.neighbors.len() as f32;
(1.0 - avg_distance.min(1.0)).max(0.0)
};
let cluster_confidence = self.cluster_context.confidence;
let interpretation_confidence = self.interpretation.confidence;
// Weighted average
(neighbor_confidence * 0.3 + cluster_confidence * 0.3 + interpretation_confidence * 0.4)
}
/// Get the number of distinct evidence sources
pub fn evidence_source_count(&self) -> usize {
let mut count = 0;
if !self.neighbors.is_empty() {
count += 1;
}
if self.cluster_context.assigned_cluster.is_some() {
count += 1;
}
if self.sequence_context.is_some() {
count += 1;
}
count
}
}
/// Evidence from a single neighbor in vector space.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NeighborEvidence {
/// The embedding ID of this neighbor
pub embedding_id: EmbeddingId,
/// Distance from the query embedding (lower = more similar)
pub distance: f32,
/// Cluster assignment if available
pub cluster_id: Option<ClusterId>,
/// Metadata about the source recording
pub recording_metadata: RecordingMetadata,
/// Optional URL to a spectrogram visualization
pub spectrogram_url: Option<String>,
}
impl NeighborEvidence {
/// Create new neighbor evidence
pub fn new(
embedding_id: EmbeddingId,
distance: f32,
recording_metadata: RecordingMetadata,
) -> Self {
Self {
embedding_id,
distance,
cluster_id: None,
recording_metadata,
spectrogram_url: None,
}
}
/// Add cluster information
pub fn with_cluster(mut self, cluster_id: ClusterId) -> Self {
self.cluster_id = Some(cluster_id);
self
}
/// Add spectrogram URL
pub fn with_spectrogram(mut self, url: String) -> Self {
self.spectrogram_url = Some(url);
self
}
/// Convert distance to similarity score (0.0 to 1.0)
pub fn similarity(&self) -> f32 {
(1.0 - self.distance).max(0.0).min(1.0)
}
}
/// Metadata about a source recording
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecordingMetadata {
/// Recording identifier
pub recording_id: String,
/// Species or taxon if known
pub taxon: Option<String>,
/// Geographic location
pub location: Option<GeoLocation>,
/// Recording timestamp
pub recorded_at: Option<Timestamp>,
/// Duration in seconds
pub duration_seconds: Option<f32>,
/// Sample rate in Hz
pub sample_rate: Option<u32>,
/// Additional tags or labels
pub tags: Vec<String>,
}
impl RecordingMetadata {
/// Create minimal recording metadata
pub fn new(recording_id: impl Into<String>) -> Self {
Self {
recording_id: recording_id.into(),
taxon: None,
location: None,
recorded_at: None,
duration_seconds: None,
sample_rate: None,
tags: Vec::new(),
}
}
/// Add taxon information
pub fn with_taxon(mut self, taxon: impl Into<String>) -> Self {
self.taxon = Some(taxon.into());
self
}
/// Add location information
pub fn with_location(mut self, location: GeoLocation) -> Self {
self.location = Some(location);
self
}
}
/// Geographic location
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GeoLocation {
pub latitude: f64,
pub longitude: f64,
pub elevation_meters: Option<f32>,
pub locality: Option<String>,
}
impl GeoLocation {
pub fn new(latitude: f64, longitude: f64) -> Self {
Self {
latitude,
longitude,
elevation_meters: None,
locality: None,
}
}
}
/// Context from cluster analysis.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClusterContext {
/// The cluster this embedding was assigned to
pub assigned_cluster: Option<ClusterId>,
/// Human-readable label for the cluster
pub cluster_label: Option<String>,
/// Confidence in the cluster assignment (0.0 to 1.0)
pub confidence: f32,
/// Similarity to the cluster exemplar (0.0 to 1.0)
pub exemplar_similarity: f32,
}
impl ClusterContext {
/// Create a new cluster context
pub fn new(
assigned_cluster: Option<ClusterId>,
confidence: f32,
exemplar_similarity: f32,
) -> Self {
Self {
assigned_cluster,
cluster_label: None,
confidence,
exemplar_similarity,
}
}
/// Create an empty cluster context (no cluster assigned)
pub fn empty() -> Self {
Self {
assigned_cluster: None,
cluster_label: None,
confidence: 0.0,
exemplar_similarity: 0.0,
}
}
/// Add a cluster label
pub fn with_label(mut self, label: impl Into<String>) -> Self {
self.cluster_label = Some(label.into());
self
}
/// Check if a cluster was assigned
pub fn has_cluster(&self) -> bool {
self.assigned_cluster.is_some()
}
}
/// Temporal sequence context for understanding vocalization patterns.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SequenceContext {
/// Segments that precede the query in time
pub preceding_segments: Vec<SegmentId>,
/// Segments that follow the query in time
pub following_segments: Vec<SegmentId>,
/// Detected acoustic motif pattern
pub detected_motif: Option<String>,
}
impl SequenceContext {
/// Create a new sequence context
pub fn new(
preceding_segments: Vec<SegmentId>,
following_segments: Vec<SegmentId>,
) -> Self {
Self {
preceding_segments,
following_segments,
detected_motif: None,
}
}
/// Create an empty sequence context
pub fn empty() -> Self {
Self {
preceding_segments: Vec::new(),
following_segments: Vec::new(),
detected_motif: None,
}
}
/// Add a detected motif
pub fn with_motif(mut self, motif: impl Into<String>) -> Self {
self.detected_motif = Some(motif.into());
self
}
/// Check if sequence context has any temporal information
pub fn has_temporal_context(&self) -> bool {
!self.preceding_segments.is_empty() || !self.following_segments.is_empty()
}
/// Get total sequence length
pub fn sequence_length(&self) -> usize {
self.preceding_segments.len() + 1 + self.following_segments.len()
}
}
/// Generated interpretation of the evidence.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Interpretation {
/// Structural description of the acoustic signal
pub structural_description: String,
/// Claims made about the signal with evidence citations
pub claims: Vec<Claim>,
/// Overall confidence in the interpretation (0.0 to 1.0)
pub confidence: f32,
}
impl Interpretation {
/// Create a new interpretation
pub fn new(structural_description: String, claims: Vec<Claim>, confidence: f32) -> Self {
Self {
structural_description,
claims,
confidence: confidence.clamp(0.0, 1.0),
}
}
/// Create an empty interpretation with no claims
pub fn empty() -> Self {
Self {
structural_description: String::new(),
claims: Vec::new(),
confidence: 0.0,
}
}
/// Add a claim to the interpretation
pub fn add_claim(&mut self, claim: Claim) {
self.claims.push(claim);
self.recalculate_confidence();
}
/// Recalculate overall confidence based on claims
fn recalculate_confidence(&mut self) {
if self.claims.is_empty() {
return;
}
let total_confidence: f32 = self.claims.iter().map(|c| c.confidence).sum();
self.confidence = total_confidence / self.claims.len() as f32;
}
/// Get claims above a confidence threshold
pub fn high_confidence_claims(&self, threshold: f32) -> Vec<&Claim> {
self.claims
.iter()
.filter(|c| c.confidence >= threshold)
.collect()
}
/// Get the number of evidence-backed claims
pub fn evidenced_claim_count(&self) -> usize {
self.claims
.iter()
.filter(|c| !c.evidence_refs.is_empty())
.count()
}
}
/// A claim made about the acoustic signal with evidence citations.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Claim {
/// The statement being made
pub statement: String,
/// References to evidence supporting this claim
pub evidence_refs: Vec<EvidenceRef>,
/// Confidence in this claim (0.0 to 1.0)
pub confidence: f32,
}
impl Claim {
/// Create a new claim
pub fn new(statement: impl Into<String>, confidence: f32) -> Self {
Self {
statement: statement.into(),
evidence_refs: Vec::new(),
confidence: confidence.clamp(0.0, 1.0),
}
}
/// Add an evidence reference
pub fn add_evidence(&mut self, evidence_ref: EvidenceRef) {
self.evidence_refs.push(evidence_ref);
}
/// Create a claim with evidence references
pub fn with_evidence(mut self, evidence_refs: Vec<EvidenceRef>) -> Self {
self.evidence_refs = evidence_refs;
self
}
/// Check if this claim has supporting evidence
pub fn has_evidence(&self) -> bool {
!self.evidence_refs.is_empty()
}
/// Get evidence references of a specific type
pub fn evidence_of_type(&self, ref_type: EvidenceRefType) -> Vec<&EvidenceRef> {
self.evidence_refs
.iter()
.filter(|e| e.ref_type == ref_type)
.collect()
}
}
/// Reference to a piece of evidence supporting a claim.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvidenceRef {
/// Type of evidence being referenced
pub ref_type: EvidenceRefType,
/// Identifier for the evidence (embedding ID, cluster ID, etc.)
pub ref_id: String,
/// Human-readable description of the evidence
pub description: String,
}
impl EvidenceRef {
/// Create a new evidence reference
pub fn new(ref_type: EvidenceRefType, ref_id: impl Into<String>, description: impl Into<String>) -> Self {
Self {
ref_type,
ref_id: ref_id.into(),
description: description.into(),
}
}
/// Create a neighbor evidence reference
pub fn neighbor(embedding_id: &EmbeddingId, description: impl Into<String>) -> Self {
Self::new(EvidenceRefType::Neighbor, embedding_id.as_str(), description)
}
/// Create a cluster evidence reference
pub fn cluster(cluster_id: &ClusterId, description: impl Into<String>) -> Self {
Self::new(EvidenceRefType::Cluster, cluster_id.as_str(), description)
}
/// Create a sequence evidence reference
pub fn sequence(segment_id: &SegmentId, description: impl Into<String>) -> Self {
Self::new(EvidenceRefType::Sequence, segment_id.as_str(), description)
}
/// Create a taxon evidence reference
pub fn taxon(taxon_name: impl Into<String>, description: impl Into<String>) -> Self {
Self::new(EvidenceRefType::Taxon, taxon_name, description)
}
}
/// Types of evidence that can be referenced in claims.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum EvidenceRefType {
/// Evidence from nearest neighbor search
Neighbor,
/// Evidence from cluster assignment
Cluster,
/// Evidence from temporal sequence analysis
Sequence,
/// Evidence from taxonomic classification
Taxon,
}
impl std::fmt::Display for EvidenceRefType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
EvidenceRefType::Neighbor => write!(f, "neighbor"),
EvidenceRefType::Cluster => write!(f, "cluster"),
EvidenceRefType::Sequence => write!(f, "sequence"),
EvidenceRefType::Taxon => write!(f, "taxon"),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_embedding_id() {
let id = EmbeddingId::new("test-123");
assert_eq!(id.as_str(), "test-123");
assert_eq!(id.to_string(), "test-123");
let generated = EmbeddingId::generate();
assert!(!generated.as_str().is_empty());
}
#[test]
fn test_neighbor_evidence_similarity() {
let metadata = RecordingMetadata::new("rec-1");
let evidence = NeighborEvidence::new(
EmbeddingId::new("emb-1"),
0.2,
metadata,
);
assert_eq!(evidence.similarity(), 0.8);
let far_evidence = NeighborEvidence::new(
EmbeddingId::new("emb-2"),
1.5,
RecordingMetadata::new("rec-2"),
);
assert_eq!(far_evidence.similarity(), 0.0);
}
#[test]
fn test_cluster_context() {
let context = ClusterContext::new(
Some(ClusterId::new("cluster-1")),
0.85,
0.92,
).with_label("Song Type A");
assert!(context.has_cluster());
assert_eq!(context.cluster_label, Some("Song Type A".to_string()));
}
#[test]
fn test_sequence_context() {
let context = SequenceContext::new(
vec![SegmentId::new("seg-1"), SegmentId::new("seg-2")],
vec![SegmentId::new("seg-4")],
).with_motif("ABAB");
assert!(context.has_temporal_context());
assert_eq!(context.sequence_length(), 4);
assert_eq!(context.detected_motif, Some("ABAB".to_string()));
}
#[test]
fn test_claim_with_evidence() {
let mut claim = Claim::new("This is a dawn chorus vocalization", 0.9);
claim.add_evidence(EvidenceRef::neighbor(
&EmbeddingId::new("emb-1"),
"Similar to known dawn chorus recording",
));
claim.add_evidence(EvidenceRef::cluster(
&ClusterId::new("cluster-5"),
"Assigned to dawn chorus cluster",
));
assert!(claim.has_evidence());
assert_eq!(claim.evidence_refs.len(), 2);
assert_eq!(claim.evidence_of_type(EvidenceRefType::Neighbor).len(), 1);
}
#[test]
fn test_interpretation_confidence() {
let mut interp = Interpretation::new(
"Complex harmonic structure with frequency modulation".to_string(),
Vec::new(),
0.0,
);
interp.add_claim(Claim::new("Claim 1", 0.8));
interp.add_claim(Claim::new("Claim 2", 0.6));
assert_eq!(interp.confidence, 0.7);
}
#[test]
fn test_evidence_pack_overall_confidence() {
let pack = EvidencePack::new(
EmbeddingId::new("query-1"),
vec![
NeighborEvidence::new(
EmbeddingId::new("n-1"),
0.1,
RecordingMetadata::new("r-1"),
),
NeighborEvidence::new(
EmbeddingId::new("n-2"),
0.2,
RecordingMetadata::new("r-2"),
),
],
ClusterContext::new(Some(ClusterId::new("c-1")), 0.9, 0.85),
None,
Interpretation::new("Test".to_string(), vec![Claim::new("Test", 0.8)], 0.8),
);
let confidence = pack.overall_confidence();
assert!(confidence > 0.0 && confidence <= 1.0);
assert_eq!(pack.evidence_source_count(), 2);
}
}

View File

@@ -0,0 +1,6 @@
//! Domain layer for the Interpretation bounded context.
//!
//! Contains core entities and repository traits.
pub mod entities;
pub mod repository;

View File

@@ -0,0 +1,305 @@
//! Repository traits for the Interpretation bounded context.
//!
//! These traits define the persistence interfaces for evidence packs
//! and related entities.
use async_trait::async_trait;
use crate::{Error, Result};
use super::entities::{EvidencePack, EmbeddingId, ClusterId, ClusterContext};
/// Repository for persisting and retrieving evidence packs.
///
/// Implementations of this trait handle the storage and retrieval
/// of evidence packs, which are the primary artifacts of RAB interpretation.
#[async_trait]
pub trait EvidencePackRepository: Send + Sync {
/// Save an evidence pack to the repository.
///
/// If an evidence pack with the same ID already exists, it will be updated.
async fn save(&self, pack: &EvidencePack) -> Result<()>;
/// Find an evidence pack by its unique identifier.
async fn find_by_id(&self, id: &str) -> Result<Option<EvidencePack>>;
/// Find all evidence packs for a given query embedding.
///
/// Returns evidence packs in reverse chronological order (newest first).
async fn find_by_query(&self, embedding_id: &EmbeddingId) -> Result<Vec<EvidencePack>>;
/// Delete an evidence pack by ID.
async fn delete(&self, id: &str) -> Result<bool>;
/// Find evidence packs created within a time range.
async fn find_by_time_range(
&self,
start: chrono::DateTime<chrono::Utc>,
end: chrono::DateTime<chrono::Utc>,
) -> Result<Vec<EvidencePack>>;
/// Count total evidence packs in the repository.
async fn count(&self) -> Result<usize>;
}
/// Repository for cluster information used in interpretation.
///
/// This trait provides read access to cluster data needed for
/// building evidence packs and generating interpretations.
#[async_trait]
pub trait ClusterRepository: Send + Sync {
/// Get cluster context for an embedding.
async fn get_cluster_context(&self, embedding_id: &EmbeddingId) -> Result<ClusterContext>;
/// Get the label for a cluster.
async fn get_cluster_label(&self, cluster_id: &ClusterId) -> Result<Option<String>>;
/// Get the exemplar embedding for a cluster.
async fn get_cluster_exemplar(&self, cluster_id: &ClusterId) -> Result<Option<EmbeddingId>>;
/// Get all embeddings in a cluster.
async fn get_cluster_members(&self, cluster_id: &ClusterId) -> Result<Vec<EmbeddingId>>;
/// Get statistics about a cluster.
async fn get_cluster_stats(&self, cluster_id: &ClusterId) -> Result<ClusterStats>;
}
/// Statistics about a cluster.
#[derive(Debug, Clone)]
pub struct ClusterStats {
/// Number of embeddings in the cluster
pub member_count: usize,
/// Average distance from cluster center
pub avg_distance: f32,
/// Maximum distance from cluster center
pub max_distance: f32,
/// Cluster coherence score (0.0 to 1.0)
pub coherence: f32,
}
impl Default for ClusterStats {
fn default() -> Self {
Self {
member_count: 0,
avg_distance: 0.0,
max_distance: 0.0,
coherence: 0.0,
}
}
}
/// In-memory implementation of EvidencePackRepository for testing.
#[derive(Debug, Default)]
pub struct InMemoryEvidencePackRepository {
packs: std::sync::RwLock<std::collections::HashMap<String, EvidencePack>>,
}
impl InMemoryEvidencePackRepository {
/// Create a new in-memory repository.
pub fn new() -> Self {
Self::default()
}
}
#[async_trait]
impl EvidencePackRepository for InMemoryEvidencePackRepository {
async fn save(&self, pack: &EvidencePack) -> Result<()> {
let mut packs = self.packs.write().map_err(|e| Error::internal(e.to_string()))?;
packs.insert(pack.id.clone(), pack.clone());
Ok(())
}
async fn find_by_id(&self, id: &str) -> Result<Option<EvidencePack>> {
let packs = self.packs.read().map_err(|e| Error::internal(e.to_string()))?;
Ok(packs.get(id).cloned())
}
async fn find_by_query(&self, embedding_id: &EmbeddingId) -> Result<Vec<EvidencePack>> {
let packs = self.packs.read().map_err(|e| Error::internal(e.to_string()))?;
let mut results: Vec<EvidencePack> = packs
.values()
.filter(|p| p.query_embedding_id == *embedding_id)
.cloned()
.collect();
results.sort_by(|a, b| b.created_at.cmp(&a.created_at));
Ok(results)
}
async fn delete(&self, id: &str) -> Result<bool> {
let mut packs = self.packs.write().map_err(|e| Error::internal(e.to_string()))?;
Ok(packs.remove(id).is_some())
}
async fn find_by_time_range(
&self,
start: chrono::DateTime<chrono::Utc>,
end: chrono::DateTime<chrono::Utc>,
) -> Result<Vec<EvidencePack>> {
let packs = self.packs.read().map_err(|e| Error::internal(e.to_string()))?;
let mut results: Vec<EvidencePack> = packs
.values()
.filter(|p| p.created_at >= start && p.created_at <= end)
.cloned()
.collect();
results.sort_by(|a, b| b.created_at.cmp(&a.created_at));
Ok(results)
}
async fn count(&self) -> Result<usize> {
let packs = self.packs.read().map_err(|e| Error::internal(e.to_string()))?;
Ok(packs.len())
}
}
/// In-memory implementation of ClusterRepository for testing.
#[derive(Debug, Default)]
pub struct InMemoryClusterRepository {
clusters: std::sync::RwLock<std::collections::HashMap<ClusterId, ClusterData>>,
assignments: std::sync::RwLock<std::collections::HashMap<EmbeddingId, ClusterId>>,
}
#[derive(Debug, Clone)]
struct ClusterData {
label: Option<String>,
exemplar: Option<EmbeddingId>,
members: Vec<EmbeddingId>,
stats: ClusterStats,
}
impl InMemoryClusterRepository {
/// Create a new in-memory cluster repository.
pub fn new() -> Self {
Self::default()
}
/// Add a cluster to the repository.
pub fn add_cluster(
&self,
cluster_id: ClusterId,
label: Option<String>,
exemplar: Option<EmbeddingId>,
) -> Result<()> {
let mut clusters = self.clusters.write().map_err(|e| Error::internal(e.to_string()))?;
clusters.insert(cluster_id, ClusterData {
label,
exemplar,
members: Vec::new(),
stats: ClusterStats::default(),
});
Ok(())
}
/// Assign an embedding to a cluster.
pub fn assign_to_cluster(
&self,
embedding_id: EmbeddingId,
cluster_id: ClusterId,
) -> Result<()> {
let mut assignments = self.assignments.write().map_err(|e| Error::internal(e.to_string()))?;
assignments.insert(embedding_id.clone(), cluster_id.clone());
let mut clusters = self.clusters.write().map_err(|e| Error::internal(e.to_string()))?;
if let Some(cluster) = clusters.get_mut(&cluster_id) {
cluster.members.push(embedding_id);
}
Ok(())
}
}
#[async_trait]
impl ClusterRepository for InMemoryClusterRepository {
async fn get_cluster_context(&self, embedding_id: &EmbeddingId) -> Result<ClusterContext> {
let assignments = self.assignments.read().map_err(|e| Error::internal(e.to_string()))?;
let cluster_id = assignments.get(embedding_id).cloned();
if let Some(cid) = &cluster_id {
let clusters = self.clusters.read().map_err(|e| Error::internal(e.to_string()))?;
if let Some(cluster) = clusters.get(cid) {
return Ok(ClusterContext {
assigned_cluster: Some(cid.clone()),
cluster_label: cluster.label.clone(),
confidence: 0.85,
exemplar_similarity: 0.90,
});
}
}
Ok(ClusterContext::empty())
}
async fn get_cluster_label(&self, cluster_id: &ClusterId) -> Result<Option<String>> {
let clusters = self.clusters.read().map_err(|e| Error::internal(e.to_string()))?;
Ok(clusters.get(cluster_id).and_then(|c| c.label.clone()))
}
async fn get_cluster_exemplar(&self, cluster_id: &ClusterId) -> Result<Option<EmbeddingId>> {
let clusters = self.clusters.read().map_err(|e| Error::internal(e.to_string()))?;
Ok(clusters.get(cluster_id).and_then(|c| c.exemplar.clone()))
}
async fn get_cluster_members(&self, cluster_id: &ClusterId) -> Result<Vec<EmbeddingId>> {
let clusters = self.clusters.read().map_err(|e| Error::internal(e.to_string()))?;
Ok(clusters.get(cluster_id).map(|c| c.members.clone()).unwrap_or_default())
}
async fn get_cluster_stats(&self, cluster_id: &ClusterId) -> Result<ClusterStats> {
let clusters = self.clusters.read().map_err(|e| Error::internal(e.to_string()))?;
Ok(clusters.get(cluster_id).map(|c| c.stats.clone()).unwrap_or_default())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_in_memory_evidence_pack_repo() {
use crate::domain::entities::*;
let repo = InMemoryEvidencePackRepository::new();
let pack = EvidencePack::new(
EmbeddingId::new("query-1"),
Vec::new(),
ClusterContext::empty(),
None,
Interpretation::empty(),
);
repo.save(&pack).await.unwrap();
assert_eq!(repo.count().await.unwrap(), 1);
let found = repo.find_by_id(&pack.id).await.unwrap();
assert!(found.is_some());
let by_query = repo.find_by_query(&EmbeddingId::new("query-1")).await.unwrap();
assert_eq!(by_query.len(), 1);
repo.delete(&pack.id).await.unwrap();
assert_eq!(repo.count().await.unwrap(), 0);
}
#[tokio::test]
async fn test_in_memory_cluster_repo() {
let repo = InMemoryClusterRepository::new();
let cluster_id = ClusterId::new("cluster-1");
repo.add_cluster(
cluster_id.clone(),
Some("Song Type A".to_string()),
Some(EmbeddingId::new("exemplar-1")),
).unwrap();
let embedding_id = EmbeddingId::new("emb-1");
repo.assign_to_cluster(embedding_id.clone(), cluster_id.clone()).unwrap();
let context = repo.get_cluster_context(&embedding_id).await.unwrap();
assert!(context.has_cluster());
assert_eq!(context.cluster_label, Some("Song Type A".to_string()));
let label = repo.get_cluster_label(&cluster_id).await.unwrap();
assert_eq!(label, Some("Song Type A".to_string()));
let members = repo.get_cluster_members(&cluster_id).await.unwrap();
assert_eq!(members.len(), 1);
}
}

View File

@@ -0,0 +1,754 @@
//! Claim generator for RAB interpretations.
//!
//! This module generates claims with evidence citations based on
//! neighbor evidence, cluster context, and sequence context.
use tracing::{debug, instrument};
use crate::application::services::InterpretationConfig;
use crate::domain::entities::{
Claim, ClusterContext, ClusterId, EmbeddingId, EvidenceRef, EvidenceRefType,
NeighborEvidence, SequenceContext,
};
use crate::infrastructure::evidence_builder::EvidenceContext;
use crate::templates::InterpretationTemplates;
use crate::Result;
/// Generator for evidence-backed claims.
///
/// The `ClaimGenerator` creates claims based on available evidence
/// and ensures each claim has proper citations.
#[derive(Debug, Clone)]
pub struct ClaimGenerator {
/// Minimum confidence threshold for claims
min_confidence: f32,
/// Maximum claims to generate
max_claims: usize,
/// Templates for claim text
templates: InterpretationTemplates,
}
impl ClaimGenerator {
/// Create a new claim generator from configuration.
pub fn new(config: &InterpretationConfig) -> Self {
Self {
min_confidence: config.min_claim_confidence,
max_claims: config.max_claims,
templates: InterpretationTemplates::new(),
}
}
/// Create a claim generator with custom parameters.
pub fn with_params(min_confidence: f32, max_claims: usize) -> Self {
Self {
min_confidence,
max_claims,
templates: InterpretationTemplates::new(),
}
}
/// Generate claims from collected evidence.
///
/// Claims are generated based on:
/// - Neighbor similarity patterns
/// - Cluster assignments
/// - Taxonomic information
/// - Temporal sequence patterns
#[instrument(skip(self, neighbors, cluster_context, sequence_context))]
pub async fn generate_claims(
&self,
query_id: &EmbeddingId,
neighbors: &[NeighborEvidence],
cluster_context: &ClusterContext,
sequence_context: &Option<SequenceContext>,
) -> Result<Vec<Claim>> {
let context = EvidenceContext::from_evidence(neighbors, cluster_context, sequence_context);
let mut claims = Vec::new();
// Generate similarity-based claims
let similarity_claims = self.generate_similarity_claims(neighbors, &context);
claims.extend(similarity_claims);
// Generate cluster-based claims
if cluster_context.has_cluster() {
let cluster_claims = self.generate_cluster_claims(cluster_context, &context);
claims.extend(cluster_claims);
}
// Generate taxonomy-based claims
if !context.unique_taxa.is_empty() {
let taxon_claims = self.generate_taxon_claims(neighbors, &context);
claims.extend(taxon_claims);
}
// Generate sequence-based claims
if let Some(seq) = sequence_context {
if seq.has_temporal_context() {
let sequence_claims = self.generate_sequence_claims(seq, &context);
claims.extend(sequence_claims);
}
}
// Filter by confidence and limit
let mut claims: Vec<Claim> = claims
.into_iter()
.filter(|c| c.confidence >= self.min_confidence)
.collect();
// Sort by confidence (highest first)
claims.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
// Limit number of claims
claims.truncate(self.max_claims);
debug!(
"Generated {} claims for query {}",
claims.len(),
query_id
);
Ok(claims)
}
/// Generate claims based on neighbor similarity.
fn generate_similarity_claims(
&self,
neighbors: &[NeighborEvidence],
context: &EvidenceContext,
) -> Vec<Claim> {
let mut claims = Vec::new();
if neighbors.is_empty() {
return claims;
}
// Claim about overall similarity
let similarity = context.scores.avg_similarity;
if similarity >= 0.7 {
let statement = self.templates.high_similarity_claim(
neighbors.len(),
similarity,
);
let confidence = similarity * 0.9;
let evidence: Vec<EvidenceRef> = neighbors
.iter()
.take(3)
.map(|n| {
EvidenceRef::neighbor(
&n.embedding_id,
format!(
"Similarity: {:.1}% (distance: {:.3})",
n.similarity() * 100.0,
n.distance
),
)
})
.collect();
claims.push(Claim::new(statement, confidence).with_evidence(evidence));
} else if similarity >= 0.5 {
let statement = self.templates.moderate_similarity_claim(neighbors.len());
let confidence = similarity * 0.8;
let evidence: Vec<EvidenceRef> = neighbors
.iter()
.take(2)
.map(|n| {
EvidenceRef::neighbor(
&n.embedding_id,
format!("Distance: {:.3}", n.distance),
)
})
.collect();
claims.push(Claim::new(statement, confidence).with_evidence(evidence));
} else if neighbors.len() >= 3 {
let statement = self.templates.low_similarity_claim();
let confidence = 0.4;
let evidence = vec![EvidenceRef::new(
EvidenceRefType::Neighbor,
"aggregate",
format!(
"Average distance: {:.3} across {} neighbors",
context.scores.avg_distance,
neighbors.len()
),
)];
claims.push(Claim::new(statement, confidence).with_evidence(evidence));
}
// Claim about closest match
if let Some(closest) = neighbors.first() {
if closest.distance < 0.2 {
let taxon_info = closest
.recording_metadata
.taxon
.as_deref()
.map(|t| format!(" ({})", t))
.unwrap_or_default();
let statement = format!(
"Strong acoustic match found with recording {}{}",
closest.recording_metadata.recording_id,
taxon_info
);
let confidence = (1.0 - closest.distance) * 0.95;
let evidence = vec![
EvidenceRef::neighbor(
&closest.embedding_id,
format!(
"Closest neighbor with distance {:.3} ({:.1}% similarity)",
closest.distance,
closest.similarity() * 100.0
),
),
];
claims.push(Claim::new(statement, confidence).with_evidence(evidence));
}
}
claims
}
/// Generate claims based on cluster assignment.
fn generate_cluster_claims(
&self,
cluster_context: &ClusterContext,
context: &EvidenceContext,
) -> Vec<Claim> {
let mut claims = Vec::new();
let cluster_id = match &cluster_context.assigned_cluster {
Some(id) => id,
None => return claims,
};
let label = cluster_context
.cluster_label
.as_deref()
.unwrap_or("unlabeled cluster");
// Main cluster assignment claim
let statement = self.templates.cluster_assignment_claim(
label,
cluster_context.confidence,
cluster_context.exemplar_similarity,
);
let confidence = cluster_context.confidence * cluster_context.exemplar_similarity;
let evidence = vec![
EvidenceRef::cluster(
cluster_id,
format!(
"Assigned to cluster '{}' with {:.1}% confidence, {:.1}% exemplar similarity",
label,
cluster_context.confidence * 100.0,
cluster_context.exemplar_similarity * 100.0
),
),
];
claims.push(Claim::new(statement, confidence).with_evidence(evidence));
// Claim about cluster coherence with neighbors
if context.scores.cluster_coherence > 0.5 {
let statement = format!(
"Acoustic features are consistent with {} - {:.0}% of similar recordings belong to the same cluster",
label,
context.scores.cluster_coherence * 100.0
);
let confidence = context.scores.cluster_coherence * 0.85;
let evidence = vec![
EvidenceRef::cluster(
cluster_id,
format!(
"{:.0}% cluster coherence among neighbors",
context.scores.cluster_coherence * 100.0
),
),
];
claims.push(Claim::new(statement, confidence).with_evidence(evidence));
}
claims
}
/// Generate claims based on taxonomic information.
fn generate_taxon_claims(
&self,
neighbors: &[NeighborEvidence],
context: &EvidenceContext,
) -> Vec<Claim> {
let mut claims = Vec::new();
if context.unique_taxa.is_empty() {
return claims;
}
// Count taxa occurrences
let mut taxon_counts: std::collections::HashMap<&str, (usize, Vec<&NeighborEvidence>)> =
std::collections::HashMap::new();
for neighbor in neighbors {
if let Some(taxon) = &neighbor.recording_metadata.taxon {
let entry = taxon_counts.entry(taxon.as_str()).or_insert((0, Vec::new()));
entry.0 += 1;
entry.1.push(neighbor);
}
}
// Find dominant taxon
if let Some((taxon, (count, examples))) = taxon_counts
.iter()
.max_by_key(|(_, (count, _))| count)
{
let proportion = *count as f32 / neighbors.len() as f32;
if proportion >= 0.6 {
let statement = self.templates.dominant_taxon_claim(taxon, proportion);
let confidence = proportion * context.scores.avg_similarity;
let evidence: Vec<EvidenceRef> = examples
.iter()
.take(3)
.map(|n| {
EvidenceRef::taxon(
*taxon,
format!(
"Recording {} identified as {} (similarity: {:.1}%)",
n.recording_metadata.recording_id,
taxon,
n.similarity() * 100.0
),
)
})
.collect();
claims.push(Claim::new(statement, confidence).with_evidence(evidence));
} else if context.unique_taxa.len() > 1 {
// Multiple taxa present
let taxa_list = context.unique_taxa.join(", ");
let statement = format!(
"Acoustic features show similarity to multiple taxa: {}. Further analysis recommended.",
taxa_list
);
let confidence = 0.5;
let evidence: Vec<EvidenceRef> = context
.unique_taxa
.iter()
.take(3)
.map(|t| {
let count = taxon_counts.get(t.as_str()).map(|(c, _)| *c).unwrap_or(0);
EvidenceRef::taxon(
t,
format!("{} neighbors identified as {}", count, t),
)
})
.collect();
claims.push(Claim::new(statement, confidence).with_evidence(evidence));
}
}
claims
}
/// Generate claims based on sequence context.
fn generate_sequence_claims(
&self,
sequence_context: &SequenceContext,
context: &EvidenceContext,
) -> Vec<Claim> {
let mut claims = Vec::new();
// Claim about temporal context
let preceding = sequence_context.preceding_segments.len();
let following = sequence_context.following_segments.len();
if preceding > 0 || following > 0 {
let statement = self.templates.sequence_context_claim(preceding, following);
let confidence = 0.7;
let mut evidence = Vec::new();
for (i, seg) in sequence_context.preceding_segments.iter().enumerate() {
evidence.push(EvidenceRef::sequence(
seg,
format!("Preceding segment {} at position -{}", seg.0, preceding - i),
));
}
for (i, seg) in sequence_context.following_segments.iter().enumerate() {
evidence.push(EvidenceRef::sequence(
seg,
format!("Following segment {} at position +{}", seg.0, i + 1),
));
}
claims.push(Claim::new(statement, confidence).with_evidence(evidence));
}
// Claim about detected motif
if let Some(motif) = &sequence_context.detected_motif {
let statement = self.templates.motif_claim(motif);
let confidence = 0.75;
let evidence = vec![EvidenceRef::new(
EvidenceRefType::Sequence,
"motif-detection",
format!(
"Motif pattern '{}' detected across {} segments",
motif,
sequence_context.sequence_length()
),
)];
claims.push(Claim::new(statement, confidence).with_evidence(evidence));
}
claims
}
/// Generate a claim from manual input with evidence validation.
pub fn create_manual_claim(
&self,
statement: &str,
confidence: f32,
evidence_refs: Vec<EvidenceRef>,
) -> Result<Claim> {
if evidence_refs.is_empty() {
return Err(crate::Error::ClaimValidationFailed(
"Claims must have at least one evidence reference".to_string(),
));
}
let confidence = confidence.clamp(0.0, 1.0);
if confidence < self.min_confidence {
debug!(
"Claim confidence {} below threshold {}: {}",
confidence, self.min_confidence, statement
);
}
Ok(Claim::new(statement, confidence).with_evidence(evidence_refs))
}
/// Merge multiple claims about the same topic.
pub fn merge_claims(&self, claims: &[Claim]) -> Option<Claim> {
if claims.is_empty() {
return None;
}
if claims.len() == 1 {
return Some(claims[0].clone());
}
// Combine evidence from all claims
let mut all_evidence: Vec<EvidenceRef> = Vec::new();
let mut total_confidence = 0.0;
for claim in claims {
all_evidence.extend(claim.evidence_refs.clone());
total_confidence += claim.confidence;
}
// Deduplicate evidence by ref_id
let mut seen_ids = std::collections::HashSet::new();
all_evidence.retain(|e| seen_ids.insert(e.ref_id.clone()));
let avg_confidence = total_confidence / claims.len() as f32;
// Use the statement from the highest-confidence claim
let best_claim = claims
.iter()
.max_by(|a, b| a.confidence.partial_cmp(&b.confidence).unwrap())
.unwrap();
Some(
Claim::new(&best_claim.statement, avg_confidence)
.with_evidence(all_evidence),
)
}
}
/// Builder for creating claims with proper evidence citations.
#[derive(Debug)]
pub struct ClaimBuilder {
statement: String,
confidence: f32,
evidence: Vec<EvidenceRef>,
}
impl ClaimBuilder {
/// Start building a new claim.
pub fn new(statement: impl Into<String>) -> Self {
Self {
statement: statement.into(),
confidence: 0.5,
evidence: Vec::new(),
}
}
/// Set the confidence level.
pub fn confidence(mut self, confidence: f32) -> Self {
self.confidence = confidence.clamp(0.0, 1.0);
self
}
/// Add a neighbor evidence reference.
pub fn cite_neighbor(
mut self,
embedding_id: &EmbeddingId,
description: impl Into<String>,
) -> Self {
self.evidence.push(EvidenceRef::neighbor(embedding_id, description));
self
}
/// Add a cluster evidence reference.
pub fn cite_cluster(
mut self,
cluster_id: &ClusterId,
description: impl Into<String>,
) -> Self {
self.evidence.push(EvidenceRef::cluster(cluster_id, description));
self
}
/// Add a taxon evidence reference.
pub fn cite_taxon(
mut self,
taxon: impl Into<String>,
description: impl Into<String>,
) -> Self {
self.evidence.push(EvidenceRef::taxon(taxon, description));
self
}
/// Add a sequence evidence reference.
pub fn cite_sequence(
mut self,
segment_id: &crate::domain::entities::SegmentId,
description: impl Into<String>,
) -> Self {
self.evidence.push(EvidenceRef::sequence(segment_id, description));
self
}
/// Build the claim.
pub fn build(self) -> Claim {
Claim::new(self.statement, self.confidence).with_evidence(self.evidence)
}
/// Build the claim, requiring at least one evidence reference.
pub fn build_validated(self) -> Result<Claim> {
if self.evidence.is_empty() {
return Err(crate::Error::ClaimValidationFailed(
"Claims must have at least one evidence reference".to_string(),
));
}
Ok(self.build())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::domain::entities::RecordingMetadata;
fn create_test_neighbors() -> Vec<NeighborEvidence> {
vec![
NeighborEvidence::new(
EmbeddingId::new("n1"),
0.1,
RecordingMetadata::new("r1").with_taxon("Species A"),
).with_cluster(ClusterId::new("c1")),
NeighborEvidence::new(
EmbeddingId::new("n2"),
0.15,
RecordingMetadata::new("r2").with_taxon("Species A"),
).with_cluster(ClusterId::new("c1")),
NeighborEvidence::new(
EmbeddingId::new("n3"),
0.2,
RecordingMetadata::new("r3").with_taxon("Species B"),
).with_cluster(ClusterId::new("c2")),
]
}
#[tokio::test]
async fn test_generate_claims_with_neighbors() {
let generator = ClaimGenerator::with_params(0.3, 10);
let neighbors = create_test_neighbors();
let cluster_context = ClusterContext::empty();
let query_id = EmbeddingId::new("query-1");
let claims = generator
.generate_claims(&query_id, &neighbors, &cluster_context, &None)
.await
.unwrap();
assert!(!claims.is_empty());
// All claims should have evidence
for claim in &claims {
assert!(claim.has_evidence(), "Claim should have evidence: {}", claim.statement);
}
}
#[tokio::test]
async fn test_generate_claims_with_cluster() {
let generator = ClaimGenerator::with_params(0.3, 10);
let neighbors = create_test_neighbors();
let cluster_context = ClusterContext::new(
Some(ClusterId::new("c1")),
0.9,
0.85,
).with_label("Song Type A");
let query_id = EmbeddingId::new("query-1");
let claims = generator
.generate_claims(&query_id, &neighbors, &cluster_context, &None)
.await
.unwrap();
// Should have cluster-related claims
let cluster_claims: Vec<_> = claims
.iter()
.filter(|c| c.statement.contains("cluster") || c.statement.contains("Song Type A"))
.collect();
assert!(!cluster_claims.is_empty());
// Cluster claims should cite the cluster
for claim in cluster_claims {
let cluster_refs = claim.evidence_of_type(EvidenceRefType::Cluster);
assert!(!cluster_refs.is_empty());
}
}
#[tokio::test]
async fn test_generate_claims_with_sequence() {
let generator = ClaimGenerator::with_params(0.3, 10);
let neighbors = create_test_neighbors();
let cluster_context = ClusterContext::empty();
let sequence_context = Some(SequenceContext::new(
vec![
crate::domain::entities::SegmentId::new("seg-1"),
crate::domain::entities::SegmentId::new("seg-2"),
],
vec![
crate::domain::entities::SegmentId::new("seg-4"),
],
).with_motif("ABAB"));
let query_id = EmbeddingId::new("query-1");
let claims = generator
.generate_claims(&query_id, &neighbors, &cluster_context, &sequence_context)
.await
.unwrap();
// Should have sequence-related claims
let sequence_claims: Vec<_> = claims
.iter()
.filter(|c| {
c.statement.contains("sequence")
|| c.statement.contains("temporal")
|| c.statement.contains("motif")
|| c.statement.contains("ABAB")
})
.collect();
assert!(!sequence_claims.is_empty());
}
#[test]
fn test_claim_builder() {
let claim = ClaimBuilder::new("Test claim statement")
.confidence(0.85)
.cite_neighbor(
&EmbeddingId::new("n1"),
"Supporting neighbor evidence",
)
.cite_cluster(
&ClusterId::new("c1"),
"Cluster assignment evidence",
)
.build();
assert_eq!(claim.statement, "Test claim statement");
assert_eq!(claim.confidence, 0.85);
assert_eq!(claim.evidence_refs.len(), 2);
}
#[test]
fn test_claim_builder_validated() {
// Should fail without evidence
let result = ClaimBuilder::new("Unsupported claim")
.confidence(0.9)
.build_validated();
assert!(result.is_err());
// Should succeed with evidence
let result = ClaimBuilder::new("Supported claim")
.confidence(0.9)
.cite_taxon("Species A", "Taxon evidence")
.build_validated();
assert!(result.is_ok());
}
#[test]
fn test_merge_claims() {
let generator = ClaimGenerator::with_params(0.3, 10);
let claim1 = ClaimBuilder::new("Similar acoustic features observed")
.confidence(0.8)
.cite_neighbor(&EmbeddingId::new("n1"), "Evidence 1")
.build();
let claim2 = ClaimBuilder::new("Similar acoustic features observed")
.confidence(0.7)
.cite_neighbor(&EmbeddingId::new("n2"), "Evidence 2")
.build();
let merged = generator.merge_claims(&[claim1, claim2]);
assert!(merged.is_some());
let merged = merged.unwrap();
assert_eq!(merged.evidence_refs.len(), 2);
assert_eq!(merged.confidence, 0.75);
}
#[test]
fn test_create_manual_claim() {
let generator = ClaimGenerator::with_params(0.5, 10);
// Should fail without evidence
let result = generator.create_manual_claim(
"Test claim",
0.8,
Vec::new(),
);
assert!(result.is_err());
// Should succeed with evidence
let result = generator.create_manual_claim(
"Test claim",
0.8,
vec![EvidenceRef::taxon("Species A", "Manual evidence")],
);
assert!(result.is_ok());
}
}

View File

@@ -0,0 +1,565 @@
//! Evidence builder for constructing RAB evidence packs.
//!
//! This module provides utilities for collecting and organizing evidence
//! from various sources (neighbors, clusters, sequences) into structured
//! evidence packs.
use tracing::{debug, instrument};
use crate::application::services::InterpretationConfig;
use crate::domain::entities::{
ClusterContext, ClusterId, EmbeddingId, NeighborEvidence, RecordingMetadata,
SegmentId, SequenceContext,
};
use crate::Result;
/// Builder for constructing evidence from various sources.
///
/// The `EvidenceBuilder` provides a structured way to collect and organize
/// evidence for RAB interpretations.
#[derive(Debug, Clone)]
pub struct EvidenceBuilder {
/// Maximum number of neighbors to include
max_neighbors: usize,
/// Whether to include spectrogram URLs
include_spectrograms: bool,
/// Whether to include sequence context
include_sequences: bool,
/// Sequence context window size
sequence_window: usize,
/// Minimum distance threshold for neighbor inclusion
min_distance_threshold: f32,
/// Maximum distance threshold for neighbor inclusion
max_distance_threshold: f32,
}
impl EvidenceBuilder {
/// Create a new evidence builder from configuration.
pub fn new(config: &InterpretationConfig) -> Self {
Self {
max_neighbors: config.max_neighbors,
include_spectrograms: config.include_spectrograms,
include_sequences: config.include_sequence_context,
sequence_window: config.sequence_context_window,
min_distance_threshold: 0.0,
max_distance_threshold: 1.0,
}
}
/// Create a builder with default settings.
pub fn default_builder() -> Self {
Self {
max_neighbors: 10,
include_spectrograms: true,
include_sequences: true,
sequence_window: 3,
min_distance_threshold: 0.0,
max_distance_threshold: 1.0,
}
}
/// Set the maximum number of neighbors.
pub fn with_max_neighbors(mut self, n: usize) -> Self {
self.max_neighbors = n;
self
}
/// Set whether to include spectrogram URLs.
pub fn with_spectrograms(mut self, include: bool) -> Self {
self.include_spectrograms = include;
self
}
/// Set the distance threshold range.
pub fn with_distance_threshold(mut self, min: f32, max: f32) -> Self {
self.min_distance_threshold = min;
self.max_distance_threshold = max;
self
}
/// Get the maximum neighbors setting.
pub fn max_neighbors(&self) -> usize {
self.max_neighbors
}
/// Check if spectrograms are enabled.
pub fn spectrograms_enabled(&self) -> bool {
self.include_spectrograms
}
/// Collect neighbor evidence from raw neighbor data.
///
/// This method processes raw neighbor data and builds structured
/// `NeighborEvidence` objects with metadata.
#[instrument(skip(self, neighbors))]
pub async fn collect_neighbor_evidence(
&self,
neighbors: &[RawNeighbor],
) -> Result<Vec<NeighborEvidence>> {
let filtered: Vec<&RawNeighbor> = neighbors
.iter()
.filter(|n| {
n.distance >= self.min_distance_threshold
&& n.distance <= self.max_distance_threshold
})
.take(self.max_neighbors)
.collect();
debug!(
"Collecting evidence from {} neighbors (filtered from {})",
filtered.len(),
neighbors.len()
);
let evidence: Vec<NeighborEvidence> = filtered
.into_iter()
.map(|n| self.build_neighbor_evidence(n))
.collect();
Ok(evidence)
}
/// Build neighbor evidence from raw neighbor data.
fn build_neighbor_evidence(&self, raw: &RawNeighbor) -> NeighborEvidence {
let metadata = raw
.metadata
.clone()
.unwrap_or_else(|| RecordingMetadata::new(&raw.embedding_id.0));
let mut evidence = NeighborEvidence::new(
raw.embedding_id.clone(),
raw.distance,
metadata,
);
if let Some(cluster_id) = &raw.cluster_id {
evidence = evidence.with_cluster(cluster_id.clone());
}
if self.include_spectrograms {
if let Some(url) = &raw.spectrogram_url {
evidence = evidence.with_spectrogram(url.clone());
}
}
evidence
}
/// Build cluster context from cluster assignment data.
#[instrument(skip(self))]
pub async fn build_cluster_context(
&self,
cluster_id: Option<ClusterId>,
label: Option<String>,
confidence: f32,
exemplar_similarity: f32,
) -> Result<ClusterContext> {
let context = ClusterContext {
assigned_cluster: cluster_id,
cluster_label: label,
confidence,
exemplar_similarity,
};
debug!(
"Built cluster context: assigned={}, confidence={}",
context.has_cluster(),
context.confidence
);
Ok(context)
}
/// Build sequence context from temporal data.
#[instrument(skip(self))]
pub async fn build_sequence_context(
&self,
preceding: Vec<SegmentId>,
following: Vec<SegmentId>,
motif: Option<String>,
) -> Result<Option<SequenceContext>> {
if !self.include_sequences {
return Ok(None);
}
if preceding.is_empty() && following.is_empty() {
return Ok(None);
}
let preceding = preceding
.into_iter()
.take(self.sequence_window)
.collect();
let following = following
.into_iter()
.take(self.sequence_window)
.collect();
let context = SequenceContext {
preceding_segments: preceding,
following_segments: following,
detected_motif: motif,
};
debug!(
"Built sequence context: {} preceding, {} following, motif={}",
context.preceding_segments.len(),
context.following_segments.len(),
context.detected_motif.as_deref().unwrap_or("none")
);
Ok(Some(context))
}
/// Aggregate evidence from multiple sources.
pub fn aggregate_evidence_scores(&self, neighbors: &[NeighborEvidence]) -> EvidenceScores {
if neighbors.is_empty() {
return EvidenceScores::default();
}
let distances: Vec<f32> = neighbors.iter().map(|n| n.distance).collect();
let avg_distance = distances.iter().sum::<f32>() / distances.len() as f32;
let min_distance = distances.iter().cloned().fold(f32::INFINITY, f32::min);
let max_distance = distances.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let similarity = (1.0 - avg_distance).max(0.0).min(1.0);
// Calculate cluster coherence (how many neighbors share the same cluster)
let clustered_count = neighbors
.iter()
.filter(|n| n.cluster_id.is_some())
.count();
let cluster_coherence = if clustered_count > 0 {
// Check if neighbors share clusters
let mut cluster_counts = std::collections::HashMap::new();
for neighbor in neighbors {
if let Some(cid) = &neighbor.cluster_id {
*cluster_counts.entry(cid.0.clone()).or_insert(0) += 1;
}
}
let max_cluster_count = cluster_counts.values().cloned().max().unwrap_or(0);
max_cluster_count as f32 / neighbors.len() as f32
} else {
0.0
};
// Calculate taxon coherence
let taxa: Vec<&str> = neighbors
.iter()
.filter_map(|n| n.recording_metadata.taxon.as_deref())
.collect();
let taxon_coherence = if !taxa.is_empty() {
let mut taxon_counts = std::collections::HashMap::new();
for taxon in &taxa {
*taxon_counts.entry(*taxon).or_insert(0) += 1;
}
let max_taxon_count = taxon_counts.values().cloned().max().unwrap_or(0);
max_taxon_count as f32 / taxa.len() as f32
} else {
0.0
};
EvidenceScores {
neighbor_count: neighbors.len(),
avg_distance,
min_distance,
max_distance,
avg_similarity: similarity,
cluster_coherence,
taxon_coherence,
}
}
}
/// Raw neighbor data before processing.
#[derive(Debug, Clone)]
pub struct RawNeighbor {
/// Embedding ID of the neighbor
pub embedding_id: EmbeddingId,
/// Distance from query
pub distance: f32,
/// Optional cluster assignment
pub cluster_id: Option<ClusterId>,
/// Optional recording metadata
pub metadata: Option<RecordingMetadata>,
/// Optional spectrogram URL
pub spectrogram_url: Option<String>,
}
impl RawNeighbor {
/// Create a new raw neighbor.
pub fn new(embedding_id: EmbeddingId, distance: f32) -> Self {
Self {
embedding_id,
distance,
cluster_id: None,
metadata: None,
spectrogram_url: None,
}
}
/// Add cluster ID.
pub fn with_cluster(mut self, cluster_id: ClusterId) -> Self {
self.cluster_id = Some(cluster_id);
self
}
/// Add metadata.
pub fn with_metadata(mut self, metadata: RecordingMetadata) -> Self {
self.metadata = Some(metadata);
self
}
/// Add spectrogram URL.
pub fn with_spectrogram(mut self, url: String) -> Self {
self.spectrogram_url = Some(url);
self
}
}
/// Aggregated scores from evidence analysis.
#[derive(Debug, Clone, Default)]
pub struct EvidenceScores {
/// Number of neighbors
pub neighbor_count: usize,
/// Average distance to neighbors
pub avg_distance: f32,
/// Minimum distance (closest neighbor)
pub min_distance: f32,
/// Maximum distance (farthest neighbor)
pub max_distance: f32,
/// Average similarity (1 - avg_distance)
pub avg_similarity: f32,
/// Cluster coherence (0-1, how many neighbors share clusters)
pub cluster_coherence: f32,
/// Taxon coherence (0-1, how many neighbors share taxa)
pub taxon_coherence: f32,
}
impl EvidenceScores {
/// Calculate overall evidence strength.
pub fn overall_strength(&self) -> f32 {
if self.neighbor_count == 0 {
return 0.0;
}
// Weighted combination of scores
let similarity_weight = 0.4;
let cluster_weight = 0.3;
let taxon_weight = 0.3;
self.avg_similarity * similarity_weight
+ self.cluster_coherence * cluster_weight
+ self.taxon_coherence * taxon_weight
}
/// Determine if evidence is strong enough for high-confidence claims.
pub fn is_strong(&self) -> bool {
self.neighbor_count >= 3 && self.overall_strength() >= 0.6
}
/// Determine if evidence is weak (should generate cautious claims).
pub fn is_weak(&self) -> bool {
self.neighbor_count < 2 || self.overall_strength() < 0.3
}
}
/// Evidence aggregation context for building interpretations.
#[derive(Debug)]
pub struct EvidenceContext {
/// Aggregated scores
pub scores: EvidenceScores,
/// Unique taxa found in neighbors
pub unique_taxa: Vec<String>,
/// Unique cluster labels found
pub unique_clusters: Vec<String>,
/// Whether temporal sequence is available
pub has_sequence: bool,
/// Detected motif if any
pub motif: Option<String>,
}
impl EvidenceContext {
/// Build evidence context from collected evidence.
pub fn from_evidence(
neighbors: &[NeighborEvidence],
cluster_context: &ClusterContext,
sequence_context: &Option<SequenceContext>,
) -> Self {
let builder = EvidenceBuilder::default_builder();
let scores = builder.aggregate_evidence_scores(neighbors);
// Collect unique taxa
let unique_taxa: Vec<String> = neighbors
.iter()
.filter_map(|n| n.recording_metadata.taxon.clone())
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect();
// Collect unique cluster labels
let mut unique_clusters = Vec::new();
if let Some(label) = &cluster_context.cluster_label {
unique_clusters.push(label.clone());
}
let has_sequence = sequence_context
.as_ref()
.map(|s| s.has_temporal_context())
.unwrap_or(false);
let motif = sequence_context
.as_ref()
.and_then(|s| s.detected_motif.clone());
Self {
scores,
unique_taxa,
unique_clusters,
has_sequence,
motif,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_evidence_builder_collect_neighbors() {
let builder = EvidenceBuilder::default_builder()
.with_max_neighbors(5)
.with_spectrograms(true);
let raw_neighbors = vec![
RawNeighbor::new(EmbeddingId::new("n1"), 0.1)
.with_metadata(RecordingMetadata::new("r1").with_taxon("Species A")),
RawNeighbor::new(EmbeddingId::new("n2"), 0.2)
.with_metadata(RecordingMetadata::new("r2").with_taxon("Species A")),
RawNeighbor::new(EmbeddingId::new("n3"), 0.3)
.with_cluster(ClusterId::new("c1")),
];
let evidence = builder.collect_neighbor_evidence(&raw_neighbors).await.unwrap();
assert_eq!(evidence.len(), 3);
assert_eq!(evidence[0].embedding_id.as_str(), "n1");
assert_eq!(evidence[0].recording_metadata.taxon, Some("Species A".to_string()));
assert!(evidence[2].cluster_id.is_some());
}
#[tokio::test]
async fn test_evidence_builder_distance_filtering() {
let builder = EvidenceBuilder::default_builder()
.with_distance_threshold(0.0, 0.5);
let raw_neighbors = vec![
RawNeighbor::new(EmbeddingId::new("close"), 0.2),
RawNeighbor::new(EmbeddingId::new("far"), 0.8),
];
let evidence = builder.collect_neighbor_evidence(&raw_neighbors).await.unwrap();
assert_eq!(evidence.len(), 1);
assert_eq!(evidence[0].embedding_id.as_str(), "close");
}
#[test]
fn test_evidence_scores_calculation() {
let builder = EvidenceBuilder::default_builder();
let neighbors = vec![
NeighborEvidence::new(
EmbeddingId::new("n1"),
0.1,
RecordingMetadata::new("r1").with_taxon("Species A"),
).with_cluster(ClusterId::new("c1")),
NeighborEvidence::new(
EmbeddingId::new("n2"),
0.2,
RecordingMetadata::new("r2").with_taxon("Species A"),
).with_cluster(ClusterId::new("c1")),
NeighborEvidence::new(
EmbeddingId::new("n3"),
0.3,
RecordingMetadata::new("r3").with_taxon("Species B"),
).with_cluster(ClusterId::new("c2")),
];
let scores = builder.aggregate_evidence_scores(&neighbors);
assert_eq!(scores.neighbor_count, 3);
assert!((scores.avg_distance - 0.2).abs() < 0.001);
assert!((scores.min_distance - 0.1).abs() < 0.001);
assert!((scores.max_distance - 0.3).abs() < 0.001);
assert!(scores.cluster_coherence > 0.0);
assert!(scores.taxon_coherence > 0.0);
}
#[test]
fn test_evidence_context_from_evidence() {
let neighbors = vec![
NeighborEvidence::new(
EmbeddingId::new("n1"),
0.1,
RecordingMetadata::new("r1").with_taxon("Species A"),
),
NeighborEvidence::new(
EmbeddingId::new("n2"),
0.2,
RecordingMetadata::new("r2").with_taxon("Species B"),
),
];
let cluster_context = ClusterContext::new(
Some(ClusterId::new("c1")),
0.9,
0.85,
).with_label("Song Type A");
let sequence_context = Some(SequenceContext::new(
vec![SegmentId::new("s1")],
vec![SegmentId::new("s3")],
).with_motif("ABAB"));
let context = EvidenceContext::from_evidence(
&neighbors,
&cluster_context,
&sequence_context,
);
assert_eq!(context.unique_taxa.len(), 2);
assert_eq!(context.unique_clusters.len(), 1);
assert!(context.has_sequence);
assert_eq!(context.motif, Some("ABAB".to_string()));
}
#[tokio::test]
async fn test_build_sequence_context() {
let builder = EvidenceBuilder::default_builder();
let context = builder
.build_sequence_context(
vec![SegmentId::new("s1"), SegmentId::new("s2")],
vec![SegmentId::new("s4")],
Some("AABB".to_string()),
)
.await
.unwrap();
assert!(context.is_some());
let ctx = context.unwrap();
assert_eq!(ctx.preceding_segments.len(), 2);
assert_eq!(ctx.following_segments.len(), 1);
assert_eq!(ctx.detected_motif, Some("AABB".to_string()));
}
}

View File

@@ -0,0 +1,6 @@
//! Infrastructure layer for the Interpretation bounded context.
//!
//! Contains implementations for evidence building and claim generation.
pub mod evidence_builder;
pub mod claim_generator;

View File

@@ -0,0 +1,33 @@
//! # sevensense-interpretation
//!
//! LLM-powered interpretation for the 7sense bioacoustics platform.
//!
//! This crate provides:
//! - Natural language report generation
//! - Conservation insights
//! - Anomaly explanation
//! - Multi-language support
//!
//! ## Architecture
//!
//! ```text
//! sevensense-interpretation
//! ├── reports/ # Report generation
//! ├── insights/ # Conservation insights
//! ├── prompts/ # Prompt templates
//! └── providers/ # LLM provider integrations
//! ```
#![warn(missing_docs)]
#![warn(clippy::all)]
#![warn(clippy::pedantic)]
#![allow(clippy::module_name_repetitions)]
// TODO: Implement interpretation modules
// - reports: Structured report generation
// - insights: Ecological pattern detection
// - prompts: Template management
// - providers: Claude, GPT-4, local models
/// Crate version information
pub const VERSION: &str = env!("CARGO_PKG_VERSION");

View File

@@ -0,0 +1,654 @@
//! Interpretation text templates.
//!
//! This module provides templates for generating human-readable
//! interpretations and claims.
use std::collections::HashMap;
/// Templates for generating interpretation text.
#[derive(Debug, Clone)]
pub struct InterpretationTemplates {
/// Custom template overrides
custom_templates: HashMap<String, String>,
}
impl Default for InterpretationTemplates {
fn default() -> Self {
Self::new()
}
}
impl InterpretationTemplates {
/// Create a new templates instance with default templates.
pub fn new() -> Self {
Self {
custom_templates: HashMap::new(),
}
}
/// Add a custom template override.
pub fn with_template(mut self, key: &str, template: &str) -> Self {
self.custom_templates.insert(key.to_string(), template.to_string());
self
}
/// Get a template by key, falling back to default.
fn get_template(&self, key: &str, default: &str) -> String {
self.custom_templates
.get(key)
.cloned()
.unwrap_or_else(|| default.to_string())
}
// === Structural Description Templates ===
/// Generate neighbor-based description.
pub fn neighbor_description(&self, count: usize, similarity: f32) -> String {
let similarity_level = if similarity >= 0.8 {
"high"
} else if similarity >= 0.6 {
"moderate"
} else if similarity >= 0.4 {
"low"
} else {
"minimal"
};
format!(
"Acoustic signal shows {} similarity ({:.1}%) to {} reference recordings in the database.",
similarity_level,
similarity * 100.0,
count
)
}
/// Generate taxon-based description.
pub fn taxon_description(&self, taxa: &[&str]) -> String {
if taxa.is_empty() {
return String::new();
}
let unique_taxa: Vec<&str> = {
let mut seen = std::collections::HashSet::new();
taxa.iter().filter(|t| seen.insert(**t)).copied().collect()
};
if unique_taxa.len() == 1 {
format!(
"Reference recordings are primarily associated with {}.",
unique_taxa[0]
)
} else {
let taxa_list = unique_taxa.join(", ");
format!(
"Reference recordings span multiple taxa: {}.",
taxa_list
)
}
}
/// Generate cluster-based description.
pub fn cluster_description(
&self,
label: &str,
confidence: f32,
exemplar_similarity: f32,
) -> String {
let confidence_level = if confidence >= 0.9 {
"very high"
} else if confidence >= 0.7 {
"high"
} else if confidence >= 0.5 {
"moderate"
} else {
"low"
};
format!(
"Cluster analysis places this vocalization in '{}' with {} confidence ({:.1}%) and {:.1}% similarity to the cluster exemplar.",
label,
confidence_level,
confidence * 100.0,
exemplar_similarity * 100.0
)
}
/// Generate sequence-based description.
pub fn sequence_description(&self, sequence_length: usize, motif: Option<&str>) -> String {
let base = format!(
"Temporal analysis reveals this segment is part of a {} vocalization sequence.",
sequence_length
);
if let Some(m) = motif {
format!("{} A recurring motif pattern '{}' has been detected.", base, m)
} else {
base
}
}
// === Claim Templates ===
/// High similarity claim.
pub fn high_similarity_claim(&self, count: usize, similarity: f32) -> String {
format!(
"Strong acoustic similarity ({:.1}%) to {} database recordings suggests a reliable identification.",
similarity * 100.0,
count
)
}
/// Moderate similarity claim.
pub fn moderate_similarity_claim(&self, count: usize) -> String {
format!(
"Moderate acoustic similarity to {} reference recordings found. Additional context recommended for confident identification.",
count
)
}
/// Low similarity claim.
pub fn low_similarity_claim(&self) -> String {
self.get_template(
"low_similarity",
"Limited similarity to existing reference recordings. This may represent an unusual vocalization variant or a novel recording."
)
}
/// Cluster assignment claim.
pub fn cluster_assignment_claim(
&self,
label: &str,
confidence: f32,
exemplar_similarity: f32,
) -> String {
format!(
"This vocalization is classified as '{}' based on acoustic clustering (confidence: {:.1}%, exemplar similarity: {:.1}%).",
label,
confidence * 100.0,
exemplar_similarity * 100.0
)
}
/// Dominant taxon claim.
pub fn dominant_taxon_claim(&self, taxon: &str, proportion: f32) -> String {
format!(
"Acoustic features strongly suggest {} ({:.0}% of similar recordings in the database belong to this taxon).",
taxon,
proportion * 100.0
)
}
/// Sequence context claim.
pub fn sequence_context_claim(&self, preceding: usize, following: usize) -> String {
format!(
"This vocalization appears within a temporal sequence with {} preceding and {} following segments, providing additional context for interpretation.",
preceding,
following
)
}
/// Motif claim.
pub fn motif_claim(&self, motif: &str) -> String {
format!(
"A repeating acoustic motif '{}' has been detected in the vocalization sequence, suggesting a structured call pattern.",
motif
)
}
// === Evidence Description Templates ===
/// Format neighbor evidence description.
pub fn neighbor_evidence_description(
&self,
recording_id: &str,
distance: f32,
taxon: Option<&str>,
) -> String {
let similarity = ((1.0 - distance) * 100.0).max(0.0);
if let Some(t) = taxon {
format!(
"Recording {} ({}) with {:.1}% acoustic similarity",
recording_id, t, similarity
)
} else {
format!(
"Recording {} with {:.1}% acoustic similarity",
recording_id, similarity
)
}
}
/// Format cluster evidence description.
pub fn cluster_evidence_description(
&self,
label: &str,
confidence: f32,
) -> String {
format!(
"Assigned to cluster '{}' with {:.1}% confidence",
label,
confidence * 100.0
)
}
/// Format sequence evidence description.
pub fn sequence_evidence_description(
&self,
segment_id: &str,
position: i32,
) -> String {
let position_desc = if position < 0 {
format!("position {} before target", -position)
} else if position > 0 {
format!("position {} after target", position)
} else {
"target position".to_string()
};
format!("Segment {} at {}", segment_id, position_desc)
}
// === Summary Templates ===
/// Generate an overall summary.
pub fn generate_summary(
&self,
neighbor_count: usize,
avg_similarity: f32,
cluster_label: Option<&str>,
dominant_taxon: Option<&str>,
confidence: f32,
) -> String {
let mut parts = Vec::new();
// Similarity summary
let similarity_desc = if avg_similarity >= 0.8 {
format!(
"highly similar ({:.1}%) to {} reference recordings",
avg_similarity * 100.0,
neighbor_count
)
} else if avg_similarity >= 0.5 {
format!(
"moderately similar ({:.1}%) to {} reference recordings",
avg_similarity * 100.0,
neighbor_count
)
} else {
format!(
"shows limited similarity ({:.1}%) to {} reference recordings",
avg_similarity * 100.0,
neighbor_count
)
};
parts.push(similarity_desc);
// Cluster summary
if let Some(label) = cluster_label {
parts.push(format!("classified in cluster '{}'", label));
}
// Taxon summary
if let Some(taxon) = dominant_taxon {
parts.push(format!("likely associated with {}", taxon));
}
let main_summary = parts.join(", ");
// Confidence qualifier
let confidence_qualifier = if confidence >= 0.8 {
"High confidence interpretation"
} else if confidence >= 0.5 {
"Moderate confidence interpretation"
} else {
"Low confidence interpretation"
};
format!(
"{}. This vocalization is {}. Overall confidence: {:.1}%.",
confidence_qualifier,
main_summary,
confidence * 100.0
)
}
/// Generate a confidence explanation.
pub fn confidence_explanation(&self, confidence: f32) -> String {
if confidence >= 0.9 {
"Very high confidence: Strong evidence from multiple sources supports this interpretation.".to_string()
} else if confidence >= 0.7 {
"High confidence: Good evidence supports this interpretation with minor uncertainty.".to_string()
} else if confidence >= 0.5 {
"Moderate confidence: Evidence partially supports this interpretation. Additional verification recommended.".to_string()
} else if confidence >= 0.3 {
"Low confidence: Limited evidence available. Interpretation should be considered tentative.".to_string()
} else {
"Very low confidence: Insufficient evidence for reliable interpretation. Expert review recommended.".to_string()
}
}
}
/// Formatter for evidence pack output.
#[derive(Debug)]
pub struct EvidencePackFormatter {
templates: InterpretationTemplates,
include_details: bool,
max_evidence_items: usize,
}
impl Default for EvidencePackFormatter {
fn default() -> Self {
Self::new()
}
}
impl EvidencePackFormatter {
/// Create a new formatter.
pub fn new() -> Self {
Self {
templates: InterpretationTemplates::new(),
include_details: true,
max_evidence_items: 5,
}
}
/// Set whether to include detailed evidence.
pub fn with_details(mut self, include: bool) -> Self {
self.include_details = include;
self
}
/// Set maximum evidence items to show.
pub fn with_max_evidence(mut self, max: usize) -> Self {
self.max_evidence_items = max;
self
}
/// Format an evidence pack as a structured report.
pub fn format_report(&self, pack: &crate::domain::entities::EvidencePack) -> String {
let mut sections = Vec::new();
// Header
sections.push(format!(
"# Evidence Pack Report\n\nID: {}\nQuery: {}\nCreated: {}",
pack.id,
pack.query_embedding_id,
pack.created_at.format("%Y-%m-%d %H:%M:%S UTC")
));
// Summary
sections.push(format!(
"\n## Summary\n\n{}",
self.templates.generate_summary(
pack.neighbors.len(),
pack.overall_confidence(),
pack.cluster_context.cluster_label.as_deref(),
pack.neighbors.first().and_then(|n| n.recording_metadata.taxon.as_deref()),
pack.interpretation.confidence,
)
));
// Structural description
sections.push(format!(
"\n## Structural Analysis\n\n{}",
pack.interpretation.structural_description
));
// Claims
if !pack.interpretation.claims.is_empty() {
let claims_text: Vec<String> = pack
.interpretation
.claims
.iter()
.map(|c| {
let evidence_count = c.evidence_refs.len();
format!(
"- {} (confidence: {:.1}%, {} evidence reference{})",
c.statement,
c.confidence * 100.0,
evidence_count,
if evidence_count == 1 { "" } else { "s" }
)
})
.collect();
sections.push(format!(
"\n## Claims\n\n{}",
claims_text.join("\n")
));
}
// Detailed evidence (if enabled)
if self.include_details && !pack.neighbors.is_empty() {
let evidence_text: Vec<String> = pack
.neighbors
.iter()
.take(self.max_evidence_items)
.map(|n| {
self.templates.neighbor_evidence_description(
&n.recording_metadata.recording_id,
n.distance,
n.recording_metadata.taxon.as_deref(),
)
})
.collect();
let more_text = if pack.neighbors.len() > self.max_evidence_items {
format!(
"\n... and {} more neighbors",
pack.neighbors.len() - self.max_evidence_items
)
} else {
String::new()
};
sections.push(format!(
"\n## Evidence Details\n\n### Neighbors\n{}\n{}",
evidence_text.join("\n"),
more_text
));
}
// Confidence explanation
sections.push(format!(
"\n## Confidence Assessment\n\n{}",
self.templates.confidence_explanation(pack.interpretation.confidence)
));
sections.join("\n")
}
/// Format a compact single-line summary.
pub fn format_compact(&self, pack: &crate::domain::entities::EvidencePack) -> String {
let taxon = pack
.neighbors
.first()
.and_then(|n| n.recording_metadata.taxon.as_deref())
.unwrap_or("unknown");
let cluster = pack
.cluster_context
.cluster_label
.as_deref()
.unwrap_or("unassigned");
format!(
"[{}] {} neighbors, cluster='{}', taxon='{}', confidence={:.1}%",
pack.id,
pack.neighbors.len(),
cluster,
taxon,
pack.overall_confidence() * 100.0
)
}
/// Format as JSON-compatible structure.
pub fn format_json(&self, pack: &crate::domain::entities::EvidencePack) -> serde_json::Value {
serde_json::json!({
"id": pack.id,
"query_embedding_id": pack.query_embedding_id.0,
"created_at": pack.created_at.to_rfc3339(),
"summary": {
"neighbor_count": pack.neighbors.len(),
"overall_confidence": pack.overall_confidence(),
"cluster_assigned": pack.cluster_context.has_cluster(),
"has_sequence_context": pack.sequence_context.is_some(),
},
"interpretation": {
"structural_description": pack.interpretation.structural_description,
"claim_count": pack.interpretation.claims.len(),
"confidence": pack.interpretation.confidence,
},
"claims": pack.interpretation.claims.iter().map(|c| {
serde_json::json!({
"statement": c.statement,
"confidence": c.confidence,
"evidence_count": c.evidence_refs.len(),
})
}).collect::<Vec<_>>(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_neighbor_description() {
let templates = InterpretationTemplates::new();
let desc = templates.neighbor_description(5, 0.85);
assert!(desc.contains("high similarity"));
assert!(desc.contains("85.0%"));
assert!(desc.contains("5 reference"));
let desc = templates.neighbor_description(3, 0.45);
assert!(desc.contains("low similarity"));
}
#[test]
fn test_taxon_description() {
let templates = InterpretationTemplates::new();
let desc = templates.taxon_description(&["Species A", "Species A", "Species A"]);
assert!(desc.contains("Species A"));
assert!(!desc.contains("multiple taxa"));
let desc = templates.taxon_description(&["Species A", "Species B"]);
assert!(desc.contains("multiple taxa"));
assert!(desc.contains("Species A"));
assert!(desc.contains("Species B"));
}
#[test]
fn test_cluster_description() {
let templates = InterpretationTemplates::new();
let desc = templates.cluster_description("Song Type A", 0.9, 0.85);
assert!(desc.contains("Song Type A"));
assert!(desc.contains("very high confidence"));
assert!(desc.contains("90.0%"));
assert!(desc.contains("85.0%"));
}
#[test]
fn test_sequence_description() {
let templates = InterpretationTemplates::new();
let desc = templates.sequence_description(5, None);
assert!(desc.contains("5 vocalization sequence"));
assert!(!desc.contains("motif"));
let desc = templates.sequence_description(5, Some("ABAB"));
assert!(desc.contains("motif pattern 'ABAB'"));
}
#[test]
fn test_generate_summary() {
let templates = InterpretationTemplates::new();
let summary = templates.generate_summary(
10,
0.85,
Some("Dawn Chorus"),
Some("Turdus merula"),
0.9,
);
assert!(summary.contains("High confidence"));
assert!(summary.contains("highly similar"));
assert!(summary.contains("Dawn Chorus"));
assert!(summary.contains("Turdus merula"));
assert!(summary.contains("90.0%"));
}
#[test]
fn test_confidence_explanation() {
let templates = InterpretationTemplates::new();
let high = templates.confidence_explanation(0.95);
assert!(high.contains("Very high confidence"));
let moderate = templates.confidence_explanation(0.55);
assert!(moderate.contains("Moderate confidence"));
let low = templates.confidence_explanation(0.2);
assert!(low.contains("Very low confidence"));
}
#[test]
fn test_custom_template_override() {
let templates = InterpretationTemplates::new()
.with_template("low_similarity", "Custom low similarity message");
let desc = templates.low_similarity_claim();
assert_eq!(desc, "Custom low similarity message");
}
#[test]
fn test_evidence_pack_formatter() {
use crate::domain::entities::*;
let pack = EvidencePack::new(
EmbeddingId::new("query-1"),
vec![
NeighborEvidence::new(
EmbeddingId::new("n1"),
0.1,
RecordingMetadata::new("r1").with_taxon("Species A"),
),
],
ClusterContext::new(
Some(ClusterId::new("c1")),
0.9,
0.85,
).with_label("Song Type A"),
None,
Interpretation::new(
"Test structural description".to_string(),
vec![Claim::new("Test claim", 0.9)],
0.85,
),
);
let formatter = EvidencePackFormatter::new();
// Test full report
let report = formatter.format_report(&pack);
assert!(report.contains("Evidence Pack Report"));
assert!(report.contains("query-1"));
assert!(report.contains("Test structural description"));
// Test compact format
let compact = formatter.format_compact(&pack);
assert!(compact.contains("1 neighbors"));
assert!(compact.contains("Song Type A"));
// Test JSON format
let json = formatter.format_json(&pack);
assert_eq!(json["query_embedding_id"], "query-1");
assert_eq!(json["summary"]["neighbor_count"], 1);
}
}