61 KiB
ADR-006: Data Architecture and Vector Storage
Status: Accepted Date: 2026-01-15 Deciders: 7sense Architecture Team Context: Bioacoustic data pipeline for RuVector integration
Context
7sense transforms bioacoustic signals (birdsong, wildlife vocalizations) into navigable geometric spaces using the RuVector platform. The system processes audio recordings through Perch 2.0 to generate 1536-dimensional embeddings, which are then indexed using HNSW for fast similarity search and organized via Graph Neural Networks (GNN) for pattern discovery.
This ADR defines the complete data architecture including:
- Entity schemas and relationships
- Vector storage tiering strategy
- Temporal data handling
- Metadata enrichment
- Data lifecycle management
- Backup and recovery procedures
Decision
1. Schema Design
1.1 Core Entities
┌─────────────────────────────────────────────────────────────────────────────┐
│ ENTITY RELATIONSHIP DIAGRAM │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────────┐ ┌───────────────┐ ┌──────────────┐ │
│ │ Recording │──1:N──│ CallSegment │──1:1──│ Embedding │ │
│ └──────────────┘ └───────────────┘ └──────────────┘ │
│ │ │ │ │
│ │ │ │ │
│ │ ┌──────┴──────┐ │ │
│ │ │ │ │ │
│ ▼ ▼ ▼ ▼ │
│ ┌──────────────┐ ┌──────────┐ ┌──────────┐ ┌──────────────┐ │
│ │ Sensor │ │ Cluster │ │ Prototype│ │ Taxon │ │
│ └──────────────┘ └──────────┘ └──────────┘ └──────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
1.2 Node Definitions
Recording - Source audio file metadata
interface Recording {
id: UUID; // Primary identifier
sensor_id: UUID; // Reference to sensor
file_path: string; // Storage location
file_hash: string; // SHA-256 for deduplication
duration_ms: number; // Total duration
sample_rate: number; // Expected: 32000 Hz
channels: number; // Expected: 1 (mono)
bit_depth: number; // Audio bit depth
start_ts: ISO8601; // Recording start timestamp
end_ts: ISO8601; // Recording end timestamp
lat: float; // GPS latitude (WGS84)
lon: float; // GPS longitude (WGS84)
altitude_m: float; // Elevation in meters
habitat: HabitatType; // Enum: forest, wetland, urban, etc.
weather: WeatherConditions; // Nested weather data
quality_score: float; // 0.0-1.0 automated quality assessment
processing_status: Status; // pending, processing, complete, failed
created_at: ISO8601;
updated_at: ISO8601;
}
interface WeatherConditions {
temperature_c: float;
humidity_pct: float;
wind_speed_ms: float;
wind_direction_deg: float;
precipitation_mm: float;
cloud_cover_pct: float;
pressure_hpa: float;
source: string; // weather API source
}
CallSegment - Individual vocalization segment
interface CallSegment {
id: UUID;
recording_id: UUID; // Parent recording
segment_index: number; // Order within recording
t0_ms: number; // Start offset in milliseconds
t1_ms: number; // End offset in milliseconds
duration_ms: number; // Computed: t1_ms - t0_ms
snr_db: float; // Signal-to-noise ratio
energy: float; // RMS energy level
peak_freq_hz: float; // Dominant frequency
bandwidth_hz: float; // Frequency range
entropy: float; // Spectral entropy (Wiener)
pitch_contour: float[]; // Sampled pitch values
rhythm_intervals: float[]; // Inter-onset intervals
spectral_centroid: float; // Spectral center of mass
spectral_flatness: float; // Tonality measure
zero_crossing_rate: float; // Temporal texture
clipping_detected: boolean; // Audio quality flag
overlap_score: float; // Overlap with other calls
segmentation_method: string; // whisper_seg, tweety_net, energy_threshold
segmentation_confidence: float;
created_at: ISO8601;
}
Embedding - Vector representation from Perch 2.0
interface Embedding {
id: UUID;
segment_id: UUID; // Parent segment
model_name: string; // "perch_2.0"
model_version: string; // Specific model version
dimensions: number; // 1536 for Perch 2.0
vector: Float32Array; // Full-precision embedding
vector_quantized: Int8Array; // Quantized for warm tier
vector_compressed: Uint8Array; // Compressed for cold tier
storage_tier: StorageTier; // hot, warm, cold
norm: float; // L2 norm for validation
generation_time_ms: number; // Inference latency
checksum: string; // Integrity verification
created_at: ISO8601;
last_accessed: ISO8601; // For tiering decisions
access_count: number; // Usage tracking
}
enum StorageTier {
HOT = 'hot', // Full float32, in-memory HNSW
WARM = 'warm', // int8 quantized, SSD-backed
COLD = 'cold' // 32x compressed, archival
}
Prototype - Cluster centroid/exemplar
interface Prototype {
id: UUID;
cluster_id: UUID; // Parent cluster
centroid_vector: Float32Array; // Averaged embedding
exemplar_ids: UUID[]; // Representative segment IDs
exemplar_count: number; // Number of exemplars
intra_cluster_variance: float;
silhouette_score: float; // Cluster quality metric
created_at: ISO8601;
updated_at: ISO8601;
}
Cluster - Grouping of similar calls
interface Cluster {
id: UUID;
name: string; // Human-readable label
description: string; // Auto-generated or manual
method: ClusterMethod; // hdbscan, kmeans, spectral
params: Record<string, any>; // Algorithm parameters
member_count: number; // Number of assigned segments
coherence_score: float; // Internal validity
stability_score: float; // Bootstrap stability
parent_cluster_id: UUID; // Hierarchical clustering
level: number; // Hierarchy depth
created_at: ISO8601;
updated_at: ISO8601;
}
enum ClusterMethod {
HDBSCAN = 'hdbscan',
KMEANS = 'kmeans',
SPECTRAL = 'spectral',
AGGLOMERATIVE = 'agglomerative'
}
Taxon - Species/taxonomic reference
interface Taxon {
id: UUID;
inat_id: number; // iNaturalist taxon ID
scientific_name: string; // Binomial nomenclature
common_name: string; // English common name
family: string; // Taxonomic family
order: string; // Taxonomic order
class: string; // Taxonomic class
conservation_status: string; // IUCN status
frequency_range_hz: [number, number]; // Typical vocalization range
habitat_types: HabitatType[];
created_at: ISO8601;
}
Sensor - Recording device metadata
interface Sensor {
id: UUID;
name: string; // Device identifier
model: string; // Hardware model
manufacturer: string;
serial_number: string;
microphone_type: string; // omni, cardioid, etc.
sensitivity_dbv: float; // Microphone sensitivity
frequency_response: [number, number]; // Hz range
deployment_lat: float;
deployment_lon: float;
deployment_altitude_m: float;
deployment_habitat: HabitatType;
deployment_start: ISO8601;
deployment_end: ISO8601;
calibration_date: ISO8601;
calibration_factor: float;
status: SensorStatus;
created_at: ISO8601;
updated_at: ISO8601;
}
enum SensorStatus {
ACTIVE = 'active',
INACTIVE = 'inactive',
MAINTENANCE = 'maintenance',
RETIRED = 'retired'
}
1.3 Edge Definitions (Graph Relationships)
// Structural relationships
(:Recording)-[:HAS_SEGMENT {order: int}]->(:CallSegment)
(:CallSegment)-[:HAS_EMBEDDING]->(:Embedding)
(:Recording)-[:FROM_SENSOR]->(:Sensor)
// Temporal sequence (syntax graph)
(:CallSegment)-[:NEXT {
dt_ms: int, // Time delta between segments
same_speaker: boolean, // Likely same individual
transition_prob: float // Learned transition probability
}]->(:CallSegment)
// Acoustic similarity (HNSW neighbors)
(:CallSegment)-[:SIMILAR {
distance: float, // Cosine distance
rank: int, // Neighbor rank (1-k)
tier: string // hot, warm, cold
}]->(:CallSegment)
// Cluster assignments
(:Cluster)-[:HAS_PROTOTYPE]->(:Prototype)
(:CallSegment)-[:ASSIGNED_TO {
confidence: float, // Assignment confidence
distance_to_centroid: float
}]->(:Cluster)
(:Cluster)-[:CHILD_OF]->(:Cluster) // Hierarchical
// Taxonomic links
(:CallSegment)-[:IDENTIFIED_AS {
confidence: float,
method: string, // model, manual, consensus
verified: boolean
}]->(:Taxon)
// Co-occurrence (same time window, nearby sensors)
(:CallSegment)-[:CO_OCCURS {
time_overlap_ms: int,
spatial_distance_m: float
}]->(:CallSegment)
1.4 Index Definitions
Primary Indexes
-- UUID lookups
CREATE UNIQUE INDEX idx_recording_id ON recordings(id);
CREATE UNIQUE INDEX idx_segment_id ON call_segments(id);
CREATE UNIQUE INDEX idx_embedding_id ON embeddings(id);
CREATE UNIQUE INDEX idx_cluster_id ON clusters(id);
CREATE UNIQUE INDEX idx_taxon_id ON taxa(id);
CREATE UNIQUE INDEX idx_sensor_id ON sensors(id);
-- Foreign key relationships
CREATE INDEX idx_segment_recording ON call_segments(recording_id);
CREATE INDEX idx_embedding_segment ON embeddings(segment_id);
CREATE INDEX idx_recording_sensor ON recordings(sensor_id);
Temporal Indexes
-- Time-based queries
CREATE INDEX idx_recording_start ON recordings(start_ts);
CREATE INDEX idx_recording_timerange ON recordings USING GIST (
tstzrange(start_ts, end_ts)
);
CREATE INDEX idx_segment_time ON call_segments(recording_id, t0_ms);
Spatial Indexes
-- Geographic queries (PostGIS)
CREATE INDEX idx_recording_location ON recordings USING GIST (
ST_SetSRID(ST_MakePoint(lon, lat), 4326)
);
CREATE INDEX idx_sensor_location ON sensors USING GIST (
ST_SetSRID(ST_MakePoint(deployment_lon, deployment_lat), 4326)
);
HNSW Vector Index
-- Hot tier: Full precision HNSW
CREATE INDEX idx_embedding_hnsw_hot ON embeddings
USING hnsw (vector vector_cosine_ops)
WITH (
m = 16, -- Connections per layer
ef_construction = 200, -- Build-time search width
ef_search = 100 -- Query-time search width
)
WHERE storage_tier = 'hot';
-- Warm tier: Quantized HNSW
CREATE INDEX idx_embedding_hnsw_warm ON embeddings
USING hnsw (vector_quantized vector_l2_ops)
WITH (m = 12, ef_construction = 100)
WHERE storage_tier = 'warm';
2. Vector Storage Strategy
2.1 Tiered Storage Architecture
┌─────────────────────────────────────────────────────────────────────────────┐
│ VECTOR STORAGE TIERS │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌────────────────────────────────────────────────────────────────────────┐ │
│ │ HOT TIER - Full Precision (float32) │ │
│ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │
│ │ Storage: In-memory + NVMe SSD │ │
│ │ Format: 1536 x float32 = 6,144 bytes/vector │ │
│ │ Index: HNSW (M=16, ef=200) │ │
│ │ Latency: <1ms query, <100us retrieval │ │
│ │ Capacity: ~1M vectors / 6GB RAM │ │
│ │ Use: Active queries, recent recordings, frequent access │ │
│ └────────────────────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ (access_count < threshold, age > 7 days) │
│ ┌────────────────────────────────────────────────────────────────────────┐ │
│ │ WARM TIER - Quantized (int8) │ │
│ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │
│ │ Storage: SSD-backed, memory-mapped │ │
│ │ Format: 1536 x int8 = 1,536 bytes/vector (4x compression) │ │
│ │ Index: HNSW (M=12, ef=100) │ │
│ │ Latency: <10ms query │ │
│ │ Capacity: ~10M vectors / 15GB SSD │ │
│ │ Use: Historical data, periodic access, batch processing │ │
│ └────────────────────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ (age > 90 days, access_count < 10) │
│ ┌────────────────────────────────────────────────────────────────────────┐ │
│ │ COLD TIER - Compressed (Product Quantization) │ │
│ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │
│ │ Storage: Object storage (S3/GCS) + local cache │ │
│ │ Format: PQ-encoded = ~192 bytes/vector (32x compression) │ │
│ │ Index: IVF-PQ (coarse quantizer + product quantization) │ │
│ │ Latency: <100ms query (cache hit), <1s (cache miss) │ │
│ │ Capacity: ~100M vectors / 20GB storage │ │
│ │ Use: Archival, compliance, rare access, bulk analytics │ │
│ └────────────────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
2.2 Quantization Implementation
Scalar Quantization (Hot to Warm)
interface ScalarQuantizer {
scale: float; // Computed from data distribution
zero_point: float; // Offset for centering
quantize(vector: Float32Array): Int8Array {
const quantized = new Int8Array(vector.length);
for (let i = 0; i < vector.length; i++) {
const scaled = (vector[i] - this.zero_point) / this.scale;
quantized[i] = Math.max(-128, Math.min(127, Math.round(scaled)));
}
return quantized;
}
dequantize(quantized: Int8Array): Float32Array {
const vector = new Float32Array(quantized.length);
for (let i = 0; i < quantized.length; i++) {
vector[i] = quantized[i] * this.scale + this.zero_point;
}
return vector;
}
}
// Calibration: compute scale/zero_point from representative sample
function calibrateQuantizer(samples: Float32Array[]): ScalarQuantizer {
let min = Infinity, max = -Infinity;
for (const sample of samples) {
for (const val of sample) {
min = Math.min(min, val);
max = Math.max(max, val);
}
}
return {
scale: (max - min) / 255,
zero_point: min + (max - min) / 2
};
}
Product Quantization (Warm to Cold)
interface ProductQuantizer {
num_subvectors: number; // 192 (divide 1536 into 192 x 8)
subvector_dim: number; // 8 dimensions per subvector
num_centroids: number; // 256 (1 byte per subvector)
codebooks: Float32Array[][]; // 192 codebooks, each with 256 centroids
encode(vector: Float32Array): Uint8Array {
const codes = new Uint8Array(this.num_subvectors);
for (let i = 0; i < this.num_subvectors; i++) {
const subvec = vector.slice(
i * this.subvector_dim,
(i + 1) * this.subvector_dim
);
codes[i] = this.findNearestCentroid(subvec, this.codebooks[i]);
}
return codes;
}
decode(codes: Uint8Array): Float32Array {
const vector = new Float32Array(1536);
for (let i = 0; i < this.num_subvectors; i++) {
const centroid = this.codebooks[i][codes[i]];
vector.set(centroid, i * this.subvector_dim);
}
return vector;
}
asymmetricDistance(query: Float32Array, codes: Uint8Array): float {
let dist = 0;
for (let i = 0; i < this.num_subvectors; i++) {
const subquery = query.slice(
i * this.subvector_dim,
(i + 1) * this.subvector_dim
);
const centroid = this.codebooks[i][codes[i]];
dist += euclideanDistance(subquery, centroid);
}
return dist;
}
}
2.3 Tiering Policy
interface TieringPolicy {
hot_to_warm: {
age_days: 7, // Move after 7 days
access_threshold: 100, // Unless accessed >100 times
batch_size: 10000 // Process in batches
},
warm_to_cold: {
age_days: 90, // Move after 90 days
access_threshold: 10, // Unless accessed >10 times in last 30 days
batch_size: 50000
},
promotion: {
cold_to_warm: {
access_count: 5, // Promote after 5 accesses
time_window_hours: 24
},
warm_to_hot: {
access_count: 20,
time_window_hours: 1
}
}
}
// Background tiering job
async function runTieringJob(policy: TieringPolicy): Promise<TieringStats> {
const stats = { demoted: 0, promoted: 0 };
// Demote hot -> warm
const hotCandidates = await db.query(`
SELECT id FROM embeddings
WHERE storage_tier = 'hot'
AND created_at < NOW() - INTERVAL '${policy.hot_to_warm.age_days} days'
AND access_count < ${policy.hot_to_warm.access_threshold}
ORDER BY last_accessed ASC
LIMIT ${policy.hot_to_warm.batch_size}
`);
for (const batch of chunk(hotCandidates, 1000)) {
await demoteToWarm(batch);
stats.demoted += batch.length;
}
// Promote cold -> warm based on access patterns
const coldAccessLog = await getRecentAccesses('cold', 24);
const promotionCandidates = coldAccessLog
.filter(e => e.count >= policy.promotion.cold_to_warm.access_count);
for (const candidate of promotionCandidates) {
await promoteToWarm(candidate.id);
stats.promoted++;
}
return stats;
}
2.4 Hyperbolic Embedding Option
For hierarchical species relationships, optionally store embeddings in Poincare ball space:
interface HyperbolicEmbedding {
euclidean_vector: Float32Array; // Original 1536-D
poincare_vector: Float32Array; // Mapped to Poincare ball
curvature: float; // Ball curvature (typically -1)
}
// Map Euclidean to Poincare ball
function exponentialMap(
euclidean: Float32Array,
curvature: float = -1
): Float32Array {
const norm = l2Norm(euclidean);
const c = Math.abs(curvature);
const factor = Math.tanh(Math.sqrt(c) * norm / 2) / (Math.sqrt(c) * norm);
return euclidean.map(x => x * factor);
}
// Poincare distance for hyperbolic similarity
function poincareDistance(
u: Float32Array,
v: Float32Array,
curvature: float = -1
): float {
const c = Math.abs(curvature);
const u_norm_sq = dotProduct(u, u);
const v_norm_sq = dotProduct(v, v);
const diff = subtract(u, v);
const diff_norm_sq = dotProduct(diff, diff);
const numerator = 2 * diff_norm_sq;
const denominator = (1 - c * u_norm_sq) * (1 - c * v_norm_sq);
return Math.acosh(1 + numerator / denominator) / Math.sqrt(c);
}
3. Graph Relationships for Cypher Queries
3.1 Common Query Patterns
Find Similar Calls
// Top-k similar calls to a given segment
MATCH (source:CallSegment {id: $segment_id})-[sim:SIMILAR]->(target:CallSegment)
WHERE sim.distance < $threshold
RETURN target, sim.distance, sim.rank
ORDER BY sim.distance ASC
LIMIT $k
Temporal Sequence Analysis
// Find call sequences (motifs) of length n
MATCH path = (start:CallSegment)-[:NEXT*1..5]->(end:CallSegment)
WHERE start.recording_id = $recording_id
RETURN [node IN nodes(path) | node.id] AS sequence,
[rel IN relationships(path) | rel.dt_ms] AS intervals,
length(path) AS motif_length
ORDER BY motif_length DESC
Cluster Exploration
// Get cluster members with their prototypes
MATCH (c:Cluster {id: $cluster_id})-[:HAS_PROTOTYPE]->(p:Prototype)
MATCH (seg:CallSegment)-[a:ASSIGNED_TO]->(c)
WHERE a.confidence > 0.8
RETURN c, p, collect(seg)[..10] AS exemplars, count(seg) AS total_members
Species Distribution
// Calls by species in a geographic region
MATCH (r:Recording)-[:HAS_SEGMENT]->(seg:CallSegment)
-[:IDENTIFIED_AS]->(t:Taxon)
WHERE point.distance(
point({latitude: r.lat, longitude: r.lon}),
point({latitude: $center_lat, longitude: $center_lon})
) < $radius_m
RETURN t.scientific_name, t.common_name, count(seg) AS call_count
ORDER BY call_count DESC
Co-occurrence Networks
// Species co-occurring in same time windows
MATCH (seg1:CallSegment)-[:IDENTIFIED_AS]->(t1:Taxon),
(seg1)-[:CO_OCCURS]->(seg2:CallSegment)-[:IDENTIFIED_AS]->(t2:Taxon)
WHERE t1.id <> t2.id
RETURN t1.common_name, t2.common_name, count(*) AS co_occurrence_count
ORDER BY co_occurrence_count DESC
LIMIT 20
Transition Matrix
// Markov transition probabilities between call types
MATCH (c1:Cluster)<-[:ASSIGNED_TO]-(seg1:CallSegment)
-[:NEXT]->(seg2:CallSegment)-[:ASSIGNED_TO]->(c2:Cluster)
WITH c1.name AS from_cluster, c2.name AS to_cluster, count(*) AS transitions
MATCH (c1:Cluster {name: from_cluster})<-[:ASSIGNED_TO]-(seg:CallSegment)
WITH from_cluster, to_cluster, transitions, count(seg) AS from_total
RETURN from_cluster, to_cluster,
toFloat(transitions) / from_total AS transition_prob
ORDER BY from_cluster, transition_prob DESC
3.2 GNN Training Edges
// Create training edges for GNN
// Acoustic similarity edges (from HNSW)
MATCH (seg:CallSegment)-[:HAS_EMBEDDING]->(emb:Embedding)
WITH seg, emb
CALL {
WITH seg, emb
MATCH (other:CallSegment)-[:HAS_EMBEDDING]->(other_emb:Embedding)
WHERE other.id <> seg.id
WITH seg, other,
gds.similarity.cosine(emb.vector, other_emb.vector) AS sim
WHERE sim > 0.8
RETURN other, sim
ORDER BY sim DESC
LIMIT 10
}
MERGE (seg)-[r:SIMILAR]->(other)
SET r.distance = 1 - sim, r.tier = 'hot'
4. Temporal Data Handling
4.1 Timestamp Standards
// All timestamps in ISO 8601 with timezone
type ISO8601 = string; // e.g., "2026-01-15T08:30:00.000Z"
interface TemporalMetadata {
// Recording level
recording_start_ts: ISO8601; // When recording began
recording_end_ts: ISO8601; // When recording ended
recording_timezone: string; // IANA timezone (e.g., "America/Los_Angeles")
// Segment level (relative to recording)
segment_offset_ms: number; // Milliseconds from recording start
segment_absolute_ts: ISO8601; // Computed absolute timestamp
// Derived temporal features
time_of_day: TimeOfDay; // dawn, morning, midday, afternoon, dusk, night
day_of_week: number; // 0-6
day_of_year: number; // 1-366
lunar_phase: LunarPhase; // new, waxing, full, waning
sunrise_offset_min: number; // Minutes from local sunrise
sunset_offset_min: number; // Minutes from local sunset
}
enum TimeOfDay {
DAWN = 'dawn', // -30min to +30min of sunrise
MORNING = 'morning', // sunrise+30min to noon
MIDDAY = 'midday', // noon +/- 2 hours
AFTERNOON = 'afternoon', // midday to sunset-30min
DUSK = 'dusk', // -30min to +30min of sunset
NIGHT = 'night' // sunset+30min to sunrise-30min
}
4.2 Sequence Ordering
interface SequenceManager {
// Build sequence graph from recording
buildSequenceGraph(recordingId: UUID): Promise<SequenceEdge[]> {
const segments = await db.query(`
SELECT id, t0_ms, t1_ms, recording_id
FROM call_segments
WHERE recording_id = $1
ORDER BY t0_ms ASC
`, [recordingId]);
const edges: SequenceEdge[] = [];
for (let i = 0; i < segments.length - 1; i++) {
const current = segments[i];
const next = segments[i + 1];
const gap_ms = next.t0_ms - current.t1_ms;
// Only link if gap is reasonable (< 5 seconds)
if (gap_ms < 5000 && gap_ms >= 0) {
edges.push({
source_id: current.id,
target_id: next.id,
dt_ms: gap_ms,
same_speaker: gap_ms < 500, // Heuristic for same individual
sequence_index: i
});
}
}
return edges;
}
// Detect repeated sequences (motifs)
findMotifs(recordingId: UUID, minLength: number = 3): Promise<Motif[]> {
// Build suffix array of cluster assignments
const sequence = await this.getClusterSequence(recordingId);
const motifs = this.findRepeatedSubstrings(sequence, minLength);
return motifs;
}
}
interface SequenceEdge {
source_id: UUID;
target_id: UUID;
dt_ms: number; // Time gap
same_speaker: boolean; // Likely same individual
sequence_index: number; // Position in recording
}
interface Motif {
pattern: string[]; // Cluster IDs in order
occurrences: number; // How many times it appears
positions: number[][]; // Start positions for each occurrence
entropy: number; // Pattern entropy
}
4.3 Time-Series Partitioning
-- Partition recordings by month for efficient time-range queries
CREATE TABLE recordings (
id UUID PRIMARY KEY,
start_ts TIMESTAMPTZ NOT NULL,
-- ... other columns
) PARTITION BY RANGE (start_ts);
-- Create partitions
CREATE TABLE recordings_2026_01 PARTITION OF recordings
FOR VALUES FROM ('2026-01-01') TO ('2026-02-01');
CREATE TABLE recordings_2026_02 PARTITION OF recordings
FOR VALUES FROM ('2026-02-01') TO ('2026-03-01');
-- Automatic partition creation (pg_partman or similar)
SELECT create_parent('public.recordings', 'start_ts', 'native', 'monthly');
5. Metadata Enrichment
5.1 Enrichment Pipeline
┌─────────────────────────────────────────────────────────────────────────────┐
│ METADATA ENRICHMENT PIPELINE │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Raw Audio │────▶│ Segmented │────▶│ Embedded │ │
│ │ │ │ Calls │ │ Vectors │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Recording │ │ Acoustic │ │ Similarity │ │
│ │ Metadata │ │ Features │ │ Neighbors │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ │
│ │ │ │ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ ENRICHMENT SERVICES │ │
│ ├─────────────────────────────────────────────────────────────────────┤ │
│ │ • Weather API → temperature, humidity, wind, precipitation │ │
│ │ • Geocoding → habitat type, elevation, land cover │ │
│ │ • Astronomy → sunrise/sunset, lunar phase, day length │ │
│ │ • Taxonomy → species ID, conservation status, range maps │ │
│ │ • Soundscape → background noise level, anthropogenic detection │ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ ENRICHED RECORD │ │
│ │ Recording + Weather + Habitat + Temporal + Taxonomic + Quality │ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
5.2 Enrichment Sources
interface EnrichmentConfig {
weather: {
provider: 'openweathermap' | 'visualcrossing' | 'meteoblue';
api_key: string;
cache_ttl_hours: 24;
historical_lookback_days: 7;
};
geocoding: {
provider: 'nominatim' | 'google' | 'mapbox';
cache_ttl_days: 30;
enrichments: ['habitat', 'elevation', 'land_cover', 'protected_area'];
};
astronomy: {
// Computed locally, no API needed
compute: ['sunrise', 'sunset', 'civil_twilight', 'lunar_phase', 'day_length'];
};
taxonomy: {
sources: ['inat', 'ebird', 'xeno-canto'];
auto_id_confidence_threshold: 0.8;
human_review_threshold: 0.5;
};
soundscape: {
background_noise_window_ms: 100;
anthropogenic_detection: boolean;
frequency_bands: [[0, 2000], [2000, 8000], [8000, 16000]];
};
}
// Enrichment job
async function enrichRecording(recordingId: UUID): Promise<EnrichedRecording> {
const recording = await db.getRecording(recordingId);
const enrichments = await Promise.all([
weatherService.getHistorical(recording.lat, recording.lon, recording.start_ts),
geocodingService.getHabitat(recording.lat, recording.lon),
astronomyService.getSolarData(recording.lat, recording.lon, recording.start_ts),
soundscapeAnalyzer.analyze(recording.file_path)
]);
return {
...recording,
weather: enrichments[0],
habitat: enrichments[1],
astronomy: enrichments[2],
soundscape: enrichments[3]
};
}
5.3 Species Identification
interface SpeciesIdentification {
segment_id: UUID;
predictions: TaxonPrediction[];
method: 'perch_classifier' | 'birdnet' | 'manual' | 'consensus';
confidence_aggregation: 'max' | 'mean' | 'ensemble';
}
interface TaxonPrediction {
taxon_id: UUID;
scientific_name: string;
confidence: float;
rank: number;
}
// Multi-model ensemble for species ID
async function identifySpecies(segmentId: UUID): Promise<SpeciesIdentification> {
const segment = await db.getSegment(segmentId);
const embedding = await db.getEmbedding(segmentId);
// Get predictions from multiple sources
const perchPreds = await perchClassifier.predict(embedding.vector);
const nnPreds = await nearestNeighborTaxon(embedding.vector, k=10);
// Ensemble combination
const combined = ensemblePredictions([perchPreds, nnPreds], weights=[0.6, 0.4]);
return {
segment_id: segmentId,
predictions: combined.slice(0, 5),
method: 'ensemble',
confidence_aggregation: 'weighted_mean'
};
}
6. Data Lifecycle
6.1 Lifecycle Stages
┌─────────────────────────────────────────────────────────────────────────────┐
│ DATA LIFECYCLE │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ STAGE 1: INGESTION │
│ ━━━━━━━━━━━━━━━━━━ │
│ • Audio upload (S3/GCS/local) │
│ • Format validation (32kHz, mono, WAV/FLAC) │
│ • Deduplication (SHA-256 hash check) │
│ • Recording metadata extraction │
│ • Queue for processing │
│ │
│ STAGE 2: PROCESSING │
│ ━━━━━━━━━━━━━━━━━━━ │
│ • Audio segmentation (WhisperSeg/TweetyNet) │
│ • Acoustic feature extraction │
│ • Perch 2.0 embedding generation │
│ • Quality scoring │
│ • Initial species predictions │
│ │
│ STAGE 3: INDEXING │
│ ━━━━━━━━━━━━━━━━━━ │
│ • HNSW index insertion (hot tier) │
│ • Neighbor edge creation │
│ • Sequence graph construction │
│ • Cluster assignment │
│ • Metadata enrichment │
│ │
│ STAGE 4: ACTIVE USE │
│ ━━━━━━━━━━━━━━━━━━━ │
│ • Query serving │
│ • GNN refinement (continuous learning) │
│ • Access tracking │
│ • Cache warming │
│ │
│ STAGE 5: TIERING │
│ ━━━━━━━━━━━━━━━━━━ │
│ • Hot → Warm (7 days, quantization) │
│ • Warm → Cold (90 days, compression) │
│ • Promotion on access │
│ │
│ STAGE 6: ARCHIVAL │
│ ━━━━━━━━━━━━━━━━━━ │
│ • Cold storage (S3 Glacier/equivalent) │
│ • Metadata preserved in primary DB │
│ • On-demand retrieval (minutes latency) │
│ │
│ STAGE 7: RETENTION/DELETION │
│ ━━━━━━━━━━━━━━━━━━━━━━━━━━ │
│ • Configurable retention policies │
│ • Legal hold support │
│ • Secure deletion (GDPR compliance) │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
6.2 Processing Pipeline
interface ProcessingPipeline {
stages: PipelineStage[];
async process(recordingId: UUID): Promise<ProcessingResult> {
const context: ProcessingContext = { recordingId, startTime: Date.now() };
for (const stage of this.stages) {
try {
context[stage.name] = await stage.execute(context);
await this.updateStatus(recordingId, stage.name, 'complete');
} catch (error) {
await this.handleError(recordingId, stage.name, error);
if (stage.critical) throw error;
}
}
return this.summarize(context);
}
}
const pipeline = new ProcessingPipeline({
stages: [
{ name: 'validate', critical: true, execute: validateAudio },
{ name: 'segment', critical: true, execute: segmentAudio },
{ name: 'extract_features', critical: false, execute: extractAcousticFeatures },
{ name: 'embed', critical: true, execute: generateEmbeddings },
{ name: 'index', critical: true, execute: insertToHNSW },
{ name: 'enrich', critical: false, execute: enrichMetadata },
{ name: 'identify', critical: false, execute: identifySpecies },
{ name: 'cluster', critical: false, execute: assignToClusters }
]
});
6.3 Retention Policies
interface RetentionPolicy {
name: string;
conditions: RetentionCondition[];
action: 'archive' | 'delete' | 'anonymize';
grace_period_days: number;
}
const defaultPolicies: RetentionPolicy[] = [
{
name: 'standard_archival',
conditions: [
{ field: 'age_days', operator: '>', value: 365 },
{ field: 'access_count_last_180_days', operator: '<', value: 5 }
],
action: 'archive',
grace_period_days: 30
},
{
name: 'low_quality_deletion',
conditions: [
{ field: 'quality_score', operator: '<', value: 0.3 },
{ field: 'age_days', operator: '>', value: 90 },
{ field: 'manually_reviewed', operator: '=', value: false }
],
action: 'delete',
grace_period_days: 14
},
{
name: 'gdpr_deletion',
conditions: [
{ field: 'deletion_requested', operator: '=', value: true }
],
action: 'delete',
grace_period_days: 0
}
];
// Retention job
async function enforceRetention(): Promise<RetentionReport> {
const report: RetentionReport = { archived: 0, deleted: 0, errors: [] };
for (const policy of defaultPolicies) {
const candidates = await findRetentionCandidates(policy);
for (const candidate of candidates) {
try {
if (policy.action === 'archive') {
await archiveRecording(candidate.id);
report.archived++;
} else if (policy.action === 'delete') {
await secureDelete(candidate.id);
report.deleted++;
}
} catch (error) {
report.errors.push({ id: candidate.id, error: error.message });
}
}
}
return report;
}
7. Backup and Recovery Strategy
7.1 Backup Architecture
┌─────────────────────────────────────────────────────────────────────────────┐
│ BACKUP ARCHITECTURE │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ PRIMARY DATABASE (PostgreSQL + pgvector) │ │
│ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │
│ │ • Streaming replication to standby │ │
│ │ • WAL archiving to object storage │ │
│ │ • Point-in-time recovery enabled │ │
│ │ • pg_basebackup daily full backup │ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │ │
│ ┌──────────────────┼──────────────────┐ │
│ ▼ ▼ ▼ │
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Standby │ │ WAL Archive │ │ Daily Full │ │
│ │ Replica │ │ (S3/GCS) │ │ Backup │ │
│ │ (sync) │ │ (15 min) │ │ (encrypted) │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ VECTOR INDEX (HNSW) │ │
│ │ ━━━━━━━━━━━━━━━━━━━━ │ │
│ │ • Snapshot to object storage daily │ │
│ │ • Incremental updates via change log │ │
│ │ • Rebuild from source embeddings (disaster recovery) │ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ AUDIO FILES │ │
│ │ ━━━━━━━━━━━━━━━ │ │
│ │ • Primary: Object storage (S3/GCS) with versioning │ │
│ │ • Cross-region replication for disaster recovery │ │
│ │ • Glacier transition for cold data │ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ GRAPH DATABASE (Neo4j/RuVector Graph Layer) │ │
│ │ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ │ │
│ │ • Online backup every 6 hours │ │
│ │ • Transaction log shipping │ │
│ │ • Cluster mode for HA │ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
7.2 Backup Schedule
interface BackupSchedule {
database: {
full_backup: {
frequency: 'daily',
time: '02:00 UTC',
retention_days: 30
},
incremental_backup: {
frequency: 'hourly',
retention_days: 7
},
wal_archiving: {
frequency: 'continuous',
archive_timeout_seconds: 900, // 15 minutes max
retention_days: 14
}
};
vector_index: {
snapshot: {
frequency: 'daily',
time: '03:00 UTC',
retention_count: 7
},
change_log: {
frequency: 'continuous',
retention_hours: 72
}
};
audio_files: {
versioning: true,
cross_region_replication: true,
glacier_transition_days: 90
};
graph: {
online_backup: {
frequency: 'every 6 hours',
retention_count: 28
}
};
}
7.3 Recovery Procedures
interface RecoveryProcedures {
// Point-in-time recovery
async recoverToPointInTime(targetTime: ISO8601): Promise<RecoveryResult> {
// 1. Stop application writes
await this.enableMaintenanceMode();
// 2. Identify nearest full backup
const baseBackup = await this.findBaseBackup(targetTime);
// 3. Restore base backup
await this.restoreBaseBackup(baseBackup);
// 4. Apply WAL logs up to target time
await this.applyWALTo(targetTime);
// 5. Rebuild vector index from restored embeddings
await this.rebuildVectorIndex();
// 6. Verify data integrity
const integrity = await this.verifyIntegrity();
// 7. Resume operations
await this.disableMaintenanceMode();
return {
success: integrity.valid,
restoredTo: targetTime,
recordsRecovered: integrity.recordCount,
duration: Date.now() - startTime
};
}
// Full disaster recovery
async fullDisasterRecovery(targetRegion: string): Promise<RecoveryResult> {
// 1. Provision new infrastructure in target region
await this.provisionInfrastructure(targetRegion);
// 2. Restore latest database backup
const latestBackup = await this.getLatestBackup();
await this.restoreBackup(latestBackup);
// 3. Sync audio files from cross-region replica
await this.syncAudioFiles(targetRegion);
// 4. Rebuild all indexes
await this.rebuildAllIndexes();
// 5. Update DNS/load balancer
await this.updateRouting(targetRegion);
return { success: true, region: targetRegion, rto: Date.now() - startTime };
}
}
7.4 Recovery Objectives
| Metric | Target | Description |
|---|---|---|
| RPO (Recovery Point Objective) | 15 minutes | Maximum data loss in time |
| RTO (Recovery Time Objective) | 1 hour | Time to restore service |
| RTO (Full DR) | 4 hours | Cross-region disaster recovery |
| Backup Verification | Daily | Automated restore test |
8. Data Validation Rules
8.1 Input Validation
interface ValidationRules {
recording: {
file_format: ['wav', 'flac', 'mp3'],
sample_rate: { min: 16000, max: 96000, preferred: 32000 },
channels: { allowed: [1, 2], preferred: 1 },
duration: { min_seconds: 1, max_seconds: 3600 },
file_size: { max_mb: 500 },
required_fields: ['file_path', 'start_ts', 'lat', 'lon']
};
segment: {
duration: { min_ms: 50, max_ms: 30000 },
snr: { min_db: -10, warn_below: 5 },
overlap: { warn_above: 0.5 },
required_fields: ['recording_id', 't0_ms', 't1_ms']
};
embedding: {
dimensions: 1536,
norm_range: { min: 0.5, max: 2.0 },
nan_allowed: false,
inf_allowed: false,
required_fields: ['segment_id', 'vector', 'model_name']
};
coordinates: {
lat: { min: -90, max: 90 },
lon: { min: -180, max: 180 },
precision: 6 // decimal places
};
}
// Validation implementation
class DataValidator {
validateRecording(recording: Recording): ValidationResult {
const errors: ValidationError[] = [];
const warnings: ValidationWarning[] = [];
// Format check
const ext = recording.file_path.split('.').pop()?.toLowerCase();
if (!ValidationRules.recording.file_format.includes(ext)) {
errors.push({ field: 'file_path', message: `Invalid format: ${ext}` });
}
// Sample rate check
if (recording.sample_rate !== ValidationRules.recording.sample_rate.preferred) {
warnings.push({
field: 'sample_rate',
message: `Non-preferred sample rate: ${recording.sample_rate}Hz`
});
}
// Coordinate validation
if (recording.lat < -90 || recording.lat > 90) {
errors.push({ field: 'lat', message: 'Latitude out of range' });
}
// Required fields
for (const field of ValidationRules.recording.required_fields) {
if (!recording[field]) {
errors.push({ field, message: 'Required field missing' });
}
}
return { valid: errors.length === 0, errors, warnings };
}
validateEmbedding(embedding: Embedding): ValidationResult {
const errors: ValidationError[] = [];
// Dimension check
if (embedding.vector.length !== ValidationRules.embedding.dimensions) {
errors.push({
field: 'vector',
message: `Expected ${ValidationRules.embedding.dimensions}D, got ${embedding.vector.length}D`
});
}
// NaN/Inf check
for (let i = 0; i < embedding.vector.length; i++) {
if (isNaN(embedding.vector[i])) {
errors.push({ field: 'vector', message: `NaN at index ${i}` });
break;
}
if (!isFinite(embedding.vector[i])) {
errors.push({ field: 'vector', message: `Infinity at index ${i}` });
break;
}
}
// Norm check
const norm = l2Norm(embedding.vector);
if (norm < ValidationRules.embedding.norm_range.min ||
norm > ValidationRules.embedding.norm_range.max) {
errors.push({
field: 'vector',
message: `Norm ${norm.toFixed(3)} outside expected range`
});
}
return { valid: errors.length === 0, errors, warnings: [] };
}
}
8.2 Consistency Checks
interface ConsistencyChecker {
// Run all consistency checks
async runChecks(): Promise<ConsistencyReport> {
const checks = await Promise.all([
this.checkOrphanedSegments(),
this.checkMissingEmbeddings(),
this.checkDuplicateRecordings(),
this.checkBrokenReferences(),
this.checkIndexSync(),
this.checkTemporalConsistency()
]);
return {
timestamp: new Date().toISOString(),
checks,
overallHealth: checks.every(c => c.passed) ? 'healthy' : 'degraded'
};
}
// Find segments without parent recordings
async checkOrphanedSegments(): Promise<CheckResult> {
const orphans = await db.query(`
SELECT cs.id FROM call_segments cs
LEFT JOIN recordings r ON cs.recording_id = r.id
WHERE r.id IS NULL
`);
return {
name: 'orphaned_segments',
passed: orphans.length === 0,
count: orphans.length,
action: 'DELETE orphaned segments or restore recordings'
};
}
// Find segments without embeddings
async checkMissingEmbeddings(): Promise<CheckResult> {
const missing = await db.query(`
SELECT cs.id FROM call_segments cs
LEFT JOIN embeddings e ON cs.id = e.segment_id
WHERE e.id IS NULL AND cs.created_at < NOW() - INTERVAL '1 hour'
`);
return {
name: 'missing_embeddings',
passed: missing.length === 0,
count: missing.length,
action: 'Reprocess segments to generate embeddings'
};
}
// Verify HNSW index matches database
async checkIndexSync(): Promise<CheckResult> {
const dbCount = await db.query(`SELECT COUNT(*) FROM embeddings WHERE storage_tier = 'hot'`);
const indexCount = await hnsw.getVectorCount();
const diff = Math.abs(dbCount - indexCount);
return {
name: 'index_sync',
passed: diff < 100, // Allow small discrepancy during updates
count: diff,
action: diff > 100 ? 'Rebuild HNSW index' : 'None required'
};
}
// Check temporal ordering of segments
async checkTemporalConsistency(): Promise<CheckResult> {
const violations = await db.query(`
SELECT recording_id, COUNT(*) as overlaps
FROM (
SELECT recording_id, t0_ms, t1_ms,
LEAD(t0_ms) OVER (PARTITION BY recording_id ORDER BY t0_ms) as next_t0
FROM call_segments
) sub
WHERE t1_ms > next_t0
GROUP BY recording_id
`);
return {
name: 'temporal_consistency',
passed: violations.length === 0,
count: violations.reduce((sum, v) => sum + v.overlaps, 0),
action: 'Re-segment recordings with overlapping segments'
};
}
}
8.3 Quality Gates
interface QualityGates {
// Gate for accepting new recordings
recording_acceptance: {
min_quality_score: 0.3,
min_snr_db: 0,
max_clipping_ratio: 0.1,
min_duration_seconds: 5
};
// Gate for including in HNSW hot tier
hot_tier_eligibility: {
embedding_norm_range: [0.8, 1.2],
segmentation_confidence: 0.7,
no_nan_values: true
};
// Gate for species identification
species_id_confidence: {
auto_accept_threshold: 0.9,
human_review_threshold: 0.5,
reject_below: 0.3
};
// Gate for cluster assignment
cluster_assignment: {
min_confidence: 0.6,
max_distance_to_centroid: 0.5
};
}
// Quality gate enforcement
class QualityGateEnforcer {
async enforceRecordingGate(recording: Recording): Promise<GateResult> {
const gates = QualityGates.recording_acceptance;
const failures: string[] = [];
if (recording.quality_score < gates.min_quality_score) {
failures.push(`Quality score ${recording.quality_score} < ${gates.min_quality_score}`);
}
// Additional checks...
return {
passed: failures.length === 0,
failures,
action: failures.length > 0 ? 'quarantine' : 'proceed'
};
}
}
Consequences
Positive
- Performance: Tiered storage achieves 150x-12,500x search improvement via HNSW with graceful degradation to warm/cold tiers
- Scalability: Architecture supports 100M+ vectors with sub-second queries
- Flexibility: Graph relationships enable complex Cypher queries for motif and sequence analysis
- Data Quality: Comprehensive validation prevents corrupt data from entering the system
- Recoverability: RPO of 15 minutes and RTO of 1 hour meet operational requirements
- Cost Efficiency: 32x compression for cold tier dramatically reduces storage costs
Negative
- Complexity: Three-tier storage adds operational overhead
- Latency Variability: Cold tier queries are 100-1000x slower than hot tier
- Migration Risk: Quantization introduces small accuracy loss (~2-5%)
- Storage Duplication: Multiple tiers may temporarily hold same data during transitions
Mitigations
- Automated tiering policies minimize manual intervention
- Warm tier serves as buffer, ensuring graceful degradation
- Calibrated quantization preserves retrieval quality above 95%
- Background jobs clean up duplicate data after successful tier migration
References
- Perch 2.0: The Bittern Lesson for Bioacoustics
- RuVector: A Database that Autonomously Learns
- HNSW: Hierarchical Navigable Small World Graphs
- Product Quantization for Nearest Neighbor Search
- Poincare Embeddings for Learning Hierarchical Representations
Document Version: 1.0 Last Updated: 2026-01-15 Next Review: 2026-04-15