Files
wifi-densepose/crates/ruvector-postgres/docs/integration-plans/05-sparse-vectors.md
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

704 lines
21 KiB
Markdown

# Sparse Vectors Integration Plan
## Overview
Integrate sparse vector support into PostgreSQL for efficient storage and search of high-dimensional sparse embeddings (BM25, SPLADE, learned sparse representations).
## Architecture
```
┌─────────────────────────────────────────────────────────────────┐
│ PostgreSQL Extension │
├─────────────────────────────────────────────────────────────────┤
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Sparse Vector Type │ │
│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │
│ │ │ COO Format │ │ CSR Format │ │ Dictionary │ │ │
│ │ │ (indices, │ │ (sorted, │ │ (hash-based │ │ │
│ │ │ values) │ │ compact) │ │ lookup) │ │ │
│ │ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ │
│ └─────────┼─────────────────┼─────────────────┼───────────┘ │
│ └─────────────────┴─────────────────┘ │
│ ▼ │
│ ┌───────────────────────────┐ │
│ │ Sparse Distance Funcs │ │
│ │ (Dot, Cosine, BM25) │ │
│ └───────────────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
```
## Module Structure
```
src/
├── sparse/
│ ├── mod.rs # Module exports
│ ├── types/
│ │ ├── sparsevec.rs # Core sparse vector type
│ │ ├── coo.rs # COO format (coordinate)
│ │ └── csr.rs # CSR format (compressed sparse row)
│ ├── distance.rs # Sparse distance functions
│ ├── index/
│ │ ├── inverted.rs # Inverted index for sparse search
│ │ └── sparse_hnsw.rs # HNSW adapted for sparse vectors
│ ├── hybrid.rs # Dense + sparse hybrid search
│ └── operators.rs # SQL operators
```
## SQL Interface
### Sparse Vector Type
```sql
-- Create table with sparse vectors
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
content TEXT,
dense_embedding vector(768),
sparse_embedding sparsevec(30000), -- BM25 or SPLADE
metadata jsonb
);
-- Insert sparse vector (indices:values format)
INSERT INTO documents (content, sparse_embedding)
VALUES (
'Machine learning for natural language processing',
'{1024:0.5, 2048:0.3, 4096:0.8, 15000:0.2}'::sparsevec
);
-- Insert from array representation
INSERT INTO documents (sparse_embedding)
VALUES (ruvector_to_sparse(
indices := ARRAY[1024, 2048, 4096, 15000],
values := ARRAY[0.5, 0.3, 0.8, 0.2],
dim := 30000
));
```
### Distance Operations
```sql
-- Sparse dot product (inner product similarity)
SELECT id, content,
ruvector_sparse_dot(sparse_embedding, query_sparse) AS score
FROM documents
ORDER BY score DESC
LIMIT 10;
-- Sparse cosine similarity
SELECT id,
ruvector_sparse_cosine(sparse_embedding, query_sparse) AS similarity
FROM documents
WHERE ruvector_sparse_cosine(sparse_embedding, query_sparse) > 0.5;
-- Custom operator: <#> for sparse inner product
SELECT * FROM documents
ORDER BY sparse_embedding <#> query_sparse DESC
LIMIT 10;
```
### Sparse Index
```sql
-- Create inverted index for sparse vectors
CREATE INDEX ON documents USING ruvector_sparse (
sparse_embedding sparsevec(30000)
) WITH (
pruning_threshold = 0.1, -- Prune low-weight terms
quantization = 'int8' -- Optional quantization
);
-- Approximate sparse search
SELECT * FROM documents
ORDER BY sparse_embedding <#> query_sparse
LIMIT 10;
```
### Hybrid Dense + Sparse Search
```sql
-- Hybrid search combining dense and sparse
SELECT id, content,
0.7 * (1 - (dense_embedding <=> query_dense)) +
0.3 * ruvector_sparse_dot(sparse_embedding, query_sparse) AS hybrid_score
FROM documents
ORDER BY hybrid_score DESC
LIMIT 10;
-- Built-in hybrid search function
SELECT * FROM ruvector_hybrid_search(
table_name := 'documents',
dense_column := 'dense_embedding',
sparse_column := 'sparse_embedding',
dense_query := query_dense,
sparse_query := query_sparse,
dense_weight := 0.7,
sparse_weight := 0.3,
k := 10
);
```
## Implementation Phases
### Phase 1: Sparse Vector Type (Week 1-2)
```rust
// src/sparse/types/sparsevec.rs
use pgrx::prelude::*;
use serde::{Serialize, Deserialize};
/// Sparse vector stored as sorted (index, value) pairs
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SparseVec {
indices: Vec<u32>,
values: Vec<f32>,
dim: u32,
}
impl SparseVec {
pub fn new(indices: Vec<u32>, values: Vec<f32>, dim: u32) -> Result<Self, SparseError> {
if indices.len() != values.len() {
return Err(SparseError::LengthMismatch);
}
// Ensure sorted and unique
let mut pairs: Vec<_> = indices.into_iter().zip(values.into_iter()).collect();
pairs.sort_by_key(|(i, _)| *i);
pairs.dedup_by_key(|(i, _)| *i);
let (indices, values): (Vec<_>, Vec<_>) = pairs.into_iter().unzip();
if indices.last().map_or(false, |&i| i >= dim) {
return Err(SparseError::IndexOutOfBounds);
}
Ok(Self { indices, values, dim })
}
/// Number of non-zero elements
#[inline]
pub fn nnz(&self) -> usize {
self.indices.len()
}
/// Get value at index (O(log n) binary search)
pub fn get(&self, index: u32) -> f32 {
match self.indices.binary_search(&index) {
Ok(pos) => self.values[pos],
Err(_) => 0.0,
}
}
/// Iterate over non-zero elements
pub fn iter(&self) -> impl Iterator<Item = (u32, f32)> + '_ {
self.indices.iter().copied().zip(self.values.iter().copied())
}
/// L2 norm
pub fn norm(&self) -> f32 {
self.values.iter().map(|&v| v * v).sum::<f32>().sqrt()
}
/// Prune elements below threshold
pub fn prune(&mut self, threshold: f32) {
let pairs: Vec<_> = self.indices.iter().copied()
.zip(self.values.iter().copied())
.filter(|(_, v)| v.abs() >= threshold)
.collect();
self.indices = pairs.iter().map(|(i, _)| *i).collect();
self.values = pairs.iter().map(|(_, v)| *v).collect();
}
/// Top-k sparsification
pub fn top_k(&self, k: usize) -> SparseVec {
let mut indexed: Vec<_> = self.indices.iter().copied()
.zip(self.values.iter().copied())
.collect();
indexed.sort_by(|(_, a), (_, b)| b.abs().partial_cmp(&a.abs()).unwrap());
indexed.truncate(k);
indexed.sort_by_key(|(i, _)| *i);
let (indices, values): (Vec<_>, Vec<_>) = indexed.into_iter().unzip();
SparseVec { indices, values, dim: self.dim }
}
}
// PostgreSQL type registration
#[derive(PostgresType, Serialize, Deserialize)]
#[pgx(sql = "CREATE TYPE sparsevec")]
pub struct PgSparseVec(SparseVec);
impl FromDatum for PgSparseVec {
// ... TOAST-aware deserialization
}
impl IntoDatum for PgSparseVec {
// ... serialization
}
// Parse from string: '{1:0.5, 2:0.3}'
impl std::str::FromStr for SparseVec {
type Err = SparseError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.trim().trim_start_matches('{').trim_end_matches('}');
let mut indices = Vec::new();
let mut values = Vec::new();
let mut max_index = 0u32;
for pair in s.split(',') {
let parts: Vec<_> = pair.trim().split(':').collect();
if parts.len() != 2 {
return Err(SparseError::ParseError);
}
let idx: u32 = parts[0].trim().parse().map_err(|_| SparseError::ParseError)?;
let val: f32 = parts[1].trim().parse().map_err(|_| SparseError::ParseError)?;
indices.push(idx);
values.push(val);
max_index = max_index.max(idx);
}
SparseVec::new(indices, values, max_index + 1)
}
}
```
### Phase 2: Sparse Distance Functions (Week 3-4)
```rust
// src/sparse/distance.rs
use simsimd::SpatialSimilarity;
/// Sparse dot product (inner product)
/// Only iterates over shared non-zero indices
pub fn sparse_dot(a: &SparseVec, b: &SparseVec) -> f32 {
let mut result = 0.0;
let mut i = 0;
let mut j = 0;
while i < a.indices.len() && j < b.indices.len() {
match a.indices[i].cmp(&b.indices[j]) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
result += a.values[i] * b.values[j];
i += 1;
j += 1;
}
}
}
result
}
/// Sparse cosine similarity
pub fn sparse_cosine(a: &SparseVec, b: &SparseVec) -> f32 {
let dot = sparse_dot(a, b);
let norm_a = a.norm();
let norm_b = b.norm();
if norm_a == 0.0 || norm_b == 0.0 {
return 0.0;
}
dot / (norm_a * norm_b)
}
/// Sparse Euclidean distance
pub fn sparse_euclidean(a: &SparseVec, b: &SparseVec) -> f32 {
let mut result = 0.0;
let mut i = 0;
let mut j = 0;
while i < a.indices.len() || j < b.indices.len() {
let idx_a = a.indices.get(i).copied().unwrap_or(u32::MAX);
let idx_b = b.indices.get(j).copied().unwrap_or(u32::MAX);
match idx_a.cmp(&idx_b) {
std::cmp::Ordering::Less => {
result += a.values[i] * a.values[i];
i += 1;
}
std::cmp::Ordering::Greater => {
result += b.values[j] * b.values[j];
j += 1;
}
std::cmp::Ordering::Equal => {
let diff = a.values[i] - b.values[j];
result += diff * diff;
i += 1;
j += 1;
}
}
}
result.sqrt()
}
/// BM25 scoring for sparse term vectors
pub fn sparse_bm25(
query: &SparseVec,
doc: &SparseVec,
doc_len: f32,
avg_doc_len: f32,
k1: f32,
b: f32,
) -> f32 {
let mut score = 0.0;
let mut i = 0;
let mut j = 0;
while i < query.indices.len() && j < doc.indices.len() {
match query.indices[i].cmp(&doc.indices[j]) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
let idf = query.values[i]; // Assume query values are IDF weights
let tf = doc.values[j]; // Doc values are TF
let numerator = tf * (k1 + 1.0);
let denominator = tf + k1 * (1.0 - b + b * doc_len / avg_doc_len);
score += idf * numerator / denominator;
i += 1;
j += 1;
}
}
}
score
}
// PostgreSQL functions
#[pg_extern(immutable, parallel_safe)]
fn ruvector_sparse_dot(a: PgSparseVec, b: PgSparseVec) -> f32 {
sparse_dot(&a.0, &b.0)
}
#[pg_extern(immutable, parallel_safe)]
fn ruvector_sparse_cosine(a: PgSparseVec, b: PgSparseVec) -> f32 {
sparse_cosine(&a.0, &b.0)
}
#[pg_extern(immutable, parallel_safe)]
fn ruvector_sparse_euclidean(a: PgSparseVec, b: PgSparseVec) -> f32 {
sparse_euclidean(&a.0, &b.0)
}
```
### Phase 3: Inverted Index (Week 5-7)
```rust
// src/sparse/index/inverted.rs
use dashmap::DashMap;
use parking_lot::RwLock;
/// Inverted index for efficient sparse vector search
pub struct InvertedIndex {
/// term_id -> [(doc_id, weight), ...]
postings: DashMap<u32, Vec<(u64, f32)>>,
/// doc_id -> sparse vector (for re-ranking)
documents: DashMap<u64, SparseVec>,
/// Document norms for cosine similarity
doc_norms: DashMap<u64, f32>,
/// Configuration
config: InvertedIndexConfig,
}
pub struct InvertedIndexConfig {
pub pruning_threshold: f32,
pub max_postings_per_term: usize,
pub quantization: Option<Quantization>,
}
impl InvertedIndex {
pub fn new(config: InvertedIndexConfig) -> Self {
Self {
postings: DashMap::new(),
documents: DashMap::new(),
doc_norms: DashMap::new(),
config,
}
}
/// Insert document into index
pub fn insert(&self, doc_id: u64, vector: SparseVec) {
let norm = vector.norm();
// Index each non-zero term
for (term_id, weight) in vector.iter() {
if weight.abs() < self.config.pruning_threshold {
continue;
}
self.postings
.entry(term_id)
.or_insert_with(Vec::new)
.push((doc_id, weight));
}
self.doc_norms.insert(doc_id, norm);
self.documents.insert(doc_id, vector);
}
/// Search using WAND algorithm for top-k
pub fn search(&self, query: &SparseVec, k: usize) -> Vec<(u64, f32)> {
// Collect candidate documents
let mut doc_scores: HashMap<u64, f32> = HashMap::new();
for (term_id, query_weight) in query.iter() {
if let Some(postings) = self.postings.get(&term_id) {
for &(doc_id, doc_weight) in postings.iter() {
*doc_scores.entry(doc_id).or_insert(0.0) += query_weight * doc_weight;
}
}
}
// Get top-k
let mut results: Vec<_> = doc_scores.into_iter().collect();
results.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
results.truncate(k);
results
}
/// WAND (Weak AND) algorithm for efficient top-k retrieval
pub fn search_wand(&self, query: &SparseVec, k: usize) -> Vec<(u64, f32)> {
// Sort query terms by max contribution (upper bound)
let mut term_info: Vec<_> = query.iter()
.filter_map(|(term_id, weight)| {
self.postings.get(&term_id).map(|p| {
let max_doc_weight = p.iter().map(|(_, w)| *w).fold(0.0f32, f32::max);
(term_id, weight, max_doc_weight * weight)
})
})
.collect();
term_info.sort_by(|(_, _, a), (_, _, b)| b.partial_cmp(a).unwrap());
// WAND traversal
let mut heap: BinaryHeap<(OrderedFloat<f32>, u64)> = BinaryHeap::new();
let threshold = 0.0f32;
// ... WAND implementation
heap.into_iter().map(|(s, id)| (id, s.0)).collect()
}
}
// PostgreSQL index access method
#[pg_extern]
fn ruvector_sparse_handler(internal: Internal) -> Internal {
// Index AM handler for sparse inverted index
}
```
### Phase 4: Hybrid Search (Week 8-9)
```rust
// src/sparse/hybrid.rs
/// Hybrid dense + sparse search
pub struct HybridSearch {
dense_weight: f32,
sparse_weight: f32,
fusion_method: FusionMethod,
}
pub enum FusionMethod {
/// Linear combination of scores
Linear,
/// Reciprocal Rank Fusion
RRF { k: f32 },
/// Learned fusion weights
Learned { model: FusionModel },
}
impl HybridSearch {
/// Combine dense and sparse results
pub fn search(
&self,
dense_results: &[(u64, f32)],
sparse_results: &[(u64, f32)],
k: usize,
) -> Vec<(u64, f32)> {
match &self.fusion_method {
FusionMethod::Linear => {
self.linear_fusion(dense_results, sparse_results, k)
}
FusionMethod::RRF { k: rrf_k } => {
self.rrf_fusion(dense_results, sparse_results, k, *rrf_k)
}
FusionMethod::Learned { model } => {
model.fuse(dense_results, sparse_results, k)
}
}
}
fn linear_fusion(
&self,
dense: &[(u64, f32)],
sparse: &[(u64, f32)],
k: usize,
) -> Vec<(u64, f32)> {
let mut scores: HashMap<u64, f32> = HashMap::new();
// Normalize dense scores to [0, 1]
let dense_max = dense.iter().map(|(_, s)| *s).fold(0.0f32, f32::max);
for (id, score) in dense {
let normalized = if dense_max > 0.0 { score / dense_max } else { 0.0 };
*scores.entry(*id).or_insert(0.0) += self.dense_weight * normalized;
}
// Normalize sparse scores to [0, 1]
let sparse_max = sparse.iter().map(|(_, s)| *s).fold(0.0f32, f32::max);
for (id, score) in sparse {
let normalized = if sparse_max > 0.0 { score / sparse_max } else { 0.0 };
*scores.entry(*id).or_insert(0.0) += self.sparse_weight * normalized;
}
let mut results: Vec<_> = scores.into_iter().collect();
results.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
results.truncate(k);
results
}
fn rrf_fusion(
&self,
dense: &[(u64, f32)],
sparse: &[(u64, f32)],
k: usize,
rrf_k: f32,
) -> Vec<(u64, f32)> {
let mut scores: HashMap<u64, f32> = HashMap::new();
// RRF: 1 / (k + rank)
for (rank, (id, _)) in dense.iter().enumerate() {
*scores.entry(*id).or_insert(0.0) += self.dense_weight / (rrf_k + rank as f32 + 1.0);
}
for (rank, (id, _)) in sparse.iter().enumerate() {
*scores.entry(*id).or_insert(0.0) += self.sparse_weight / (rrf_k + rank as f32 + 1.0);
}
let mut results: Vec<_> = scores.into_iter().collect();
results.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
results.truncate(k);
results
}
}
#[pg_extern]
fn ruvector_hybrid_search(
table_name: &str,
dense_column: &str,
sparse_column: &str,
dense_query: Vec<f32>,
sparse_query: PgSparseVec,
dense_weight: default!(f32, 0.7),
sparse_weight: default!(f32, 0.3),
k: default!(i32, 10),
fusion: default!(&str, "'linear'"),
) -> TableIterator<'static, (name!(id, i64), name!(score, f32))> {
// Implementation using SPI
}
```
### Phase 5: SPLADE Integration (Week 10)
```rust
// src/sparse/splade.rs
/// SPLADE-style learned sparse representations
pub struct SpladeEncoder {
/// Vocab size for term indices
vocab_size: usize,
/// Sparsity threshold
threshold: f32,
}
impl SpladeEncoder {
/// Convert dense embedding to SPLADE-style sparse
/// (typically done externally, but we support post-processing)
pub fn sparsify(&self, logits: &[f32]) -> SparseVec {
let mut indices = Vec::new();
let mut values = Vec::new();
for (i, &logit) in logits.iter().enumerate() {
// ReLU + log(1 + x) activation
if logit > 0.0 {
let value = (1.0 + logit).ln();
if value > self.threshold {
indices.push(i as u32);
values.push(value);
}
}
}
SparseVec::new(indices, values, self.vocab_size as u32).unwrap()
}
}
#[pg_extern]
fn ruvector_to_sparse(
indices: Vec<i32>,
values: Vec<f32>,
dim: i32,
) -> PgSparseVec {
let indices: Vec<u32> = indices.into_iter().map(|i| i as u32).collect();
PgSparseVec(SparseVec::new(indices, values, dim as u32).unwrap())
}
#[pg_extern]
fn ruvector_sparse_top_k(sparse: PgSparseVec, k: i32) -> PgSparseVec {
PgSparseVec(sparse.0.top_k(k as usize))
}
#[pg_extern]
fn ruvector_sparse_prune(sparse: PgSparseVec, threshold: f32) -> PgSparseVec {
let mut result = sparse.0.clone();
result.prune(threshold);
PgSparseVec(result)
}
```
## Benchmarks
| Operation | NNZ (query) | NNZ (doc) | Dim | Time (μs) |
|-----------|-------------|-----------|-----|-----------|
| Dot Product | 100 | 100 | 30K | 0.8 |
| Cosine | 100 | 100 | 30K | 1.2 |
| Inverted Search | 100 | - | 30K | 450 |
| Hybrid Search | 100 | 768 | 30K | 1200 |
## Dependencies
```toml
[dependencies]
# Concurrent collections
dashmap = "6.0"
# Ordered floats for heaps
ordered-float = "4.2"
# Serialization
serde = { version = "1.0", features = ["derive"] }
bincode = "2.0.0-rc.3"
```
## Feature Flags
```toml
[features]
sparse = []
sparse-inverted = ["sparse"]
sparse-hybrid = ["sparse"]
sparse-all = ["sparse-inverted", "sparse-hybrid"]
```