Files

ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900

2026-02-28 14:39:40 -05:00

21 KiB

Raw Blame History

Sparse Vectors Integration Plan

Overview

Integrate sparse vector support into PostgreSQL for efficient storage and search of high-dimensional sparse embeddings (BM25, SPLADE, learned sparse representations).

Architecture

┌─────────────────────────────────────────────────────────────────┐
│                     PostgreSQL Extension                         │
├─────────────────────────────────────────────────────────────────┤
│  ┌─────────────────────────────────────────────────────────┐    │
│  │                  Sparse Vector Type                      │    │
│  │  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐   │    │
│  │  │  COO Format  │  │  CSR Format  │  │  Dictionary  │   │    │
│  │  │  (indices,   │  │  (sorted,    │  │  (hash-based │   │    │
│  │  │   values)    │  │   compact)   │  │   lookup)    │   │    │
│  │  └──────┬───────┘  └──────┬───────┘  └──────┬───────┘   │    │
│  └─────────┼─────────────────┼─────────────────┼───────────┘    │
│            └─────────────────┴─────────────────┘                │
│                              ▼                                   │
│              ┌───────────────────────────┐                       │
│              │   Sparse Distance Funcs   │                       │
│              │   (Dot, Cosine, BM25)     │                       │
│              └───────────────────────────┘                       │
└─────────────────────────────────────────────────────────────────┘

Module Structure

src/
├── sparse/
│   ├── mod.rs              # Module exports
│   ├── types/
│   │   ├── sparsevec.rs    # Core sparse vector type
│   │   ├── coo.rs          # COO format (coordinate)
│   │   └── csr.rs          # CSR format (compressed sparse row)
│   ├── distance.rs         # Sparse distance functions
│   ├── index/
│   │   ├── inverted.rs     # Inverted index for sparse search
│   │   └── sparse_hnsw.rs  # HNSW adapted for sparse vectors
│   ├── hybrid.rs           # Dense + sparse hybrid search
│   └── operators.rs        # SQL operators

SQL Interface

Sparse Vector Type

-- Create table with sparse vectors
CREATE TABLE documents (
    id SERIAL PRIMARY KEY,
    content TEXT,
    dense_embedding vector(768),
    sparse_embedding sparsevec(30000),  -- BM25 or SPLADE
    metadata jsonb
);

-- Insert sparse vector (indices:values format)
INSERT INTO documents (content, sparse_embedding)
VALUES (
    'Machine learning for natural language processing',
    '{1024:0.5, 2048:0.3, 4096:0.8, 15000:0.2}'::sparsevec
);

-- Insert from array representation
INSERT INTO documents (sparse_embedding)
VALUES (ruvector_to_sparse(
    indices := ARRAY[1024, 2048, 4096, 15000],
    values := ARRAY[0.5, 0.3, 0.8, 0.2],
    dim := 30000
));

Distance Operations

-- Sparse dot product (inner product similarity)
SELECT id, content,
       ruvector_sparse_dot(sparse_embedding, query_sparse) AS score
FROM documents
ORDER BY score DESC
LIMIT 10;

-- Sparse cosine similarity
SELECT id,
       ruvector_sparse_cosine(sparse_embedding, query_sparse) AS similarity
FROM documents
WHERE ruvector_sparse_cosine(sparse_embedding, query_sparse) > 0.5;

-- Custom operator: <#> for sparse inner product
SELECT * FROM documents
ORDER BY sparse_embedding <#> query_sparse DESC
LIMIT 10;

Sparse Index

-- Create inverted index for sparse vectors
CREATE INDEX ON documents USING ruvector_sparse (
    sparse_embedding sparsevec(30000)
) WITH (
    pruning_threshold = 0.1,  -- Prune low-weight terms
    quantization = 'int8'     -- Optional quantization
);

-- Approximate sparse search
SELECT * FROM documents
ORDER BY sparse_embedding <#> query_sparse
LIMIT 10;

Hybrid Dense + Sparse Search

-- Hybrid search combining dense and sparse
SELECT id, content,
       0.7 * (1 - (dense_embedding <=> query_dense)) +
       0.3 * ruvector_sparse_dot(sparse_embedding, query_sparse) AS hybrid_score
FROM documents
ORDER BY hybrid_score DESC
LIMIT 10;

-- Built-in hybrid search function
SELECT * FROM ruvector_hybrid_search(
    table_name := 'documents',
    dense_column := 'dense_embedding',
    sparse_column := 'sparse_embedding',
    dense_query := query_dense,
    sparse_query := query_sparse,
    dense_weight := 0.7,
    sparse_weight := 0.3,
    k := 10
);

Implementation Phases

Phase 1: Sparse Vector Type (Week 1-2)

// src/sparse/types/sparsevec.rs

use pgrx::prelude::*;
use serde::{Serialize, Deserialize};

/// Sparse vector stored as sorted (index, value) pairs
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SparseVec {
    indices: Vec<u32>,
    values: Vec<f32>,
    dim: u32,
}

impl SparseVec {
    pub fn new(indices: Vec<u32>, values: Vec<f32>, dim: u32) -> Result<Self, SparseError> {
        if indices.len() != values.len() {
            return Err(SparseError::LengthMismatch);
        }

        // Ensure sorted and unique
        let mut pairs: Vec<_> = indices.into_iter().zip(values.into_iter()).collect();
        pairs.sort_by_key(|(i, _)| *i);
        pairs.dedup_by_key(|(i, _)| *i);

        let (indices, values): (Vec<_>, Vec<_>) = pairs.into_iter().unzip();

        if indices.last().map_or(false, |&i| i >= dim) {
            return Err(SparseError::IndexOutOfBounds);
        }

        Ok(Self { indices, values, dim })
    }

    /// Number of non-zero elements
    #[inline]
    pub fn nnz(&self) -> usize {
        self.indices.len()
    }

    /// Get value at index (O(log n) binary search)
    pub fn get(&self, index: u32) -> f32 {
        match self.indices.binary_search(&index) {
            Ok(pos) => self.values[pos],
            Err(_) => 0.0,
        }
    }

    /// Iterate over non-zero elements
    pub fn iter(&self) -> impl Iterator<Item = (u32, f32)> + '_ {
        self.indices.iter().copied().zip(self.values.iter().copied())
    }

    /// L2 norm
    pub fn norm(&self) -> f32 {
        self.values.iter().map(|&v| v * v).sum::<f32>().sqrt()
    }

    /// Prune elements below threshold
    pub fn prune(&mut self, threshold: f32) {
        let pairs: Vec<_> = self.indices.iter().copied()
            .zip(self.values.iter().copied())
            .filter(|(_, v)| v.abs() >= threshold)
            .collect();

        self.indices = pairs.iter().map(|(i, _)| *i).collect();
        self.values = pairs.iter().map(|(_, v)| *v).collect();
    }

    /// Top-k sparsification
    pub fn top_k(&self, k: usize) -> SparseVec {
        let mut indexed: Vec<_> = self.indices.iter().copied()
            .zip(self.values.iter().copied())
            .collect();

        indexed.sort_by(|(_, a), (_, b)| b.abs().partial_cmp(&a.abs()).unwrap());
        indexed.truncate(k);
        indexed.sort_by_key(|(i, _)| *i);

        let (indices, values): (Vec<_>, Vec<_>) = indexed.into_iter().unzip();

        SparseVec { indices, values, dim: self.dim }
    }
}

// PostgreSQL type registration
#[derive(PostgresType, Serialize, Deserialize)]
#[pgx(sql = "CREATE TYPE sparsevec")]
pub struct PgSparseVec(SparseVec);

impl FromDatum for PgSparseVec {
    // ... TOAST-aware deserialization
}

impl IntoDatum for PgSparseVec {
    // ... serialization
}

// Parse from string: '{1:0.5, 2:0.3}'
impl std::str::FromStr for SparseVec {
    type Err = SparseError;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let s = s.trim().trim_start_matches('{').trim_end_matches('}');
        let mut indices = Vec::new();
        let mut values = Vec::new();
        let mut max_index = 0u32;

        for pair in s.split(',') {
            let parts: Vec<_> = pair.trim().split(':').collect();
            if parts.len() != 2 {
                return Err(SparseError::ParseError);
            }
            let idx: u32 = parts[0].trim().parse().map_err(|_| SparseError::ParseError)?;
            let val: f32 = parts[1].trim().parse().map_err(|_| SparseError::ParseError)?;
            indices.push(idx);
            values.push(val);
            max_index = max_index.max(idx);
        }

        SparseVec::new(indices, values, max_index + 1)
    }
}

Phase 2: Sparse Distance Functions (Week 3-4)

// src/sparse/distance.rs

use simsimd::SpatialSimilarity;

/// Sparse dot product (inner product)
/// Only iterates over shared non-zero indices
pub fn sparse_dot(a: &SparseVec, b: &SparseVec) -> f32 {
    let mut result = 0.0;
    let mut i = 0;
    let mut j = 0;

    while i < a.indices.len() && j < b.indices.len() {
        match a.indices[i].cmp(&b.indices[j]) {
            std::cmp::Ordering::Less => i += 1,
            std::cmp::Ordering::Greater => j += 1,
            std::cmp::Ordering::Equal => {
                result += a.values[i] * b.values[j];
                i += 1;
                j += 1;
            }
        }
    }

    result
}

/// Sparse cosine similarity
pub fn sparse_cosine(a: &SparseVec, b: &SparseVec) -> f32 {
    let dot = sparse_dot(a, b);
    let norm_a = a.norm();
    let norm_b = b.norm();

    if norm_a == 0.0 || norm_b == 0.0 {
        return 0.0;
    }

    dot / (norm_a * norm_b)
}

/// Sparse Euclidean distance
pub fn sparse_euclidean(a: &SparseVec, b: &SparseVec) -> f32 {
    let mut result = 0.0;
    let mut i = 0;
    let mut j = 0;

    while i < a.indices.len() || j < b.indices.len() {
        let idx_a = a.indices.get(i).copied().unwrap_or(u32::MAX);
        let idx_b = b.indices.get(j).copied().unwrap_or(u32::MAX);

        match idx_a.cmp(&idx_b) {
            std::cmp::Ordering::Less => {
                result += a.values[i] * a.values[i];
                i += 1;
            }
            std::cmp::Ordering::Greater => {
                result += b.values[j] * b.values[j];
                j += 1;
            }
            std::cmp::Ordering::Equal => {
                let diff = a.values[i] - b.values[j];
                result += diff * diff;
                i += 1;
                j += 1;
            }
        }
    }

    result.sqrt()
}

/// BM25 scoring for sparse term vectors
pub fn sparse_bm25(
    query: &SparseVec,
    doc: &SparseVec,
    doc_len: f32,
    avg_doc_len: f32,
    k1: f32,
    b: f32,
) -> f32 {
    let mut score = 0.0;
    let mut i = 0;
    let mut j = 0;

    while i < query.indices.len() && j < doc.indices.len() {
        match query.indices[i].cmp(&doc.indices[j]) {
            std::cmp::Ordering::Less => i += 1,
            std::cmp::Ordering::Greater => j += 1,
            std::cmp::Ordering::Equal => {
                let idf = query.values[i];  // Assume query values are IDF weights
                let tf = doc.values[j];     // Doc values are TF

                let numerator = tf * (k1 + 1.0);
                let denominator = tf + k1 * (1.0 - b + b * doc_len / avg_doc_len);

                score += idf * numerator / denominator;
                i += 1;
                j += 1;
            }
        }
    }

    score
}

// PostgreSQL functions
#[pg_extern(immutable, parallel_safe)]
fn ruvector_sparse_dot(a: PgSparseVec, b: PgSparseVec) -> f32 {
    sparse_dot(&a.0, &b.0)
}

#[pg_extern(immutable, parallel_safe)]
fn ruvector_sparse_cosine(a: PgSparseVec, b: PgSparseVec) -> f32 {
    sparse_cosine(&a.0, &b.0)
}

#[pg_extern(immutable, parallel_safe)]
fn ruvector_sparse_euclidean(a: PgSparseVec, b: PgSparseVec) -> f32 {
    sparse_euclidean(&a.0, &b.0)
}

Phase 3: Inverted Index (Week 5-7)

// src/sparse/index/inverted.rs

use dashmap::DashMap;
use parking_lot::RwLock;

/// Inverted index for efficient sparse vector search
pub struct InvertedIndex {
    /// term_id -> [(doc_id, weight), ...]
    postings: DashMap<u32, Vec<(u64, f32)>>,
    /// doc_id -> sparse vector (for re-ranking)
    documents: DashMap<u64, SparseVec>,
    /// Document norms for cosine similarity
    doc_norms: DashMap<u64, f32>,
    /// Configuration
    config: InvertedIndexConfig,
}

pub struct InvertedIndexConfig {
    pub pruning_threshold: f32,
    pub max_postings_per_term: usize,
    pub quantization: Option<Quantization>,
}

impl InvertedIndex {
    pub fn new(config: InvertedIndexConfig) -> Self {
        Self {
            postings: DashMap::new(),
            documents: DashMap::new(),
            doc_norms: DashMap::new(),
            config,
        }
    }

    /// Insert document into index
    pub fn insert(&self, doc_id: u64, vector: SparseVec) {
        let norm = vector.norm();

        // Index each non-zero term
        for (term_id, weight) in vector.iter() {
            if weight.abs() < self.config.pruning_threshold {
                continue;
            }

            self.postings
                .entry(term_id)
                .or_insert_with(Vec::new)
                .push((doc_id, weight));
        }

        self.doc_norms.insert(doc_id, norm);
        self.documents.insert(doc_id, vector);
    }

    /// Search using WAND algorithm for top-k
    pub fn search(&self, query: &SparseVec, k: usize) -> Vec<(u64, f32)> {
        // Collect candidate documents
        let mut doc_scores: HashMap<u64, f32> = HashMap::new();

        for (term_id, query_weight) in query.iter() {
            if let Some(postings) = self.postings.get(&term_id) {
                for &(doc_id, doc_weight) in postings.iter() {
                    *doc_scores.entry(doc_id).or_insert(0.0) += query_weight * doc_weight;
                }
            }
        }

        // Get top-k
        let mut results: Vec<_> = doc_scores.into_iter().collect();
        results.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
        results.truncate(k);

        results
    }

    /// WAND (Weak AND) algorithm for efficient top-k retrieval
    pub fn search_wand(&self, query: &SparseVec, k: usize) -> Vec<(u64, f32)> {
        // Sort query terms by max contribution (upper bound)
        let mut term_info: Vec<_> = query.iter()
            .filter_map(|(term_id, weight)| {
                self.postings.get(&term_id).map(|p| {
                    let max_doc_weight = p.iter().map(|(_, w)| *w).fold(0.0f32, f32::max);
                    (term_id, weight, max_doc_weight * weight)
                })
            })
            .collect();

        term_info.sort_by(|(_, _, a), (_, _, b)| b.partial_cmp(a).unwrap());

        // WAND traversal
        let mut heap: BinaryHeap<(OrderedFloat<f32>, u64)> = BinaryHeap::new();
        let threshold = 0.0f32;

        // ... WAND implementation

        heap.into_iter().map(|(s, id)| (id, s.0)).collect()
    }
}

// PostgreSQL index access method
#[pg_extern]
fn ruvector_sparse_handler(internal: Internal) -> Internal {
    // Index AM handler for sparse inverted index
}

Phase 4: Hybrid Search (Week 8-9)

// src/sparse/hybrid.rs

/// Hybrid dense + sparse search
pub struct HybridSearch {
    dense_weight: f32,
    sparse_weight: f32,
    fusion_method: FusionMethod,
}

pub enum FusionMethod {
    /// Linear combination of scores
    Linear,
    /// Reciprocal Rank Fusion
    RRF { k: f32 },
    /// Learned fusion weights
    Learned { model: FusionModel },
}

impl HybridSearch {
    /// Combine dense and sparse results
    pub fn search(
        &self,
        dense_results: &[(u64, f32)],
        sparse_results: &[(u64, f32)],
        k: usize,
    ) -> Vec<(u64, f32)> {
        match &self.fusion_method {
            FusionMethod::Linear => {
                self.linear_fusion(dense_results, sparse_results, k)
            }
            FusionMethod::RRF { k: rrf_k } => {
                self.rrf_fusion(dense_results, sparse_results, k, *rrf_k)
            }
            FusionMethod::Learned { model } => {
                model.fuse(dense_results, sparse_results, k)
            }
        }
    }

    fn linear_fusion(
        &self,
        dense: &[(u64, f32)],
        sparse: &[(u64, f32)],
        k: usize,
    ) -> Vec<(u64, f32)> {
        let mut scores: HashMap<u64, f32> = HashMap::new();

        // Normalize dense scores to [0, 1]
        let dense_max = dense.iter().map(|(_, s)| *s).fold(0.0f32, f32::max);
        for (id, score) in dense {
            let normalized = if dense_max > 0.0 { score / dense_max } else { 0.0 };
            *scores.entry(*id).or_insert(0.0) += self.dense_weight * normalized;
        }

        // Normalize sparse scores to [0, 1]
        let sparse_max = sparse.iter().map(|(_, s)| *s).fold(0.0f32, f32::max);
        for (id, score) in sparse {
            let normalized = if sparse_max > 0.0 { score / sparse_max } else { 0.0 };
            *scores.entry(*id).or_insert(0.0) += self.sparse_weight * normalized;
        }

        let mut results: Vec<_> = scores.into_iter().collect();
        results.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
        results.truncate(k);
        results
    }

    fn rrf_fusion(
        &self,
        dense: &[(u64, f32)],
        sparse: &[(u64, f32)],
        k: usize,
        rrf_k: f32,
    ) -> Vec<(u64, f32)> {
        let mut scores: HashMap<u64, f32> = HashMap::new();

        // RRF: 1 / (k + rank)
        for (rank, (id, _)) in dense.iter().enumerate() {
            *scores.entry(*id).or_insert(0.0) += self.dense_weight / (rrf_k + rank as f32 + 1.0);
        }

        for (rank, (id, _)) in sparse.iter().enumerate() {
            *scores.entry(*id).or_insert(0.0) += self.sparse_weight / (rrf_k + rank as f32 + 1.0);
        }

        let mut results: Vec<_> = scores.into_iter().collect();
        results.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap());
        results.truncate(k);
        results
    }
}

#[pg_extern]
fn ruvector_hybrid_search(
    table_name: &str,
    dense_column: &str,
    sparse_column: &str,
    dense_query: Vec<f32>,
    sparse_query: PgSparseVec,
    dense_weight: default!(f32, 0.7),
    sparse_weight: default!(f32, 0.3),
    k: default!(i32, 10),
    fusion: default!(&str, "'linear'"),
) -> TableIterator<'static, (name!(id, i64), name!(score, f32))> {
    // Implementation using SPI
}

Phase 5: SPLADE Integration (Week 10)

// src/sparse/splade.rs

/// SPLADE-style learned sparse representations
pub struct SpladeEncoder {
    /// Vocab size for term indices
    vocab_size: usize,
    /// Sparsity threshold
    threshold: f32,
}

impl SpladeEncoder {
    /// Convert dense embedding to SPLADE-style sparse
    /// (typically done externally, but we support post-processing)
    pub fn sparsify(&self, logits: &[f32]) -> SparseVec {
        let mut indices = Vec::new();
        let mut values = Vec::new();

        for (i, &logit) in logits.iter().enumerate() {
            // ReLU + log(1 + x) activation
            if logit > 0.0 {
                let value = (1.0 + logit).ln();
                if value > self.threshold {
                    indices.push(i as u32);
                    values.push(value);
                }
            }
        }

        SparseVec::new(indices, values, self.vocab_size as u32).unwrap()
    }
}

#[pg_extern]
fn ruvector_to_sparse(
    indices: Vec<i32>,
    values: Vec<f32>,
    dim: i32,
) -> PgSparseVec {
    let indices: Vec<u32> = indices.into_iter().map(|i| i as u32).collect();
    PgSparseVec(SparseVec::new(indices, values, dim as u32).unwrap())
}

#[pg_extern]
fn ruvector_sparse_top_k(sparse: PgSparseVec, k: i32) -> PgSparseVec {
    PgSparseVec(sparse.0.top_k(k as usize))
}

#[pg_extern]
fn ruvector_sparse_prune(sparse: PgSparseVec, threshold: f32) -> PgSparseVec {
    let mut result = sparse.0.clone();
    result.prune(threshold);
    PgSparseVec(result)
}

Benchmarks

Operation	NNZ (query)	NNZ (doc)	Dim	Time (μs)
Dot Product	100	100	30K	0.8
Cosine	100	100	30K	1.2
Inverted Search	100	-	30K	450
Hybrid Search	100	768	30K	1200

Dependencies

[dependencies]
# Concurrent collections
dashmap = "6.0"

# Ordered floats for heaps
ordered-float = "4.2"

# Serialization
serde = { version = "1.0", features = ["derive"] }
bincode = "2.0.0-rc.3"

Feature Flags

[features]
sparse = []
sparse-inverted = ["sparse"]
sparse-hybrid = ["sparse"]
sparse-all = ["sparse-inverted", "sparse-hybrid"]

21 KiB Raw Blame History