Files

ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900

2026-02-28 14:39:40 -05:00

16 KiB

Raw Blame History

RuVector Postgres v2 - Hybrid Search (BM25 + Vector)

Why Hybrid Search Matters

Vector search finds semantically similar content. Keyword search finds exact matches.

Neither is sufficient alone:

Vector-only misses exact keyword matches (product SKUs, error codes, names)
Keyword-only misses semantic similarity ("car" vs "automobile")

Every production RAG system needs both. pgvector doesn't have this. We do.

Design Goals

Single query, both signals — No application-level fusion
Configurable blending — RRF, linear, learned weights
Integrity-aware — Hybrid index participates in contracted graph
PostgreSQL-native — Leverages tsvector and GIN indexes

Architecture

                     +------------------+
                     |   Hybrid Query   |
                     | "error 500 fix"  |
                     +--------+---------+
                              |
              +---------------+---------------+
              |                               |
     +--------v--------+            +---------v---------+
     |  Vector Branch  |            |  Keyword Branch   |
     |  (HNSW/IVF)     |            |  (GIN/tsvector)   |
     +--------+--------+            +---------+---------+
              |                               |
              |  top-100 by cosine            |  top-100 by BM25
              |                               |
              +---------------+---------------+
                              |
                     +--------v--------+
                     |  Fusion Layer   |
                     |  (RRF / Linear) |
                     +--------+--------+
                              |
                     +--------v--------+
                     |  Final top-k    |
                     +--------+--------+
                              |
                     +--------v--------+
                     | Optional Rerank |
                     +-----------------+

SQL Interface

Basic Hybrid Search

-- Simple hybrid search with default RRF fusion
SELECT * FROM ruvector_hybrid_search(
    'documents',           -- collection name
    query_text := 'database connection timeout error',
    query_vector := $embedding,
    k := 10
);

-- Returns: id, content, vector_score, keyword_score, hybrid_score

Configurable Fusion

-- RRF (Reciprocal Rank Fusion) - default, robust
SELECT * FROM ruvector_hybrid_search(
    'documents',
    query_text := 'postgres replication lag',
    query_vector := $embedding,
    k := 20,
    fusion := 'rrf',
    rrf_k := 60  -- RRF constant (default 60)
);

-- Linear blend with alpha
SELECT * FROM ruvector_hybrid_search(
    'documents',
    query_text := 'postgres replication lag',
    query_vector := $embedding,
    k := 20,
    fusion := 'linear',
    alpha := 0.7  -- 0.7 * vector + 0.3 * keyword
);

-- Learned fusion weights (from query patterns)
SELECT * FROM ruvector_hybrid_search(
    'documents',
    query_text := 'postgres replication lag',
    query_vector := $embedding,
    k := 20,
    fusion := 'learned'  -- Uses GNN-trained weights
);

Operator Syntax (Advanced)

-- Using hybrid operator in ORDER BY
SELECT id, content,
       ruvector_hybrid_score(
           embedding <=> $query_vec,
           ts_rank_cd(fts, plainto_tsquery($query_text)),
           alpha := 0.6
       ) AS score
FROM documents
WHERE fts @@ plainto_tsquery($query_text)  -- Pre-filter
   OR embedding <=> $query_vec < 0.5       -- Or similar vectors
ORDER BY score DESC
LIMIT 10;

Schema Requirements

Collection with Hybrid Support

-- Create table with both vector and FTS columns
CREATE TABLE documents (
    id          BIGSERIAL PRIMARY KEY,
    content     TEXT NOT NULL,
    embedding   vector(1536) NOT NULL,
    fts         tsvector GENERATED ALWAYS AS (to_tsvector('english', content)) STORED,
    metadata    JSONB DEFAULT '{}'::jsonb,
    created_at  TIMESTAMPTZ DEFAULT NOW()
);

-- Vector index
CREATE INDEX idx_documents_embedding
    ON documents USING ruhnsw (embedding vector_cosine_ops)
    WITH (m = 16, ef_construction = 100);

-- FTS index
CREATE INDEX idx_documents_fts
    ON documents USING gin (fts);

-- Register for hybrid search
SELECT ruvector_register_hybrid(
    collection := 'documents',
    vector_column := 'embedding',
    fts_column := 'fts',
    text_column := 'content'  -- For BM25 stats
);

Hybrid Registration Table

-- Internal: tracks hybrid-enabled collections
CREATE TABLE ruvector.hybrid_collections (
    id              SERIAL PRIMARY KEY,
    collection_id   INTEGER NOT NULL REFERENCES ruvector.collections(id),
    vector_column   TEXT NOT NULL,
    fts_column      TEXT NOT NULL,
    text_column     TEXT NOT NULL,

    -- BM25 parameters (computed from corpus)
    avg_doc_length  REAL,
    doc_count       BIGINT,
    k1              REAL DEFAULT 1.2,
    b               REAL DEFAULT 0.75,

    -- Fusion settings
    default_fusion  TEXT DEFAULT 'rrf',
    default_alpha   REAL DEFAULT 0.5,
    learned_weights JSONB,

    -- Stats
    last_stats_update TIMESTAMPTZ,
    created_at      TIMESTAMPTZ DEFAULT NOW()
);

BM25 Implementation

Why Not Just ts_rank?

PostgreSQL's ts_rank is not true BM25. It doesn't account for:

Document length normalization
IDF weighting across corpus
Term frequency saturation

We implement proper BM25 in the engine.

BM25 Scoring

// src/hybrid/bm25.rs

/// BM25 scorer with corpus statistics
pub struct BM25Scorer {
    k1: f32,           // Term frequency saturation (default 1.2)
    b: f32,            // Length normalization (default 0.75)
    avg_doc_len: f32,  // Average document length
    doc_count: u64,    // Total documents
    idf_cache: HashMap<String, f32>,  // Cached IDF values
}

impl BM25Scorer {
    /// Compute IDF for a term
    fn idf(&self, doc_freq: u64) -> f32 {
        let n = self.doc_count as f32;
        let df = doc_freq as f32;
        ((n - df + 0.5) / (df + 0.5) + 1.0).ln()
    }

    /// Score a document for a query
    pub fn score(&self, doc: &Document, query_terms: &[String]) -> f32 {
        let doc_len = doc.term_count as f32;
        let len_norm = 1.0 - self.b + self.b * (doc_len / self.avg_doc_len);

        query_terms.iter()
            .filter_map(|term| {
                let tf = doc.term_freq(term)? as f32;
                let idf = self.idf_cache.get(term)?;

                // BM25 formula
                let numerator = tf * (self.k1 + 1.0);
                let denominator = tf + self.k1 * len_norm;

                Some(idf * numerator / denominator)
            })
            .sum()
    }
}

Corpus Statistics Update

-- Update BM25 statistics (run periodically or after bulk inserts)
SELECT ruvector_hybrid_update_stats('documents');

-- Stats stored in hybrid_collections table
-- Computed via background worker or on-demand

// Background worker updates corpus stats
pub fn update_bm25_stats(collection_id: i32) -> Result<(), Error> {
    Spi::run(|client| {
        // Get average document length
        let avg_len: f64 = client.select(
            "SELECT AVG(LENGTH(content)) FROM documents",
            None, &[]
        )?.first().unwrap().get(1)?;

        // Get document count
        let doc_count: i64 = client.select(
            "SELECT COUNT(*) FROM documents",
            None, &[]
        )?.first().unwrap().get(1)?;

        // Update term frequencies (using tsvector stats)
        // ... compute IDF cache ...

        client.update(
            "UPDATE ruvector.hybrid_collections
             SET avg_doc_length = $1, doc_count = $2, last_stats_update = NOW()
             WHERE collection_id = $3",
            None,
            &[avg_len.into(), doc_count.into(), collection_id.into()]
        )
    })
}

Fusion Algorithms

Reciprocal Rank Fusion (RRF)

Default and most robust. Works without score calibration.

// src/hybrid/fusion.rs

/// RRF fusion: score = sum(1 / (k + rank_i))
pub fn rrf_fusion(
    vector_results: &[(DocId, f32)],  // (id, distance)
    keyword_results: &[(DocId, f32)], // (id, bm25_score)
    k: usize,                          // RRF constant (default 60)
    limit: usize,
) -> Vec<(DocId, f32)> {
    let mut scores: HashMap<DocId, f32> = HashMap::new();

    // Vector ranking (lower distance = higher rank)
    for (rank, (doc_id, _)) in vector_results.iter().enumerate() {
        *scores.entry(*doc_id).or_default() += 1.0 / (k + rank + 1) as f32;
    }

    // Keyword ranking (higher BM25 = higher rank)
    for (rank, (doc_id, _)) in keyword_results.iter().enumerate() {
        *scores.entry(*doc_id).or_default() += 1.0 / (k + rank + 1) as f32;
    }

    // Sort by fused score
    let mut results: Vec<_> = scores.into_iter().collect();
    results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
    results.truncate(limit);
    results
}

Linear Fusion

Simple weighted combination. Requires score normalization.

/// Linear fusion: score = alpha * vec_score + (1 - alpha) * kw_score
pub fn linear_fusion(
    vector_results: &[(DocId, f32)],
    keyword_results: &[(DocId, f32)],
    alpha: f32,
    limit: usize,
) -> Vec<(DocId, f32)> {
    // Normalize vector scores (convert distance to similarity)
    let vec_scores = normalize_to_similarity(vector_results);

    // Normalize BM25 scores to [0, 1]
    let kw_scores = min_max_normalize(keyword_results);

    // Combine
    let mut combined: HashMap<DocId, f32> = HashMap::new();

    for (doc_id, score) in vec_scores {
        *combined.entry(doc_id).or_default() += alpha * score;
    }

    for (doc_id, score) in kw_scores {
        *combined.entry(doc_id).or_default() += (1.0 - alpha) * score;
    }

    let mut results: Vec<_> = combined.into_iter().collect();
    results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
    results.truncate(limit);
    results
}

Learned Fusion

Uses query characteristics to select weights dynamically.

/// Learned fusion using GNN-predicted weights
pub fn learned_fusion(
    query_embedding: &[f32],
    query_terms: &[String],
    vector_results: &[(DocId, f32)],
    keyword_results: &[(DocId, f32)],
    model: &FusionModel,
    limit: usize,
) -> Vec<(DocId, f32)> {
    // Query features
    let features = QueryFeatures {
        embedding_norm: l2_norm(query_embedding),
        term_count: query_terms.len(),
        avg_term_idf: compute_avg_idf(query_terms),
        has_exact_match: detect_exact_match_intent(query_terms),
        query_type: classify_query_type(query_terms),  // navigational, informational, etc.
    };

    // Predict optimal alpha for this query
    let alpha = model.predict_alpha(&features);

    linear_fusion(vector_results, keyword_results, alpha, limit)
}

Integrity Integration

Hybrid search participates in the integrity control plane.

Contracted Graph Nodes

-- Hybrid index adds nodes to contracted graph
INSERT INTO ruvector.contracted_graph (collection_id, node_type, node_id, node_name, health_score)
SELECT
    c.id,
    'hybrid_index',
    h.id,
    'hybrid_' || c.name,
    CASE
        WHEN h.last_stats_update > NOW() - INTERVAL '1 day' THEN 1.0
        WHEN h.last_stats_update > NOW() - INTERVAL '7 days' THEN 0.7
        ELSE 0.3  -- Stale stats degrade health
    END
FROM ruvector.hybrid_collections h
JOIN ruvector.collections c ON h.collection_id = c.id;

Integrity-Aware Hybrid Search

/// Hybrid search with integrity gating
pub fn hybrid_search_with_integrity(
    collection_id: i32,
    query: &HybridQuery,
) -> Result<Vec<HybridResult>, Error> {
    // Check integrity gate
    let gate = check_integrity_gate(collection_id, "hybrid_search");

    match gate.state {
        IntegrityState::Normal => {
            // Full hybrid: both branches
            execute_full_hybrid(query)
        }
        IntegrityState::Stress => {
            // Degrade gracefully: prefer faster branch
            if query.alpha > 0.5 {
                // Vector-heavy query: use vector only
                execute_vector_only(query)
            } else {
                // Keyword-heavy query: use keyword only
                execute_keyword_only(query)
            }
        }
        IntegrityState::Critical => {
            // Minimal: keyword only (cheapest)
            execute_keyword_only(query)
        }
    }
}

Performance Optimization

Pre-filtering Strategy

-- Hybrid search with pre-filter (faster for selective filters)
SELECT * FROM ruvector_hybrid_search(
    'documents',
    query_text := 'error handling',
    query_vector := $embedding,
    k := 10,
    filter := 'category = ''backend'' AND created_at > NOW() - INTERVAL ''30 days'''
);

// Execution strategy selection
fn choose_strategy(filter_selectivity: f32, corpus_size: u64) -> HybridStrategy {
    if filter_selectivity < 0.01 {
        // Very selective: pre-filter, then hybrid on small set
        HybridStrategy::PreFilter
    } else if filter_selectivity < 0.1 && corpus_size > 1_000_000 {
        // Moderately selective, large corpus: hybrid first, post-filter
        HybridStrategy::PostFilter
    } else {
        // Not selective: full hybrid
        HybridStrategy::Full
    }
}

Parallel Execution

/// Execute vector and keyword branches in parallel
pub async fn parallel_hybrid(query: &HybridQuery) -> HybridResults {
    let (vector_results, keyword_results) = tokio::join!(
        execute_vector_branch(&query.embedding, query.prefetch_k),
        execute_keyword_branch(&query.text, query.prefetch_k),
    );

    fuse_results(vector_results, keyword_results, query.fusion, query.k)
}

Caching

/// Cache BM25 scores for repeated terms
pub struct HybridCache {
    term_doc_scores: LruCache<(String, DocId), f32>,
    idf_cache: HashMap<String, f32>,
    ttl: Duration,
}

Configuration

GUC Parameters

-- Default fusion method
SET ruvector.hybrid_fusion = 'rrf';  -- 'rrf', 'linear', 'learned'

-- Default alpha for linear fusion
SET ruvector.hybrid_alpha = 0.5;

-- RRF constant
SET ruvector.hybrid_rrf_k = 60;

-- Prefetch size for each branch
SET ruvector.hybrid_prefetch_k = 100;

-- Enable parallel branch execution
SET ruvector.hybrid_parallel = true;

Per-Collection Settings

SELECT ruvector_hybrid_configure('documents', '{
    "default_fusion": "learned",
    "prefetch_k": 200,
    "bm25_k1": 1.5,
    "bm25_b": 0.8,
    "stats_refresh_interval": "1 hour"
}'::jsonb);

Monitoring

-- Hybrid search statistics
SELECT * FROM ruvector_hybrid_stats('documents');

-- Returns:
-- {
--   "total_searches": 15234,
--   "avg_vector_latency_ms": 4.2,
--   "avg_keyword_latency_ms": 2.1,
--   "avg_fusion_latency_ms": 0.3,
--   "cache_hit_rate": 0.67,
--   "last_stats_update": "2024-01-15T10:30:00Z",
--   "corpus_size": 1250000,
--   "avg_doc_length": 542
-- }

Testing Requirements

Correctness Tests

BM25 scoring matches reference implementation
RRF fusion produces expected rankings
Linear fusion respects alpha parameter
Learned fusion adapts to query type

Performance Tests

Hybrid search < 2x single-branch latency
Parallel execution shows speedup
Cache hit rate > 50% for repeated queries

Integration Tests

Integrity degradation triggers graceful fallback
Stats update doesn't block queries
Large corpus (10M+ docs) scales

Example: RAG Application

-- Complete RAG retrieval with hybrid search
WITH retrieved AS (
    SELECT
        id,
        content,
        hybrid_score,
        metadata
    FROM ruvector_hybrid_search(
        'knowledge_base',
        query_text := $user_question,
        query_vector := $question_embedding,
        k := 5,
        fusion := 'rrf',
        filter := 'status = ''published'''
    )
)
SELECT
    string_agg(content, E'\n\n---\n\n') AS context,
    array_agg(id) AS source_ids
FROM retrieved;

-- Pass context to LLM for answer generation

16 KiB Raw Blame History