Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/docs/hnsw/HNSW_USAGE_EXAMPLE.md
+++ b/vendor/ruvector/docs/hnsw/HNSW_USAGE_EXAMPLE.md
@@ -0,0 +1,561 @@
+# HNSW Index - Complete Usage Example
+
+This guide provides a complete, practical example of using the HNSW index for vector similarity search in PostgreSQL.
+
+## Prerequisites
+
+```bash
+# Install the extension
+cd /home/user/ruvector/crates/ruvector-postgres
+cargo pgrx install
+
+# Or package for deployment
+cargo pgrx package
+```
+
+## Step 1: Create Database and Enable Extension
+
+```sql
+-- Create a new database for vector search
+CREATE DATABASE vector_search;
+\c vector_search
+
+-- Enable the RuVector extension
+CREATE EXTENSION ruvector;
+
+-- Verify installation
+SELECT ruvector_version();
+SELECT ruvector_simd_info();
+```
+
+## Step 2: Create Table with Vectors
+
+```sql
+-- Create a table for storing document embeddings
+CREATE TABLE documents (
+    id SERIAL PRIMARY KEY,
+    title TEXT NOT NULL,
+    content TEXT,
+    embedding real[],  -- 384-dimensional embeddings
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Add some metadata indexes
+CREATE INDEX idx_documents_created ON documents(created_at);
+CREATE INDEX idx_documents_title ON documents USING gin(to_tsvector('english', title));
+```
+
+## Step 3: Insert Sample Data
+
+```sql
+-- Insert sample documents with random embeddings (in practice, use real embeddings)
+INSERT INTO documents (title, content, embedding)
+SELECT
+    'Document ' || i,
+    'This is the content of document ' || i,
+    array_agg(random())::real[]
+FROM generate_series(1, 10000) AS i
+CROSS JOIN generate_series(1, 384) AS dim
+GROUP BY i;
+
+-- Verify data
+SELECT COUNT(*), pg_size_pretty(pg_total_relation_size('documents'))
+FROM documents;
+```
+
+## Step 4: Create HNSW Index
+
+```sql
+-- Create HNSW index with L2 distance (default parameters)
+CREATE INDEX idx_documents_embedding_hnsw
+ON documents USING hnsw (embedding hnsw_l2_ops);
+
+-- Check index size
+SELECT
+    indexname,
+    pg_size_pretty(pg_relation_size(indexname::regclass)) AS size
+FROM pg_indexes
+WHERE tablename = 'documents';
+```
+
+## Step 5: Basic Similarity Search
+
+```sql
+-- Find 10 most similar documents to a query vector
+WITH query AS (
+    -- In practice, this would be an embedding from your model
+    SELECT array_agg(random())::real[] AS vec
+    FROM generate_series(1, 384)
+)
+SELECT
+    d.id,
+    d.title,
+    d.embedding <-> query.vec AS distance
+FROM documents d, query
+ORDER BY d.embedding <-> query.vec
+LIMIT 10;
+```
+
+## Step 6: Advanced Queries
+
+### Filtered Search
+
+```sql
+-- Find similar documents created in the last 7 days
+WITH query AS (
+    SELECT array_agg(random())::real[] AS vec
+    FROM generate_series(1, 384)
+)
+SELECT
+    d.id,
+    d.title,
+    d.created_at,
+    d.embedding <-> query.vec AS distance
+FROM documents d, query
+WHERE d.created_at > CURRENT_TIMESTAMP - INTERVAL '7 days'
+ORDER BY d.embedding <-> query.vec
+LIMIT 10;
+```
+
+### Hybrid Search (Text + Vector)
+
+```sql
+-- Combine full-text search with vector similarity
+WITH query AS (
+    SELECT array_agg(random())::real[] AS vec
+    FROM generate_series(1, 384)
+)
+SELECT
+    d.id,
+    d.title,
+    ts_rank(to_tsvector('english', d.title), to_tsquery('document')) AS text_score,
+    d.embedding <-> query.vec AS vector_distance,
+    -- Combined score (weighted)
+    (0.3 * ts_rank(to_tsvector('english', d.title), to_tsquery('document'))) +
+    (0.7 * (1.0 / (1.0 + (d.embedding <-> query.vec)))) AS combined_score
+FROM documents d, query
+WHERE to_tsvector('english', d.title) @@ to_tsquery('document')
+ORDER BY combined_score DESC
+LIMIT 10;
+```
+
+### Batch Similarity Search
+
+```sql
+-- Find similar documents for multiple queries
+WITH queries AS (
+    SELECT
+        q_id,
+        array_agg(random())::real[] AS vec
+    FROM generate_series(1, 5) AS q_id
+    CROSS JOIN generate_series(1, 384)
+    GROUP BY q_id
+),
+results AS (
+    SELECT
+        q.q_id,
+        d.id AS doc_id,
+        d.title,
+        d.embedding <-> q.vec AS distance,
+        ROW_NUMBER() OVER (PARTITION BY q.q_id ORDER BY d.embedding <-> q.vec) AS rank
+    FROM queries q
+    CROSS JOIN documents d
+)
+SELECT *
+FROM results
+WHERE rank <= 10
+ORDER BY q_id, rank;
+```
+
+## Step 7: Performance Tuning
+
+### Adjust ef_search for Better Recall
+
+```sql
+-- Show current setting
+SHOW ruvector.ef_search;
+
+-- Increase for better recall (slower queries)
+SET ruvector.ef_search = 100;
+
+-- Run query
+WITH query AS (
+    SELECT array_agg(random())::real[] AS vec
+    FROM generate_series(1, 384)
+)
+SELECT
+    d.id,
+    d.title,
+    d.embedding <-> query.vec AS distance
+FROM documents d, query
+ORDER BY d.embedding <-> query.vec
+LIMIT 10;
+
+-- Reset to default
+RESET ruvector.ef_search;
+```
+
+### Analyze Query Performance
+
+```sql
+-- Explain query plan
+EXPLAIN (ANALYZE, BUFFERS)
+WITH query AS (
+    SELECT array_agg(random())::real[] AS vec
+    FROM generate_series(1, 384)
+)
+SELECT
+    d.id,
+    d.embedding <-> query.vec AS distance
+FROM documents d, query
+ORDER BY d.embedding <-> query.vec
+LIMIT 10;
+```
+
+## Step 8: Different Distance Metrics
+
+### Cosine Distance
+
+```sql
+-- Create index with cosine distance
+CREATE INDEX idx_documents_embedding_cosine
+ON documents USING hnsw (embedding hnsw_cosine_ops);
+
+-- Query with cosine distance (normalized vectors work best)
+WITH query AS (
+    SELECT vector_normalize(array_agg(random())::real[]) AS vec
+    FROM generate_series(1, 384)
+)
+SELECT
+    d.id,
+    d.title,
+    d.embedding <=> query.vec AS cosine_distance,
+    1.0 - (d.embedding <=> query.vec) AS cosine_similarity
+FROM documents d, query
+ORDER BY d.embedding <=> query.vec
+LIMIT 10;
+```
+
+### Inner Product
+
+```sql
+-- Create index with inner product
+CREATE INDEX idx_documents_embedding_ip
+ON documents USING hnsw (embedding hnsw_ip_ops);
+
+-- Query with inner product
+WITH query AS (
+    SELECT array_agg(random())::real[] AS vec
+    FROM generate_series(1, 384)
+)
+SELECT
+    d.id,
+    d.title,
+    d.embedding <#> query.vec AS neg_inner_product,
+    -(d.embedding <#> query.vec) AS inner_product
+FROM documents d, query
+ORDER BY d.embedding <#> query.vec
+LIMIT 10;
+```
+
+## Step 9: Index Maintenance
+
+### Monitor Index Health
+
+```sql
+-- Get memory statistics
+SELECT ruvector_memory_stats();
+
+-- Check index bloat
+SELECT
+    schemaname,
+    tablename,
+    indexname,
+    pg_size_pretty(pg_relation_size(indexrelid)) AS index_size,
+    pg_size_pretty(pg_relation_size(relid)) AS table_size,
+    ROUND(100.0 * pg_relation_size(indexrelid) /
+          NULLIF(pg_relation_size(relid), 0), 2) AS index_ratio
+FROM pg_stat_user_indexes
+WHERE schemaname = 'public'
+  AND tablename = 'documents';
+```
+
+### Perform Maintenance
+
+```sql
+-- Run index maintenance
+SELECT ruvector_index_maintenance('idx_documents_embedding_hnsw');
+
+-- Vacuum after many deletes
+VACUUM ANALYZE documents;
+
+-- Rebuild index if heavily degraded
+REINDEX INDEX idx_documents_embedding_hnsw;
+```
+
+## Step 10: Production Best Practices
+
+### Partitioning for Large Datasets
+
+```sql
+-- Create partitioned table for time-series data
+CREATE TABLE documents_partitioned (
+    id BIGSERIAL,
+    title TEXT NOT NULL,
+    embedding real[],
+    created_at TIMESTAMP NOT NULL
+) PARTITION BY RANGE (created_at);
+
+-- Create monthly partitions
+CREATE TABLE documents_2024_01 PARTITION OF documents_partitioned
+    FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');
+
+CREATE TABLE documents_2024_02 PARTITION OF documents_partitioned
+    FOR VALUES FROM ('2024-02-01') TO ('2024-03-01');
+
+-- Create HNSW index on each partition
+CREATE INDEX idx_documents_2024_01_embedding
+ON documents_2024_01 USING hnsw (embedding hnsw_l2_ops);
+
+CREATE INDEX idx_documents_2024_02_embedding
+ON documents_2024_02 USING hnsw (embedding hnsw_l2_ops);
+```
+
+### Connection Pooling Setup
+
+```python
+# Python example with psycopg2
+import psycopg2
+from psycopg2 import pool
+import numpy as np
+
+# Create connection pool
+db_pool = psycopg2.pool.ThreadedConnectionPool(
+    minconn=1,
+    maxconn=20,
+    host="localhost",
+    database="vector_search",
+    user="postgres",
+    password="password"
+)
+
+def search_similar(query_vector, k=10):
+    """Search for k most similar documents"""
+    conn = db_pool.getconn()
+    try:
+        with conn.cursor() as cur:
+            # Set ef_search for this query
+            cur.execute("SET LOCAL ruvector.ef_search = 100")
+
+            # Execute similarity search
+            cur.execute("""
+                SELECT id, title, embedding <-> %s AS distance
+                FROM documents
+                ORDER BY embedding <-> %s
+                LIMIT %s
+            """, (query_vector.tolist(), query_vector.tolist(), k))
+
+            return cur.fetchall()
+    finally:
+        db_pool.putconn(conn)
+
+# Example usage
+query = np.random.randn(384).astype(np.float32)
+results = search_similar(query, k=10)
+for doc_id, title, distance in results:
+    print(f"{title}: {distance:.4f}")
+```
+
+### Monitoring Queries
+
+```sql
+-- Create view for monitoring slow vector queries
+CREATE OR REPLACE VIEW slow_vector_queries AS
+SELECT
+    calls,
+    total_exec_time,
+    mean_exec_time,
+    max_exec_time,
+    query
+FROM pg_stat_statements
+WHERE query LIKE '%<->%'
+   OR query LIKE '%<=>%'
+   OR query LIKE '%<#>%'
+ORDER BY mean_exec_time DESC;
+
+-- Monitor slow queries
+SELECT * FROM slow_vector_queries LIMIT 10;
+```
+
+## Step 11: Application Integration
+
+### REST API Example (Node.js + Express)
+
+```javascript
+const express = require('express');
+const { Pool } = require('pg');
+
+const app = express();
+const pool = new Pool({
+    host: 'localhost',
+    database: 'vector_search',
+    user: 'postgres',
+    password: 'password',
+    max: 20
+});
+
+app.use(express.json());
+
+// Search endpoint
+app.post('/api/search', async (req, res) => {
+    const { query_vector, k = 10, ef_search = 40 } = req.body;
+
+    try {
+        const client = await pool.connect();
+
+        // Set ef_search for this session
+        await client.query('SET LOCAL ruvector.ef_search = $1', [ef_search]);
+
+        // Execute search
+        const result = await client.query(`
+            SELECT id, title, embedding <-> $1::real[] AS distance
+            FROM documents
+            ORDER BY embedding <-> $1::real[]
+            LIMIT $2
+        `, [query_vector, k]);
+
+        client.release();
+
+        res.json({
+            results: result.rows,
+            count: result.rowCount
+        });
+    } catch (err) {
+        console.error(err);
+        res.status(500).json({ error: 'Search failed' });
+    }
+});
+
+app.listen(3000, () => {
+    console.log('Vector search API running on port 3000');
+});
+```
+
+## Complete Example: Semantic Document Search
+
+```sql
+-- 1. Create schema
+CREATE TABLE articles (
+    id SERIAL PRIMARY KEY,
+    title TEXT NOT NULL,
+    author TEXT,
+    content TEXT NOT NULL,
+    embedding real[],  -- 768-dimensional BERT embeddings
+    tags TEXT[],
+    published_at TIMESTAMP,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+
+-- 2. Create indexes
+CREATE INDEX idx_articles_embedding_hnsw
+ON articles USING hnsw (embedding hnsw_cosine_ops)
+WITH (m = 32, ef_construction = 128);
+
+CREATE INDEX idx_articles_tags ON articles USING gin(tags);
+CREATE INDEX idx_articles_published ON articles(published_at);
+
+-- 3. Insert articles (with embeddings from your model)
+INSERT INTO articles (title, author, content, embedding, tags, published_at)
+VALUES
+    ('Introduction to Vector Databases', 'Alice', 'Content...',
+     array_agg(random())::real[], ARRAY['database', 'vectors'], '2024-01-15'),
+    -- ... more articles
+;
+
+-- 4. Semantic search with filters
+WITH query AS (
+    SELECT array_agg(random())::real[] AS vec  -- Replace with actual embedding
+    FROM generate_series(1, 768)
+)
+SELECT
+    a.id,
+    a.title,
+    a.author,
+    a.published_at,
+    a.tags,
+    a.embedding <=> query.vec AS similarity_score
+FROM articles a, query
+WHERE
+    a.published_at >= CURRENT_DATE - INTERVAL '30 days'  -- Recent articles
+    AND a.tags && ARRAY['database', 'search']  -- Tag filter
+ORDER BY a.embedding <=> query.vec
+LIMIT 20;
+
+-- 5. Analyze performance
+EXPLAIN (ANALYZE, BUFFERS, VERBOSE)
+SELECT id, title, embedding <=> $1 AS score
+FROM articles
+WHERE published_at >= CURRENT_DATE - INTERVAL '30 days'
+ORDER BY embedding <=> $1
+LIMIT 20;
+```
+
+## Troubleshooting Common Issues
+
+### Issue: Slow Index Build
+
+```sql
+-- Solution: Increase memory and adjust parameters
+SET maintenance_work_mem = '4GB';
+ALTER TABLE documents SET (autovacuum_enabled = false);
+
+-- Rebuild with lower ef_construction
+DROP INDEX idx_documents_embedding_hnsw;
+CREATE INDEX idx_documents_embedding_hnsw
+ON documents USING hnsw (embedding hnsw_l2_ops)
+WITH (m = 16, ef_construction = 64);
+
+-- Re-enable autovacuum
+ALTER TABLE documents SET (autovacuum_enabled = true);
+```
+
+### Issue: Low Recall
+
+```sql
+-- Increase ef_search globally
+ALTER SYSTEM SET ruvector.ef_search = 100;
+SELECT pg_reload_conf();
+
+-- Or rebuild index with better parameters
+CREATE INDEX idx_documents_embedding_hnsw_v2
+ON documents USING hnsw (embedding hnsw_l2_ops)
+WITH (m = 32, ef_construction = 200);
+```
+
+### Issue: High Memory Usage
+
+```sql
+-- Monitor memory
+SELECT ruvector_memory_stats();
+
+-- Reduce index size with lower m
+CREATE INDEX idx_documents_embedding_small
+ON documents USING hnsw (embedding hnsw_l2_ops)
+WITH (m = 8, ef_construction = 32);
+```
+
+## Conclusion
+
+This example demonstrates the complete workflow for using HNSW indexes in production:
+
+1. Extension installation and setup
+2. Table creation with vector columns
+3. HNSW index creation with tuning
+4. Various query patterns (basic, filtered, hybrid)
+5. Performance optimization
+6. Maintenance and monitoring
+7. Application integration
+
+For more details, see:
+- [HNSW Index Documentation](HNSW_INDEX.md)
+- [Implementation Summary](HNSW_IMPLEMENTATION_SUMMARY.md)