Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,381 @@
-- Realistic workload benchmark for ruvector vs pgvector
-- This script tests common operations with realistic dataset sizes
\timing on
\set ECHO all
-- Configuration
\set num_vectors 1000000
\set num_queries 1000
\set dims 1536
\set k 10
BEGIN;
-- ============================================================================
-- Setup Test Tables
-- ============================================================================
DROP TABLE IF EXISTS vectors_ruvector CASCADE;
DROP TABLE IF EXISTS vectors_pgvector CASCADE;
DROP TABLE IF EXISTS queries CASCADE;
-- Create tables
CREATE TABLE vectors_ruvector (
id SERIAL PRIMARY KEY,
embedding ruvector(:dims),
metadata JSONB
);
CREATE TABLE vectors_pgvector (
id SERIAL PRIMARY KEY,
embedding vector(:dims),
metadata JSONB
);
CREATE TABLE queries (
id SERIAL PRIMARY KEY,
query_vector ruvector(:dims)
);
-- ============================================================================
-- Generate Test Data
-- ============================================================================
\echo 'Generating test data...'
-- Insert vectors (ruvector)
INSERT INTO vectors_ruvector (embedding, metadata)
SELECT
array_to_ruvector(ARRAY(
SELECT random()::real
FROM generate_series(1, :dims)
)),
jsonb_build_object('category', i % 100)
FROM generate_series(1, :num_vectors) i;
-- Insert vectors (pgvector)
INSERT INTO vectors_pgvector (embedding, metadata)
SELECT
ARRAY(
SELECT random()::real
FROM generate_series(1, :dims)
)::vector(:dims),
jsonb_build_object('category', i % 100)
FROM generate_series(1, :num_vectors) i;
-- Generate query vectors
INSERT INTO queries (query_vector)
SELECT
array_to_ruvector(ARRAY(
SELECT random()::real
FROM generate_series(1, :dims)
))
FROM generate_series(1, :num_queries);
COMMIT;
-- ============================================================================
-- Benchmark 1: Sequential Scan (No Index)
-- ============================================================================
\echo ''
\echo '=== Benchmark 1: Sequential Scan (No Index) ==='
\echo ''
-- Get a test query
\set test_query 'SELECT query_vector FROM queries WHERE id = 1'
-- RuVector scan
\echo 'RuVector sequential scan (p50, p99 latency):'
SELECT
percentile_cont(0.5) WITHIN GROUP (ORDER BY duration) AS p50_ms,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration) AS p99_ms,
AVG(duration) AS avg_ms,
MIN(duration) AS min_ms,
MAX(duration) AS max_ms
FROM (
SELECT
id,
extract(milliseconds FROM (clock_timestamp() - start_time)) AS duration
FROM (
SELECT
id,
clock_timestamp() AS start_time,
(SELECT id FROM vectors_ruvector v ORDER BY v.embedding <-> (:test_query)::ruvector LIMIT :k)
FROM queries
LIMIT 100
) t
) times;
-- PGVector scan
\echo 'pgvector sequential scan (p50, p99 latency):'
SELECT
percentile_cont(0.5) WITHIN GROUP (ORDER BY duration) AS p50_ms,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration) AS p99_ms,
AVG(duration) AS avg_ms,
MIN(duration) AS min_ms,
MAX(duration) AS max_ms
FROM (
SELECT
id,
extract(milliseconds FROM (clock_timestamp() - start_time)) AS duration
FROM (
SELECT
id,
clock_timestamp() AS start_time,
(SELECT id FROM vectors_pgvector v ORDER BY v.embedding <-> (SELECT query_vector::vector FROM queries WHERE id = 1) LIMIT :k)
FROM queries
LIMIT 100
) t
) times;
-- ============================================================================
-- Benchmark 2: Build Index
-- ============================================================================
\echo ''
\echo '=== Benchmark 2: Index Build Time ==='
\echo ''
-- RuVector HNSW
\echo 'Building ruvector HNSW index...'
\timing on
CREATE INDEX vectors_ruvector_hnsw_idx ON vectors_ruvector
USING hnsw (embedding ruvector_l2_ops)
WITH (m = 16, ef_construction = 64);
-- PGVector HNSW
\echo 'Building pgvector HNSW index...'
\timing on
CREATE INDEX vectors_pgvector_hnsw_idx ON vectors_pgvector
USING hnsw (embedding vector_l2_ops)
WITH (m = 16, ef_construction = 64);
-- ============================================================================
-- Benchmark 3: Index Search Performance
-- ============================================================================
\echo ''
\echo '=== Benchmark 3: Index Search (HNSW) ==='
\echo ''
-- Warm up
SELECT COUNT(*) FROM vectors_ruvector v, queries q
WHERE v.embedding <-> q.query_vector < 1000 LIMIT 100;
-- RuVector HNSW search
\echo 'RuVector HNSW search (p50, p99 latency):'
SELECT
percentile_cont(0.5) WITHIN GROUP (ORDER BY duration) AS p50_ms,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration) AS p99_ms,
AVG(duration) AS avg_ms,
MIN(duration) AS min_ms,
MAX(duration) AS max_ms
FROM (
SELECT
id,
extract(milliseconds FROM (clock_timestamp() - start_time)) AS duration
FROM (
SELECT
q.id,
clock_timestamp() AS start_time,
(SELECT id FROM vectors_ruvector v ORDER BY v.embedding <-> q.query_vector LIMIT :k)
FROM queries q
LIMIT 1000
) t
) times;
-- PGVector HNSW search
\echo 'pgvector HNSW search (p50, p99 latency):'
SELECT
percentile_cont(0.5) WITHIN GROUP (ORDER BY duration) AS p50_ms,
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration) AS p99_ms,
AVG(duration) AS avg_ms,
MIN(duration) AS min_ms,
MAX(duration) AS max_ms
FROM (
SELECT
id,
extract(milliseconds FROM (clock_timestamp() - start_time)) AS duration
FROM (
SELECT
q.id,
clock_timestamp() AS start_time,
(SELECT id FROM vectors_pgvector v ORDER BY v.embedding <-> q.query_vector::vector LIMIT :k)
FROM queries q
LIMIT 1000
) t
) times;
-- ============================================================================
-- Benchmark 4: Distance Function Performance
-- ============================================================================
\echo ''
\echo '=== Benchmark 4: Distance Functions ==='
\echo ''
-- L2 Distance
\echo 'L2 Distance (100k calculations):'
\timing on
SELECT SUM(ruvector_l2_distance(v1.embedding, v2.embedding))
FROM vectors_ruvector v1
CROSS JOIN vectors_ruvector v2
WHERE v1.id <= 100 AND v2.id <= 1000;
\timing on
SELECT SUM(v1.embedding <-> v2.embedding)
FROM vectors_pgvector v1
CROSS JOIN vectors_pgvector v2
WHERE v1.id <= 100 AND v2.id <= 1000;
-- Cosine Distance
\echo 'Cosine Distance (100k calculations):'
\timing on
SELECT SUM(ruvector_cosine_distance(v1.embedding, v2.embedding))
FROM vectors_ruvector v1
CROSS JOIN vectors_ruvector v2
WHERE v1.id <= 100 AND v2.id <= 1000;
\timing on
SELECT SUM(v1.embedding <=> v2.embedding)
FROM vectors_pgvector v1
CROSS JOIN vectors_pgvector v2
WHERE v1.id <= 100 AND v2.id <= 1000;
-- Inner Product
\echo 'Inner Product (100k calculations):'
\timing on
SELECT SUM(ruvector_inner_product(v1.embedding, v2.embedding))
FROM vectors_ruvector v1
CROSS JOIN vectors_ruvector v2
WHERE v1.id <= 100 AND v2.id <= 1000;
\timing on
SELECT SUM(v1.embedding <#> v2.embedding)
FROM vectors_pgvector v1
CROSS JOIN vectors_pgvector v2
WHERE v1.id <= 100 AND v2.id <= 1000;
-- ============================================================================
-- Benchmark 5: Index Recall Accuracy
-- ============================================================================
\echo ''
\echo '=== Benchmark 5: Index Recall ==='
\echo ''
-- Create ground truth table
DROP TABLE IF EXISTS ground_truth;
CREATE TEMP TABLE ground_truth AS
SELECT
q.id AS query_id,
ARRAY_AGG(v.id ORDER BY v.embedding <-> q.query_vector) AS true_neighbors
FROM queries q
CROSS JOIN LATERAL (
SELECT id, embedding
FROM vectors_ruvector
ORDER BY embedding <-> q.query_vector
LIMIT :k
) v
WHERE q.id <= 100
GROUP BY q.id;
-- Compute recall for ruvector HNSW
WITH hnsw_results AS (
SELECT
q.id AS query_id,
ARRAY_AGG(v.id ORDER BY v.embedding <-> q.query_vector) AS hnsw_neighbors
FROM queries q
CROSS JOIN LATERAL (
SELECT id
FROM vectors_ruvector
ORDER BY embedding <-> q.query_vector
LIMIT :k
) v
WHERE q.id <= 100
GROUP BY q.id
)
SELECT
AVG(
(
SELECT COUNT(*)
FROM unnest(h.hnsw_neighbors) AS hn
WHERE hn = ANY(g.true_neighbors)
)::float / :k
) AS recall
FROM hnsw_results h
JOIN ground_truth g ON h.query_id = g.query_id;
-- ============================================================================
-- Benchmark 6: Memory Usage
-- ============================================================================
\echo ''
\echo '=== Benchmark 6: Memory Usage ==='
\echo ''
-- Table sizes
\echo 'Table sizes:'
SELECT
'ruvector' AS type,
pg_size_pretty(pg_total_relation_size('vectors_ruvector')) AS total_size,
pg_size_pretty(pg_relation_size('vectors_ruvector')) AS table_size,
pg_size_pretty(pg_indexes_size('vectors_ruvector')) AS index_size
UNION ALL
SELECT
'pgvector' AS type,
pg_size_pretty(pg_total_relation_size('vectors_pgvector')) AS total_size,
pg_size_pretty(pg_relation_size('vectors_pgvector')) AS table_size,
pg_size_pretty(pg_indexes_size('vectors_pgvector')) AS index_size;
-- Index sizes
\echo 'Index sizes:'
SELECT
indexname,
pg_size_pretty(pg_relation_size(indexname::regclass)) AS size
FROM pg_indexes
WHERE tablename IN ('vectors_ruvector', 'vectors_pgvector')
ORDER BY tablename, indexname;
-- ============================================================================
-- Benchmark 7: Quantization Performance
-- ============================================================================
\echo ''
\echo '=== Benchmark 7: Quantization ==='
\echo ''
-- Create quantized tables
DROP TABLE IF EXISTS vectors_scalar;
CREATE TABLE vectors_scalar (
id SERIAL PRIMARY KEY,
embedding scalarvec
);
INSERT INTO vectors_scalar (embedding)
SELECT quantize_scalar(embedding)
FROM vectors_ruvector
LIMIT 100000;
-- Quantized search
\echo 'Scalar quantized search:'
\timing on
SELECT id
FROM vectors_scalar
ORDER BY embedding <-> quantize_scalar((SELECT query_vector FROM queries WHERE id = 1))
LIMIT :k;
-- ============================================================================
-- Cleanup
-- ============================================================================
\echo ''
\echo '=== Benchmark Complete ==='
\echo ''
DROP TABLE IF EXISTS vectors_ruvector CASCADE;
DROP TABLE IF EXISTS vectors_pgvector CASCADE;
DROP TABLE IF EXISTS queries CASCADE;
DROP TABLE IF EXISTS vectors_scalar CASCADE;

View File

@@ -0,0 +1,123 @@
-- Quick benchmark script for development testing
-- Smaller dataset for faster iteration
\timing on
\set ECHO all
-- Configuration
\set num_vectors 10000
\set num_queries 100
\set dims 768
\set k 10
BEGIN;
-- ============================================================================
-- Setup
-- ============================================================================
DROP TABLE IF EXISTS test_vectors CASCADE;
DROP TABLE IF EXISTS test_queries CASCADE;
CREATE TABLE test_vectors (
id SERIAL PRIMARY KEY,
embedding ruvector(:dims)
);
CREATE TABLE test_queries (
id SERIAL PRIMARY KEY,
query_vector ruvector(:dims)
);
-- ============================================================================
-- Load Data
-- ============================================================================
\echo 'Loading test data...'
INSERT INTO test_vectors (embedding)
SELECT
array_to_ruvector(ARRAY(
SELECT random()::real
FROM generate_series(1, :dims)
))
FROM generate_series(1, :num_vectors);
INSERT INTO test_queries (query_vector)
SELECT
array_to_ruvector(ARRAY(
SELECT random()::real
FROM generate_series(1, :dims)
))
FROM generate_series(1, :num_queries);
COMMIT;
-- ============================================================================
-- Sequential Scan Baseline
-- ============================================================================
\echo ''
\echo 'Sequential scan baseline:'
EXPLAIN ANALYZE
SELECT id
FROM test_vectors
ORDER BY embedding <-> (SELECT query_vector FROM test_queries WHERE id = 1)
LIMIT :k;
-- ============================================================================
-- Build HNSW Index
-- ============================================================================
\echo ''
\echo 'Building HNSW index...'
CREATE INDEX test_vectors_hnsw_idx ON test_vectors
USING hnsw (embedding ruvector_l2_ops)
WITH (m = 16, ef_construction = 64);
-- ============================================================================
-- Index Search
-- ============================================================================
\echo ''
\echo 'HNSW index search:'
EXPLAIN ANALYZE
SELECT id
FROM test_vectors
ORDER BY embedding <-> (SELECT query_vector FROM test_queries WHERE id = 1)
LIMIT :k;
-- ============================================================================
-- Distance Functions
-- ============================================================================
\echo ''
\echo 'Distance function performance (1000 calculations):'
-- L2
\timing on
SELECT SUM(ruvector_l2_distance(v1.embedding, v2.embedding))
FROM test_vectors v1, test_vectors v2
WHERE v1.id <= 10 AND v2.id <= 100;
-- Cosine
\timing on
SELECT SUM(ruvector_cosine_distance(v1.embedding, v2.embedding))
FROM test_vectors v1, test_vectors v2
WHERE v1.id <= 10 AND v2.id <= 100;
-- Inner Product
\timing on
SELECT SUM(ruvector_inner_product(v1.embedding, v2.embedding))
FROM test_vectors v1, test_vectors v2
WHERE v1.id <= 10 AND v2.id <= 100;
-- ============================================================================
-- Cleanup
-- ============================================================================
DROP TABLE IF EXISTS test_vectors CASCADE;
DROP TABLE IF EXISTS test_queries CASCADE;
\echo ''
\echo 'Quick benchmark complete!'