Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
381
vendor/ruvector/crates/ruvector-postgres/benches/sql/benchmark_workload.sql
vendored
Normal file
381
vendor/ruvector/crates/ruvector-postgres/benches/sql/benchmark_workload.sql
vendored
Normal file
@@ -0,0 +1,381 @@
|
||||
-- Realistic workload benchmark for ruvector vs pgvector
|
||||
-- This script tests common operations with realistic dataset sizes
|
||||
|
||||
\timing on
|
||||
\set ECHO all
|
||||
|
||||
-- Configuration
|
||||
\set num_vectors 1000000
|
||||
\set num_queries 1000
|
||||
\set dims 1536
|
||||
\set k 10
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- ============================================================================
|
||||
-- Setup Test Tables
|
||||
-- ============================================================================
|
||||
|
||||
DROP TABLE IF EXISTS vectors_ruvector CASCADE;
|
||||
DROP TABLE IF EXISTS vectors_pgvector CASCADE;
|
||||
DROP TABLE IF EXISTS queries CASCADE;
|
||||
|
||||
-- Create tables
|
||||
CREATE TABLE vectors_ruvector (
|
||||
id SERIAL PRIMARY KEY,
|
||||
embedding ruvector(:dims),
|
||||
metadata JSONB
|
||||
);
|
||||
|
||||
CREATE TABLE vectors_pgvector (
|
||||
id SERIAL PRIMARY KEY,
|
||||
embedding vector(:dims),
|
||||
metadata JSONB
|
||||
);
|
||||
|
||||
CREATE TABLE queries (
|
||||
id SERIAL PRIMARY KEY,
|
||||
query_vector ruvector(:dims)
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- Generate Test Data
|
||||
-- ============================================================================
|
||||
|
||||
\echo 'Generating test data...'
|
||||
|
||||
-- Insert vectors (ruvector)
|
||||
INSERT INTO vectors_ruvector (embedding, metadata)
|
||||
SELECT
|
||||
array_to_ruvector(ARRAY(
|
||||
SELECT random()::real
|
||||
FROM generate_series(1, :dims)
|
||||
)),
|
||||
jsonb_build_object('category', i % 100)
|
||||
FROM generate_series(1, :num_vectors) i;
|
||||
|
||||
-- Insert vectors (pgvector)
|
||||
INSERT INTO vectors_pgvector (embedding, metadata)
|
||||
SELECT
|
||||
ARRAY(
|
||||
SELECT random()::real
|
||||
FROM generate_series(1, :dims)
|
||||
)::vector(:dims),
|
||||
jsonb_build_object('category', i % 100)
|
||||
FROM generate_series(1, :num_vectors) i;
|
||||
|
||||
-- Generate query vectors
|
||||
INSERT INTO queries (query_vector)
|
||||
SELECT
|
||||
array_to_ruvector(ARRAY(
|
||||
SELECT random()::real
|
||||
FROM generate_series(1, :dims)
|
||||
))
|
||||
FROM generate_series(1, :num_queries);
|
||||
|
||||
COMMIT;
|
||||
|
||||
-- ============================================================================
|
||||
-- Benchmark 1: Sequential Scan (No Index)
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo '=== Benchmark 1: Sequential Scan (No Index) ==='
|
||||
\echo ''
|
||||
|
||||
-- Get a test query
|
||||
\set test_query 'SELECT query_vector FROM queries WHERE id = 1'
|
||||
|
||||
-- RuVector scan
|
||||
\echo 'RuVector sequential scan (p50, p99 latency):'
|
||||
SELECT
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY duration) AS p50_ms,
|
||||
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration) AS p99_ms,
|
||||
AVG(duration) AS avg_ms,
|
||||
MIN(duration) AS min_ms,
|
||||
MAX(duration) AS max_ms
|
||||
FROM (
|
||||
SELECT
|
||||
id,
|
||||
extract(milliseconds FROM (clock_timestamp() - start_time)) AS duration
|
||||
FROM (
|
||||
SELECT
|
||||
id,
|
||||
clock_timestamp() AS start_time,
|
||||
(SELECT id FROM vectors_ruvector v ORDER BY v.embedding <-> (:test_query)::ruvector LIMIT :k)
|
||||
FROM queries
|
||||
LIMIT 100
|
||||
) t
|
||||
) times;
|
||||
|
||||
-- PGVector scan
|
||||
\echo 'pgvector sequential scan (p50, p99 latency):'
|
||||
SELECT
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY duration) AS p50_ms,
|
||||
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration) AS p99_ms,
|
||||
AVG(duration) AS avg_ms,
|
||||
MIN(duration) AS min_ms,
|
||||
MAX(duration) AS max_ms
|
||||
FROM (
|
||||
SELECT
|
||||
id,
|
||||
extract(milliseconds FROM (clock_timestamp() - start_time)) AS duration
|
||||
FROM (
|
||||
SELECT
|
||||
id,
|
||||
clock_timestamp() AS start_time,
|
||||
(SELECT id FROM vectors_pgvector v ORDER BY v.embedding <-> (SELECT query_vector::vector FROM queries WHERE id = 1) LIMIT :k)
|
||||
FROM queries
|
||||
LIMIT 100
|
||||
) t
|
||||
) times;
|
||||
|
||||
-- ============================================================================
|
||||
-- Benchmark 2: Build Index
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo '=== Benchmark 2: Index Build Time ==='
|
||||
\echo ''
|
||||
|
||||
-- RuVector HNSW
|
||||
\echo 'Building ruvector HNSW index...'
|
||||
\timing on
|
||||
CREATE INDEX vectors_ruvector_hnsw_idx ON vectors_ruvector
|
||||
USING hnsw (embedding ruvector_l2_ops)
|
||||
WITH (m = 16, ef_construction = 64);
|
||||
|
||||
-- PGVector HNSW
|
||||
\echo 'Building pgvector HNSW index...'
|
||||
\timing on
|
||||
CREATE INDEX vectors_pgvector_hnsw_idx ON vectors_pgvector
|
||||
USING hnsw (embedding vector_l2_ops)
|
||||
WITH (m = 16, ef_construction = 64);
|
||||
|
||||
-- ============================================================================
|
||||
-- Benchmark 3: Index Search Performance
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo '=== Benchmark 3: Index Search (HNSW) ==='
|
||||
\echo ''
|
||||
|
||||
-- Warm up
|
||||
SELECT COUNT(*) FROM vectors_ruvector v, queries q
|
||||
WHERE v.embedding <-> q.query_vector < 1000 LIMIT 100;
|
||||
|
||||
-- RuVector HNSW search
|
||||
\echo 'RuVector HNSW search (p50, p99 latency):'
|
||||
SELECT
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY duration) AS p50_ms,
|
||||
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration) AS p99_ms,
|
||||
AVG(duration) AS avg_ms,
|
||||
MIN(duration) AS min_ms,
|
||||
MAX(duration) AS max_ms
|
||||
FROM (
|
||||
SELECT
|
||||
id,
|
||||
extract(milliseconds FROM (clock_timestamp() - start_time)) AS duration
|
||||
FROM (
|
||||
SELECT
|
||||
q.id,
|
||||
clock_timestamp() AS start_time,
|
||||
(SELECT id FROM vectors_ruvector v ORDER BY v.embedding <-> q.query_vector LIMIT :k)
|
||||
FROM queries q
|
||||
LIMIT 1000
|
||||
) t
|
||||
) times;
|
||||
|
||||
-- PGVector HNSW search
|
||||
\echo 'pgvector HNSW search (p50, p99 latency):'
|
||||
SELECT
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY duration) AS p50_ms,
|
||||
percentile_cont(0.99) WITHIN GROUP (ORDER BY duration) AS p99_ms,
|
||||
AVG(duration) AS avg_ms,
|
||||
MIN(duration) AS min_ms,
|
||||
MAX(duration) AS max_ms
|
||||
FROM (
|
||||
SELECT
|
||||
id,
|
||||
extract(milliseconds FROM (clock_timestamp() - start_time)) AS duration
|
||||
FROM (
|
||||
SELECT
|
||||
q.id,
|
||||
clock_timestamp() AS start_time,
|
||||
(SELECT id FROM vectors_pgvector v ORDER BY v.embedding <-> q.query_vector::vector LIMIT :k)
|
||||
FROM queries q
|
||||
LIMIT 1000
|
||||
) t
|
||||
) times;
|
||||
|
||||
-- ============================================================================
|
||||
-- Benchmark 4: Distance Function Performance
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo '=== Benchmark 4: Distance Functions ==='
|
||||
\echo ''
|
||||
|
||||
-- L2 Distance
|
||||
\echo 'L2 Distance (100k calculations):'
|
||||
\timing on
|
||||
SELECT SUM(ruvector_l2_distance(v1.embedding, v2.embedding))
|
||||
FROM vectors_ruvector v1
|
||||
CROSS JOIN vectors_ruvector v2
|
||||
WHERE v1.id <= 100 AND v2.id <= 1000;
|
||||
|
||||
\timing on
|
||||
SELECT SUM(v1.embedding <-> v2.embedding)
|
||||
FROM vectors_pgvector v1
|
||||
CROSS JOIN vectors_pgvector v2
|
||||
WHERE v1.id <= 100 AND v2.id <= 1000;
|
||||
|
||||
-- Cosine Distance
|
||||
\echo 'Cosine Distance (100k calculations):'
|
||||
\timing on
|
||||
SELECT SUM(ruvector_cosine_distance(v1.embedding, v2.embedding))
|
||||
FROM vectors_ruvector v1
|
||||
CROSS JOIN vectors_ruvector v2
|
||||
WHERE v1.id <= 100 AND v2.id <= 1000;
|
||||
|
||||
\timing on
|
||||
SELECT SUM(v1.embedding <=> v2.embedding)
|
||||
FROM vectors_pgvector v1
|
||||
CROSS JOIN vectors_pgvector v2
|
||||
WHERE v1.id <= 100 AND v2.id <= 1000;
|
||||
|
||||
-- Inner Product
|
||||
\echo 'Inner Product (100k calculations):'
|
||||
\timing on
|
||||
SELECT SUM(ruvector_inner_product(v1.embedding, v2.embedding))
|
||||
FROM vectors_ruvector v1
|
||||
CROSS JOIN vectors_ruvector v2
|
||||
WHERE v1.id <= 100 AND v2.id <= 1000;
|
||||
|
||||
\timing on
|
||||
SELECT SUM(v1.embedding <#> v2.embedding)
|
||||
FROM vectors_pgvector v1
|
||||
CROSS JOIN vectors_pgvector v2
|
||||
WHERE v1.id <= 100 AND v2.id <= 1000;
|
||||
|
||||
-- ============================================================================
|
||||
-- Benchmark 5: Index Recall Accuracy
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo '=== Benchmark 5: Index Recall ==='
|
||||
\echo ''
|
||||
|
||||
-- Create ground truth table
|
||||
DROP TABLE IF EXISTS ground_truth;
|
||||
CREATE TEMP TABLE ground_truth AS
|
||||
SELECT
|
||||
q.id AS query_id,
|
||||
ARRAY_AGG(v.id ORDER BY v.embedding <-> q.query_vector) AS true_neighbors
|
||||
FROM queries q
|
||||
CROSS JOIN LATERAL (
|
||||
SELECT id, embedding
|
||||
FROM vectors_ruvector
|
||||
ORDER BY embedding <-> q.query_vector
|
||||
LIMIT :k
|
||||
) v
|
||||
WHERE q.id <= 100
|
||||
GROUP BY q.id;
|
||||
|
||||
-- Compute recall for ruvector HNSW
|
||||
WITH hnsw_results AS (
|
||||
SELECT
|
||||
q.id AS query_id,
|
||||
ARRAY_AGG(v.id ORDER BY v.embedding <-> q.query_vector) AS hnsw_neighbors
|
||||
FROM queries q
|
||||
CROSS JOIN LATERAL (
|
||||
SELECT id
|
||||
FROM vectors_ruvector
|
||||
ORDER BY embedding <-> q.query_vector
|
||||
LIMIT :k
|
||||
) v
|
||||
WHERE q.id <= 100
|
||||
GROUP BY q.id
|
||||
)
|
||||
SELECT
|
||||
AVG(
|
||||
(
|
||||
SELECT COUNT(*)
|
||||
FROM unnest(h.hnsw_neighbors) AS hn
|
||||
WHERE hn = ANY(g.true_neighbors)
|
||||
)::float / :k
|
||||
) AS recall
|
||||
FROM hnsw_results h
|
||||
JOIN ground_truth g ON h.query_id = g.query_id;
|
||||
|
||||
-- ============================================================================
|
||||
-- Benchmark 6: Memory Usage
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo '=== Benchmark 6: Memory Usage ==='
|
||||
\echo ''
|
||||
|
||||
-- Table sizes
|
||||
\echo 'Table sizes:'
|
||||
SELECT
|
||||
'ruvector' AS type,
|
||||
pg_size_pretty(pg_total_relation_size('vectors_ruvector')) AS total_size,
|
||||
pg_size_pretty(pg_relation_size('vectors_ruvector')) AS table_size,
|
||||
pg_size_pretty(pg_indexes_size('vectors_ruvector')) AS index_size
|
||||
UNION ALL
|
||||
SELECT
|
||||
'pgvector' AS type,
|
||||
pg_size_pretty(pg_total_relation_size('vectors_pgvector')) AS total_size,
|
||||
pg_size_pretty(pg_relation_size('vectors_pgvector')) AS table_size,
|
||||
pg_size_pretty(pg_indexes_size('vectors_pgvector')) AS index_size;
|
||||
|
||||
-- Index sizes
|
||||
\echo 'Index sizes:'
|
||||
SELECT
|
||||
indexname,
|
||||
pg_size_pretty(pg_relation_size(indexname::regclass)) AS size
|
||||
FROM pg_indexes
|
||||
WHERE tablename IN ('vectors_ruvector', 'vectors_pgvector')
|
||||
ORDER BY tablename, indexname;
|
||||
|
||||
-- ============================================================================
|
||||
-- Benchmark 7: Quantization Performance
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo '=== Benchmark 7: Quantization ==='
|
||||
\echo ''
|
||||
|
||||
-- Create quantized tables
|
||||
DROP TABLE IF EXISTS vectors_scalar;
|
||||
CREATE TABLE vectors_scalar (
|
||||
id SERIAL PRIMARY KEY,
|
||||
embedding scalarvec
|
||||
);
|
||||
|
||||
INSERT INTO vectors_scalar (embedding)
|
||||
SELECT quantize_scalar(embedding)
|
||||
FROM vectors_ruvector
|
||||
LIMIT 100000;
|
||||
|
||||
-- Quantized search
|
||||
\echo 'Scalar quantized search:'
|
||||
\timing on
|
||||
SELECT id
|
||||
FROM vectors_scalar
|
||||
ORDER BY embedding <-> quantize_scalar((SELECT query_vector FROM queries WHERE id = 1))
|
||||
LIMIT :k;
|
||||
|
||||
-- ============================================================================
|
||||
-- Cleanup
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo '=== Benchmark Complete ==='
|
||||
\echo ''
|
||||
|
||||
DROP TABLE IF EXISTS vectors_ruvector CASCADE;
|
||||
DROP TABLE IF EXISTS vectors_pgvector CASCADE;
|
||||
DROP TABLE IF EXISTS queries CASCADE;
|
||||
DROP TABLE IF EXISTS vectors_scalar CASCADE;
|
||||
123
vendor/ruvector/crates/ruvector-postgres/benches/sql/quick_benchmark.sql
vendored
Normal file
123
vendor/ruvector/crates/ruvector-postgres/benches/sql/quick_benchmark.sql
vendored
Normal file
@@ -0,0 +1,123 @@
|
||||
-- Quick benchmark script for development testing
|
||||
-- Smaller dataset for faster iteration
|
||||
|
||||
\timing on
|
||||
\set ECHO all
|
||||
|
||||
-- Configuration
|
||||
\set num_vectors 10000
|
||||
\set num_queries 100
|
||||
\set dims 768
|
||||
\set k 10
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- ============================================================================
|
||||
-- Setup
|
||||
-- ============================================================================
|
||||
|
||||
DROP TABLE IF EXISTS test_vectors CASCADE;
|
||||
DROP TABLE IF EXISTS test_queries CASCADE;
|
||||
|
||||
CREATE TABLE test_vectors (
|
||||
id SERIAL PRIMARY KEY,
|
||||
embedding ruvector(:dims)
|
||||
);
|
||||
|
||||
CREATE TABLE test_queries (
|
||||
id SERIAL PRIMARY KEY,
|
||||
query_vector ruvector(:dims)
|
||||
);
|
||||
|
||||
-- ============================================================================
|
||||
-- Load Data
|
||||
-- ============================================================================
|
||||
|
||||
\echo 'Loading test data...'
|
||||
|
||||
INSERT INTO test_vectors (embedding)
|
||||
SELECT
|
||||
array_to_ruvector(ARRAY(
|
||||
SELECT random()::real
|
||||
FROM generate_series(1, :dims)
|
||||
))
|
||||
FROM generate_series(1, :num_vectors);
|
||||
|
||||
INSERT INTO test_queries (query_vector)
|
||||
SELECT
|
||||
array_to_ruvector(ARRAY(
|
||||
SELECT random()::real
|
||||
FROM generate_series(1, :dims)
|
||||
))
|
||||
FROM generate_series(1, :num_queries);
|
||||
|
||||
COMMIT;
|
||||
|
||||
-- ============================================================================
|
||||
-- Sequential Scan Baseline
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo 'Sequential scan baseline:'
|
||||
EXPLAIN ANALYZE
|
||||
SELECT id
|
||||
FROM test_vectors
|
||||
ORDER BY embedding <-> (SELECT query_vector FROM test_queries WHERE id = 1)
|
||||
LIMIT :k;
|
||||
|
||||
-- ============================================================================
|
||||
-- Build HNSW Index
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo 'Building HNSW index...'
|
||||
CREATE INDEX test_vectors_hnsw_idx ON test_vectors
|
||||
USING hnsw (embedding ruvector_l2_ops)
|
||||
WITH (m = 16, ef_construction = 64);
|
||||
|
||||
-- ============================================================================
|
||||
-- Index Search
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo 'HNSW index search:'
|
||||
EXPLAIN ANALYZE
|
||||
SELECT id
|
||||
FROM test_vectors
|
||||
ORDER BY embedding <-> (SELECT query_vector FROM test_queries WHERE id = 1)
|
||||
LIMIT :k;
|
||||
|
||||
-- ============================================================================
|
||||
-- Distance Functions
|
||||
-- ============================================================================
|
||||
|
||||
\echo ''
|
||||
\echo 'Distance function performance (1000 calculations):'
|
||||
|
||||
-- L2
|
||||
\timing on
|
||||
SELECT SUM(ruvector_l2_distance(v1.embedding, v2.embedding))
|
||||
FROM test_vectors v1, test_vectors v2
|
||||
WHERE v1.id <= 10 AND v2.id <= 100;
|
||||
|
||||
-- Cosine
|
||||
\timing on
|
||||
SELECT SUM(ruvector_cosine_distance(v1.embedding, v2.embedding))
|
||||
FROM test_vectors v1, test_vectors v2
|
||||
WHERE v1.id <= 10 AND v2.id <= 100;
|
||||
|
||||
-- Inner Product
|
||||
\timing on
|
||||
SELECT SUM(ruvector_inner_product(v1.embedding, v2.embedding))
|
||||
FROM test_vectors v1, test_vectors v2
|
||||
WHERE v1.id <= 10 AND v2.id <= 100;
|
||||
|
||||
-- ============================================================================
|
||||
-- Cleanup
|
||||
-- ============================================================================
|
||||
|
||||
DROP TABLE IF EXISTS test_vectors CASCADE;
|
||||
DROP TABLE IF EXISTS test_queries CASCADE;
|
||||
|
||||
\echo ''
|
||||
\echo 'Quick benchmark complete!'
|
||||
Reference in New Issue
Block a user