git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
146 lines
5.1 KiB
PL/PgSQL
146 lines
5.1 KiB
PL/PgSQL
-- RuVector Optimized Benchmark Setup
|
|
-- Performance-optimized schema with indexes and parallel-safe functions
|
|
|
|
-- Enable extension
|
|
CREATE EXTENSION IF NOT EXISTS ruvector;
|
|
|
|
-- ============================================================================
|
|
-- Optimized Vector Table with HNSW Index
|
|
-- ============================================================================
|
|
DROP TABLE IF EXISTS benchmark_vectors CASCADE;
|
|
CREATE TABLE benchmark_vectors (
|
|
id SERIAL PRIMARY KEY,
|
|
embedding ruvector,
|
|
category TEXT,
|
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
|
);
|
|
|
|
-- Insert test vectors (1000 random 128-dim vectors)
|
|
INSERT INTO benchmark_vectors (embedding, category)
|
|
SELECT
|
|
ruvector_random(128),
|
|
'category_' || (random() * 10)::int
|
|
FROM generate_series(1, 1000);
|
|
|
|
-- Create HNSW index for fast similarity search
|
|
-- m=16: connections per layer, ef_construction=100: build-time accuracy
|
|
CREATE INDEX IF NOT EXISTS idx_vectors_hnsw
|
|
ON benchmark_vectors USING hnsw (embedding ruvector_cosine_ops)
|
|
WITH (m = 16, ef_construction = 100);
|
|
|
|
-- ============================================================================
|
|
-- Optimized Full-Text Search with GIN Index
|
|
-- ============================================================================
|
|
DROP TABLE IF EXISTS benchmark_documents CASCADE;
|
|
CREATE TABLE benchmark_documents (
|
|
id SERIAL PRIMARY KEY,
|
|
content TEXT,
|
|
content_tsvector TSVECTOR GENERATED ALWAYS AS (to_tsvector('english', content)) STORED,
|
|
sparse_embedding TEXT,
|
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
|
);
|
|
|
|
-- Insert test documents
|
|
INSERT INTO benchmark_documents (content, sparse_embedding)
|
|
SELECT
|
|
'Document ' || i || ' contains words like vector database similarity search embedding neural network',
|
|
ruvector_sparse_from_dense(ARRAY[random(), 0, random(), 0, random(), 0, random(), 0]::float4[])
|
|
FROM generate_series(1, 500) i;
|
|
|
|
-- GIN index for full-text search
|
|
CREATE INDEX IF NOT EXISTS idx_documents_fts
|
|
ON benchmark_documents USING gin (content_tsvector);
|
|
|
|
-- ============================================================================
|
|
-- Optimized Graph Tables with B-tree Indexes
|
|
-- ============================================================================
|
|
DROP TABLE IF EXISTS benchmark_edges CASCADE;
|
|
DROP TABLE IF EXISTS benchmark_nodes CASCADE;
|
|
|
|
CREATE TABLE benchmark_nodes (
|
|
id SERIAL PRIMARY KEY,
|
|
features ruvector,
|
|
node_type TEXT
|
|
);
|
|
|
|
CREATE TABLE benchmark_edges (
|
|
id SERIAL PRIMARY KEY,
|
|
source_id INT REFERENCES benchmark_nodes(id),
|
|
target_id INT REFERENCES benchmark_nodes(id),
|
|
edge_type TEXT,
|
|
weight FLOAT DEFAULT 1.0
|
|
);
|
|
|
|
-- Insert test graph data
|
|
INSERT INTO benchmark_nodes (features, node_type)
|
|
SELECT
|
|
ruvector_random(64),
|
|
'type_' || (random() * 5)::int
|
|
FROM generate_series(1, 200);
|
|
|
|
INSERT INTO benchmark_edges (source_id, target_id, edge_type, weight)
|
|
SELECT
|
|
(random() * 199 + 1)::int,
|
|
(random() * 199 + 1)::int,
|
|
'edge_' || (random() * 3)::int,
|
|
random()
|
|
FROM generate_series(1, 1000);
|
|
|
|
-- B-tree indexes for fast edge lookups
|
|
CREATE INDEX IF NOT EXISTS idx_edges_source ON benchmark_edges(source_id);
|
|
CREATE INDEX IF NOT EXISTS idx_edges_target ON benchmark_edges(target_id);
|
|
CREATE INDEX IF NOT EXISTS idx_edges_source_target ON benchmark_edges(source_id, target_id);
|
|
|
|
-- ============================================================================
|
|
-- Optimized Quantization Tables
|
|
-- ============================================================================
|
|
DROP TABLE IF EXISTS benchmark_quantized CASCADE;
|
|
CREATE TABLE benchmark_quantized (
|
|
id SERIAL PRIMARY KEY,
|
|
original ruvector,
|
|
binary_quantized BIT VARYING,
|
|
scalar_quantized BYTEA
|
|
);
|
|
|
|
-- Insert and quantize vectors
|
|
INSERT INTO benchmark_quantized (original, binary_quantized, scalar_quantized)
|
|
SELECT
|
|
v.embedding,
|
|
ruvector_binary_quantize(v.embedding),
|
|
ruvector_scalar_quantize(v.embedding, 8)
|
|
FROM benchmark_vectors v
|
|
LIMIT 500;
|
|
|
|
-- ============================================================================
|
|
-- Parallel-Safe Helper Functions
|
|
-- ============================================================================
|
|
|
|
-- Parallel-safe cosine distance function
|
|
CREATE OR REPLACE FUNCTION bench_cosine_distance(a ruvector, b ruvector)
|
|
RETURNS float8 AS $$
|
|
SELECT ruvector_distance(a, b, 'cosine')
|
|
$$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE;
|
|
|
|
-- Parallel-safe Hamming distance using bit_count
|
|
CREATE OR REPLACE FUNCTION bench_hamming_distance(a BIT VARYING, b BIT VARYING)
|
|
RETURNS int AS $$
|
|
SELECT bit_count(a # b)::int
|
|
$$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE;
|
|
|
|
-- Parallel-safe sparse dot product
|
|
CREATE OR REPLACE FUNCTION bench_sparse_dot(a TEXT, b TEXT)
|
|
RETURNS float8 AS $$
|
|
SELECT ruvector_sparse_distance(a, b, 'cosine')
|
|
$$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE;
|
|
|
|
-- ============================================================================
|
|
-- Statistics Update
|
|
-- ============================================================================
|
|
ANALYZE benchmark_vectors;
|
|
ANALYZE benchmark_documents;
|
|
ANALYZE benchmark_nodes;
|
|
ANALYZE benchmark_edges;
|
|
ANALYZE benchmark_quantized;
|
|
|
|
SELECT 'Optimized benchmark setup complete' AS status;
|