git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
458 lines
14 KiB
PL/PgSQL
458 lines
14 KiB
PL/PgSQL
-- ============================================================================
|
|
-- HNSW Index Test Suite
|
|
-- ============================================================================
|
|
-- Comprehensive tests for HNSW index access method
|
|
--
|
|
-- Run with: psql -d testdb -f hnsw_index_tests.sql
|
|
|
|
\set ECHO all
|
|
\set ON_ERROR_STOP on
|
|
|
|
-- Create test database if needed
|
|
-- CREATE DATABASE hnsw_test;
|
|
-- \c hnsw_test
|
|
|
|
-- Load extension
|
|
CREATE EXTENSION IF NOT EXISTS ruvector;
|
|
|
|
-- ============================================================================
|
|
-- Test 1: Basic Index Creation
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 1: Basic HNSW Index Creation ==='
|
|
|
|
CREATE TABLE test_vectors (
|
|
id SERIAL PRIMARY KEY,
|
|
embedding real[]
|
|
);
|
|
|
|
-- Insert test data (3D vectors)
|
|
INSERT INTO test_vectors (embedding) VALUES
|
|
(ARRAY[0.0, 0.0, 0.0]::real[]),
|
|
(ARRAY[1.0, 0.0, 0.0]::real[]),
|
|
(ARRAY[0.0, 1.0, 0.0]::real[]),
|
|
(ARRAY[0.0, 0.0, 1.0]::real[]),
|
|
(ARRAY[1.0, 1.0, 0.0]::real[]),
|
|
(ARRAY[1.0, 0.0, 1.0]::real[]),
|
|
(ARRAY[0.0, 1.0, 1.0]::real[]),
|
|
(ARRAY[1.0, 1.0, 1.0]::real[]),
|
|
(ARRAY[0.5, 0.5, 0.5]::real[]),
|
|
(ARRAY[0.2, 0.3, 0.1]::real[]);
|
|
|
|
-- Create HNSW index with default options (L2 distance)
|
|
CREATE INDEX test_vectors_hnsw_l2_idx ON test_vectors USING hnsw (embedding hnsw_l2_ops);
|
|
|
|
-- Verify index was created
|
|
SELECT indexname, indexdef
|
|
FROM pg_indexes
|
|
WHERE tablename = 'test_vectors';
|
|
|
|
-- ============================================================================
|
|
-- Test 2: L2 Distance Queries
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 2: L2 Distance Queries ==='
|
|
|
|
-- Query nearest neighbors to origin [0, 0, 0]
|
|
SELECT id, embedding, embedding <-> ARRAY[0.0, 0.0, 0.0]::real[] AS distance
|
|
FROM test_vectors
|
|
ORDER BY embedding <-> ARRAY[0.0, 0.0, 0.0]::real[]
|
|
LIMIT 5;
|
|
|
|
-- Query nearest neighbors to [1, 1, 1]
|
|
SELECT id, embedding, embedding <-> ARRAY[1.0, 1.0, 1.0]::real[] AS distance
|
|
FROM test_vectors
|
|
ORDER BY embedding <-> ARRAY[1.0, 1.0, 1.0]::real[]
|
|
LIMIT 5;
|
|
|
|
-- ============================================================================
|
|
-- Test 3: Index with Custom Options
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 3: HNSW Index with Custom Options ==='
|
|
|
|
CREATE TABLE test_vectors_opts (
|
|
id SERIAL PRIMARY KEY,
|
|
embedding real[]
|
|
);
|
|
|
|
-- Insert larger dataset
|
|
INSERT INTO test_vectors_opts (embedding)
|
|
SELECT ARRAY[random(), random(), random()]::real[]
|
|
FROM generate_series(1, 1000);
|
|
|
|
-- Create index with custom parameters
|
|
CREATE INDEX test_vectors_opts_hnsw_idx ON test_vectors_opts
|
|
USING hnsw (embedding hnsw_l2_ops)
|
|
WITH (m = 32, ef_construction = 128);
|
|
|
|
-- Verify index was created with options
|
|
SELECT indexname, indexdef
|
|
FROM pg_indexes
|
|
WHERE tablename = 'test_vectors_opts';
|
|
|
|
-- Query performance test
|
|
\timing on
|
|
SELECT id, embedding <-> ARRAY[0.5, 0.5, 0.5]::real[] AS distance
|
|
FROM test_vectors_opts
|
|
ORDER BY embedding <-> ARRAY[0.5, 0.5, 0.5]::real[]
|
|
LIMIT 10;
|
|
\timing off
|
|
|
|
-- ============================================================================
|
|
-- Test 4: Cosine Distance Index
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 4: Cosine Distance Index ==='
|
|
|
|
CREATE TABLE test_vectors_cosine (
|
|
id SERIAL PRIMARY KEY,
|
|
embedding real[]
|
|
);
|
|
|
|
-- Insert normalized vectors for cosine similarity
|
|
INSERT INTO test_vectors_cosine (embedding)
|
|
SELECT vector_normalize(ARRAY[random(), random(), random()]::real[])
|
|
FROM generate_series(1, 100);
|
|
|
|
-- Create HNSW index with cosine distance
|
|
CREATE INDEX test_vectors_cosine_idx ON test_vectors_cosine
|
|
USING hnsw (embedding hnsw_cosine_ops);
|
|
|
|
-- Query with cosine distance
|
|
SELECT id, embedding <=> ARRAY[1.0, 0.0, 0.0]::real[] AS cosine_dist
|
|
FROM test_vectors_cosine
|
|
ORDER BY embedding <=> ARRAY[1.0, 0.0, 0.0]::real[]
|
|
LIMIT 5;
|
|
|
|
-- ============================================================================
|
|
-- Test 5: Inner Product Index
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 5: Inner Product Index ==='
|
|
|
|
CREATE TABLE test_vectors_ip (
|
|
id SERIAL PRIMARY KEY,
|
|
embedding real[]
|
|
);
|
|
|
|
-- Insert test vectors
|
|
INSERT INTO test_vectors_ip (embedding)
|
|
SELECT ARRAY[random() * 10, random() * 10, random() * 10]::real[]
|
|
FROM generate_series(1, 100);
|
|
|
|
-- Create HNSW index with inner product
|
|
CREATE INDEX test_vectors_ip_idx ON test_vectors_ip
|
|
USING hnsw (embedding hnsw_ip_ops);
|
|
|
|
-- Query with inner product (finds vectors with largest inner product)
|
|
SELECT id, embedding <#> ARRAY[1.0, 1.0, 1.0]::real[] AS neg_ip
|
|
FROM test_vectors_ip
|
|
ORDER BY embedding <#> ARRAY[1.0, 1.0, 1.0]::real[]
|
|
LIMIT 5;
|
|
|
|
-- ============================================================================
|
|
-- Test 6: High-Dimensional Vectors
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 6: High-Dimensional Vectors (128D) ==='
|
|
|
|
CREATE TABLE test_vectors_high_dim (
|
|
id SERIAL PRIMARY KEY,
|
|
embedding real[]
|
|
);
|
|
|
|
-- Insert 128-dimensional vectors
|
|
INSERT INTO test_vectors_high_dim (embedding)
|
|
SELECT array_agg(random())::real[]
|
|
FROM generate_series(1, 500),
|
|
generate_series(1, 128)
|
|
GROUP BY 1;
|
|
|
|
-- Create HNSW index
|
|
CREATE INDEX test_vectors_high_dim_idx ON test_vectors_high_dim
|
|
USING hnsw (embedding hnsw_l2_ops)
|
|
WITH (m = 16, ef_construction = 64);
|
|
|
|
-- Query 128D vectors
|
|
\set query_vec 'SELECT array_agg(random())::real[] FROM generate_series(1, 128)'
|
|
SELECT id, embedding <-> (:query_vec) AS distance
|
|
FROM test_vectors_high_dim
|
|
ORDER BY embedding <-> (:query_vec)
|
|
LIMIT 5;
|
|
|
|
-- ============================================================================
|
|
-- Test 7: Index Maintenance
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 7: Index Maintenance ==='
|
|
|
|
-- Get memory statistics
|
|
SELECT ruvector_memory_stats();
|
|
|
|
-- Perform index maintenance
|
|
SELECT ruvector_index_maintenance('test_vectors_hnsw_l2_idx');
|
|
|
|
-- Check index size
|
|
SELECT
|
|
indexname,
|
|
pg_size_pretty(pg_relation_size(indexname::regclass)) AS index_size
|
|
FROM pg_indexes
|
|
WHERE tablename LIKE 'test_vectors%';
|
|
|
|
-- ============================================================================
|
|
-- Test 8: Insert/Delete Operations
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 8: Insert and Delete Operations ==='
|
|
|
|
-- Insert new vectors
|
|
INSERT INTO test_vectors (embedding)
|
|
SELECT ARRAY[random(), random(), random()]::real[]
|
|
FROM generate_series(1, 100);
|
|
|
|
-- Query after insert
|
|
SELECT COUNT(*) FROM test_vectors;
|
|
|
|
-- Delete some vectors
|
|
DELETE FROM test_vectors WHERE id % 2 = 0;
|
|
|
|
-- Query after delete
|
|
SELECT COUNT(*) FROM test_vectors;
|
|
|
|
-- Verify index still works
|
|
SELECT id, embedding <-> ARRAY[0.5, 0.5, 0.5]::real[] AS distance
|
|
FROM test_vectors
|
|
ORDER BY embedding <-> ARRAY[0.5, 0.5, 0.5]::real[]
|
|
LIMIT 5;
|
|
|
|
-- ============================================================================
|
|
-- Test 9: Query Plan Analysis
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 9: Query Plan Analysis ==='
|
|
|
|
-- Explain query plan for HNSW index scan
|
|
EXPLAIN (ANALYZE, BUFFERS)
|
|
SELECT id, embedding <-> ARRAY[0.5, 0.5, 0.5]::real[] AS distance
|
|
FROM test_vectors_opts
|
|
ORDER BY embedding <-> ARRAY[0.5, 0.5, 0.5]::real[]
|
|
LIMIT 10;
|
|
|
|
-- ============================================================================
|
|
-- Test 10: Session Parameter Testing
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 10: Session Parameter Testing ==='
|
|
|
|
-- Show current ef_search setting
|
|
SHOW ruvector.ef_search;
|
|
|
|
-- Increase ef_search for better recall
|
|
SET ruvector.ef_search = 100;
|
|
|
|
-- Run query with increased ef_search
|
|
SELECT id, embedding <-> ARRAY[0.5, 0.5, 0.5]::real[] AS distance
|
|
FROM test_vectors_opts
|
|
ORDER BY embedding <-> ARRAY[0.5, 0.5, 0.5]::real[]
|
|
LIMIT 10;
|
|
|
|
-- Reset to default
|
|
RESET ruvector.ef_search;
|
|
|
|
-- ============================================================================
|
|
-- Test 11: Operator Functionality
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 11: Distance Operator Tests ==='
|
|
|
|
-- Test L2 distance operator
|
|
SELECT
|
|
ARRAY[1.0, 2.0, 3.0]::real[] <-> ARRAY[4.0, 5.0, 6.0]::real[] AS l2_dist;
|
|
|
|
-- Test cosine distance operator
|
|
SELECT
|
|
ARRAY[1.0, 0.0, 0.0]::real[] <=> ARRAY[0.0, 1.0, 0.0]::real[] AS cosine_dist;
|
|
|
|
-- Test inner product operator
|
|
SELECT
|
|
ARRAY[1.0, 2.0, 3.0]::real[] <#> ARRAY[4.0, 5.0, 6.0]::real[] AS neg_ip;
|
|
|
|
-- ============================================================================
|
|
-- Test 12: Edge Cases
|
|
-- ============================================================================
|
|
|
|
\echo '=== Test 12: Edge Cases ==='
|
|
|
|
-- Empty result set
|
|
SELECT id, embedding <-> ARRAY[100.0, 100.0, 100.0]::real[] AS distance
|
|
FROM test_vectors
|
|
WHERE id < 0 -- No results
|
|
ORDER BY embedding <-> ARRAY[100.0, 100.0, 100.0]::real[]
|
|
LIMIT 5;
|
|
|
|
-- Single vector table
|
|
CREATE TABLE test_single_vector (
|
|
id SERIAL PRIMARY KEY,
|
|
embedding real[]
|
|
);
|
|
|
|
INSERT INTO test_single_vector (embedding) VALUES (ARRAY[1.0, 2.0, 3.0]::real[]);
|
|
|
|
CREATE INDEX test_single_vector_idx ON test_single_vector
|
|
USING hnsw (embedding hnsw_l2_ops);
|
|
|
|
SELECT * FROM test_single_vector
|
|
ORDER BY embedding <-> ARRAY[0.0, 0.0, 0.0]::real[]
|
|
LIMIT 5;
|
|
|
|
-- ============================================================================
|
|
-- Test 13: Parameterized Query Regression Tests (Issue #141)
|
|
-- ============================================================================
|
|
-- These tests verify the fix for HNSW segmentation fault with parameterized
|
|
-- queries. See ADR-0027 and GitHub issue #141 for details.
|
|
|
|
\echo '=== Test 13: Parameterized Query Regression Tests (Issue #141) ==='
|
|
|
|
-- Create ruvector table for parameterized query testing
|
|
CREATE TABLE test_ruvector_param (
|
|
id SERIAL PRIMARY KEY,
|
|
content TEXT NOT NULL,
|
|
embedding ruvector(8)
|
|
);
|
|
|
|
-- Insert test data with ruvector type
|
|
INSERT INTO test_ruvector_param (content, embedding) VALUES
|
|
('Doc 1', '[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]'::ruvector(8)),
|
|
('Doc 2', '[0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]'::ruvector(8)),
|
|
('Doc 3', '[0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]'::ruvector(8)),
|
|
('Doc 4', '[0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.1]'::ruvector(8)),
|
|
('Doc 5', '[0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.1, 0.2]'::ruvector(8));
|
|
|
|
-- Create HNSW index on ruvector column
|
|
CREATE INDEX test_ruvector_param_hnsw_idx ON test_ruvector_param
|
|
USING hnsw (embedding ruvector_cosine_ops)
|
|
WITH (m = 16, ef_construction = 64);
|
|
|
|
-- Test 13a: Literal query (baseline - should work)
|
|
\echo '--- Test 13a: Literal Query (baseline) ---'
|
|
SELECT id, content,
|
|
1 - (embedding <=> '[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]'::ruvector(8)) as similarity
|
|
FROM test_ruvector_param
|
|
ORDER BY embedding <=> '[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]'::ruvector(8)
|
|
LIMIT 3;
|
|
|
|
-- Test 13b: Prepared statement with parameter (was crashing before fix)
|
|
\echo '--- Test 13b: Prepared Statement with Parameter ---'
|
|
PREPARE param_search_test AS
|
|
SELECT id, content FROM test_ruvector_param
|
|
ORDER BY embedding <=> $1::ruvector(8)
|
|
LIMIT 3;
|
|
|
|
EXECUTE param_search_test('[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]');
|
|
EXECUTE param_search_test('[0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.1, 0.2]');
|
|
|
|
DEALLOCATE param_search_test;
|
|
|
|
-- Test 13c: Function with text parameter (simulates driver behavior)
|
|
\echo '--- Test 13c: Function with Text Parameter ---'
|
|
CREATE OR REPLACE FUNCTION test_hnsw_param_search(query_vec TEXT)
|
|
RETURNS TABLE(id INT, content TEXT) AS $$
|
|
BEGIN
|
|
RETURN QUERY
|
|
SELECT t.id, t.content
|
|
FROM test_ruvector_param t
|
|
ORDER BY t.embedding <=> query_vec::ruvector(8)
|
|
LIMIT 3;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
SELECT * FROM test_hnsw_param_search('[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]');
|
|
SELECT * FROM test_hnsw_param_search('[0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]');
|
|
|
|
DROP FUNCTION test_hnsw_param_search;
|
|
|
|
-- Test 13d: Zero vector error handling (should error gracefully, not crash)
|
|
\echo '--- Test 13d: Zero Vector Error Handling ---'
|
|
\set ON_ERROR_STOP off
|
|
-- This should produce an error, not a crash
|
|
SELECT id, content FROM test_ruvector_param
|
|
ORDER BY embedding <=> '[0, 0, 0, 0, 0, 0, 0, 0]'::ruvector(8)
|
|
LIMIT 3;
|
|
\set ON_ERROR_STOP on
|
|
|
|
-- Test 13e: Dimension mismatch error handling (should error gracefully)
|
|
\echo '--- Test 13e: Dimension Mismatch Error Handling ---'
|
|
\set ON_ERROR_STOP off
|
|
-- This should produce an error about dimension mismatch
|
|
SELECT id, content FROM test_ruvector_param
|
|
ORDER BY embedding <=> '[0.1, 0.2, 0.3]'::ruvector(3)
|
|
LIMIT 3;
|
|
\set ON_ERROR_STOP on
|
|
|
|
-- Test 13f: 384-dimension vectors (production scale test)
|
|
\echo '--- Test 13f: 384-Dimension Vectors (Production Scale) ---'
|
|
CREATE TABLE test_ruvector_384 (
|
|
id SERIAL PRIMARY KEY,
|
|
content TEXT NOT NULL,
|
|
embedding ruvector(384)
|
|
);
|
|
|
|
-- Generate 100 test vectors with 384 dimensions
|
|
DO $$
|
|
DECLARE
|
|
i INTEGER;
|
|
vec_text TEXT;
|
|
BEGIN
|
|
FOR i IN 1..100 LOOP
|
|
SELECT '[' || string_agg(((random() - 0.5)::numeric(6,4))::text, ',') || ']'
|
|
INTO vec_text
|
|
FROM generate_series(1, 384);
|
|
|
|
INSERT INTO test_ruvector_384 (content, embedding)
|
|
VALUES ('Doc ' || i, vec_text::ruvector(384));
|
|
END LOOP;
|
|
END $$;
|
|
|
|
-- Create HNSW index
|
|
CREATE INDEX test_ruvector_384_idx ON test_ruvector_384
|
|
USING hnsw (embedding ruvector_cosine_ops)
|
|
WITH (m = 16, ef_construction = 64);
|
|
|
|
-- Prepare and execute parameterized search on 384-dim vectors
|
|
PREPARE param_search_384 AS
|
|
SELECT id, content FROM test_ruvector_384
|
|
ORDER BY embedding <=> $1::ruvector(384)
|
|
LIMIT 5;
|
|
|
|
-- Get a sample vector and search with it via parameter
|
|
DO $$
|
|
DECLARE
|
|
sample_vec TEXT;
|
|
BEGIN
|
|
SELECT embedding::text INTO sample_vec FROM test_ruvector_384 WHERE id = 1;
|
|
-- This would fail before the fix
|
|
RAISE NOTICE 'Sample vector extracted, length: %', length(sample_vec);
|
|
END $$;
|
|
|
|
DEALLOCATE param_search_384;
|
|
|
|
\echo '=== Test 13: Parameterized Query Tests Completed ==='
|
|
|
|
-- ============================================================================
|
|
-- Cleanup
|
|
-- ============================================================================
|
|
|
|
\echo '=== Cleanup ==='
|
|
|
|
DROP TABLE IF EXISTS test_vectors CASCADE;
|
|
DROP TABLE IF EXISTS test_vectors_opts CASCADE;
|
|
DROP TABLE IF EXISTS test_vectors_cosine CASCADE;
|
|
DROP TABLE IF EXISTS test_vectors_ip CASCADE;
|
|
DROP TABLE IF EXISTS test_vectors_high_dim CASCADE;
|
|
DROP TABLE IF EXISTS test_single_vector CASCADE;
|
|
DROP TABLE IF EXISTS test_ruvector_param CASCADE;
|
|
DROP TABLE IF EXISTS test_ruvector_384 CASCADE;
|
|
|
|
\echo '=== All tests completed successfully ==='
|