336 lines
8.9 KiB
SQL
336 lines
8.9 KiB
SQL
-- ============================================================================
|
|
-- SparseVec PostgreSQL Type - Usage Examples
|
|
-- ============================================================================
|
|
|
|
-- Basic Usage
|
|
-- ============================================================================
|
|
|
|
-- Create a sparse vector with format {idx:val,idx:val,...}/dimensions
|
|
SELECT '{0:1.5,3:2.5,7:3.5}/10'::sparsevec;
|
|
|
|
-- Create an empty sparse vector
|
|
SELECT '{}/100'::sparsevec;
|
|
|
|
-- Create a dense sparse vector (many non-zeros)
|
|
SELECT '{0:1.0,1:2.0,2:3.0,3:4.0,4:5.0}/5'::sparsevec;
|
|
|
|
-- Introspection
|
|
-- ============================================================================
|
|
|
|
-- Get dimensions
|
|
SELECT sparsevec_dims('{0:1.5,3:2.5,7:3.5}/10'::sparsevec);
|
|
-- Returns: 10
|
|
|
|
-- Get number of non-zero elements
|
|
SELECT sparsevec_nnz('{0:1.5,3:2.5,7:3.5}/10'::sparsevec);
|
|
-- Returns: 3
|
|
|
|
-- Get sparsity ratio
|
|
SELECT sparsevec_sparsity('{0:1.5,3:2.5,7:3.5}/10'::sparsevec);
|
|
-- Returns: 0.3 (30% non-zero)
|
|
|
|
-- Get L2 norm
|
|
SELECT sparsevec_norm('{0:3.0,1:4.0}/5'::sparsevec);
|
|
-- Returns: 5.0
|
|
|
|
-- Get value at specific index
|
|
SELECT sparsevec_get('{0:1.5,3:2.5,7:3.5}/10'::sparsevec, 3);
|
|
-- Returns: 2.5
|
|
|
|
SELECT sparsevec_get('{0:1.5,3:2.5,7:3.5}/10'::sparsevec, 5);
|
|
-- Returns: 0.0 (not present)
|
|
|
|
-- Parse and inspect
|
|
SELECT sparsevec_parse('{0:1.5,3:2.5,7:3.5}/10');
|
|
-- Returns JSON with full details
|
|
|
|
-- Distance Calculations
|
|
-- ============================================================================
|
|
|
|
-- L2 (Euclidean) distance
|
|
SELECT sparsevec_l2_distance(
|
|
'{0:1.0,2:2.0,4:3.0}/5'::sparsevec,
|
|
'{1:1.0,2:1.0,3:2.0}/5'::sparsevec
|
|
);
|
|
|
|
-- Inner product distance (negative dot product)
|
|
SELECT sparsevec_ip_distance(
|
|
'{0:1.0,2:2.0}/5'::sparsevec,
|
|
'{2:1.0,4:3.0}/5'::sparsevec
|
|
);
|
|
-- Returns: -2.0 (only index 2 overlaps: -(2*1))
|
|
|
|
-- Cosine distance
|
|
SELECT sparsevec_cosine_distance(
|
|
'{0:1.0,2:2.0}/5'::sparsevec,
|
|
'{0:2.0,2:4.0}/5'::sparsevec
|
|
);
|
|
-- Returns: ~0.0 (same direction)
|
|
|
|
-- Mixed sparse-dense distances
|
|
SELECT sparsevec_vector_l2_distance(
|
|
'{0:1.0,3:2.0}/5'::sparsevec,
|
|
'[1.0,0.0,0.0,2.0,0.0]'::ruvector
|
|
);
|
|
|
|
SELECT sparsevec_vector_cosine_distance(
|
|
'{0:1.0,3:2.0}/5'::sparsevec,
|
|
'[1.0,0.0,0.0,2.0,0.0]'::ruvector
|
|
);
|
|
|
|
-- Vector Operations
|
|
-- ============================================================================
|
|
|
|
-- Normalize to unit length
|
|
SELECT sparsevec_normalize('{0:3.0,1:4.0}/5'::sparsevec);
|
|
-- Returns: {0:0.6,1:0.8}/5
|
|
|
|
-- Add two sparse vectors
|
|
SELECT sparsevec_add(
|
|
'{0:1.0,2:2.0}/5'::sparsevec,
|
|
'{1:3.0,2:1.0}/5'::sparsevec
|
|
);
|
|
-- Returns: {0:1.0,1:3.0,2:3.0}/5
|
|
|
|
-- Multiply by scalar
|
|
SELECT sparsevec_mul_scalar('{0:1.0,2:2.0}/5'::sparsevec, 2.5);
|
|
-- Returns: {0:2.5,2:5.0}/5
|
|
|
|
-- Conversions
|
|
-- ============================================================================
|
|
|
|
-- Sparse to dense vector
|
|
SELECT sparsevec_to_vector('{0:1.0,3:2.0}/5'::sparsevec);
|
|
-- Returns: [1.0, 0.0, 0.0, 2.0, 0.0]
|
|
|
|
-- Dense to sparse with threshold
|
|
SELECT vector_to_sparsevec('[0.001,0.5,0.002,1.0,0.003]'::ruvector, 0.01);
|
|
-- Returns: {1:0.5,3:1.0}/5 (filters values ≤ 0.01)
|
|
|
|
-- Sparse to array
|
|
SELECT sparsevec_to_array('{0:1.0,3:2.0}/5'::sparsevec);
|
|
|
|
-- Array to sparse
|
|
SELECT array_to_sparsevec(ARRAY[0.001, 0.5, 0.002, 1.0, 0.003]::float4[], 0.01);
|
|
|
|
-- Table Creation and Queries
|
|
-- ============================================================================
|
|
|
|
-- Create table for text embeddings (TF-IDF)
|
|
CREATE TABLE documents (
|
|
id SERIAL PRIMARY KEY,
|
|
title TEXT NOT NULL,
|
|
content TEXT,
|
|
embedding sparsevec(10000) -- 10K vocabulary
|
|
);
|
|
|
|
-- Insert documents with sparse embeddings
|
|
INSERT INTO documents (title, content, embedding) VALUES
|
|
('Document 1', 'machine learning artificial intelligence',
|
|
'{45:0.8,123:0.6,789:0.9,1024:0.7}/10000'),
|
|
('Document 2', 'deep learning neural networks',
|
|
'{45:0.3,234:0.9,789:0.4,2048:0.8}/10000'),
|
|
('Document 3', 'natural language processing',
|
|
'{123:0.7,456:0.9,3072:0.6}/10000');
|
|
|
|
-- Find similar documents using cosine distance
|
|
SELECT
|
|
d.id,
|
|
d.title,
|
|
sparsevec_cosine_distance(d.embedding, query.embedding) AS distance
|
|
FROM
|
|
documents d,
|
|
(SELECT embedding FROM documents WHERE id = 1) AS query
|
|
WHERE
|
|
d.id != 1
|
|
ORDER BY
|
|
distance ASC
|
|
LIMIT 5;
|
|
|
|
-- Find nearest neighbors using L2 distance
|
|
SELECT
|
|
d.id,
|
|
d.title,
|
|
sparsevec_l2_distance(d.embedding,
|
|
'{45:0.8,123:0.6,789:0.9}/10000'::sparsevec) AS distance
|
|
FROM
|
|
documents d
|
|
ORDER BY
|
|
distance ASC
|
|
LIMIT 10;
|
|
|
|
-- Recommender System Example
|
|
-- ============================================================================
|
|
|
|
-- User-item interaction matrix (sparse)
|
|
CREATE TABLE user_profiles (
|
|
user_id INT PRIMARY KEY,
|
|
username TEXT NOT NULL,
|
|
preferences sparsevec(100000) -- 100K items
|
|
);
|
|
|
|
-- Insert user profiles with sparse preference vectors
|
|
INSERT INTO user_profiles (user_id, username, preferences) VALUES
|
|
(1, 'alice', '{123:5.0,456:4.5,789:3.5,1024:4.0}/100000'),
|
|
(2, 'bob', '{123:4.0,234:5.0,789:4.5,2048:3.5}/100000'),
|
|
(3, 'carol', '{456:5.0,890:4.0,2048:4.5,3072:5.0}/100000');
|
|
|
|
-- Collaborative filtering: Find similar users
|
|
SELECT
|
|
u2.user_id,
|
|
u2.username,
|
|
sparsevec_cosine_distance(u1.preferences, u2.preferences) AS similarity
|
|
FROM
|
|
user_profiles u1,
|
|
user_profiles u2
|
|
WHERE
|
|
u1.user_id = 1
|
|
AND u2.user_id != 1
|
|
ORDER BY
|
|
similarity ASC
|
|
LIMIT 10;
|
|
|
|
-- Find items user might like (based on similar users)
|
|
WITH similar_users AS (
|
|
SELECT
|
|
u2.user_id,
|
|
u2.preferences,
|
|
sparsevec_cosine_distance(u1.preferences, u2.preferences) AS similarity
|
|
FROM
|
|
user_profiles u1,
|
|
user_profiles u2
|
|
WHERE
|
|
u1.user_id = 1
|
|
AND u2.user_id != 1
|
|
ORDER BY
|
|
similarity ASC
|
|
LIMIT 5
|
|
)
|
|
SELECT
|
|
user_id,
|
|
similarity
|
|
FROM
|
|
similar_users;
|
|
|
|
-- Graph Embeddings Example
|
|
-- ============================================================================
|
|
|
|
-- Store graph node embeddings
|
|
CREATE TABLE graph_nodes (
|
|
node_id BIGINT PRIMARY KEY,
|
|
node_type TEXT,
|
|
sparse_embedding sparsevec(50000)
|
|
);
|
|
|
|
-- Insert graph nodes with embeddings
|
|
INSERT INTO graph_nodes (node_id, node_type, sparse_embedding) VALUES
|
|
(1, 'person', '{100:0.9,500:0.7,1000:0.8}/50000'),
|
|
(2, 'product', '{200:0.8,600:0.9,1500:0.7}/50000'),
|
|
(3, 'company', '{100:0.5,300:0.8,2000:0.9}/50000');
|
|
|
|
-- Find nearest neighbors in embedding space
|
|
SELECT
|
|
node_id,
|
|
node_type,
|
|
sparsevec_l2_distance(sparse_embedding,
|
|
'{100:0.9,500:0.7,1000:0.8}/50000'::sparsevec) AS distance
|
|
FROM
|
|
graph_nodes
|
|
WHERE
|
|
node_id != 1
|
|
ORDER BY
|
|
distance ASC
|
|
LIMIT 20;
|
|
|
|
-- Statistics and Analytics
|
|
-- ============================================================================
|
|
|
|
-- Analyze sparsity distribution
|
|
SELECT
|
|
percentile_cont(0.5) WITHIN GROUP (ORDER BY sparsevec_sparsity(embedding)) AS median_sparsity,
|
|
AVG(sparsevec_sparsity(embedding)) AS avg_sparsity,
|
|
MIN(sparsevec_nnz(embedding)) AS min_nnz,
|
|
MAX(sparsevec_nnz(embedding)) AS max_nnz
|
|
FROM
|
|
documents;
|
|
|
|
-- Find documents with highest/lowest sparsity
|
|
SELECT
|
|
id,
|
|
title,
|
|
sparsevec_nnz(embedding) AS non_zeros,
|
|
sparsevec_sparsity(embedding) AS sparsity_ratio
|
|
FROM
|
|
documents
|
|
ORDER BY
|
|
sparsity_ratio DESC
|
|
LIMIT 10;
|
|
|
|
-- Performance Comparison
|
|
-- ============================================================================
|
|
|
|
-- Compare storage efficiency
|
|
SELECT
|
|
'Dense' AS type,
|
|
pg_column_size('[' || array_to_string(array_agg(i::text), ',') || ']'::ruvector) AS bytes
|
|
FROM generate_series(1, 10000) AS i
|
|
UNION ALL
|
|
SELECT
|
|
'Sparse (1% non-zero)' AS type,
|
|
pg_column_size('{' || array_to_string(
|
|
array_agg(i || ':1.0'), ',') || '}/10000'::sparsevec) AS bytes
|
|
FROM generate_series(1, 100) AS i;
|
|
|
|
-- Advanced Queries
|
|
-- ============================================================================
|
|
|
|
-- Batch distance calculation
|
|
WITH query_vector AS (
|
|
SELECT '{0:1.0,100:2.0,500:3.0}/10000'::sparsevec AS vec
|
|
)
|
|
SELECT
|
|
d.id,
|
|
d.title,
|
|
sparsevec_cosine_distance(d.embedding, q.vec) AS distance
|
|
FROM
|
|
documents d,
|
|
query_vector q
|
|
ORDER BY
|
|
distance ASC;
|
|
|
|
-- Filter by distance threshold
|
|
SELECT
|
|
d.id,
|
|
d.title
|
|
FROM
|
|
documents d
|
|
WHERE
|
|
sparsevec_cosine_distance(d.embedding,
|
|
'{45:0.8,123:0.6}/10000'::sparsevec) < 0.5
|
|
ORDER BY
|
|
id;
|
|
|
|
-- Aggregate operations
|
|
SELECT
|
|
AVG(sparsevec_norm(embedding)) AS avg_norm,
|
|
STDDEV(sparsevec_norm(embedding)) AS stddev_norm
|
|
FROM
|
|
documents;
|
|
|
|
-- Index Creation (Future Enhancement)
|
|
-- ============================================================================
|
|
|
|
-- These would be available once index support is added:
|
|
-- CREATE INDEX idx_doc_embedding ON documents
|
|
-- USING hnsw (embedding sparsevec_cosine_ops);
|
|
|
|
-- CREATE INDEX idx_user_prefs ON user_profiles
|
|
-- USING ivfflat (preferences sparsevec_l2_ops);
|
|
|
|
-- Cleanup
|
|
-- ============================================================================
|
|
|
|
-- DROP TABLE IF EXISTS documents;
|
|
-- DROP TABLE IF EXISTS user_profiles;
|
|
-- DROP TABLE IF EXISTS graph_nodes;
|