git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
323 lines
10 KiB
SQL
323 lines
10 KiB
SQL
-- =============================================================================
|
|
-- RuVector Self-Learning Module Usage Examples
|
|
-- =============================================================================
|
|
-- This file demonstrates how to use the self-learning and ReasoningBank
|
|
-- features for adaptive query optimization.
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 1. Basic Setup: Enable Learning
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- Enable learning for a table with default configuration
|
|
SELECT ruvector_enable_learning('my_vectors');
|
|
|
|
-- Enable with custom configuration
|
|
SELECT ruvector_enable_learning(
|
|
'my_vectors',
|
|
'{"max_trajectories": 2000, "num_clusters": 15}'::jsonb
|
|
);
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 2. Recording Query Trajectories
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- Trajectories are typically recorded automatically by search functions,
|
|
-- but you can also record them manually for testing or custom workflows.
|
|
|
|
-- Record a query trajectory
|
|
SELECT ruvector_record_trajectory(
|
|
'my_vectors', -- table name
|
|
ARRAY[0.1, 0.2, 0.3, 0.4], -- query vector
|
|
ARRAY[1, 2, 3, 4, 5]::bigint[], -- result IDs
|
|
1500, -- latency in microseconds
|
|
50, -- ef_search used
|
|
10 -- probes used
|
|
);
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 3. Providing Relevance Feedback
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- After seeing query results, users can provide feedback about which
|
|
-- results were actually relevant
|
|
|
|
SELECT ruvector_record_feedback(
|
|
'my_vectors', -- table name
|
|
ARRAY[0.1, 0.2, 0.3, 0.4], -- query vector
|
|
ARRAY[1, 2, 5]::bigint[], -- relevant IDs
|
|
ARRAY[3, 4]::bigint[] -- irrelevant IDs
|
|
);
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 4. Extracting and Managing Patterns
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- Extract patterns from recorded trajectories using k-means clustering
|
|
SELECT ruvector_extract_patterns(
|
|
'my_vectors', -- table name
|
|
10 -- number of clusters
|
|
);
|
|
|
|
-- Get current learning statistics
|
|
SELECT ruvector_learning_stats('my_vectors');
|
|
|
|
-- Example output:
|
|
-- {
|
|
-- "trajectories": {
|
|
-- "total": 150,
|
|
-- "with_feedback": 45,
|
|
-- "avg_latency_us": 1234.5,
|
|
-- "avg_precision": 0.85,
|
|
-- "avg_recall": 0.78
|
|
-- },
|
|
-- "patterns": {
|
|
-- "total": 10,
|
|
-- "total_samples": 150,
|
|
-- "avg_confidence": 0.87,
|
|
-- "total_usage": 523
|
|
-- }
|
|
-- }
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 5. Auto-Tuning Search Parameters
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- Auto-tune for balanced performance (default)
|
|
SELECT ruvector_auto_tune('my_vectors');
|
|
|
|
-- Auto-tune optimizing for speed
|
|
SELECT ruvector_auto_tune('my_vectors', 'speed');
|
|
|
|
-- Auto-tune optimizing for accuracy
|
|
SELECT ruvector_auto_tune('my_vectors', 'accuracy');
|
|
|
|
-- Auto-tune with sample queries
|
|
SELECT ruvector_auto_tune(
|
|
'my_vectors',
|
|
'balanced',
|
|
ARRAY[
|
|
ARRAY[0.1, 0.2, 0.3],
|
|
ARRAY[0.4, 0.5, 0.6],
|
|
ARRAY[0.7, 0.8, 0.9]
|
|
]
|
|
);
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 6. Getting Optimized Search Parameters
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- Get optimized search parameters for a specific query
|
|
SELECT ruvector_get_search_params(
|
|
'my_vectors',
|
|
ARRAY[0.1, 0.2, 0.3, 0.4]
|
|
);
|
|
|
|
-- Example output:
|
|
-- {
|
|
-- "ef_search": 52,
|
|
-- "probes": 12,
|
|
-- "confidence": 0.89
|
|
-- }
|
|
|
|
-- Use these parameters in your search:
|
|
-- SET ruvector.ef_search = 52;
|
|
-- SET ruvector.probes = 12;
|
|
-- SELECT * FROM my_vectors ORDER BY embedding <-> '[0.1, 0.2, 0.3, 0.4]' LIMIT 10;
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 7. Pattern Consolidation and Pruning
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- Consolidate similar patterns to reduce memory usage
|
|
-- Patterns with similarity >= 0.95 will be merged
|
|
SELECT ruvector_consolidate_patterns('my_vectors', 0.95);
|
|
|
|
-- Prune low-quality patterns
|
|
-- Remove patterns with usage < 5 or confidence < 0.5
|
|
SELECT ruvector_prune_patterns(
|
|
'my_vectors',
|
|
5, -- min_usage
|
|
0.5 -- min_confidence
|
|
);
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 8. Complete Workflow Example
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- Create a table with vectors
|
|
CREATE TABLE documents (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
title TEXT,
|
|
embedding vector(384)
|
|
);
|
|
|
|
-- Insert some sample data
|
|
INSERT INTO documents (title, embedding)
|
|
SELECT
|
|
'Document ' || i,
|
|
ruvector_random(384)
|
|
FROM generate_series(1, 1000) i;
|
|
|
|
-- Create an HNSW index
|
|
CREATE INDEX ON documents USING hnsw (embedding vector_cosine_ops);
|
|
|
|
-- Enable learning for adaptive optimization
|
|
SELECT ruvector_enable_learning('documents');
|
|
|
|
-- Simulate user queries and collect trajectories
|
|
DO $$
|
|
DECLARE
|
|
query_vec vector(384);
|
|
results bigint[];
|
|
start_time bigint;
|
|
end_time bigint;
|
|
BEGIN
|
|
FOR i IN 1..50 LOOP
|
|
-- Generate random query
|
|
query_vec := ruvector_random(384);
|
|
|
|
-- Execute search and measure time
|
|
start_time := EXTRACT(EPOCH FROM clock_timestamp()) * 1000000;
|
|
|
|
SELECT array_agg(id) INTO results
|
|
FROM (
|
|
SELECT id FROM documents
|
|
ORDER BY embedding <=> query_vec
|
|
LIMIT 10
|
|
) t;
|
|
|
|
end_time := EXTRACT(EPOCH FROM clock_timestamp()) * 1000000;
|
|
|
|
-- Record trajectory
|
|
PERFORM ruvector_record_trajectory(
|
|
'documents',
|
|
query_vec::float4[],
|
|
results,
|
|
(end_time - start_time)::bigint,
|
|
50, -- current ef_search
|
|
10 -- current probes
|
|
);
|
|
|
|
-- Occasionally provide feedback
|
|
IF i % 5 = 0 THEN
|
|
PERFORM ruvector_record_feedback(
|
|
'documents',
|
|
query_vec::float4[],
|
|
results[1:3], -- first 3 were relevant
|
|
results[8:10] -- last 3 were not relevant
|
|
);
|
|
END IF;
|
|
END LOOP;
|
|
END $$;
|
|
|
|
-- Extract patterns from collected data
|
|
SELECT ruvector_extract_patterns('documents', 10);
|
|
|
|
-- View learning statistics
|
|
SELECT ruvector_learning_stats('documents');
|
|
|
|
-- Auto-tune for optimal performance
|
|
SELECT ruvector_auto_tune('documents', 'balanced');
|
|
|
|
-- Get optimized parameters for a new query
|
|
WITH query AS (
|
|
SELECT ruvector_random(384) AS vec
|
|
),
|
|
params AS (
|
|
SELECT ruvector_get_search_params('documents', (SELECT vec::float4[] FROM query)) AS p
|
|
)
|
|
SELECT
|
|
(p->'ef_search')::int AS ef_search,
|
|
(p->'probes')::int AS probes,
|
|
(p->'confidence')::float AS confidence
|
|
FROM params;
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 9. Monitoring and Maintenance
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- Regularly consolidate patterns (can be run in a cron job)
|
|
SELECT ruvector_consolidate_patterns('documents', 0.92);
|
|
|
|
-- Prune low-quality patterns monthly
|
|
SELECT ruvector_prune_patterns('documents', 10, 0.6);
|
|
|
|
-- Clear all learning data if needed
|
|
SELECT ruvector_clear_learning('documents');
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 10. Advanced: Integration with Application Code
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- Example: Python application using learned parameters
|
|
|
|
/*
|
|
import psycopg2
|
|
|
|
def search_with_learning(conn, table, query_vector, limit=10):
|
|
"""Search using learned optimal parameters"""
|
|
|
|
# Get optimized parameters
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT ruvector_get_search_params(%s, %s::float4[])
|
|
""", (table, query_vector))
|
|
params = cur.fetchone()[0]
|
|
|
|
# Apply parameters and search
|
|
with conn.cursor() as cur:
|
|
cur.execute(f"""
|
|
SET ruvector.ef_search = {params['ef_search']};
|
|
SET ruvector.probes = {params['probes']};
|
|
|
|
SELECT id, title, embedding <=> %s::vector AS distance
|
|
FROM {table}
|
|
ORDER BY embedding <=> %s::vector
|
|
LIMIT %s
|
|
""", (query_vector, query_vector, limit))
|
|
|
|
results = cur.fetchall()
|
|
|
|
return results, params
|
|
|
|
# Use it
|
|
conn = psycopg2.connect("dbname=mydb")
|
|
results, params = search_with_learning(
|
|
conn,
|
|
'documents',
|
|
[0.1, 0.2, 0.3, ...],
|
|
limit=10
|
|
)
|
|
|
|
print(f"Search completed with ef_search={params['ef_search']}, "
|
|
f"confidence={params['confidence']:.2f}")
|
|
*/
|
|
|
|
-- -----------------------------------------------------------------------------
|
|
-- 11. Best Practices
|
|
-- -----------------------------------------------------------------------------
|
|
|
|
-- 1. Collect enough trajectories before extracting patterns (50+ recommended)
|
|
-- 2. Provide relevance feedback when possible for better learning
|
|
-- 3. Consolidate patterns regularly to manage memory
|
|
-- 4. Prune low-quality patterns periodically
|
|
-- 5. Monitor learning statistics to track improvement
|
|
-- 6. Start with balanced optimization, adjust based on needs
|
|
-- 7. Re-extract patterns when query patterns change significantly
|
|
|
|
-- Example monitoring query:
|
|
SELECT
|
|
jsonb_pretty(ruvector_learning_stats('documents')) AS stats,
|
|
CASE
|
|
WHEN (stats->'trajectories'->>'total')::int < 50
|
|
THEN 'Collecting data - need more trajectories'
|
|
WHEN (stats->'patterns'->>'total')::int = 0
|
|
THEN 'Ready to extract patterns'
|
|
WHEN (stats->'patterns'->>'avg_confidence')::float < 0.7
|
|
THEN 'Low confidence - collect more feedback'
|
|
ELSE 'System is learning well'
|
|
END AS recommendation
|
|
FROM (
|
|
SELECT ruvector_learning_stats('documents') AS stats
|
|
) t;
|