37 KiB
37 KiB
RuVector Postgres v2 - SQL Schema Specification
Overview
This document defines the complete SQL schema for RuVector Postgres v2, including types, operators, functions, tables, and views. The schema is designed for 100% pgvector compatibility while providing extended functionality.
1. Data Types
1.1 Core Vector Types (pgvector Compatible)
-- Dense vector type (pgvector compatible)
CREATE TYPE vector;
-- Input/output functions
CREATE FUNCTION vector_in(cstring, oid, integer) RETURNS vector
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION vector_out(vector) RETURNS cstring
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION vector_typmod_in(cstring[]) RETURNS integer
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION vector_typmod_out(integer) RETURNS cstring
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION vector_recv(internal, oid, integer) RETURNS vector
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION vector_send(vector) RETURNS bytea
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE TYPE vector (
INPUT = vector_in,
OUTPUT = vector_out,
TYPMOD_IN = vector_typmod_in,
TYPMOD_OUT = vector_typmod_out,
RECEIVE = vector_recv,
SEND = vector_send,
STORAGE = external,
INTERNALLENGTH = VARIABLE,
ALIGNMENT = double
);
1.2 Extended Vector Types
-- Half-precision vector (float16)
CREATE TYPE halfvec (
INPUT = halfvec_in,
OUTPUT = halfvec_out,
TYPMOD_IN = halfvec_typmod_in,
TYPMOD_OUT = halfvec_typmod_out,
RECEIVE = halfvec_recv,
SEND = halfvec_send,
STORAGE = external,
INTERNALLENGTH = VARIABLE,
ALIGNMENT = double
);
-- Sparse vector (coordinate format)
CREATE TYPE sparsevec (
INPUT = sparsevec_in,
OUTPUT = sparsevec_out,
TYPMOD_IN = sparsevec_typmod_in,
TYPMOD_OUT = sparsevec_typmod_out,
RECEIVE = sparsevec_recv,
SEND = sparsevec_send,
STORAGE = external,
INTERNALLENGTH = VARIABLE,
ALIGNMENT = double
);
-- Binary vector (bit-packed)
CREATE TYPE binaryvec (
INPUT = binaryvec_in,
OUTPUT = binaryvec_out,
TYPMOD_IN = binaryvec_typmod_in,
TYPMOD_OUT = binaryvec_typmod_out,
RECEIVE = binaryvec_recv,
SEND = binaryvec_send,
STORAGE = plain,
INTERNALLENGTH = VARIABLE,
ALIGNMENT = int4
);
2. Operators (pgvector Compatible)
2.1 Distance Operators
-- L2 (Euclidean) distance
CREATE OPERATOR <-> (
LEFTARG = vector,
RIGHTARG = vector,
FUNCTION = vector_l2_distance,
COMMUTATOR = <->
);
-- Cosine distance (1 - cosine_similarity)
CREATE OPERATOR <=> (
LEFTARG = vector,
RIGHTARG = vector,
FUNCTION = vector_cosine_distance,
COMMUTATOR = <=>
);
-- Negative inner product (for MAX inner product via MIN)
CREATE OPERATOR <#> (
LEFTARG = vector,
RIGHTARG = vector,
FUNCTION = vector_negative_inner_product,
COMMUTATOR = <#>
);
-- L1 (Manhattan) distance (RuVector extension)
CREATE OPERATOR <+> (
LEFTARG = vector,
RIGHTARG = vector,
FUNCTION = vector_l1_distance,
COMMUTATOR = <+>
);
2.2 Operator Implementation Functions
-- Distance calculation functions
CREATE FUNCTION vector_l2_distance(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION vector_cosine_distance(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION vector_negative_inner_product(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION vector_l1_distance(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
-- Similarity functions (convenience)
CREATE FUNCTION cosine_similarity(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME', 'vector_cosine_similarity'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION inner_product(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME', 'vector_inner_product'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION l2_distance(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME', 'vector_l2_distance'
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
2.3 Vector Arithmetic
-- Addition
CREATE OPERATOR + (
LEFTARG = vector,
RIGHTARG = vector,
FUNCTION = vector_add,
COMMUTATOR = +
);
-- Subtraction
CREATE OPERATOR - (
LEFTARG = vector,
RIGHTARG = vector,
FUNCTION = vector_sub
);
-- Scalar multiplication
CREATE OPERATOR * (
LEFTARG = vector,
RIGHTARG = float8,
FUNCTION = vector_mul_scalar
);
CREATE OPERATOR * (
LEFTARG = float8,
RIGHTARG = vector,
FUNCTION = scalar_mul_vector,
COMMUTATOR = *
);
-- Comparison (for DISTINCT, GROUP BY)
CREATE OPERATOR = (
LEFTARG = vector,
RIGHTARG = vector,
FUNCTION = vector_eq,
COMMUTATOR = =,
NEGATOR = <>,
HASHES,
MERGES
);
CREATE OPERATOR <> (
LEFTARG = vector,
RIGHTARG = vector,
FUNCTION = vector_ne,
COMMUTATOR = <>,
NEGATOR = =
);
3. Operator Classes
3.1 HNSW Operator Classes
-- L2 distance for HNSW
CREATE OPERATOR CLASS vector_l2_ops
DEFAULT FOR TYPE vector USING hnsw AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 vector_l2_distance(vector, vector);
-- Cosine distance for HNSW
CREATE OPERATOR CLASS vector_cosine_ops
FOR TYPE vector USING hnsw AS
OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 vector_cosine_distance(vector, vector);
-- Inner product for HNSW
CREATE OPERATOR CLASS vector_ip_ops
FOR TYPE vector USING hnsw AS
OPERATOR 1 <#> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 vector_negative_inner_product(vector, vector);
3.2 IVFFlat Operator Classes
-- L2 distance for IVFFlat
CREATE OPERATOR CLASS vector_l2_ops
DEFAULT FOR TYPE vector USING ivfflat AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 vector_l2_distance(vector, vector);
-- Cosine distance for IVFFlat
CREATE OPERATOR CLASS vector_cosine_ops
FOR TYPE vector USING ivfflat AS
OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 vector_cosine_distance(vector, vector);
-- Inner product for IVFFlat
CREATE OPERATOR CLASS vector_ip_ops
FOR TYPE vector USING ivfflat AS
OPERATOR 1 <#> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 vector_negative_inner_product(vector, vector);
4. Index Access Methods
4.1 HNSW Access Method
CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
AS 'MODULE_PATHNAME' LANGUAGE C;
COMMENT ON ACCESS METHOD hnsw IS
'HNSW index for approximate nearest neighbor search';
4.2 IVFFlat Access Method
CREATE ACCESS METHOD ivfflat TYPE INDEX HANDLER ivfflat_handler;
CREATE FUNCTION ivfflat_handler(internal) RETURNS index_am_handler
AS 'MODULE_PATHNAME' LANGUAGE C;
COMMENT ON ACCESS METHOD ivfflat IS
'IVF-Flat index for approximate nearest neighbor search';
5. Core Extension Tables
5.1 Collection Metadata
-- Stores metadata about vector collections
CREATE TABLE ruvector.collections (
id SERIAL PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
table_schema TEXT NOT NULL,
table_name TEXT NOT NULL,
column_name TEXT NOT NULL,
dimensions INTEGER NOT NULL CHECK (dimensions > 0 AND dimensions <= 16000),
distance_metric TEXT NOT NULL DEFAULT 'l2'
CHECK (distance_metric IN ('l2', 'cosine', 'ip', 'l1')),
index_type TEXT CHECK (index_type IN ('hnsw', 'ivfflat', 'flat')),
index_oid OID,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
config JSONB NOT NULL DEFAULT '{}'::jsonb,
UNIQUE(table_schema, table_name, column_name)
);
CREATE INDEX idx_collections_table
ON ruvector.collections(table_schema, table_name);
5.2 Index Statistics
-- Stores per-index statistics
CREATE TABLE ruvector.index_stats (
index_oid OID PRIMARY KEY,
collection_id INTEGER REFERENCES ruvector.collections(id) ON DELETE CASCADE,
vector_count BIGINT NOT NULL DEFAULT 0,
index_size_bytes BIGINT NOT NULL DEFAULT 0,
last_build TIMESTAMPTZ,
last_vacuum TIMESTAMPTZ,
last_sample TIMESTAMPTZ,
build_time_ms BIGINT,
avg_search_ms REAL,
recall_estimate REAL,
stats_json JSONB NOT NULL DEFAULT '{}'::jsonb,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_index_stats_collection
ON ruvector.index_stats(collection_id);
6. Tiered Storage Tables (Phase 2)
6.1 Tier Configuration
-- Tier policy configuration per collection
CREATE TABLE ruvector.tier_policies (
id SERIAL PRIMARY KEY,
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
tier_name TEXT NOT NULL CHECK (tier_name IN ('hot', 'warm', 'cool', 'cold')),
threshold_hours INTEGER NOT NULL DEFAULT 24,
compression TEXT NOT NULL DEFAULT 'none'
CHECK (compression IN ('none', 'sq8', 'pq16', 'pq32', 'pq64')),
storage_path TEXT, -- NULL = default storage
enabled BOOLEAN NOT NULL DEFAULT true,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(collection_id, tier_name)
);
-- Default tier policies
INSERT INTO ruvector.tier_policies (collection_id, tier_name, threshold_hours, compression)
SELECT id, 'hot', 0, 'none' FROM ruvector.collections
UNION ALL
SELECT id, 'warm', 24, 'sq8' FROM ruvector.collections
UNION ALL
SELECT id, 'cool', 168, 'pq16' FROM ruvector.collections
UNION ALL
SELECT id, 'cold', 720, 'pq32' FROM ruvector.collections;
6.2 Access Tracking
-- Per-vector access counters (partitioned by collection)
CREATE TABLE ruvector.access_counters (
collection_id INTEGER NOT NULL,
vector_tid TID NOT NULL, -- Tuple ID in source table
access_count INTEGER NOT NULL DEFAULT 0,
last_access TIMESTAMPTZ NOT NULL DEFAULT NOW(),
current_tier TEXT NOT NULL DEFAULT 'hot',
PRIMARY KEY (collection_id, vector_tid)
) PARTITION BY LIST (collection_id);
-- Create partitions dynamically via trigger on collections insert
CREATE OR REPLACE FUNCTION ruvector.create_access_partition()
RETURNS TRIGGER AS $$
BEGIN
EXECUTE format(
'CREATE TABLE IF NOT EXISTS ruvector.access_counters_%s
PARTITION OF ruvector.access_counters
FOR VALUES IN (%s)',
NEW.id, NEW.id
);
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER trg_create_access_partition
AFTER INSERT ON ruvector.collections
FOR EACH ROW EXECUTE FUNCTION ruvector.create_access_partition();
6.3 Tier Statistics
-- Aggregated tier statistics
CREATE TABLE ruvector.tier_stats (
id SERIAL PRIMARY KEY,
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
tier_name TEXT NOT NULL,
vector_count BIGINT NOT NULL DEFAULT 0,
size_bytes BIGINT NOT NULL DEFAULT 0,
avg_access_rate REAL NOT NULL DEFAULT 0,
last_compaction TIMESTAMPTZ,
compaction_count INTEGER NOT NULL DEFAULT 0,
snapshot_time TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(collection_id, tier_name, snapshot_time)
);
CREATE INDEX idx_tier_stats_collection_time
ON ruvector.tier_stats(collection_id, snapshot_time DESC);
7. Graph Schema (Phase 3)
7.1 Graph Storage
-- Graph metadata
CREATE TABLE ruvector.graphs (
id SERIAL PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
description TEXT,
node_count BIGINT NOT NULL DEFAULT 0,
edge_count BIGINT NOT NULL DEFAULT 0,
hyperedge_count BIGINT NOT NULL DEFAULT 0,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
config JSONB NOT NULL DEFAULT '{}'::jsonb
);
-- Graph nodes (can reference vectors)
CREATE TABLE ruvector.nodes (
id BIGSERIAL PRIMARY KEY,
graph_id INTEGER NOT NULL REFERENCES ruvector.graphs(id) ON DELETE CASCADE,
external_id TEXT, -- Application-provided ID
node_type TEXT NOT NULL DEFAULT 'default',
vector_ref TID, -- Reference to vector in user table
collection_id INTEGER REFERENCES ruvector.collections(id),
properties JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(graph_id, external_id)
);
CREATE INDEX idx_nodes_graph_type ON ruvector.nodes(graph_id, node_type);
CREATE INDEX idx_nodes_properties ON ruvector.nodes USING gin(properties);
-- Graph edges
CREATE TABLE ruvector.edges (
id BIGSERIAL PRIMARY KEY,
graph_id INTEGER NOT NULL REFERENCES ruvector.graphs(id) ON DELETE CASCADE,
source_id BIGINT NOT NULL REFERENCES ruvector.nodes(id) ON DELETE CASCADE,
target_id BIGINT NOT NULL REFERENCES ruvector.nodes(id) ON DELETE CASCADE,
edge_type TEXT NOT NULL DEFAULT 'default',
weight REAL NOT NULL DEFAULT 1.0,
properties JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- No self-loops by default
CHECK (source_id <> target_id)
);
CREATE INDEX idx_edges_source ON ruvector.edges(graph_id, source_id);
CREATE INDEX idx_edges_target ON ruvector.edges(graph_id, target_id);
CREATE INDEX idx_edges_type ON ruvector.edges(graph_id, edge_type);
-- Hyperedges (connect multiple nodes)
CREATE TABLE ruvector.hyperedges (
id BIGSERIAL PRIMARY KEY,
graph_id INTEGER NOT NULL REFERENCES ruvector.graphs(id) ON DELETE CASCADE,
hyperedge_type TEXT NOT NULL DEFAULT 'default',
node_ids BIGINT[] NOT NULL,
weights REAL[], -- Optional per-node weights
properties JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CHECK (array_length(node_ids, 1) >= 2)
);
CREATE INDEX idx_hyperedges_graph ON ruvector.hyperedges(graph_id);
CREATE INDEX idx_hyperedges_nodes ON ruvector.hyperedges USING gin(node_ids);
7.2 Graph Views (Relational Bridge)
-- Unified node view with vector data
CREATE VIEW ruvector.nodes_view AS
SELECT
n.id,
n.graph_id,
g.name AS graph_name,
n.external_id,
n.node_type,
n.properties,
n.created_at,
c.table_schema,
c.table_name,
c.column_name,
n.vector_ref
FROM ruvector.nodes n
JOIN ruvector.graphs g ON n.graph_id = g.id
LEFT JOIN ruvector.collections c ON n.collection_id = c.id;
-- Edge view with source/target details
CREATE VIEW ruvector.edges_view AS
SELECT
e.id,
e.graph_id,
g.name AS graph_name,
e.source_id,
src.external_id AS source_external_id,
src.node_type AS source_type,
e.target_id,
tgt.external_id AS target_external_id,
tgt.node_type AS target_type,
e.edge_type,
e.weight,
e.properties,
e.created_at
FROM ruvector.edges e
JOIN ruvector.graphs g ON e.graph_id = g.id
JOIN ruvector.nodes src ON e.source_id = src.id
JOIN ruvector.nodes tgt ON e.target_id = tgt.id;
-- Hyperedge view with expanded nodes
CREATE VIEW ruvector.hyperedges_view AS
SELECT
h.id,
h.graph_id,
g.name AS graph_name,
h.hyperedge_type,
h.node_ids,
h.weights,
h.properties,
h.created_at,
array_agg(n.external_id) AS node_external_ids
FROM ruvector.hyperedges h
JOIN ruvector.graphs g ON h.graph_id = g.id
LEFT JOIN LATERAL unnest(h.node_ids) AS node_id ON true
LEFT JOIN ruvector.nodes n ON n.id = node_id
GROUP BY h.id, g.name;
8. Integrity Control Schema (Phase 4)
8.1 Integrity State
-- Current integrity state per collection
CREATE TABLE ruvector.integrity_state (
collection_id INTEGER PRIMARY KEY REFERENCES ruvector.collections(id) ON DELETE CASCADE,
state TEXT NOT NULL DEFAULT 'normal'
CHECK (state IN ('normal', 'stress', 'critical')),
lambda_cut REAL NOT NULL DEFAULT 1.0,
witness_edges JSONB NOT NULL DEFAULT '[]'::jsonb,
last_sample TIMESTAMPTZ NOT NULL DEFAULT NOW(),
sample_count INTEGER NOT NULL DEFAULT 0,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Integrity state transitions history
CREATE TABLE ruvector.integrity_events (
id BIGSERIAL PRIMARY KEY,
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
event_type TEXT NOT NULL CHECK (event_type IN (
'state_change', 'sample', 'policy_update', 'manual_override', 'recovery'
)),
previous_state TEXT,
new_state TEXT,
lambda_cut REAL,
witness_edges JSONB,
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
signature BYTEA, -- Ed25519 signature for audit
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_integrity_events_collection_time
ON ruvector.integrity_events(collection_id, created_at DESC);
CREATE INDEX idx_integrity_events_type
ON ruvector.integrity_events(event_type, created_at DESC);
8.2 Integrity Policies
-- Integrity policies per collection
CREATE TABLE ruvector.integrity_policies (
id SERIAL PRIMARY KEY,
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
name TEXT NOT NULL,
description TEXT,
-- Thresholds
threshold_high REAL NOT NULL DEFAULT 0.8, -- Above = normal
threshold_low REAL NOT NULL DEFAULT 0.3, -- Below = critical
-- Actions by state
normal_actions JSONB NOT NULL DEFAULT '{
"allow_bulk_insert": true,
"allow_index_rewire": true,
"allow_compression": true,
"allow_replication": true
}'::jsonb,
stress_actions JSONB NOT NULL DEFAULT '{
"allow_bulk_insert": false,
"allow_index_rewire": true,
"allow_compression": true,
"allow_replication": true,
"throttle_inserts_pct": 50
}'::jsonb,
critical_actions JSONB NOT NULL DEFAULT '{
"allow_bulk_insert": false,
"allow_index_rewire": false,
"allow_compression": false,
"allow_replication": true,
"throttle_inserts_pct": 90
}'::jsonb,
-- Sampling configuration
sample_interval_secs INTEGER NOT NULL DEFAULT 60,
sample_size INTEGER NOT NULL DEFAULT 1000,
enabled BOOLEAN NOT NULL DEFAULT true,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(collection_id, name)
);
8.3 Contracted Graph Schema
-- Contracted operational graph for mincut computation
-- CRITICAL: Never compute mincut on full similarity graph!
CREATE TABLE ruvector.contracted_graph (
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
node_type TEXT NOT NULL CHECK (node_type IN (
'partition', 'centroid', 'shard', 'maintenance_dep'
)),
node_id BIGINT NOT NULL,
node_data JSONB NOT NULL DEFAULT '{}'::jsonb,
PRIMARY KEY (collection_id, node_type, node_id)
);
CREATE TABLE ruvector.contracted_edges (
id BIGSERIAL PRIMARY KEY,
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
source_type TEXT NOT NULL,
source_id BIGINT NOT NULL,
target_type TEXT NOT NULL,
target_id BIGINT NOT NULL,
edge_type TEXT NOT NULL CHECK (edge_type IN (
'partition_link', 'routing_link', 'dependency', 'replication'
)),
capacity REAL NOT NULL DEFAULT 1.0, -- For max-flow computation
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
FOREIGN KEY (collection_id, source_type, source_id)
REFERENCES ruvector.contracted_graph(collection_id, node_type, node_id) ON DELETE CASCADE,
FOREIGN KEY (collection_id, target_type, target_id)
REFERENCES ruvector.contracted_graph(collection_id, node_type, node_id) ON DELETE CASCADE
);
CREATE INDEX idx_contracted_edges_source
ON ruvector.contracted_edges(collection_id, source_type, source_id);
CREATE INDEX idx_contracted_edges_target
ON ruvector.contracted_edges(collection_id, target_type, target_id);
9. GNN & Learning Tables
9.1 Training Data
-- Captured training interactions
CREATE TABLE ruvector.training_interactions (
id BIGSERIAL PRIMARY KEY,
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
query_vector vector, -- Query that was performed
result_ids TEXT[] NOT NULL, -- TIDs of returned results
selected_id TEXT, -- TID user actually selected (if known)
feedback_score REAL, -- Explicit feedback (-1 to 1)
latency_ms INTEGER,
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_training_collection_time
ON ruvector.training_interactions(collection_id, created_at DESC);
-- Partitioned by time for efficient cleanup
-- In production, add monthly partitioning
9.2 GNN Models
-- Trained GNN model registry
CREATE TABLE ruvector.gnn_models (
id SERIAL PRIMARY KEY,
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
model_name TEXT NOT NULL,
model_type TEXT NOT NULL CHECK (model_type IN ('gcn', 'graphsage', 'gat', 'custom')),
version INTEGER NOT NULL DEFAULT 1,
is_active BOOLEAN NOT NULL DEFAULT false,
-- Model binary (serialized weights)
model_data BYTEA,
-- Training metadata
training_samples INTEGER,
training_epochs INTEGER,
validation_loss REAL,
validation_recall REAL,
-- Configuration
config JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(collection_id, model_name, version)
);
CREATE INDEX idx_gnn_models_active
ON ruvector.gnn_models(collection_id, is_active) WHERE is_active;
10. Core Functions
10.1 Collection Management
-- Register a new vector collection
CREATE FUNCTION ruvector_register_collection(
p_name TEXT,
p_table_schema TEXT,
p_table_name TEXT,
p_column_name TEXT,
p_dimensions INTEGER,
p_distance_metric TEXT DEFAULT 'l2'
) RETURNS INTEGER AS $$
DECLARE
v_id INTEGER;
BEGIN
INSERT INTO ruvector.collections
(name, table_schema, table_name, column_name, dimensions, distance_metric)
VALUES
(p_name, p_table_schema, p_table_name, p_column_name, p_dimensions, p_distance_metric)
RETURNING id INTO v_id;
-- Initialize integrity state
INSERT INTO ruvector.integrity_state (collection_id) VALUES (v_id);
-- Create default integrity policy
INSERT INTO ruvector.integrity_policies (collection_id, name, description)
VALUES (v_id, 'default', 'Default integrity policy');
RETURN v_id;
END;
$$ LANGUAGE plpgsql;
-- Unregister a collection
CREATE FUNCTION ruvector_unregister_collection(p_name TEXT) RETURNS BOOLEAN AS $$
BEGIN
DELETE FROM ruvector.collections WHERE name = p_name;
RETURN FOUND;
END;
$$ LANGUAGE plpgsql;
-- List all collections
CREATE FUNCTION ruvector_list_collections()
RETURNS TABLE (
id INTEGER,
name TEXT,
table_ref TEXT,
dimensions INTEGER,
distance_metric TEXT,
index_type TEXT,
vector_count BIGINT
) AS $$
BEGIN
RETURN QUERY
SELECT
c.id,
c.name,
c.table_schema || '.' || c.table_name || '.' || c.column_name,
c.dimensions,
c.distance_metric,
c.index_type,
COALESCE(s.vector_count, 0)
FROM ruvector.collections c
LEFT JOIN ruvector.index_stats s ON c.index_oid = s.index_oid;
END;
$$ LANGUAGE plpgsql;
10.2 Index Management
-- Create HNSW index with custom parameters
CREATE FUNCTION ruvector_index_create(
p_table_name TEXT,
p_column_name TEXT,
p_index_type TEXT DEFAULT 'hnsw',
p_distance TEXT DEFAULT 'l2',
p_m INTEGER DEFAULT 16,
p_ef_construction INTEGER DEFAULT 64
) RETURNS TEXT AS $$
DECLARE
v_index_name TEXT;
v_ops_class TEXT;
BEGIN
v_index_name := format('idx_%s_%s_%s',
replace(p_table_name, '.', '_'),
p_column_name,
p_index_type);
v_ops_class := CASE p_distance
WHEN 'l2' THEN 'vector_l2_ops'
WHEN 'cosine' THEN 'vector_cosine_ops'
WHEN 'ip' THEN 'vector_ip_ops'
ELSE 'vector_l2_ops'
END;
IF p_index_type = 'hnsw' THEN
EXECUTE format(
'CREATE INDEX %I ON %s USING hnsw (%I %s) WITH (m = %s, ef_construction = %s)',
v_index_name, p_table_name, p_column_name, v_ops_class, p_m, p_ef_construction
);
ELSIF p_index_type = 'ivfflat' THEN
EXECUTE format(
'CREATE INDEX %I ON %s USING ivfflat (%I %s) WITH (lists = 100)',
v_index_name, p_table_name, p_column_name, v_ops_class
);
END IF;
RETURN v_index_name;
END;
$$ LANGUAGE plpgsql;
-- Get index statistics
CREATE FUNCTION ruvector_index_stats(p_index_name TEXT)
RETURNS JSONB AS 'MODULE_PATHNAME', 'ruvector_index_stats' LANGUAGE C;
-- Perform index maintenance
CREATE FUNCTION ruvector_index_maintenance(p_index_name TEXT)
RETURNS TEXT AS 'MODULE_PATHNAME' LANGUAGE C;
10.3 Vector Search
-- Extended search with options
CREATE FUNCTION ruvector_search(
p_collection_name TEXT,
p_query vector,
p_k INTEGER DEFAULT 10,
p_ef_search INTEGER DEFAULT NULL,
p_use_gnn BOOLEAN DEFAULT FALSE,
p_filter JSONB DEFAULT NULL
) RETURNS TABLE (
id TEXT,
distance REAL,
score REAL
) AS 'MODULE_PATHNAME', 'ruvector_search' LANGUAGE C;
-- Batch search
CREATE FUNCTION ruvector_search_batch(
p_collection_name TEXT,
p_queries vector[],
p_k INTEGER DEFAULT 10
) RETURNS TABLE (
query_idx INTEGER,
id TEXT,
distance REAL
) AS 'MODULE_PATHNAME', 'ruvector_search_batch' LANGUAGE C;
10.4 Tiered Storage Functions
-- Configure tier thresholds
CREATE FUNCTION ruvector_set_tiers(
p_collection_name TEXT,
p_hot_hours INTEGER DEFAULT 24,
p_warm_hours INTEGER DEFAULT 168,
p_cool_hours INTEGER DEFAULT 720
) RETURNS BOOLEAN AS $$
DECLARE
v_collection_id INTEGER;
BEGIN
SELECT id INTO v_collection_id FROM ruvector.collections WHERE name = p_collection_name;
IF NOT FOUND THEN
RAISE EXCEPTION 'Collection not found: %', p_collection_name;
END IF;
UPDATE ruvector.tier_policies
SET threshold_hours = p_hot_hours WHERE collection_id = v_collection_id AND tier_name = 'warm';
UPDATE ruvector.tier_policies
SET threshold_hours = p_warm_hours WHERE collection_id = v_collection_id AND tier_name = 'cool';
UPDATE ruvector.tier_policies
SET threshold_hours = p_cool_hours WHERE collection_id = v_collection_id AND tier_name = 'cold';
RETURN TRUE;
END;
$$ LANGUAGE plpgsql;
-- Trigger manual compaction
CREATE FUNCTION ruvector_compact(p_collection_name TEXT)
RETURNS JSONB AS 'MODULE_PATHNAME', 'ruvector_compact' LANGUAGE C;
-- Get tier report
CREATE FUNCTION ruvector_tier_report(p_collection_name TEXT)
RETURNS TABLE (
tier_name TEXT,
vector_count BIGINT,
size_mb REAL,
compression TEXT,
avg_age_hours REAL
) AS 'MODULE_PATHNAME', 'ruvector_tier_report' LANGUAGE C;
10.5 Graph & Cypher Functions
-- Execute Cypher query
CREATE FUNCTION ruvector_cypher(
p_graph_name TEXT,
p_query TEXT,
p_params JSONB DEFAULT '{}'::jsonb
) RETURNS SETOF JSONB AS 'MODULE_PATHNAME', 'ruvector_cypher' LANGUAGE C;
-- Create graph
CREATE FUNCTION ruvector_graph_create(
p_name TEXT,
p_description TEXT DEFAULT NULL
) RETURNS INTEGER AS $$
DECLARE
v_id INTEGER;
BEGIN
INSERT INTO ruvector.graphs (name, description)
VALUES (p_name, p_description)
RETURNING id INTO v_id;
RETURN v_id;
END;
$$ LANGUAGE plpgsql;
-- Add node
CREATE FUNCTION ruvector_node_add(
p_graph_name TEXT,
p_external_id TEXT,
p_node_type TEXT DEFAULT 'default',
p_properties JSONB DEFAULT '{}'::jsonb
) RETURNS BIGINT AS $$
DECLARE
v_graph_id INTEGER;
v_node_id BIGINT;
BEGIN
SELECT id INTO v_graph_id FROM ruvector.graphs WHERE name = p_graph_name;
IF NOT FOUND THEN
RAISE EXCEPTION 'Graph not found: %', p_graph_name;
END IF;
INSERT INTO ruvector.nodes (graph_id, external_id, node_type, properties)
VALUES (v_graph_id, p_external_id, p_node_type, p_properties)
RETURNING id INTO v_node_id;
UPDATE ruvector.graphs SET node_count = node_count + 1, updated_at = NOW()
WHERE id = v_graph_id;
RETURN v_node_id;
END;
$$ LANGUAGE plpgsql;
-- Add edge
CREATE FUNCTION ruvector_edge_add(
p_graph_name TEXT,
p_source_external_id TEXT,
p_target_external_id TEXT,
p_edge_type TEXT DEFAULT 'default',
p_weight REAL DEFAULT 1.0,
p_properties JSONB DEFAULT '{}'::jsonb
) RETURNS BIGINT AS $$
DECLARE
v_graph_id INTEGER;
v_source_id BIGINT;
v_target_id BIGINT;
v_edge_id BIGINT;
BEGIN
SELECT id INTO v_graph_id FROM ruvector.graphs WHERE name = p_graph_name;
SELECT id INTO v_source_id FROM ruvector.nodes
WHERE graph_id = v_graph_id AND external_id = p_source_external_id;
SELECT id INTO v_target_id FROM ruvector.nodes
WHERE graph_id = v_graph_id AND external_id = p_target_external_id;
INSERT INTO ruvector.edges (graph_id, source_id, target_id, edge_type, weight, properties)
VALUES (v_graph_id, v_source_id, v_target_id, p_edge_type, p_weight, p_properties)
RETURNING id INTO v_edge_id;
UPDATE ruvector.graphs SET edge_count = edge_count + 1, updated_at = NOW()
WHERE id = v_graph_id;
RETURN v_edge_id;
END;
$$ LANGUAGE plpgsql;
10.6 Integrity Control Functions
-- Sample integrity state
CREATE FUNCTION ruvector_integrity_sample(p_collection_name TEXT)
RETURNS JSONB AS 'MODULE_PATHNAME', 'ruvector_integrity_sample' LANGUAGE C;
-- Set integrity policy
CREATE FUNCTION ruvector_integrity_policy_set(
p_collection_name TEXT,
p_policy_name TEXT,
p_config JSONB
) RETURNS BOOLEAN AS $$
DECLARE
v_collection_id INTEGER;
BEGIN
SELECT id INTO v_collection_id FROM ruvector.collections WHERE name = p_collection_name;
IF NOT FOUND THEN
RAISE EXCEPTION 'Collection not found: %', p_collection_name;
END IF;
-- Update or insert policy
INSERT INTO ruvector.integrity_policies (collection_id, name,
threshold_high, threshold_low,
normal_actions, stress_actions, critical_actions,
sample_interval_secs, sample_size)
VALUES (
v_collection_id,
p_policy_name,
COALESCE((p_config->>'threshold_high')::REAL, 0.8),
COALESCE((p_config->>'threshold_low')::REAL, 0.3),
COALESCE(p_config->'normal_actions', '{}'::jsonb),
COALESCE(p_config->'stress_actions', '{}'::jsonb),
COALESCE(p_config->'critical_actions', '{}'::jsonb),
COALESCE((p_config->>'sample_interval_secs')::INTEGER, 60),
COALESCE((p_config->>'sample_size')::INTEGER, 1000)
)
ON CONFLICT (collection_id, name) DO UPDATE SET
threshold_high = EXCLUDED.threshold_high,
threshold_low = EXCLUDED.threshold_low,
normal_actions = EXCLUDED.normal_actions,
stress_actions = EXCLUDED.stress_actions,
critical_actions = EXCLUDED.critical_actions,
sample_interval_secs = EXCLUDED.sample_interval_secs,
sample_size = EXCLUDED.sample_size,
updated_at = NOW();
RETURN TRUE;
END;
$$ LANGUAGE plpgsql;
-- Check integrity gate
CREATE FUNCTION ruvector_integrity_gate(
p_collection_name TEXT,
p_operation TEXT
) RETURNS JSONB AS 'MODULE_PATHNAME', 'ruvector_integrity_gate' LANGUAGE C;
-- Get current integrity state
CREATE FUNCTION ruvector_integrity_status(p_collection_name TEXT)
RETURNS JSONB AS $$
DECLARE
v_result JSONB;
BEGIN
SELECT jsonb_build_object(
'collection', p_collection_name,
'state', s.state,
'lambda_cut', s.lambda_cut,
'witness_edges', s.witness_edges,
'last_sample', s.last_sample,
'sample_count', s.sample_count
) INTO v_result
FROM ruvector.collections c
JOIN ruvector.integrity_state s ON c.id = s.collection_id
WHERE c.name = p_collection_name;
RETURN v_result;
END;
$$ LANGUAGE plpgsql;
10.7 GNN Functions
-- Record interaction for training
CREATE FUNCTION ruvector_record_interaction(
p_collection_name TEXT,
p_query vector,
p_result_ids TEXT[],
p_selected_id TEXT DEFAULT NULL,
p_feedback_score REAL DEFAULT NULL
) RETURNS BIGINT AS $$
DECLARE
v_collection_id INTEGER;
v_id BIGINT;
BEGIN
SELECT id INTO v_collection_id FROM ruvector.collections WHERE name = p_collection_name;
INSERT INTO ruvector.training_interactions
(collection_id, query_vector, result_ids, selected_id, feedback_score)
VALUES
(v_collection_id, p_query, p_result_ids, p_selected_id, p_feedback_score)
RETURNING id INTO v_id;
RETURN v_id;
END;
$$ LANGUAGE plpgsql;
-- Trigger GNN training
CREATE FUNCTION ruvector_gnn_train(
p_collection_name TEXT,
p_model_type TEXT DEFAULT 'graphsage',
p_epochs INTEGER DEFAULT 100,
p_config JSONB DEFAULT '{}'::jsonb
) RETURNS JSONB AS 'MODULE_PATHNAME', 'ruvector_gnn_train' LANGUAGE C;
-- Get active GNN model
CREATE FUNCTION ruvector_gnn_model_active(p_collection_name TEXT)
RETURNS JSONB AS $$
DECLARE
v_result JSONB;
BEGIN
SELECT jsonb_build_object(
'model_name', m.model_name,
'model_type', m.model_type,
'version', m.version,
'training_samples', m.training_samples,
'validation_recall', m.validation_recall,
'created_at', m.created_at
) INTO v_result
FROM ruvector.collections c
JOIN ruvector.gnn_models m ON c.id = m.collection_id
WHERE c.name = p_collection_name AND m.is_active;
RETURN COALESCE(v_result, '{}'::jsonb);
END;
$$ LANGUAGE plpgsql;
11. Utility Functions
-- Extension version
CREATE FUNCTION ruvector_version() RETURNS TEXT
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE PARALLEL SAFE;
-- SIMD capability info
CREATE FUNCTION ruvector_simd_info() RETURNS TEXT
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE PARALLEL SAFE;
-- Memory statistics
CREATE FUNCTION ruvector_memory_stats() RETURNS JSONB
AS 'MODULE_PATHNAME' LANGUAGE C;
-- Vector utilities
CREATE FUNCTION vector_dims(vector) RETURNS INTEGER
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION vector_norm(vector) RETURNS REAL
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION vector_normalize(vector) RETURNS vector
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
-- Aggregates
CREATE AGGREGATE avg(vector) (
SFUNC = vector_accum,
STYPE = internal,
FINALFUNC = vector_avg_final,
COMBINEFUNC = vector_combine,
PARALLEL = SAFE
);
CREATE AGGREGATE sum(vector) (
SFUNC = vector_add,
STYPE = vector,
COMBINEFUNC = vector_add,
PARALLEL = SAFE
);
12. Schema Initialization
-- Create extension schema
CREATE SCHEMA IF NOT EXISTS ruvector;
-- Set search path
ALTER EXTENSION ruvector SET SCHEMA ruvector;
-- Grant usage
GRANT USAGE ON SCHEMA ruvector TO PUBLIC;
GRANT SELECT ON ALL TABLES IN SCHEMA ruvector TO PUBLIC;
-- Create roles
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'ruvector_admin') THEN
CREATE ROLE ruvector_admin;
END IF;
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'ruvector_operator') THEN
CREATE ROLE ruvector_operator;
END IF;
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'ruvector_user') THEN
CREATE ROLE ruvector_user;
END IF;
END
$$;
-- Grant permissions
GRANT ALL ON SCHEMA ruvector TO ruvector_admin;
GRANT ALL ON ALL TABLES IN SCHEMA ruvector TO ruvector_admin;
GRANT ALL ON ALL SEQUENCES IN SCHEMA ruvector TO ruvector_admin;
GRANT SELECT, INSERT, UPDATE, DELETE ON ruvector.collections TO ruvector_operator;
GRANT SELECT, INSERT, UPDATE, DELETE ON ruvector.index_stats TO ruvector_operator;
GRANT SELECT, INSERT, UPDATE, DELETE ON ruvector.tier_policies TO ruvector_operator;
GRANT SELECT ON ALL TABLES IN SCHEMA ruvector TO ruvector_user;
Testing Requirements
Type Tests
- Vector input/output roundtrip
- Dimension validation
- Operator correctness
Index Tests
- HNSW build and search
- IVFFlat build and search
- Recall at various ef_search
Function Tests
- All functions callable
- Error handling
- Permission checks
Dependencies
- PostgreSQL 14+ (for operator class features)
- pgrx 0.11+ (Rust bindings)
- serde (JSON serialization)
- ed25519-dalek (signature verification)