git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
1294 lines
37 KiB
Markdown
1294 lines
37 KiB
Markdown
# RuVector Postgres v2 - SQL Schema Specification
|
|
|
|
## Overview
|
|
|
|
This document defines the complete SQL schema for RuVector Postgres v2, including types, operators, functions, tables, and views. The schema is designed for **100% pgvector compatibility** while providing extended functionality.
|
|
|
|
---
|
|
|
|
## 1. Data Types
|
|
|
|
### 1.1 Core Vector Types (pgvector Compatible)
|
|
|
|
```sql
|
|
-- Dense vector type (pgvector compatible)
|
|
CREATE TYPE vector;
|
|
|
|
-- Input/output functions
|
|
CREATE FUNCTION vector_in(cstring, oid, integer) RETURNS vector
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION vector_out(vector) RETURNS cstring
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION vector_typmod_in(cstring[]) RETURNS integer
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION vector_typmod_out(integer) RETURNS cstring
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION vector_recv(internal, oid, integer) RETURNS vector
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION vector_send(vector) RETURNS bytea
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE TYPE vector (
|
|
INPUT = vector_in,
|
|
OUTPUT = vector_out,
|
|
TYPMOD_IN = vector_typmod_in,
|
|
TYPMOD_OUT = vector_typmod_out,
|
|
RECEIVE = vector_recv,
|
|
SEND = vector_send,
|
|
STORAGE = external,
|
|
INTERNALLENGTH = VARIABLE,
|
|
ALIGNMENT = double
|
|
);
|
|
```
|
|
|
|
### 1.2 Extended Vector Types
|
|
|
|
```sql
|
|
-- Half-precision vector (float16)
|
|
CREATE TYPE halfvec (
|
|
INPUT = halfvec_in,
|
|
OUTPUT = halfvec_out,
|
|
TYPMOD_IN = halfvec_typmod_in,
|
|
TYPMOD_OUT = halfvec_typmod_out,
|
|
RECEIVE = halfvec_recv,
|
|
SEND = halfvec_send,
|
|
STORAGE = external,
|
|
INTERNALLENGTH = VARIABLE,
|
|
ALIGNMENT = double
|
|
);
|
|
|
|
-- Sparse vector (coordinate format)
|
|
CREATE TYPE sparsevec (
|
|
INPUT = sparsevec_in,
|
|
OUTPUT = sparsevec_out,
|
|
TYPMOD_IN = sparsevec_typmod_in,
|
|
TYPMOD_OUT = sparsevec_typmod_out,
|
|
RECEIVE = sparsevec_recv,
|
|
SEND = sparsevec_send,
|
|
STORAGE = external,
|
|
INTERNALLENGTH = VARIABLE,
|
|
ALIGNMENT = double
|
|
);
|
|
|
|
-- Binary vector (bit-packed)
|
|
CREATE TYPE binaryvec (
|
|
INPUT = binaryvec_in,
|
|
OUTPUT = binaryvec_out,
|
|
TYPMOD_IN = binaryvec_typmod_in,
|
|
TYPMOD_OUT = binaryvec_typmod_out,
|
|
RECEIVE = binaryvec_recv,
|
|
SEND = binaryvec_send,
|
|
STORAGE = plain,
|
|
INTERNALLENGTH = VARIABLE,
|
|
ALIGNMENT = int4
|
|
);
|
|
```
|
|
|
|
---
|
|
|
|
## 2. Operators (pgvector Compatible)
|
|
|
|
### 2.1 Distance Operators
|
|
|
|
```sql
|
|
-- L2 (Euclidean) distance
|
|
CREATE OPERATOR <-> (
|
|
LEFTARG = vector,
|
|
RIGHTARG = vector,
|
|
FUNCTION = vector_l2_distance,
|
|
COMMUTATOR = <->
|
|
);
|
|
|
|
-- Cosine distance (1 - cosine_similarity)
|
|
CREATE OPERATOR <=> (
|
|
LEFTARG = vector,
|
|
RIGHTARG = vector,
|
|
FUNCTION = vector_cosine_distance,
|
|
COMMUTATOR = <=>
|
|
);
|
|
|
|
-- Negative inner product (for MAX inner product via MIN)
|
|
CREATE OPERATOR <#> (
|
|
LEFTARG = vector,
|
|
RIGHTARG = vector,
|
|
FUNCTION = vector_negative_inner_product,
|
|
COMMUTATOR = <#>
|
|
);
|
|
|
|
-- L1 (Manhattan) distance (RuVector extension)
|
|
CREATE OPERATOR <+> (
|
|
LEFTARG = vector,
|
|
RIGHTARG = vector,
|
|
FUNCTION = vector_l1_distance,
|
|
COMMUTATOR = <+>
|
|
);
|
|
```
|
|
|
|
### 2.2 Operator Implementation Functions
|
|
|
|
```sql
|
|
-- Distance calculation functions
|
|
CREATE FUNCTION vector_l2_distance(vector, vector) RETURNS float8
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION vector_cosine_distance(vector, vector) RETURNS float8
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION vector_negative_inner_product(vector, vector) RETURNS float8
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION vector_l1_distance(vector, vector) RETURNS float8
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
-- Similarity functions (convenience)
|
|
CREATE FUNCTION cosine_similarity(vector, vector) RETURNS float8
|
|
AS 'MODULE_PATHNAME', 'vector_cosine_similarity'
|
|
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION inner_product(vector, vector) RETURNS float8
|
|
AS 'MODULE_PATHNAME', 'vector_inner_product'
|
|
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION l2_distance(vector, vector) RETURNS float8
|
|
AS 'MODULE_PATHNAME', 'vector_l2_distance'
|
|
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
```
|
|
|
|
### 2.3 Vector Arithmetic
|
|
|
|
```sql
|
|
-- Addition
|
|
CREATE OPERATOR + (
|
|
LEFTARG = vector,
|
|
RIGHTARG = vector,
|
|
FUNCTION = vector_add,
|
|
COMMUTATOR = +
|
|
);
|
|
|
|
-- Subtraction
|
|
CREATE OPERATOR - (
|
|
LEFTARG = vector,
|
|
RIGHTARG = vector,
|
|
FUNCTION = vector_sub
|
|
);
|
|
|
|
-- Scalar multiplication
|
|
CREATE OPERATOR * (
|
|
LEFTARG = vector,
|
|
RIGHTARG = float8,
|
|
FUNCTION = vector_mul_scalar
|
|
);
|
|
|
|
CREATE OPERATOR * (
|
|
LEFTARG = float8,
|
|
RIGHTARG = vector,
|
|
FUNCTION = scalar_mul_vector,
|
|
COMMUTATOR = *
|
|
);
|
|
|
|
-- Comparison (for DISTINCT, GROUP BY)
|
|
CREATE OPERATOR = (
|
|
LEFTARG = vector,
|
|
RIGHTARG = vector,
|
|
FUNCTION = vector_eq,
|
|
COMMUTATOR = =,
|
|
NEGATOR = <>,
|
|
HASHES,
|
|
MERGES
|
|
);
|
|
|
|
CREATE OPERATOR <> (
|
|
LEFTARG = vector,
|
|
RIGHTARG = vector,
|
|
FUNCTION = vector_ne,
|
|
COMMUTATOR = <>,
|
|
NEGATOR = =
|
|
);
|
|
```
|
|
|
|
---
|
|
|
|
## 3. Operator Classes
|
|
|
|
### 3.1 HNSW Operator Classes
|
|
|
|
```sql
|
|
-- L2 distance for HNSW
|
|
CREATE OPERATOR CLASS vector_l2_ops
|
|
DEFAULT FOR TYPE vector USING hnsw AS
|
|
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
|
|
FUNCTION 1 vector_l2_distance(vector, vector);
|
|
|
|
-- Cosine distance for HNSW
|
|
CREATE OPERATOR CLASS vector_cosine_ops
|
|
FOR TYPE vector USING hnsw AS
|
|
OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops,
|
|
FUNCTION 1 vector_cosine_distance(vector, vector);
|
|
|
|
-- Inner product for HNSW
|
|
CREATE OPERATOR CLASS vector_ip_ops
|
|
FOR TYPE vector USING hnsw AS
|
|
OPERATOR 1 <#> (vector, vector) FOR ORDER BY float_ops,
|
|
FUNCTION 1 vector_negative_inner_product(vector, vector);
|
|
```
|
|
|
|
### 3.2 IVFFlat Operator Classes
|
|
|
|
```sql
|
|
-- L2 distance for IVFFlat
|
|
CREATE OPERATOR CLASS vector_l2_ops
|
|
DEFAULT FOR TYPE vector USING ivfflat AS
|
|
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
|
|
FUNCTION 1 vector_l2_distance(vector, vector);
|
|
|
|
-- Cosine distance for IVFFlat
|
|
CREATE OPERATOR CLASS vector_cosine_ops
|
|
FOR TYPE vector USING ivfflat AS
|
|
OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops,
|
|
FUNCTION 1 vector_cosine_distance(vector, vector);
|
|
|
|
-- Inner product for IVFFlat
|
|
CREATE OPERATOR CLASS vector_ip_ops
|
|
FOR TYPE vector USING ivfflat AS
|
|
OPERATOR 1 <#> (vector, vector) FOR ORDER BY float_ops,
|
|
FUNCTION 1 vector_negative_inner_product(vector, vector);
|
|
```
|
|
|
|
---
|
|
|
|
## 4. Index Access Methods
|
|
|
|
### 4.1 HNSW Access Method
|
|
|
|
```sql
|
|
CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
|
|
|
|
CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
|
|
AS 'MODULE_PATHNAME' LANGUAGE C;
|
|
|
|
COMMENT ON ACCESS METHOD hnsw IS
|
|
'HNSW index for approximate nearest neighbor search';
|
|
```
|
|
|
|
### 4.2 IVFFlat Access Method
|
|
|
|
```sql
|
|
CREATE ACCESS METHOD ivfflat TYPE INDEX HANDLER ivfflat_handler;
|
|
|
|
CREATE FUNCTION ivfflat_handler(internal) RETURNS index_am_handler
|
|
AS 'MODULE_PATHNAME' LANGUAGE C;
|
|
|
|
COMMENT ON ACCESS METHOD ivfflat IS
|
|
'IVF-Flat index for approximate nearest neighbor search';
|
|
```
|
|
|
|
---
|
|
|
|
## 5. Core Extension Tables
|
|
|
|
### 5.1 Collection Metadata
|
|
|
|
```sql
|
|
-- Stores metadata about vector collections
|
|
CREATE TABLE ruvector.collections (
|
|
id SERIAL PRIMARY KEY,
|
|
name TEXT NOT NULL UNIQUE,
|
|
table_schema TEXT NOT NULL,
|
|
table_name TEXT NOT NULL,
|
|
column_name TEXT NOT NULL,
|
|
dimensions INTEGER NOT NULL CHECK (dimensions > 0 AND dimensions <= 16000),
|
|
distance_metric TEXT NOT NULL DEFAULT 'l2'
|
|
CHECK (distance_metric IN ('l2', 'cosine', 'ip', 'l1')),
|
|
index_type TEXT CHECK (index_type IN ('hnsw', 'ivfflat', 'flat')),
|
|
index_oid OID,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
config JSONB NOT NULL DEFAULT '{}'::jsonb,
|
|
|
|
UNIQUE(table_schema, table_name, column_name)
|
|
);
|
|
|
|
CREATE INDEX idx_collections_table
|
|
ON ruvector.collections(table_schema, table_name);
|
|
```
|
|
|
|
### 5.2 Index Statistics
|
|
|
|
```sql
|
|
-- Stores per-index statistics
|
|
CREATE TABLE ruvector.index_stats (
|
|
index_oid OID PRIMARY KEY,
|
|
collection_id INTEGER REFERENCES ruvector.collections(id) ON DELETE CASCADE,
|
|
vector_count BIGINT NOT NULL DEFAULT 0,
|
|
index_size_bytes BIGINT NOT NULL DEFAULT 0,
|
|
last_build TIMESTAMPTZ,
|
|
last_vacuum TIMESTAMPTZ,
|
|
last_sample TIMESTAMPTZ,
|
|
build_time_ms BIGINT,
|
|
avg_search_ms REAL,
|
|
recall_estimate REAL,
|
|
stats_json JSONB NOT NULL DEFAULT '{}'::jsonb,
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_index_stats_collection
|
|
ON ruvector.index_stats(collection_id);
|
|
```
|
|
|
|
---
|
|
|
|
## 6. Tiered Storage Tables (Phase 2)
|
|
|
|
### 6.1 Tier Configuration
|
|
|
|
```sql
|
|
-- Tier policy configuration per collection
|
|
CREATE TABLE ruvector.tier_policies (
|
|
id SERIAL PRIMARY KEY,
|
|
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
|
|
tier_name TEXT NOT NULL CHECK (tier_name IN ('hot', 'warm', 'cool', 'cold')),
|
|
threshold_hours INTEGER NOT NULL DEFAULT 24,
|
|
compression TEXT NOT NULL DEFAULT 'none'
|
|
CHECK (compression IN ('none', 'sq8', 'pq16', 'pq32', 'pq64')),
|
|
storage_path TEXT, -- NULL = default storage
|
|
enabled BOOLEAN NOT NULL DEFAULT true,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
UNIQUE(collection_id, tier_name)
|
|
);
|
|
|
|
-- Default tier policies
|
|
INSERT INTO ruvector.tier_policies (collection_id, tier_name, threshold_hours, compression)
|
|
SELECT id, 'hot', 0, 'none' FROM ruvector.collections
|
|
UNION ALL
|
|
SELECT id, 'warm', 24, 'sq8' FROM ruvector.collections
|
|
UNION ALL
|
|
SELECT id, 'cool', 168, 'pq16' FROM ruvector.collections
|
|
UNION ALL
|
|
SELECT id, 'cold', 720, 'pq32' FROM ruvector.collections;
|
|
```
|
|
|
|
### 6.2 Access Tracking
|
|
|
|
```sql
|
|
-- Per-vector access counters (partitioned by collection)
|
|
CREATE TABLE ruvector.access_counters (
|
|
collection_id INTEGER NOT NULL,
|
|
vector_tid TID NOT NULL, -- Tuple ID in source table
|
|
access_count INTEGER NOT NULL DEFAULT 0,
|
|
last_access TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
current_tier TEXT NOT NULL DEFAULT 'hot',
|
|
|
|
PRIMARY KEY (collection_id, vector_tid)
|
|
) PARTITION BY LIST (collection_id);
|
|
|
|
-- Create partitions dynamically via trigger on collections insert
|
|
CREATE OR REPLACE FUNCTION ruvector.create_access_partition()
|
|
RETURNS TRIGGER AS $$
|
|
BEGIN
|
|
EXECUTE format(
|
|
'CREATE TABLE IF NOT EXISTS ruvector.access_counters_%s
|
|
PARTITION OF ruvector.access_counters
|
|
FOR VALUES IN (%s)',
|
|
NEW.id, NEW.id
|
|
);
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
CREATE TRIGGER trg_create_access_partition
|
|
AFTER INSERT ON ruvector.collections
|
|
FOR EACH ROW EXECUTE FUNCTION ruvector.create_access_partition();
|
|
```
|
|
|
|
### 6.3 Tier Statistics
|
|
|
|
```sql
|
|
-- Aggregated tier statistics
|
|
CREATE TABLE ruvector.tier_stats (
|
|
id SERIAL PRIMARY KEY,
|
|
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
|
|
tier_name TEXT NOT NULL,
|
|
vector_count BIGINT NOT NULL DEFAULT 0,
|
|
size_bytes BIGINT NOT NULL DEFAULT 0,
|
|
avg_access_rate REAL NOT NULL DEFAULT 0,
|
|
last_compaction TIMESTAMPTZ,
|
|
compaction_count INTEGER NOT NULL DEFAULT 0,
|
|
snapshot_time TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
UNIQUE(collection_id, tier_name, snapshot_time)
|
|
);
|
|
|
|
CREATE INDEX idx_tier_stats_collection_time
|
|
ON ruvector.tier_stats(collection_id, snapshot_time DESC);
|
|
```
|
|
|
|
---
|
|
|
|
## 7. Graph Schema (Phase 3)
|
|
|
|
### 7.1 Graph Storage
|
|
|
|
```sql
|
|
-- Graph metadata
|
|
CREATE TABLE ruvector.graphs (
|
|
id SERIAL PRIMARY KEY,
|
|
name TEXT NOT NULL UNIQUE,
|
|
description TEXT,
|
|
node_count BIGINT NOT NULL DEFAULT 0,
|
|
edge_count BIGINT NOT NULL DEFAULT 0,
|
|
hyperedge_count BIGINT NOT NULL DEFAULT 0,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
config JSONB NOT NULL DEFAULT '{}'::jsonb
|
|
);
|
|
|
|
-- Graph nodes (can reference vectors)
|
|
CREATE TABLE ruvector.nodes (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
graph_id INTEGER NOT NULL REFERENCES ruvector.graphs(id) ON DELETE CASCADE,
|
|
external_id TEXT, -- Application-provided ID
|
|
node_type TEXT NOT NULL DEFAULT 'default',
|
|
vector_ref TID, -- Reference to vector in user table
|
|
collection_id INTEGER REFERENCES ruvector.collections(id),
|
|
properties JSONB NOT NULL DEFAULT '{}'::jsonb,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
UNIQUE(graph_id, external_id)
|
|
);
|
|
|
|
CREATE INDEX idx_nodes_graph_type ON ruvector.nodes(graph_id, node_type);
|
|
CREATE INDEX idx_nodes_properties ON ruvector.nodes USING gin(properties);
|
|
|
|
-- Graph edges
|
|
CREATE TABLE ruvector.edges (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
graph_id INTEGER NOT NULL REFERENCES ruvector.graphs(id) ON DELETE CASCADE,
|
|
source_id BIGINT NOT NULL REFERENCES ruvector.nodes(id) ON DELETE CASCADE,
|
|
target_id BIGINT NOT NULL REFERENCES ruvector.nodes(id) ON DELETE CASCADE,
|
|
edge_type TEXT NOT NULL DEFAULT 'default',
|
|
weight REAL NOT NULL DEFAULT 1.0,
|
|
properties JSONB NOT NULL DEFAULT '{}'::jsonb,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
-- No self-loops by default
|
|
CHECK (source_id <> target_id)
|
|
);
|
|
|
|
CREATE INDEX idx_edges_source ON ruvector.edges(graph_id, source_id);
|
|
CREATE INDEX idx_edges_target ON ruvector.edges(graph_id, target_id);
|
|
CREATE INDEX idx_edges_type ON ruvector.edges(graph_id, edge_type);
|
|
|
|
-- Hyperedges (connect multiple nodes)
|
|
CREATE TABLE ruvector.hyperedges (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
graph_id INTEGER NOT NULL REFERENCES ruvector.graphs(id) ON DELETE CASCADE,
|
|
hyperedge_type TEXT NOT NULL DEFAULT 'default',
|
|
node_ids BIGINT[] NOT NULL,
|
|
weights REAL[], -- Optional per-node weights
|
|
properties JSONB NOT NULL DEFAULT '{}'::jsonb,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
CHECK (array_length(node_ids, 1) >= 2)
|
|
);
|
|
|
|
CREATE INDEX idx_hyperedges_graph ON ruvector.hyperedges(graph_id);
|
|
CREATE INDEX idx_hyperedges_nodes ON ruvector.hyperedges USING gin(node_ids);
|
|
```
|
|
|
|
### 7.2 Graph Views (Relational Bridge)
|
|
|
|
```sql
|
|
-- Unified node view with vector data
|
|
CREATE VIEW ruvector.nodes_view AS
|
|
SELECT
|
|
n.id,
|
|
n.graph_id,
|
|
g.name AS graph_name,
|
|
n.external_id,
|
|
n.node_type,
|
|
n.properties,
|
|
n.created_at,
|
|
c.table_schema,
|
|
c.table_name,
|
|
c.column_name,
|
|
n.vector_ref
|
|
FROM ruvector.nodes n
|
|
JOIN ruvector.graphs g ON n.graph_id = g.id
|
|
LEFT JOIN ruvector.collections c ON n.collection_id = c.id;
|
|
|
|
-- Edge view with source/target details
|
|
CREATE VIEW ruvector.edges_view AS
|
|
SELECT
|
|
e.id,
|
|
e.graph_id,
|
|
g.name AS graph_name,
|
|
e.source_id,
|
|
src.external_id AS source_external_id,
|
|
src.node_type AS source_type,
|
|
e.target_id,
|
|
tgt.external_id AS target_external_id,
|
|
tgt.node_type AS target_type,
|
|
e.edge_type,
|
|
e.weight,
|
|
e.properties,
|
|
e.created_at
|
|
FROM ruvector.edges e
|
|
JOIN ruvector.graphs g ON e.graph_id = g.id
|
|
JOIN ruvector.nodes src ON e.source_id = src.id
|
|
JOIN ruvector.nodes tgt ON e.target_id = tgt.id;
|
|
|
|
-- Hyperedge view with expanded nodes
|
|
CREATE VIEW ruvector.hyperedges_view AS
|
|
SELECT
|
|
h.id,
|
|
h.graph_id,
|
|
g.name AS graph_name,
|
|
h.hyperedge_type,
|
|
h.node_ids,
|
|
h.weights,
|
|
h.properties,
|
|
h.created_at,
|
|
array_agg(n.external_id) AS node_external_ids
|
|
FROM ruvector.hyperedges h
|
|
JOIN ruvector.graphs g ON h.graph_id = g.id
|
|
LEFT JOIN LATERAL unnest(h.node_ids) AS node_id ON true
|
|
LEFT JOIN ruvector.nodes n ON n.id = node_id
|
|
GROUP BY h.id, g.name;
|
|
```
|
|
|
|
---
|
|
|
|
## 8. Integrity Control Schema (Phase 4)
|
|
|
|
### 8.1 Integrity State
|
|
|
|
```sql
|
|
-- Current integrity state per collection
|
|
CREATE TABLE ruvector.integrity_state (
|
|
collection_id INTEGER PRIMARY KEY REFERENCES ruvector.collections(id) ON DELETE CASCADE,
|
|
state TEXT NOT NULL DEFAULT 'normal'
|
|
CHECK (state IN ('normal', 'stress', 'critical')),
|
|
lambda_cut REAL NOT NULL DEFAULT 1.0,
|
|
witness_edges JSONB NOT NULL DEFAULT '[]'::jsonb,
|
|
last_sample TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
sample_count INTEGER NOT NULL DEFAULT 0,
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
-- Integrity state transitions history
|
|
CREATE TABLE ruvector.integrity_events (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
|
|
event_type TEXT NOT NULL CHECK (event_type IN (
|
|
'state_change', 'sample', 'policy_update', 'manual_override', 'recovery'
|
|
)),
|
|
previous_state TEXT,
|
|
new_state TEXT,
|
|
lambda_cut REAL,
|
|
witness_edges JSONB,
|
|
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
|
|
signature BYTEA, -- Ed25519 signature for audit
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_integrity_events_collection_time
|
|
ON ruvector.integrity_events(collection_id, created_at DESC);
|
|
CREATE INDEX idx_integrity_events_type
|
|
ON ruvector.integrity_events(event_type, created_at DESC);
|
|
```
|
|
|
|
### 8.2 Integrity Policies
|
|
|
|
```sql
|
|
-- Integrity policies per collection
|
|
CREATE TABLE ruvector.integrity_policies (
|
|
id SERIAL PRIMARY KEY,
|
|
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
|
|
name TEXT NOT NULL,
|
|
description TEXT,
|
|
|
|
-- Thresholds
|
|
threshold_high REAL NOT NULL DEFAULT 0.8, -- Above = normal
|
|
threshold_low REAL NOT NULL DEFAULT 0.3, -- Below = critical
|
|
|
|
-- Actions by state
|
|
normal_actions JSONB NOT NULL DEFAULT '{
|
|
"allow_bulk_insert": true,
|
|
"allow_index_rewire": true,
|
|
"allow_compression": true,
|
|
"allow_replication": true
|
|
}'::jsonb,
|
|
|
|
stress_actions JSONB NOT NULL DEFAULT '{
|
|
"allow_bulk_insert": false,
|
|
"allow_index_rewire": true,
|
|
"allow_compression": true,
|
|
"allow_replication": true,
|
|
"throttle_inserts_pct": 50
|
|
}'::jsonb,
|
|
|
|
critical_actions JSONB NOT NULL DEFAULT '{
|
|
"allow_bulk_insert": false,
|
|
"allow_index_rewire": false,
|
|
"allow_compression": false,
|
|
"allow_replication": true,
|
|
"throttle_inserts_pct": 90
|
|
}'::jsonb,
|
|
|
|
-- Sampling configuration
|
|
sample_interval_secs INTEGER NOT NULL DEFAULT 60,
|
|
sample_size INTEGER NOT NULL DEFAULT 1000,
|
|
|
|
enabled BOOLEAN NOT NULL DEFAULT true,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
UNIQUE(collection_id, name)
|
|
);
|
|
```
|
|
|
|
### 8.3 Contracted Graph Schema
|
|
|
|
```sql
|
|
-- Contracted operational graph for mincut computation
|
|
-- CRITICAL: Never compute mincut on full similarity graph!
|
|
CREATE TABLE ruvector.contracted_graph (
|
|
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
|
|
node_type TEXT NOT NULL CHECK (node_type IN (
|
|
'partition', 'centroid', 'shard', 'maintenance_dep'
|
|
)),
|
|
node_id BIGINT NOT NULL,
|
|
node_data JSONB NOT NULL DEFAULT '{}'::jsonb,
|
|
|
|
PRIMARY KEY (collection_id, node_type, node_id)
|
|
);
|
|
|
|
CREATE TABLE ruvector.contracted_edges (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
|
|
source_type TEXT NOT NULL,
|
|
source_id BIGINT NOT NULL,
|
|
target_type TEXT NOT NULL,
|
|
target_id BIGINT NOT NULL,
|
|
edge_type TEXT NOT NULL CHECK (edge_type IN (
|
|
'partition_link', 'routing_link', 'dependency', 'replication'
|
|
)),
|
|
capacity REAL NOT NULL DEFAULT 1.0, -- For max-flow computation
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
FOREIGN KEY (collection_id, source_type, source_id)
|
|
REFERENCES ruvector.contracted_graph(collection_id, node_type, node_id) ON DELETE CASCADE,
|
|
FOREIGN KEY (collection_id, target_type, target_id)
|
|
REFERENCES ruvector.contracted_graph(collection_id, node_type, node_id) ON DELETE CASCADE
|
|
);
|
|
|
|
CREATE INDEX idx_contracted_edges_source
|
|
ON ruvector.contracted_edges(collection_id, source_type, source_id);
|
|
CREATE INDEX idx_contracted_edges_target
|
|
ON ruvector.contracted_edges(collection_id, target_type, target_id);
|
|
```
|
|
|
|
---
|
|
|
|
## 9. GNN & Learning Tables
|
|
|
|
### 9.1 Training Data
|
|
|
|
```sql
|
|
-- Captured training interactions
|
|
CREATE TABLE ruvector.training_interactions (
|
|
id BIGSERIAL PRIMARY KEY,
|
|
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
|
|
query_vector vector, -- Query that was performed
|
|
result_ids TEXT[] NOT NULL, -- TIDs of returned results
|
|
selected_id TEXT, -- TID user actually selected (if known)
|
|
feedback_score REAL, -- Explicit feedback (-1 to 1)
|
|
latency_ms INTEGER,
|
|
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_training_collection_time
|
|
ON ruvector.training_interactions(collection_id, created_at DESC);
|
|
|
|
-- Partitioned by time for efficient cleanup
|
|
-- In production, add monthly partitioning
|
|
```
|
|
|
|
### 9.2 GNN Models
|
|
|
|
```sql
|
|
-- Trained GNN model registry
|
|
CREATE TABLE ruvector.gnn_models (
|
|
id SERIAL PRIMARY KEY,
|
|
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id) ON DELETE CASCADE,
|
|
model_name TEXT NOT NULL,
|
|
model_type TEXT NOT NULL CHECK (model_type IN ('gcn', 'graphsage', 'gat', 'custom')),
|
|
version INTEGER NOT NULL DEFAULT 1,
|
|
is_active BOOLEAN NOT NULL DEFAULT false,
|
|
|
|
-- Model binary (serialized weights)
|
|
model_data BYTEA,
|
|
|
|
-- Training metadata
|
|
training_samples INTEGER,
|
|
training_epochs INTEGER,
|
|
validation_loss REAL,
|
|
validation_recall REAL,
|
|
|
|
-- Configuration
|
|
config JSONB NOT NULL DEFAULT '{}'::jsonb,
|
|
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
UNIQUE(collection_id, model_name, version)
|
|
);
|
|
|
|
CREATE INDEX idx_gnn_models_active
|
|
ON ruvector.gnn_models(collection_id, is_active) WHERE is_active;
|
|
```
|
|
|
|
---
|
|
|
|
## 10. Core Functions
|
|
|
|
### 10.1 Collection Management
|
|
|
|
```sql
|
|
-- Register a new vector collection
|
|
CREATE FUNCTION ruvector_register_collection(
|
|
p_name TEXT,
|
|
p_table_schema TEXT,
|
|
p_table_name TEXT,
|
|
p_column_name TEXT,
|
|
p_dimensions INTEGER,
|
|
p_distance_metric TEXT DEFAULT 'l2'
|
|
) RETURNS INTEGER AS $$
|
|
DECLARE
|
|
v_id INTEGER;
|
|
BEGIN
|
|
INSERT INTO ruvector.collections
|
|
(name, table_schema, table_name, column_name, dimensions, distance_metric)
|
|
VALUES
|
|
(p_name, p_table_schema, p_table_name, p_column_name, p_dimensions, p_distance_metric)
|
|
RETURNING id INTO v_id;
|
|
|
|
-- Initialize integrity state
|
|
INSERT INTO ruvector.integrity_state (collection_id) VALUES (v_id);
|
|
|
|
-- Create default integrity policy
|
|
INSERT INTO ruvector.integrity_policies (collection_id, name, description)
|
|
VALUES (v_id, 'default', 'Default integrity policy');
|
|
|
|
RETURN v_id;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- Unregister a collection
|
|
CREATE FUNCTION ruvector_unregister_collection(p_name TEXT) RETURNS BOOLEAN AS $$
|
|
BEGIN
|
|
DELETE FROM ruvector.collections WHERE name = p_name;
|
|
RETURN FOUND;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- List all collections
|
|
CREATE FUNCTION ruvector_list_collections()
|
|
RETURNS TABLE (
|
|
id INTEGER,
|
|
name TEXT,
|
|
table_ref TEXT,
|
|
dimensions INTEGER,
|
|
distance_metric TEXT,
|
|
index_type TEXT,
|
|
vector_count BIGINT
|
|
) AS $$
|
|
BEGIN
|
|
RETURN QUERY
|
|
SELECT
|
|
c.id,
|
|
c.name,
|
|
c.table_schema || '.' || c.table_name || '.' || c.column_name,
|
|
c.dimensions,
|
|
c.distance_metric,
|
|
c.index_type,
|
|
COALESCE(s.vector_count, 0)
|
|
FROM ruvector.collections c
|
|
LEFT JOIN ruvector.index_stats s ON c.index_oid = s.index_oid;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
```
|
|
|
|
### 10.2 Index Management
|
|
|
|
```sql
|
|
-- Create HNSW index with custom parameters
|
|
CREATE FUNCTION ruvector_index_create(
|
|
p_table_name TEXT,
|
|
p_column_name TEXT,
|
|
p_index_type TEXT DEFAULT 'hnsw',
|
|
p_distance TEXT DEFAULT 'l2',
|
|
p_m INTEGER DEFAULT 16,
|
|
p_ef_construction INTEGER DEFAULT 64
|
|
) RETURNS TEXT AS $$
|
|
DECLARE
|
|
v_index_name TEXT;
|
|
v_ops_class TEXT;
|
|
BEGIN
|
|
v_index_name := format('idx_%s_%s_%s',
|
|
replace(p_table_name, '.', '_'),
|
|
p_column_name,
|
|
p_index_type);
|
|
|
|
v_ops_class := CASE p_distance
|
|
WHEN 'l2' THEN 'vector_l2_ops'
|
|
WHEN 'cosine' THEN 'vector_cosine_ops'
|
|
WHEN 'ip' THEN 'vector_ip_ops'
|
|
ELSE 'vector_l2_ops'
|
|
END;
|
|
|
|
IF p_index_type = 'hnsw' THEN
|
|
EXECUTE format(
|
|
'CREATE INDEX %I ON %s USING hnsw (%I %s) WITH (m = %s, ef_construction = %s)',
|
|
v_index_name, p_table_name, p_column_name, v_ops_class, p_m, p_ef_construction
|
|
);
|
|
ELSIF p_index_type = 'ivfflat' THEN
|
|
EXECUTE format(
|
|
'CREATE INDEX %I ON %s USING ivfflat (%I %s) WITH (lists = 100)',
|
|
v_index_name, p_table_name, p_column_name, v_ops_class
|
|
);
|
|
END IF;
|
|
|
|
RETURN v_index_name;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- Get index statistics
|
|
CREATE FUNCTION ruvector_index_stats(p_index_name TEXT)
|
|
RETURNS JSONB AS 'MODULE_PATHNAME', 'ruvector_index_stats' LANGUAGE C;
|
|
|
|
-- Perform index maintenance
|
|
CREATE FUNCTION ruvector_index_maintenance(p_index_name TEXT)
|
|
RETURNS TEXT AS 'MODULE_PATHNAME' LANGUAGE C;
|
|
```
|
|
|
|
### 10.3 Vector Search
|
|
|
|
```sql
|
|
-- Extended search with options
|
|
CREATE FUNCTION ruvector_search(
|
|
p_collection_name TEXT,
|
|
p_query vector,
|
|
p_k INTEGER DEFAULT 10,
|
|
p_ef_search INTEGER DEFAULT NULL,
|
|
p_use_gnn BOOLEAN DEFAULT FALSE,
|
|
p_filter JSONB DEFAULT NULL
|
|
) RETURNS TABLE (
|
|
id TEXT,
|
|
distance REAL,
|
|
score REAL
|
|
) AS 'MODULE_PATHNAME', 'ruvector_search' LANGUAGE C;
|
|
|
|
-- Batch search
|
|
CREATE FUNCTION ruvector_search_batch(
|
|
p_collection_name TEXT,
|
|
p_queries vector[],
|
|
p_k INTEGER DEFAULT 10
|
|
) RETURNS TABLE (
|
|
query_idx INTEGER,
|
|
id TEXT,
|
|
distance REAL
|
|
) AS 'MODULE_PATHNAME', 'ruvector_search_batch' LANGUAGE C;
|
|
```
|
|
|
|
### 10.4 Tiered Storage Functions
|
|
|
|
```sql
|
|
-- Configure tier thresholds
|
|
CREATE FUNCTION ruvector_set_tiers(
|
|
p_collection_name TEXT,
|
|
p_hot_hours INTEGER DEFAULT 24,
|
|
p_warm_hours INTEGER DEFAULT 168,
|
|
p_cool_hours INTEGER DEFAULT 720
|
|
) RETURNS BOOLEAN AS $$
|
|
DECLARE
|
|
v_collection_id INTEGER;
|
|
BEGIN
|
|
SELECT id INTO v_collection_id FROM ruvector.collections WHERE name = p_collection_name;
|
|
IF NOT FOUND THEN
|
|
RAISE EXCEPTION 'Collection not found: %', p_collection_name;
|
|
END IF;
|
|
|
|
UPDATE ruvector.tier_policies
|
|
SET threshold_hours = p_hot_hours WHERE collection_id = v_collection_id AND tier_name = 'warm';
|
|
|
|
UPDATE ruvector.tier_policies
|
|
SET threshold_hours = p_warm_hours WHERE collection_id = v_collection_id AND tier_name = 'cool';
|
|
|
|
UPDATE ruvector.tier_policies
|
|
SET threshold_hours = p_cool_hours WHERE collection_id = v_collection_id AND tier_name = 'cold';
|
|
|
|
RETURN TRUE;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- Trigger manual compaction
|
|
CREATE FUNCTION ruvector_compact(p_collection_name TEXT)
|
|
RETURNS JSONB AS 'MODULE_PATHNAME', 'ruvector_compact' LANGUAGE C;
|
|
|
|
-- Get tier report
|
|
CREATE FUNCTION ruvector_tier_report(p_collection_name TEXT)
|
|
RETURNS TABLE (
|
|
tier_name TEXT,
|
|
vector_count BIGINT,
|
|
size_mb REAL,
|
|
compression TEXT,
|
|
avg_age_hours REAL
|
|
) AS 'MODULE_PATHNAME', 'ruvector_tier_report' LANGUAGE C;
|
|
```
|
|
|
|
### 10.5 Graph & Cypher Functions
|
|
|
|
```sql
|
|
-- Execute Cypher query
|
|
CREATE FUNCTION ruvector_cypher(
|
|
p_graph_name TEXT,
|
|
p_query TEXT,
|
|
p_params JSONB DEFAULT '{}'::jsonb
|
|
) RETURNS SETOF JSONB AS 'MODULE_PATHNAME', 'ruvector_cypher' LANGUAGE C;
|
|
|
|
-- Create graph
|
|
CREATE FUNCTION ruvector_graph_create(
|
|
p_name TEXT,
|
|
p_description TEXT DEFAULT NULL
|
|
) RETURNS INTEGER AS $$
|
|
DECLARE
|
|
v_id INTEGER;
|
|
BEGIN
|
|
INSERT INTO ruvector.graphs (name, description)
|
|
VALUES (p_name, p_description)
|
|
RETURNING id INTO v_id;
|
|
|
|
RETURN v_id;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- Add node
|
|
CREATE FUNCTION ruvector_node_add(
|
|
p_graph_name TEXT,
|
|
p_external_id TEXT,
|
|
p_node_type TEXT DEFAULT 'default',
|
|
p_properties JSONB DEFAULT '{}'::jsonb
|
|
) RETURNS BIGINT AS $$
|
|
DECLARE
|
|
v_graph_id INTEGER;
|
|
v_node_id BIGINT;
|
|
BEGIN
|
|
SELECT id INTO v_graph_id FROM ruvector.graphs WHERE name = p_graph_name;
|
|
IF NOT FOUND THEN
|
|
RAISE EXCEPTION 'Graph not found: %', p_graph_name;
|
|
END IF;
|
|
|
|
INSERT INTO ruvector.nodes (graph_id, external_id, node_type, properties)
|
|
VALUES (v_graph_id, p_external_id, p_node_type, p_properties)
|
|
RETURNING id INTO v_node_id;
|
|
|
|
UPDATE ruvector.graphs SET node_count = node_count + 1, updated_at = NOW()
|
|
WHERE id = v_graph_id;
|
|
|
|
RETURN v_node_id;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- Add edge
|
|
CREATE FUNCTION ruvector_edge_add(
|
|
p_graph_name TEXT,
|
|
p_source_external_id TEXT,
|
|
p_target_external_id TEXT,
|
|
p_edge_type TEXT DEFAULT 'default',
|
|
p_weight REAL DEFAULT 1.0,
|
|
p_properties JSONB DEFAULT '{}'::jsonb
|
|
) RETURNS BIGINT AS $$
|
|
DECLARE
|
|
v_graph_id INTEGER;
|
|
v_source_id BIGINT;
|
|
v_target_id BIGINT;
|
|
v_edge_id BIGINT;
|
|
BEGIN
|
|
SELECT id INTO v_graph_id FROM ruvector.graphs WHERE name = p_graph_name;
|
|
|
|
SELECT id INTO v_source_id FROM ruvector.nodes
|
|
WHERE graph_id = v_graph_id AND external_id = p_source_external_id;
|
|
|
|
SELECT id INTO v_target_id FROM ruvector.nodes
|
|
WHERE graph_id = v_graph_id AND external_id = p_target_external_id;
|
|
|
|
INSERT INTO ruvector.edges (graph_id, source_id, target_id, edge_type, weight, properties)
|
|
VALUES (v_graph_id, v_source_id, v_target_id, p_edge_type, p_weight, p_properties)
|
|
RETURNING id INTO v_edge_id;
|
|
|
|
UPDATE ruvector.graphs SET edge_count = edge_count + 1, updated_at = NOW()
|
|
WHERE id = v_graph_id;
|
|
|
|
RETURN v_edge_id;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
```
|
|
|
|
### 10.6 Integrity Control Functions
|
|
|
|
```sql
|
|
-- Sample integrity state
|
|
CREATE FUNCTION ruvector_integrity_sample(p_collection_name TEXT)
|
|
RETURNS JSONB AS 'MODULE_PATHNAME', 'ruvector_integrity_sample' LANGUAGE C;
|
|
|
|
-- Set integrity policy
|
|
CREATE FUNCTION ruvector_integrity_policy_set(
|
|
p_collection_name TEXT,
|
|
p_policy_name TEXT,
|
|
p_config JSONB
|
|
) RETURNS BOOLEAN AS $$
|
|
DECLARE
|
|
v_collection_id INTEGER;
|
|
BEGIN
|
|
SELECT id INTO v_collection_id FROM ruvector.collections WHERE name = p_collection_name;
|
|
IF NOT FOUND THEN
|
|
RAISE EXCEPTION 'Collection not found: %', p_collection_name;
|
|
END IF;
|
|
|
|
-- Update or insert policy
|
|
INSERT INTO ruvector.integrity_policies (collection_id, name,
|
|
threshold_high, threshold_low,
|
|
normal_actions, stress_actions, critical_actions,
|
|
sample_interval_secs, sample_size)
|
|
VALUES (
|
|
v_collection_id,
|
|
p_policy_name,
|
|
COALESCE((p_config->>'threshold_high')::REAL, 0.8),
|
|
COALESCE((p_config->>'threshold_low')::REAL, 0.3),
|
|
COALESCE(p_config->'normal_actions', '{}'::jsonb),
|
|
COALESCE(p_config->'stress_actions', '{}'::jsonb),
|
|
COALESCE(p_config->'critical_actions', '{}'::jsonb),
|
|
COALESCE((p_config->>'sample_interval_secs')::INTEGER, 60),
|
|
COALESCE((p_config->>'sample_size')::INTEGER, 1000)
|
|
)
|
|
ON CONFLICT (collection_id, name) DO UPDATE SET
|
|
threshold_high = EXCLUDED.threshold_high,
|
|
threshold_low = EXCLUDED.threshold_low,
|
|
normal_actions = EXCLUDED.normal_actions,
|
|
stress_actions = EXCLUDED.stress_actions,
|
|
critical_actions = EXCLUDED.critical_actions,
|
|
sample_interval_secs = EXCLUDED.sample_interval_secs,
|
|
sample_size = EXCLUDED.sample_size,
|
|
updated_at = NOW();
|
|
|
|
RETURN TRUE;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- Check integrity gate
|
|
CREATE FUNCTION ruvector_integrity_gate(
|
|
p_collection_name TEXT,
|
|
p_operation TEXT
|
|
) RETURNS JSONB AS 'MODULE_PATHNAME', 'ruvector_integrity_gate' LANGUAGE C;
|
|
|
|
-- Get current integrity state
|
|
CREATE FUNCTION ruvector_integrity_status(p_collection_name TEXT)
|
|
RETURNS JSONB AS $$
|
|
DECLARE
|
|
v_result JSONB;
|
|
BEGIN
|
|
SELECT jsonb_build_object(
|
|
'collection', p_collection_name,
|
|
'state', s.state,
|
|
'lambda_cut', s.lambda_cut,
|
|
'witness_edges', s.witness_edges,
|
|
'last_sample', s.last_sample,
|
|
'sample_count', s.sample_count
|
|
) INTO v_result
|
|
FROM ruvector.collections c
|
|
JOIN ruvector.integrity_state s ON c.id = s.collection_id
|
|
WHERE c.name = p_collection_name;
|
|
|
|
RETURN v_result;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
```
|
|
|
|
### 10.7 GNN Functions
|
|
|
|
```sql
|
|
-- Record interaction for training
|
|
CREATE FUNCTION ruvector_record_interaction(
|
|
p_collection_name TEXT,
|
|
p_query vector,
|
|
p_result_ids TEXT[],
|
|
p_selected_id TEXT DEFAULT NULL,
|
|
p_feedback_score REAL DEFAULT NULL
|
|
) RETURNS BIGINT AS $$
|
|
DECLARE
|
|
v_collection_id INTEGER;
|
|
v_id BIGINT;
|
|
BEGIN
|
|
SELECT id INTO v_collection_id FROM ruvector.collections WHERE name = p_collection_name;
|
|
|
|
INSERT INTO ruvector.training_interactions
|
|
(collection_id, query_vector, result_ids, selected_id, feedback_score)
|
|
VALUES
|
|
(v_collection_id, p_query, p_result_ids, p_selected_id, p_feedback_score)
|
|
RETURNING id INTO v_id;
|
|
|
|
RETURN v_id;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- Trigger GNN training
|
|
CREATE FUNCTION ruvector_gnn_train(
|
|
p_collection_name TEXT,
|
|
p_model_type TEXT DEFAULT 'graphsage',
|
|
p_epochs INTEGER DEFAULT 100,
|
|
p_config JSONB DEFAULT '{}'::jsonb
|
|
) RETURNS JSONB AS 'MODULE_PATHNAME', 'ruvector_gnn_train' LANGUAGE C;
|
|
|
|
-- Get active GNN model
|
|
CREATE FUNCTION ruvector_gnn_model_active(p_collection_name TEXT)
|
|
RETURNS JSONB AS $$
|
|
DECLARE
|
|
v_result JSONB;
|
|
BEGIN
|
|
SELECT jsonb_build_object(
|
|
'model_name', m.model_name,
|
|
'model_type', m.model_type,
|
|
'version', m.version,
|
|
'training_samples', m.training_samples,
|
|
'validation_recall', m.validation_recall,
|
|
'created_at', m.created_at
|
|
) INTO v_result
|
|
FROM ruvector.collections c
|
|
JOIN ruvector.gnn_models m ON c.id = m.collection_id
|
|
WHERE c.name = p_collection_name AND m.is_active;
|
|
|
|
RETURN COALESCE(v_result, '{}'::jsonb);
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
```
|
|
|
|
---
|
|
|
|
## 11. Utility Functions
|
|
|
|
```sql
|
|
-- Extension version
|
|
CREATE FUNCTION ruvector_version() RETURNS TEXT
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE PARALLEL SAFE;
|
|
|
|
-- SIMD capability info
|
|
CREATE FUNCTION ruvector_simd_info() RETURNS TEXT
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE PARALLEL SAFE;
|
|
|
|
-- Memory statistics
|
|
CREATE FUNCTION ruvector_memory_stats() RETURNS JSONB
|
|
AS 'MODULE_PATHNAME' LANGUAGE C;
|
|
|
|
-- Vector utilities
|
|
CREATE FUNCTION vector_dims(vector) RETURNS INTEGER
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION vector_norm(vector) RETURNS REAL
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
CREATE FUNCTION vector_normalize(vector) RETURNS vector
|
|
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
|
|
|
-- Aggregates
|
|
CREATE AGGREGATE avg(vector) (
|
|
SFUNC = vector_accum,
|
|
STYPE = internal,
|
|
FINALFUNC = vector_avg_final,
|
|
COMBINEFUNC = vector_combine,
|
|
PARALLEL = SAFE
|
|
);
|
|
|
|
CREATE AGGREGATE sum(vector) (
|
|
SFUNC = vector_add,
|
|
STYPE = vector,
|
|
COMBINEFUNC = vector_add,
|
|
PARALLEL = SAFE
|
|
);
|
|
```
|
|
|
|
---
|
|
|
|
## 12. Schema Initialization
|
|
|
|
```sql
|
|
-- Create extension schema
|
|
CREATE SCHEMA IF NOT EXISTS ruvector;
|
|
|
|
-- Set search path
|
|
ALTER EXTENSION ruvector SET SCHEMA ruvector;
|
|
|
|
-- Grant usage
|
|
GRANT USAGE ON SCHEMA ruvector TO PUBLIC;
|
|
GRANT SELECT ON ALL TABLES IN SCHEMA ruvector TO PUBLIC;
|
|
|
|
-- Create roles
|
|
DO $$
|
|
BEGIN
|
|
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'ruvector_admin') THEN
|
|
CREATE ROLE ruvector_admin;
|
|
END IF;
|
|
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'ruvector_operator') THEN
|
|
CREATE ROLE ruvector_operator;
|
|
END IF;
|
|
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'ruvector_user') THEN
|
|
CREATE ROLE ruvector_user;
|
|
END IF;
|
|
END
|
|
$$;
|
|
|
|
-- Grant permissions
|
|
GRANT ALL ON SCHEMA ruvector TO ruvector_admin;
|
|
GRANT ALL ON ALL TABLES IN SCHEMA ruvector TO ruvector_admin;
|
|
GRANT ALL ON ALL SEQUENCES IN SCHEMA ruvector TO ruvector_admin;
|
|
|
|
GRANT SELECT, INSERT, UPDATE, DELETE ON ruvector.collections TO ruvector_operator;
|
|
GRANT SELECT, INSERT, UPDATE, DELETE ON ruvector.index_stats TO ruvector_operator;
|
|
GRANT SELECT, INSERT, UPDATE, DELETE ON ruvector.tier_policies TO ruvector_operator;
|
|
GRANT SELECT ON ALL TABLES IN SCHEMA ruvector TO ruvector_user;
|
|
```
|
|
|
|
---
|
|
|
|
## Testing Requirements
|
|
|
|
### Type Tests
|
|
- Vector input/output roundtrip
|
|
- Dimension validation
|
|
- Operator correctness
|
|
|
|
### Index Tests
|
|
- HNSW build and search
|
|
- IVFFlat build and search
|
|
- Recall at various ef_search
|
|
|
|
### Function Tests
|
|
- All functions callable
|
|
- Error handling
|
|
- Permission checks
|
|
|
|
---
|
|
|
|
## Dependencies
|
|
|
|
- PostgreSQL 14+ (for operator class features)
|
|
- pgrx 0.11+ (Rust bindings)
|
|
- serde (JSON serialization)
|
|
- ed25519-dalek (signature verification)
|