git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
43 KiB
43 KiB
RuVector Postgres v2 - Phase 3: Graph Engine & Cypher
Overview
Phase 3 adds property graph capabilities with Cypher query support, enabling users to model relationships between vectors and perform graph traversals alongside vector similarity search.
Objectives
Primary Goals
- Property graph storage in PostgreSQL
- Cypher query execution via
ruvector_cypher() - Relational bridge views for SQL-graph mixing
- Vector-enriched graph queries
Success Criteria
- Full Cypher read query support
- Graph nodes can reference vectors
- SQL joins with graph data
- < 10ms overhead for simple traversals
Graph-SQL Join Keys and Identity System
Minimum Viable Bridge
When exposing ruvector_nodes and ruvector_edges as views, users need clear join keys to mix Cypher output with their relational tables.
+------------------------------------------------------------------+
| GRAPH-SQL IDENTITY SYSTEM |
+------------------------------------------------------------------+
DESIGN PRINCIPLES:
• "SQL first" - users join Cypher output to their tables easily
• Stable identifiers that survive graph mutations
• No need to learn a new identity system
IDENTITY MAPPING:
Graph Node ID (BIGINT):
• Stable, auto-generated within RuVector
• Stored in catalog table: ruvector.node_catalog
• Maps to user-provided external_id
External ID (TEXT):
• User-provided identifier (e.g., "user_123", "doc_abc")
• Unique within node_type
• Used for joins with user tables
Vector Reference (TID or FK):
• Optional link to user table with vector column
• Enables vector operations on graph nodes
+------------------------------------------------------------------+
Node Catalog Table
-- Central catalog for node identity mapping
CREATE TABLE ruvector.node_catalog (
-- Internal stable ID (primary key)
node_id BIGSERIAL PRIMARY KEY,
-- Collection (graph) this node belongs to
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id),
-- User-provided external identifier
external_id TEXT NOT NULL,
-- Node type/label (e.g., 'User', 'Document', 'Product')
node_type TEXT NOT NULL,
-- Reference to user table (for vector access)
source_table TEXT, -- e.g., 'public.documents'
source_pk TEXT, -- e.g., 'id'
source_pk_value TEXT, -- e.g., '12345'
-- Metadata
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Uniqueness constraint
CONSTRAINT uq_node_external UNIQUE (collection_id, node_type, external_id)
);
CREATE INDEX idx_node_catalog_external
ON ruvector.node_catalog(collection_id, external_id);
CREATE INDEX idx_node_catalog_source
ON ruvector.node_catalog(source_table, source_pk_value);
Edge Catalog Table
-- Edge storage with relational metadata
CREATE TABLE ruvector.edge_catalog (
-- Internal edge ID
edge_id BIGSERIAL PRIMARY KEY,
-- Collection
collection_id INTEGER NOT NULL REFERENCES ruvector.collections(id),
-- Source and target nodes (foreign keys)
source_node_id BIGINT NOT NULL REFERENCES ruvector.node_catalog(node_id),
target_node_id BIGINT NOT NULL REFERENCES ruvector.node_catalog(node_id),
-- Edge type/label (e.g., 'FOLLOWS', 'PURCHASED', 'SIMILAR_TO')
edge_type TEXT NOT NULL,
-- Edge weight (for weighted graph algorithms)
weight REAL DEFAULT 1.0,
-- User-provided properties
properties JSONB DEFAULT '{}'::jsonb,
-- Metadata
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_edge_source ON ruvector.edge_catalog(source_node_id);
CREATE INDEX idx_edge_target ON ruvector.edge_catalog(target_node_id);
CREATE INDEX idx_edge_type ON ruvector.edge_catalog(collection_id, edge_type);
Relational Views for SQL Access
-- View for SQL users to query nodes
CREATE VIEW ruvector.nodes AS
SELECT
nc.node_id,
nc.external_id,
nc.node_type AS label,
c.name AS collection,
nc.source_table,
nc.source_pk_value,
n.properties,
nc.created_at
FROM ruvector.node_catalog nc
JOIN ruvector.collections c ON nc.collection_id = c.id
LEFT JOIN ruvector._node_properties n ON nc.node_id = n.node_id;
-- View for SQL users to query edges
CREATE VIEW ruvector.edges AS
SELECT
ec.edge_id,
ec.edge_type AS type,
src.external_id AS source_external_id,
src.node_type AS source_label,
tgt.external_id AS target_external_id,
tgt.node_type AS target_label,
ec.weight,
ec.properties,
c.name AS collection
FROM ruvector.edge_catalog ec
JOIN ruvector.node_catalog src ON ec.source_node_id = src.node_id
JOIN ruvector.node_catalog tgt ON ec.target_node_id = tgt.node_id
JOIN ruvector.collections c ON ec.collection_id = c.id;
Joining Cypher Results to SQL Tables
-- Example: Join Cypher results with user table
WITH graph_results AS (
SELECT * FROM ruvector_cypher(
'social',
'MATCH (u:User)-[:FOLLOWS*1..3]->(friend:User)
WHERE u.external_id = $user_id
RETURN friend.external_id AS friend_id
LIMIT 100',
jsonb_build_object('user_id', 'user_123')
)
)
SELECT
u.id,
u.name,
u.email,
u.profile_vector <-> query_vector AS similarity
FROM graph_results gr
JOIN users u ON u.user_id = gr.friend_id -- Join on external_id
CROSS JOIN (SELECT '[1,2,3,...]'::vector AS query_vector) q
ORDER BY similarity
LIMIT 10;
Cypher Return Format
-- ruvector_cypher returns setof records with stable IDs
CREATE FUNCTION ruvector_cypher(
p_collection TEXT,
p_query TEXT,
p_params JSONB DEFAULT '{}'::jsonb
) RETURNS TABLE (
-- Node columns (when RETURN includes nodes)
node_id BIGINT,
external_id TEXT,
label TEXT,
properties JSONB,
-- Edge columns (when RETURN includes relationships)
edge_id BIGINT,
edge_type TEXT,
source_id BIGINT,
target_id BIGINT,
weight REAL,
-- Path columns (when RETURN includes paths)
path_length INTEGER,
path_nodes BIGINT[],
path_edges BIGINT[]
) AS 'MODULE_PATHNAME', 'ruvector_cypher' LANGUAGE C;
Join Acceleration
-- Materialized metadata for fast joins
CREATE MATERIALIZED VIEW ruvector.node_external_ids AS
SELECT
node_id,
external_id,
node_type,
collection_id
FROM ruvector.node_catalog;
CREATE UNIQUE INDEX idx_node_ext_lookup
ON ruvector.node_external_ids(collection_id, external_id);
-- Refresh periodically
CREATE FUNCTION ruvector.refresh_join_cache()
RETURNS void AS $$
REFRESH MATERIALIZED VIEW CONCURRENTLY ruvector.node_external_ids;
$$ LANGUAGE SQL;
Architecture
Graph Stack
+------------------------------------------------------------------+
| User Queries |
| SQL: SELECT * FROM items ORDER BY embedding <-> $q |
| Cypher: MATCH (a)-[:LIKES]->(b) WHERE a.vector <=> $q < 0.5 |
+------------------------------------------------------------------+
| |
v v
+---------------------------+ +---------------------------+
| SQL Query Path | | Cypher Query Path |
| | | |
| Parse -> Plan -> Execute | | Parse -> Plan -> Execute |
| | | |
| Uses: PostgreSQL Executor | | Uses: RuVector Cypher |
+---------------------------+ +---------------------------+
| |
+-------------+-------------+
|
v
+------------------------------------------------------------------+
| Graph Storage Layer |
| - ruvector.nodes (PostgreSQL table) |
| - ruvector.edges (PostgreSQL table) |
| - ruvector.hyperedges (PostgreSQL table) |
| - Property indexes (GIN on JSONB) |
+------------------------------------------------------------------+
|
v
+------------------------------------------------------------------+
| Vector Integration |
| - node.vector_ref -> user table (TID) |
| - Cypher can use vector operators |
| - GNN training from graph structure |
+------------------------------------------------------------------+
Data Model
+------------------+
| Graph |
| (id, name, ...) |
+--------+---------+
|
| 1:N
v
+------------------+ +------+------+ +------------------+
| Node | | Edge | | Hyperedge |
| - id | | - id | | - id |
| - external_id | | - source_id | | - node_ids[] |
| - node_type | | - target_id | | - weights[] |
| - properties | | - edge_type | | - properties |
| - vector_ref | | - weight | +------------------+
+------------------+ | - properties|
| +-------------+
|
| References
v
+------------------+
| User Table |
| - id |
| - embedding | <-- vector column
| - metadata |
+------------------+
Deliverables
1. Graph Storage Schema
-- Graph metadata
CREATE TABLE ruvector.graphs (
id SERIAL PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
description TEXT,
node_count BIGINT NOT NULL DEFAULT 0,
edge_count BIGINT NOT NULL DEFAULT 0,
hyperedge_count BIGINT NOT NULL DEFAULT 0,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
config JSONB NOT NULL DEFAULT '{}'::jsonb
);
-- Graph nodes
CREATE TABLE ruvector.nodes (
id BIGSERIAL PRIMARY KEY,
graph_id INTEGER NOT NULL REFERENCES ruvector.graphs(id) ON DELETE CASCADE,
external_id TEXT,
node_type TEXT NOT NULL DEFAULT 'default',
vector_ref TID,
collection_id INTEGER REFERENCES ruvector.collections(id),
properties JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(graph_id, external_id)
);
CREATE INDEX idx_nodes_graph_type ON ruvector.nodes(graph_id, node_type);
CREATE INDEX idx_nodes_properties ON ruvector.nodes USING gin(properties);
CREATE INDEX idx_nodes_vector_ref ON ruvector.nodes(collection_id, vector_ref)
WHERE vector_ref IS NOT NULL;
-- Graph edges
CREATE TABLE ruvector.edges (
id BIGSERIAL PRIMARY KEY,
graph_id INTEGER NOT NULL REFERENCES ruvector.graphs(id) ON DELETE CASCADE,
source_id BIGINT NOT NULL REFERENCES ruvector.nodes(id) ON DELETE CASCADE,
target_id BIGINT NOT NULL REFERENCES ruvector.nodes(id) ON DELETE CASCADE,
edge_type TEXT NOT NULL DEFAULT 'default',
weight REAL NOT NULL DEFAULT 1.0,
properties JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CHECK (source_id <> target_id)
);
CREATE INDEX idx_edges_source ON ruvector.edges(graph_id, source_id);
CREATE INDEX idx_edges_target ON ruvector.edges(graph_id, target_id);
CREATE INDEX idx_edges_type ON ruvector.edges(graph_id, edge_type);
CREATE INDEX idx_edges_properties ON ruvector.edges USING gin(properties);
-- Hyperedges (connect multiple nodes)
CREATE TABLE ruvector.hyperedges (
id BIGSERIAL PRIMARY KEY,
graph_id INTEGER NOT NULL REFERENCES ruvector.graphs(id) ON DELETE CASCADE,
hyperedge_type TEXT NOT NULL DEFAULT 'default',
node_ids BIGINT[] NOT NULL,
weights REAL[],
properties JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CHECK (array_length(node_ids, 1) >= 2)
);
CREATE INDEX idx_hyperedges_graph ON ruvector.hyperedges(graph_id);
CREATE INDEX idx_hyperedges_nodes ON ruvector.hyperedges USING gin(node_ids);
CREATE INDEX idx_hyperedges_type ON ruvector.hyperedges(graph_id, hyperedge_type);
2. Cypher Parser
// src/graph/cypher/parser.rs
use nom::{
branch::alt,
bytes::complete::{tag, tag_no_case, take_while1},
character::complete::{alphanumeric1, char, multispace0, multispace1},
combinator::{map, opt, recognize},
multi::{many0, separated_list0, separated_list1},
sequence::{delimited, pair, preceded, separated_pair, tuple},
IResult,
};
/// Cypher AST node types
#[derive(Debug, Clone)]
pub enum CypherStatement {
Match(MatchClause),
Return(ReturnClause),
Create(CreateClause),
Delete(DeleteClause),
Set(SetClause),
Query(CypherQuery),
}
#[derive(Debug, Clone)]
pub struct CypherQuery {
pub match_clause: Option<MatchClause>,
pub where_clause: Option<WhereClause>,
pub return_clause: Option<ReturnClause>,
pub order_by: Option<OrderByClause>,
pub limit: Option<usize>,
pub skip: Option<usize>,
}
#[derive(Debug, Clone)]
pub struct MatchClause {
pub patterns: Vec<Pattern>,
pub optional: bool,
}
#[derive(Debug, Clone)]
pub enum Pattern {
Node(NodePattern),
Path(PathPattern),
}
#[derive(Debug, Clone)]
pub struct NodePattern {
pub variable: Option<String>,
pub labels: Vec<String>,
pub properties: Option<MapLiteral>,
}
#[derive(Debug, Clone)]
pub struct PathPattern {
pub elements: Vec<PathElement>,
}
#[derive(Debug, Clone)]
pub enum PathElement {
Node(NodePattern),
Relationship(RelationshipPattern),
}
#[derive(Debug, Clone)]
pub struct RelationshipPattern {
pub variable: Option<String>,
pub types: Vec<String>,
pub direction: Direction,
pub properties: Option<MapLiteral>,
pub length: Option<RelationshipLength>,
}
#[derive(Debug, Clone, Copy)]
pub enum Direction {
Left, // <-
Right, // ->
Both, // --
Undirected, // --
}
#[derive(Debug, Clone)]
pub struct RelationshipLength {
pub min: Option<usize>,
pub max: Option<usize>,
}
#[derive(Debug, Clone)]
pub struct WhereClause {
pub expression: Expression,
}
#[derive(Debug, Clone)]
pub enum Expression {
Literal(Literal),
Property(PropertyAccess),
Parameter(String),
FunctionCall(FunctionCall),
BinaryOp(Box<BinaryOp>),
UnaryOp(Box<UnaryOp>),
List(Vec<Expression>),
Map(MapLiteral),
}
#[derive(Debug, Clone)]
pub struct BinaryOp {
pub op: BinaryOperator,
pub left: Expression,
pub right: Expression,
}
#[derive(Debug, Clone, Copy)]
pub enum BinaryOperator {
Eq, Ne, Lt, Gt, Le, Ge,
And, Or, Xor,
Add, Sub, Mul, Div, Mod,
Contains, StartsWith, EndsWith,
In,
VectorDistance, // Custom: <-> for vectors
VectorSimilarity, // Custom: <=> for vectors
}
#[derive(Debug, Clone)]
pub struct PropertyAccess {
pub variable: String,
pub properties: Vec<String>,
}
#[derive(Debug, Clone)]
pub enum Literal {
Null,
Bool(bool),
Int(i64),
Float(f64),
String(String),
}
#[derive(Debug, Clone)]
pub struct MapLiteral {
pub entries: Vec<(String, Expression)>,
}
#[derive(Debug, Clone)]
pub struct FunctionCall {
pub name: String,
pub args: Vec<Expression>,
pub distinct: bool,
}
#[derive(Debug, Clone)]
pub struct ReturnClause {
pub items: Vec<ReturnItem>,
pub distinct: bool,
}
#[derive(Debug, Clone)]
pub struct ReturnItem {
pub expression: Expression,
pub alias: Option<String>,
}
#[derive(Debug, Clone)]
pub struct OrderByClause {
pub items: Vec<OrderByItem>,
}
#[derive(Debug, Clone)]
pub struct OrderByItem {
pub expression: Expression,
pub ascending: bool,
}
/// Parse a Cypher query
pub fn parse_cypher(input: &str) -> Result<CypherQuery, ParseError> {
match cypher_query(input) {
Ok((_, query)) => Ok(query),
Err(e) => Err(ParseError::SyntaxError(format!("{:?}", e))),
}
}
fn cypher_query(input: &str) -> IResult<&str, CypherQuery> {
let (input, _) = multispace0(input)?;
let (input, match_clause) = opt(match_clause)(input)?;
let (input, _) = multispace0(input)?;
let (input, where_clause) = opt(where_clause)(input)?;
let (input, _) = multispace0(input)?;
let (input, return_clause) = opt(return_clause)(input)?;
let (input, _) = multispace0(input)?;
let (input, order_by) = opt(order_by_clause)(input)?;
let (input, _) = multispace0(input)?;
let (input, limit) = opt(limit_clause)(input)?;
let (input, _) = multispace0(input)?;
let (input, skip) = opt(skip_clause)(input)?;
Ok((input, CypherQuery {
match_clause,
where_clause,
return_clause,
order_by,
limit,
skip,
}))
}
fn match_clause(input: &str) -> IResult<&str, MatchClause> {
let (input, optional) = opt(preceded(
tuple((tag_no_case("OPTIONAL"), multispace1)),
tag_no_case("")
))(input)?;
let (input, _) = tag_no_case("MATCH")(input)?;
let (input, _) = multispace1(input)?;
let (input, patterns) = separated_list1(
tuple((multispace0, char(','), multispace0)),
pattern
)(input)?;
Ok((input, MatchClause {
patterns,
optional: optional.is_some(),
}))
}
fn pattern(input: &str) -> IResult<&str, Pattern> {
alt((
map(path_pattern, Pattern::Path),
map(node_pattern, Pattern::Node),
))(input)
}
fn node_pattern(input: &str) -> IResult<&str, NodePattern> {
let (input, _) = char('(')(input)?;
let (input, _) = multispace0(input)?;
let (input, variable) = opt(identifier)(input)?;
let (input, labels) = many0(preceded(char(':'), identifier))(input)?;
let (input, _) = multispace0(input)?;
let (input, properties) = opt(map_literal)(input)?;
let (input, _) = multispace0(input)?;
let (input, _) = char(')')(input)?;
Ok((input, NodePattern {
variable,
labels,
properties,
}))
}
fn path_pattern(input: &str) -> IResult<&str, PathPattern> {
let (input, first_node) = node_pattern(input)?;
let (input, rest) = many0(pair(relationship_pattern, node_pattern))(input)?;
let mut elements = vec![PathElement::Node(first_node)];
for (rel, node) in rest {
elements.push(PathElement::Relationship(rel));
elements.push(PathElement::Node(node));
}
Ok((input, PathPattern { elements }))
}
fn relationship_pattern(input: &str) -> IResult<&str, RelationshipPattern> {
// Handle different arrow types: -[r:TYPE]-> <-[r:TYPE]- -[r:TYPE]-
let (input, left_arrow) = opt(char('<'))(input)?;
let (input, _) = char('-')(input)?;
let (input, details) = opt(delimited(
char('['),
relationship_details,
char(']')
))(input)?;
let (input, _) = char('-')(input)?;
let (input, right_arrow) = opt(char('>'))(input)?;
let direction = match (left_arrow.is_some(), right_arrow.is_some()) {
(true, false) => Direction::Left,
(false, true) => Direction::Right,
(true, true) => Direction::Both,
(false, false) => Direction::Undirected,
};
let (variable, types, properties, length) = details.unwrap_or((None, vec![], None, None));
Ok((input, RelationshipPattern {
variable,
types,
direction,
properties,
length,
}))
}
fn relationship_details(
input: &str
) -> IResult<&str, (Option<String>, Vec<String>, Option<MapLiteral>, Option<RelationshipLength>)> {
let (input, _) = multispace0(input)?;
let (input, variable) = opt(identifier)(input)?;
let (input, types) = many0(preceded(char(':'), identifier))(input)?;
let (input, length) = opt(relationship_length)(input)?;
let (input, _) = multispace0(input)?;
let (input, properties) = opt(map_literal)(input)?;
let (input, _) = multispace0(input)?;
Ok((input, (variable, types, properties, length)))
}
fn where_clause(input: &str) -> IResult<&str, WhereClause> {
let (input, _) = tag_no_case("WHERE")(input)?;
let (input, _) = multispace1(input)?;
let (input, expression) = expression(input)?;
Ok((input, WhereClause { expression }))
}
fn return_clause(input: &str) -> IResult<&str, ReturnClause> {
let (input, _) = tag_no_case("RETURN")(input)?;
let (input, _) = multispace1(input)?;
let (input, distinct) = opt(preceded(
tag_no_case("DISTINCT"),
multispace1
))(input)?;
let (input, items) = separated_list1(
tuple((multispace0, char(','), multispace0)),
return_item
)(input)?;
Ok((input, ReturnClause {
items,
distinct: distinct.is_some(),
}))
}
fn return_item(input: &str) -> IResult<&str, ReturnItem> {
let (input, expression) = expression(input)?;
let (input, alias) = opt(preceded(
tuple((multispace1, tag_no_case("AS"), multispace1)),
identifier
))(input)?;
Ok((input, ReturnItem { expression, alias }))
}
fn identifier(input: &str) -> IResult<&str, String> {
map(
recognize(pair(
alt((alphanumeric1, tag("_"))),
many0(alt((alphanumeric1, tag("_"))))
)),
|s: &str| s.to_string()
)(input)
}
fn expression(input: &str) -> IResult<&str, Expression> {
// Simplified expression parser
or_expression(input)
}
fn or_expression(input: &str) -> IResult<&str, Expression> {
let (input, left) = and_expression(input)?;
let (input, rest) = many0(preceded(
tuple((multispace0, tag_no_case("OR"), multispace0)),
and_expression
))(input)?;
let result = rest.into_iter().fold(left, |acc, right| {
Expression::BinaryOp(Box::new(BinaryOp {
op: BinaryOperator::Or,
left: acc,
right,
}))
});
Ok((input, result))
}
fn and_expression(input: &str) -> IResult<&str, Expression> {
let (input, left) = comparison_expression(input)?;
let (input, rest) = many0(preceded(
tuple((multispace0, tag_no_case("AND"), multispace0)),
comparison_expression
))(input)?;
let result = rest.into_iter().fold(left, |acc, right| {
Expression::BinaryOp(Box::new(BinaryOp {
op: BinaryOperator::And,
left: acc,
right,
}))
});
Ok((input, result))
}
fn comparison_expression(input: &str) -> IResult<&str, Expression> {
let (input, left) = primary_expression(input)?;
let (input, _) = multispace0(input)?;
let (input, op_right) = opt(pair(
comparison_operator,
preceded(multispace0, primary_expression)
))(input)?;
match op_right {
Some((op, right)) => {
Ok((input, Expression::BinaryOp(Box::new(BinaryOp {
op,
left,
right,
}))))
}
None => Ok((input, left)),
}
}
fn comparison_operator(input: &str) -> IResult<&str, BinaryOperator> {
alt((
map(tag("<->"), |_| BinaryOperator::VectorDistance),
map(tag("<=>"), |_| BinaryOperator::VectorSimilarity),
map(tag("<="), |_| BinaryOperator::Le),
map(tag(">="), |_| BinaryOperator::Ge),
map(tag("<>"), |_| BinaryOperator::Ne),
map(tag("!="), |_| BinaryOperator::Ne),
map(char('<'), |_| BinaryOperator::Lt),
map(char('>'), |_| BinaryOperator::Gt),
map(char('='), |_| BinaryOperator::Eq),
))(input)
}
fn primary_expression(input: &str) -> IResult<&str, Expression> {
alt((
map(literal, Expression::Literal),
map(parameter, Expression::Parameter),
map(property_access, Expression::Property),
map(function_call, Expression::FunctionCall),
))(input)
}
// ... Additional parser functions ...
3. Cypher Executor
// src/graph/cypher/executor.rs
use super::parser::*;
use std::collections::HashMap;
/// Execute a Cypher query
pub struct CypherExecutor {
graph_id: i32,
params: HashMap<String, serde_json::Value>,
}
impl CypherExecutor {
pub fn new(graph_id: i32) -> Self {
Self {
graph_id,
params: HashMap::new(),
}
}
pub fn with_params(mut self, params: HashMap<String, serde_json::Value>) -> Self {
self.params = params;
self
}
/// Execute query and return results
pub fn execute(&self, query: &CypherQuery) -> Result<Vec<serde_json::Value>, ExecutionError> {
// Build execution plan
let plan = self.plan(query)?;
// Execute plan
let mut context = ExecutionContext::new(self.graph_id, &self.params);
let results = plan.execute(&mut context)?;
// Format results according to RETURN clause
self.format_results(results, query)
}
fn plan(&self, query: &CypherQuery) -> Result<ExecutionPlan, ExecutionError> {
let mut plan = ExecutionPlan::new();
// Add MATCH operations
if let Some(ref match_clause) = query.match_clause {
for pattern in &match_clause.patterns {
plan.add_operation(self.plan_pattern(pattern)?);
}
}
// Add WHERE filter
if let Some(ref where_clause) = query.where_clause {
plan.add_filter(where_clause.clone());
}
// Add ORDER BY
if let Some(ref order_by) = query.order_by {
plan.add_order_by(order_by.clone());
}
// Add LIMIT/SKIP
if let Some(limit) = query.limit {
plan.set_limit(limit);
}
if let Some(skip) = query.skip {
plan.set_skip(skip);
}
Ok(plan)
}
fn plan_pattern(&self, pattern: &Pattern) -> Result<PatternOperation, ExecutionError> {
match pattern {
Pattern::Node(node) => {
Ok(PatternOperation::ScanNodes {
variable: node.variable.clone(),
labels: node.labels.clone(),
properties: node.properties.clone(),
})
}
Pattern::Path(path) => {
self.plan_path_pattern(path)
}
}
}
fn plan_path_pattern(&self, path: &PathPattern) -> Result<PatternOperation, ExecutionError> {
let mut operations = Vec::new();
for (i, element) in path.elements.iter().enumerate() {
match element {
PathElement::Node(node) if i == 0 => {
operations.push(PatternOperation::ScanNodes {
variable: node.variable.clone(),
labels: node.labels.clone(),
properties: node.properties.clone(),
});
}
PathElement::Relationship(rel) => {
let next_node = match path.elements.get(i + 1) {
Some(PathElement::Node(n)) => n.clone(),
_ => return Err(ExecutionError::InvalidPattern),
};
operations.push(PatternOperation::Traverse {
rel_variable: rel.variable.clone(),
rel_types: rel.types.clone(),
direction: rel.direction,
target_variable: next_node.variable.clone(),
target_labels: next_node.labels.clone(),
});
}
_ => {}
}
}
Ok(PatternOperation::PathMatch { operations })
}
fn format_results(
&self,
results: Vec<ResultRow>,
query: &CypherQuery,
) -> Result<Vec<serde_json::Value>, ExecutionError> {
let return_clause = query.return_clause.as_ref()
.ok_or(ExecutionError::NoReturnClause)?;
results.into_iter()
.map(|row| {
let mut obj = serde_json::Map::new();
for item in &return_clause.items {
let key = item.alias.clone()
.unwrap_or_else(|| format_expression(&item.expression));
let value = evaluate_expression(&item.expression, &row)?;
obj.insert(key, value);
}
Ok(serde_json::Value::Object(obj))
})
.collect()
}
}
/// Execution plan
struct ExecutionPlan {
operations: Vec<PatternOperation>,
filter: Option<WhereClause>,
order_by: Option<OrderByClause>,
limit: Option<usize>,
skip: Option<usize>,
}
impl ExecutionPlan {
fn new() -> Self {
Self {
operations: Vec::new(),
filter: None,
order_by: None,
limit: None,
skip: None,
}
}
fn add_operation(&mut self, op: PatternOperation) {
self.operations.push(op);
}
fn add_filter(&mut self, filter: WhereClause) {
self.filter = Some(filter);
}
fn add_order_by(&mut self, order_by: OrderByClause) {
self.order_by = Some(order_by);
}
fn set_limit(&mut self, limit: usize) {
self.limit = Some(limit);
}
fn set_skip(&mut self, skip: usize) {
self.skip = Some(skip);
}
fn execute(&self, context: &mut ExecutionContext) -> Result<Vec<ResultRow>, ExecutionError> {
let mut results = Vec::new();
// Execute pattern operations
for op in &self.operations {
let op_results = op.execute(context)?;
results = if results.is_empty() {
op_results
} else {
// Cross-product or join based on shared variables
join_results(results, op_results, context)
};
}
// Apply filter
if let Some(ref filter) = self.filter {
results = results.into_iter()
.filter(|row| evaluate_predicate(&filter.expression, row).unwrap_or(false))
.collect();
}
// Apply ORDER BY
if let Some(ref order_by) = self.order_by {
sort_results(&mut results, order_by)?;
}
// Apply SKIP
if let Some(skip) = self.skip {
results = results.into_iter().skip(skip).collect();
}
// Apply LIMIT
if let Some(limit) = self.limit {
results = results.into_iter().take(limit).collect();
}
Ok(results)
}
}
enum PatternOperation {
ScanNodes {
variable: Option<String>,
labels: Vec<String>,
properties: Option<MapLiteral>,
},
Traverse {
rel_variable: Option<String>,
rel_types: Vec<String>,
direction: Direction,
target_variable: Option<String>,
target_labels: Vec<String>,
},
PathMatch {
operations: Vec<PatternOperation>,
},
}
impl PatternOperation {
fn execute(&self, context: &mut ExecutionContext) -> Result<Vec<ResultRow>, ExecutionError> {
match self {
PatternOperation::ScanNodes { variable, labels, properties } => {
scan_nodes(context, variable, labels, properties)
}
PatternOperation::Traverse { rel_variable, rel_types, direction, target_variable, target_labels } => {
traverse_edges(context, rel_variable, rel_types, *direction, target_variable, target_labels)
}
PatternOperation::PathMatch { operations } => {
let mut results = Vec::new();
for op in operations {
let op_results = op.execute(context)?;
results = if results.is_empty() {
op_results
} else {
extend_paths(results, op_results)?
};
}
Ok(results)
}
}
}
}
fn scan_nodes(
context: &ExecutionContext,
variable: &Option<String>,
labels: &[String],
properties: &Option<MapLiteral>,
) -> Result<Vec<ResultRow>, ExecutionError> {
Spi::connect(|client| {
let mut query = format!(
"SELECT id, external_id, node_type, properties, vector_ref, collection_id
FROM ruvector.nodes WHERE graph_id = $1"
);
let mut params: Vec<_> = vec![context.graph_id.into()];
// Filter by labels (node_type)
if !labels.is_empty() {
query.push_str(&format!(
" AND node_type IN ({})",
labels.iter()
.enumerate()
.map(|(i, _)| format!("${}", i + 2))
.collect::<Vec<_>>()
.join(",")
));
for label in labels {
params.push(label.clone().into());
}
}
// Filter by properties
if let Some(props) = properties {
for (key, value) in &props.entries {
let idx = params.len() + 1;
query.push_str(&format!(
" AND properties->>'{}' = ${}",
key, idx
));
params.push(literal_to_param(value));
}
}
let results = client.select(&query, None, ¶ms)?;
results.map(|row| {
let mut result_row = ResultRow::new();
if let Some(var) = variable {
result_row.set(var, serde_json::json!({
"id": row.get::<i64>(1)?,
"external_id": row.get::<Option<String>>(2)?,
"labels": vec![row.get::<String>(3)?],
"properties": row.get::<pgrx::JsonB>(4)?.0,
"_vector_ref": row.get::<Option<String>>(5)?,
"_collection_id": row.get::<Option<i32>>(6)?,
}));
}
Ok(result_row)
}).collect()
})
}
fn traverse_edges(
context: &ExecutionContext,
rel_variable: &Option<String>,
rel_types: &[String],
direction: Direction,
target_variable: &Option<String>,
target_labels: &[String],
) -> Result<Vec<ResultRow>, ExecutionError> {
// Implementation would query ruvector.edges and join with nodes
// Based on direction and type constraints
todo!("Implement edge traversal")
}
4. SQL Functions
-- Execute Cypher query
CREATE FUNCTION ruvector_cypher(
p_graph_name TEXT,
p_query TEXT,
p_params JSONB DEFAULT '{}'::jsonb
) RETURNS SETOF JSONB AS 'MODULE_PATHNAME', 'ruvector_cypher' LANGUAGE C;
-- Create graph
CREATE FUNCTION ruvector_graph_create(
p_name TEXT,
p_description TEXT DEFAULT NULL
) RETURNS INTEGER AS $$
DECLARE
v_id INTEGER;
BEGIN
INSERT INTO ruvector.graphs (name, description)
VALUES (p_name, p_description)
RETURNING id INTO v_id;
RETURN v_id;
END;
$$ LANGUAGE plpgsql;
-- Delete graph
CREATE FUNCTION ruvector_graph_delete(p_name TEXT) RETURNS BOOLEAN AS $$
BEGIN
DELETE FROM ruvector.graphs WHERE name = p_name;
RETURN FOUND;
END;
$$ LANGUAGE plpgsql;
-- Add node
CREATE FUNCTION ruvector_node_add(
p_graph_name TEXT,
p_external_id TEXT,
p_node_type TEXT DEFAULT 'default',
p_properties JSONB DEFAULT '{}'::jsonb,
p_vector_table TEXT DEFAULT NULL,
p_vector_column TEXT DEFAULT NULL,
p_vector_id TEXT DEFAULT NULL
) RETURNS BIGINT AS $$
DECLARE
v_graph_id INTEGER;
v_node_id BIGINT;
v_vector_ref TID;
v_collection_id INTEGER;
BEGIN
SELECT id INTO v_graph_id FROM ruvector.graphs WHERE name = p_graph_name;
IF NOT FOUND THEN
RAISE EXCEPTION 'Graph not found: %', p_graph_name;
END IF;
-- Get vector reference if specified
IF p_vector_table IS NOT NULL AND p_vector_id IS NOT NULL THEN
EXECUTE format(
'SELECT ctid FROM %I WHERE id = $1',
p_vector_table
) INTO v_vector_ref USING p_vector_id;
SELECT id INTO v_collection_id
FROM ruvector.collections
WHERE table_name = p_vector_table AND column_name = COALESCE(p_vector_column, 'embedding');
END IF;
INSERT INTO ruvector.nodes (graph_id, external_id, node_type, properties, vector_ref, collection_id)
VALUES (v_graph_id, p_external_id, p_node_type, p_properties, v_vector_ref, v_collection_id)
RETURNING id INTO v_node_id;
UPDATE ruvector.graphs SET node_count = node_count + 1, updated_at = NOW()
WHERE id = v_graph_id;
RETURN v_node_id;
END;
$$ LANGUAGE plpgsql;
-- Add edge
CREATE FUNCTION ruvector_edge_add(
p_graph_name TEXT,
p_source_external_id TEXT,
p_target_external_id TEXT,
p_edge_type TEXT DEFAULT 'default',
p_weight REAL DEFAULT 1.0,
p_properties JSONB DEFAULT '{}'::jsonb
) RETURNS BIGINT AS $$
DECLARE
v_graph_id INTEGER;
v_source_id BIGINT;
v_target_id BIGINT;
v_edge_id BIGINT;
BEGIN
SELECT id INTO v_graph_id FROM ruvector.graphs WHERE name = p_graph_name;
SELECT id INTO v_source_id FROM ruvector.nodes
WHERE graph_id = v_graph_id AND external_id = p_source_external_id;
IF NOT FOUND THEN
RAISE EXCEPTION 'Source node not found: %', p_source_external_id;
END IF;
SELECT id INTO v_target_id FROM ruvector.nodes
WHERE graph_id = v_graph_id AND external_id = p_target_external_id;
IF NOT FOUND THEN
RAISE EXCEPTION 'Target node not found: %', p_target_external_id;
END IF;
INSERT INTO ruvector.edges (graph_id, source_id, target_id, edge_type, weight, properties)
VALUES (v_graph_id, v_source_id, v_target_id, p_edge_type, p_weight, p_properties)
RETURNING id INTO v_edge_id;
UPDATE ruvector.graphs SET edge_count = edge_count + 1, updated_at = NOW()
WHERE id = v_graph_id;
RETURN v_edge_id;
END;
$$ LANGUAGE plpgsql;
5. Relational Bridge Views
-- Unified node view with vector data
CREATE VIEW ruvector.nodes_view AS
SELECT
n.id,
n.graph_id,
g.name AS graph_name,
n.external_id,
n.node_type AS label,
n.properties,
n.created_at,
c.table_schema || '.' || c.table_name AS vector_table,
c.column_name AS vector_column,
n.vector_ref
FROM ruvector.nodes n
JOIN ruvector.graphs g ON n.graph_id = g.id
LEFT JOIN ruvector.collections c ON n.collection_id = c.id;
-- Edge view with full details
CREATE VIEW ruvector.edges_view AS
SELECT
e.id,
e.graph_id,
g.name AS graph_name,
e.source_id,
src.external_id AS source_external_id,
src.node_type AS source_label,
e.target_id,
tgt.external_id AS target_external_id,
tgt.node_type AS target_label,
e.edge_type,
e.weight,
e.properties,
e.created_at
FROM ruvector.edges e
JOIN ruvector.graphs g ON e.graph_id = g.id
JOIN ruvector.nodes src ON e.source_id = src.id
JOIN ruvector.nodes tgt ON e.target_id = tgt.id;
-- Adjacency list view for SQL-based traversals
CREATE VIEW ruvector.adjacency_list AS
SELECT
g.name AS graph_name,
src.external_id AS source,
e.edge_type,
tgt.external_id AS target,
e.weight
FROM ruvector.edges e
JOIN ruvector.graphs g ON e.graph_id = g.id
JOIN ruvector.nodes src ON e.source_id = src.id
JOIN ruvector.nodes tgt ON e.target_id = tgt.id;
-- Function to get neighbors in SQL
CREATE FUNCTION ruvector_neighbors(
p_graph_name TEXT,
p_node_external_id TEXT,
p_edge_types TEXT[] DEFAULT NULL,
p_direction TEXT DEFAULT 'both' -- 'in', 'out', 'both'
) RETURNS TABLE (
neighbor_id BIGINT,
neighbor_external_id TEXT,
neighbor_label TEXT,
edge_type TEXT,
edge_weight REAL,
direction TEXT
) AS $$
BEGIN
RETURN QUERY
WITH graph AS (
SELECT id FROM ruvector.graphs WHERE name = p_graph_name
),
source_node AS (
SELECT n.id FROM ruvector.nodes n, graph g
WHERE n.graph_id = g.id AND n.external_id = p_node_external_id
)
SELECT
n.id,
n.external_id,
n.node_type,
e.edge_type,
e.weight,
CASE
WHEN e.source_id = s.id THEN 'out'
ELSE 'in'
END
FROM ruvector.edges e
JOIN source_node s ON (e.source_id = s.id OR e.target_id = s.id)
JOIN ruvector.nodes n ON (
CASE
WHEN e.source_id = s.id THEN e.target_id = n.id
ELSE e.source_id = n.id
END
)
WHERE (p_edge_types IS NULL OR e.edge_type = ANY(p_edge_types))
AND (
p_direction = 'both'
OR (p_direction = 'out' AND e.source_id = s.id)
OR (p_direction = 'in' AND e.target_id = s.id)
);
END;
$$ LANGUAGE plpgsql;
Usage Examples
Basic Cypher Queries
-- Create a graph
SELECT ruvector_graph_create('social', 'Social network graph');
-- Add nodes
SELECT ruvector_node_add('social', 'alice', 'Person', '{"name": "Alice", "age": 30}');
SELECT ruvector_node_add('social', 'bob', 'Person', '{"name": "Bob", "age": 25}');
SELECT ruvector_node_add('social', 'charlie', 'Person', '{"name": "Charlie", "age": 35}');
-- Add edges
SELECT ruvector_edge_add('social', 'alice', 'bob', 'KNOWS', 1.0);
SELECT ruvector_edge_add('social', 'bob', 'charlie', 'KNOWS', 1.0);
SELECT ruvector_edge_add('social', 'alice', 'charlie', 'FOLLOWS', 0.5);
-- Query with Cypher
SELECT * FROM ruvector_cypher('social', '
MATCH (a:Person)-[:KNOWS]->(b:Person)
WHERE a.age > 25
RETURN a.name AS person, b.name AS knows
');
-- Path queries
SELECT * FROM ruvector_cypher('social', '
MATCH path = (a:Person)-[:KNOWS*1..3]->(b:Person)
WHERE a.name = "Alice"
RETURN a.name, b.name, length(path) AS distance
');
Vector-Enriched Graph Queries
-- Create nodes linked to vectors
SELECT ruvector_node_add(
'social',
'alice',
'Person',
'{"name": "Alice"}',
'user_embeddings', -- table with vectors
'embedding', -- vector column
'1' -- user ID
);
-- Query by vector similarity
SELECT * FROM ruvector_cypher('social', '
MATCH (a:Person)
WHERE a._vector <=> $query_vector < 0.5
RETURN a.name, a._vector <=> $query_vector AS similarity
ORDER BY similarity
LIMIT 10
', '{"query_vector": [0.1, 0.2, 0.3, ...]}'::jsonb);
SQL-Graph Mixing
-- Use SQL views with graph data
SELECT
nv.external_id,
nv.label,
nv.properties->>'name' AS name,
COUNT(DISTINCT ev.target_id) AS connection_count
FROM ruvector.nodes_view nv
LEFT JOIN ruvector.edges_view ev ON nv.id = ev.source_id
WHERE nv.graph_name = 'social'
GROUP BY nv.id, nv.external_id, nv.label, nv.properties
ORDER BY connection_count DESC;
-- Join graph data with user table
SELECT
u.id,
u.username,
n.properties->>'score' AS graph_score,
COUNT(e.id) AS edge_count
FROM users u
JOIN ruvector.nodes n ON n.external_id = u.id::text
LEFT JOIN ruvector.edges e ON e.source_id = n.id
WHERE n.graph_id = (SELECT id FROM ruvector.graphs WHERE name = 'social')
GROUP BY u.id, u.username, n.properties;
Testing Requirements
Unit Tests
- Cypher parser coverage
- Expression evaluation
- Pattern matching
Integration Tests
- Graph CRUD operations
- Cypher execution
- Vector integration
- SQL view queries
Performance Tests
- Large graph traversals
- Index effectiveness
- Memory usage
Timeline
| Week | Deliverable |
|---|---|
| 9 | Graph storage schema |
| 10 | Cypher parser |
| 11 | Cypher executor |
| 12 | SQL functions and views |