Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
400
vendor/ruvector/benchmarks/graph/src/graph-data-generator.ts
vendored
Normal file
400
vendor/ruvector/benchmarks/graph/src/graph-data-generator.ts
vendored
Normal file
@@ -0,0 +1,400 @@
|
||||
/**
|
||||
* Graph data generator using agentic-synth
|
||||
* Generates synthetic graph datasets for benchmarking
|
||||
*/
|
||||
|
||||
import { AgenticSynth, createSynth } from '@ruvector/agentic-synth';
|
||||
import { writeFileSync, mkdirSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
|
||||
export interface GraphNode {
|
||||
id: string;
|
||||
labels: string[];
|
||||
properties: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface GraphEdge {
|
||||
id: string;
|
||||
from: string;
|
||||
to: string;
|
||||
type: string;
|
||||
properties: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface GraphDataset {
|
||||
nodes: GraphNode[];
|
||||
edges: GraphEdge[];
|
||||
metadata: {
|
||||
nodeCount: number;
|
||||
edgeCount: number;
|
||||
avgDegree: number;
|
||||
labels: string[];
|
||||
relationshipTypes: string[];
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate social network graph data
|
||||
*/
|
||||
export async function generateSocialNetwork(
|
||||
numUsers: number = 1000000,
|
||||
avgFriends: number = 10
|
||||
): Promise<GraphDataset> {
|
||||
console.log(`Generating social network: ${numUsers} users, avg ${avgFriends} friends...`);
|
||||
|
||||
const synth = createSynth({
|
||||
provider: 'gemini',
|
||||
model: 'gemini-2.0-flash-exp'
|
||||
});
|
||||
|
||||
const nodes: GraphNode[] = [];
|
||||
const edges: GraphEdge[] = [];
|
||||
|
||||
// Generate users in batches
|
||||
const batchSize = 10000;
|
||||
const numBatches = Math.ceil(numUsers / batchSize);
|
||||
|
||||
for (let batch = 0; batch < numBatches; batch++) {
|
||||
const batchStart = batch * batchSize;
|
||||
const batchEnd = Math.min(batchStart + batchSize, numUsers);
|
||||
const batchUsers = batchEnd - batchStart;
|
||||
|
||||
console.log(` Generating users ${batchStart}-${batchEnd}...`);
|
||||
|
||||
// Use agentic-synth to generate realistic user data
|
||||
const userResult = await synth.generateStructured({
|
||||
type: 'json',
|
||||
count: batchUsers,
|
||||
schema: {
|
||||
id: 'string',
|
||||
name: 'string',
|
||||
age: 'number',
|
||||
location: 'string',
|
||||
interests: 'array<string>',
|
||||
joinDate: 'timestamp'
|
||||
},
|
||||
prompt: `Generate realistic social media user profiles with diverse demographics,
|
||||
locations (cities worldwide), ages (18-80), and interests (hobbies, activities, topics).
|
||||
Make names culturally appropriate for their locations.`
|
||||
});
|
||||
|
||||
// Convert to graph nodes
|
||||
for (let i = 0; i < batchUsers; i++) {
|
||||
const userId = `user_${batchStart + i}`;
|
||||
const userData = userResult.data[i] as Record<string, unknown>;
|
||||
|
||||
nodes.push({
|
||||
id: userId,
|
||||
labels: ['Person', 'User'],
|
||||
properties: userData
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Generated ${nodes.length} user nodes`);
|
||||
|
||||
// Generate friendships (edges)
|
||||
const numEdges = Math.floor(numUsers * avgFriends / 2); // Undirected, so divide by 2
|
||||
console.log(`Generating ${numEdges} friendships...`);
|
||||
|
||||
// Use preferential attachment (scale-free network)
|
||||
const degrees = new Array(numUsers).fill(0);
|
||||
|
||||
for (let i = 0; i < numEdges; i++) {
|
||||
if (i % 100000 === 0) {
|
||||
console.log(` Generated ${i} edges...`);
|
||||
}
|
||||
|
||||
// Select nodes with preferential attachment
|
||||
let from = Math.floor(Math.random() * numUsers);
|
||||
let to = Math.floor(Math.random() * numUsers);
|
||||
|
||||
// Avoid self-loops
|
||||
while (to === from) {
|
||||
to = Math.floor(Math.random() * numUsers);
|
||||
}
|
||||
|
||||
const edgeId = `friendship_${i}`;
|
||||
const friendshipDate = new Date(
|
||||
Date.now() - Math.random() * 365 * 24 * 60 * 60 * 1000 * 5
|
||||
).toISOString();
|
||||
|
||||
edges.push({
|
||||
id: edgeId,
|
||||
from: `user_${from}`,
|
||||
to: `user_${to}`,
|
||||
type: 'FRIENDS_WITH',
|
||||
properties: {
|
||||
since: friendshipDate,
|
||||
strength: Math.random()
|
||||
}
|
||||
});
|
||||
|
||||
degrees[from]++;
|
||||
degrees[to]++;
|
||||
}
|
||||
|
||||
const avgDegree = degrees.reduce((a, b) => a + b, 0) / numUsers;
|
||||
console.log(`Average degree: ${avgDegree.toFixed(2)}`);
|
||||
|
||||
return {
|
||||
nodes,
|
||||
edges,
|
||||
metadata: {
|
||||
nodeCount: nodes.length,
|
||||
edgeCount: edges.length,
|
||||
avgDegree,
|
||||
labels: ['Person', 'User'],
|
||||
relationshipTypes: ['FRIENDS_WITH']
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate knowledge graph data
|
||||
*/
|
||||
export async function generateKnowledgeGraph(
|
||||
numEntities: number = 100000
|
||||
): Promise<GraphDataset> {
|
||||
console.log(`Generating knowledge graph: ${numEntities} entities...`);
|
||||
|
||||
const synth = createSynth({
|
||||
provider: 'gemini',
|
||||
model: 'gemini-2.0-flash-exp'
|
||||
});
|
||||
|
||||
const nodes: GraphNode[] = [];
|
||||
const edges: GraphEdge[] = [];
|
||||
|
||||
// Generate different entity types
|
||||
const entityTypes = [
|
||||
{ label: 'Person', count: 0.3, schema: { name: 'string', birthDate: 'date', nationality: 'string' } },
|
||||
{ label: 'Organization', count: 0.25, schema: { name: 'string', founded: 'number', industry: 'string' } },
|
||||
{ label: 'Location', count: 0.2, schema: { name: 'string', country: 'string', lat: 'number', lon: 'number' } },
|
||||
{ label: 'Event', count: 0.15, schema: { name: 'string', date: 'date', type: 'string' } },
|
||||
{ label: 'Concept', count: 0.1, schema: { name: 'string', domain: 'string', definition: 'string' } }
|
||||
];
|
||||
|
||||
let entityId = 0;
|
||||
|
||||
for (const entityType of entityTypes) {
|
||||
const count = Math.floor(numEntities * entityType.count);
|
||||
console.log(` Generating ${count} ${entityType.label} entities...`);
|
||||
|
||||
const result = await synth.generateStructured({
|
||||
type: 'json',
|
||||
count,
|
||||
schema: entityType.schema,
|
||||
prompt: `Generate realistic ${entityType.label} entities for a knowledge graph.
|
||||
Ensure diversity and real-world accuracy.`
|
||||
});
|
||||
|
||||
for (const entity of result.data) {
|
||||
nodes.push({
|
||||
id: `entity_${entityId++}`,
|
||||
labels: [entityType.label, 'Entity'],
|
||||
properties: entity as Record<string, unknown>
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Generated ${nodes.length} entity nodes`);
|
||||
|
||||
// Generate relationships
|
||||
const relationshipTypes = [
|
||||
'WORKS_AT',
|
||||
'LOCATED_IN',
|
||||
'PARTICIPATED_IN',
|
||||
'RELATED_TO',
|
||||
'INFLUENCED_BY'
|
||||
];
|
||||
|
||||
const numEdges = numEntities * 10; // 10 relationships per entity on average
|
||||
console.log(`Generating ${numEdges} relationships...`);
|
||||
|
||||
for (let i = 0; i < numEdges; i++) {
|
||||
if (i % 50000 === 0) {
|
||||
console.log(` Generated ${i} relationships...`);
|
||||
}
|
||||
|
||||
const from = Math.floor(Math.random() * nodes.length);
|
||||
const to = Math.floor(Math.random() * nodes.length);
|
||||
|
||||
if (from === to) continue;
|
||||
|
||||
const relType = relationshipTypes[Math.floor(Math.random() * relationshipTypes.length)];
|
||||
|
||||
edges.push({
|
||||
id: `rel_${i}`,
|
||||
from: nodes[from].id,
|
||||
to: nodes[to].id,
|
||||
type: relType,
|
||||
properties: {
|
||||
confidence: Math.random(),
|
||||
source: 'generated'
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
nodes,
|
||||
edges,
|
||||
metadata: {
|
||||
nodeCount: nodes.length,
|
||||
edgeCount: edges.length,
|
||||
avgDegree: (edges.length * 2) / nodes.length,
|
||||
labels: entityTypes.map(t => t.label),
|
||||
relationshipTypes
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate temporal event graph
|
||||
*/
|
||||
export async function generateTemporalGraph(
|
||||
numEvents: number = 500000,
|
||||
timeRangeDays: number = 365
|
||||
): Promise<GraphDataset> {
|
||||
console.log(`Generating temporal graph: ${numEvents} events over ${timeRangeDays} days...`);
|
||||
|
||||
const synth = createSynth({
|
||||
provider: 'gemini',
|
||||
model: 'gemini-2.0-flash-exp'
|
||||
});
|
||||
|
||||
const nodes: GraphNode[] = [];
|
||||
const edges: GraphEdge[] = [];
|
||||
|
||||
// Generate time-series events
|
||||
console.log(' Generating event data...');
|
||||
|
||||
const eventResult = await synth.generateTimeSeries({
|
||||
type: 'timeseries',
|
||||
count: numEvents,
|
||||
interval: Math.floor((timeRangeDays * 24 * 60 * 60 * 1000) / numEvents),
|
||||
schema: {
|
||||
eventType: 'string',
|
||||
severity: 'number',
|
||||
entity: 'string',
|
||||
state: 'string'
|
||||
},
|
||||
prompt: `Generate realistic system events including state changes, user actions,
|
||||
system alerts, and business events. Include severity levels 1-5.`
|
||||
});
|
||||
|
||||
for (let i = 0; i < numEvents; i++) {
|
||||
const eventData = eventResult.data[i] as Record<string, unknown>;
|
||||
|
||||
nodes.push({
|
||||
id: `event_${i}`,
|
||||
labels: ['Event'],
|
||||
properties: {
|
||||
...eventData,
|
||||
timestamp: new Date(Date.now() - Math.random() * timeRangeDays * 24 * 60 * 60 * 1000).toISOString()
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`Generated ${nodes.length} event nodes`);
|
||||
|
||||
// Generate state transitions (temporal edges)
|
||||
console.log(' Generating state transitions...');
|
||||
|
||||
for (let i = 0; i < numEvents - 1; i++) {
|
||||
if (i % 50000 === 0) {
|
||||
console.log(` Generated ${i} transitions...`);
|
||||
}
|
||||
|
||||
// Connect events that are causally related (next event in sequence)
|
||||
if (Math.random() < 0.3) {
|
||||
edges.push({
|
||||
id: `transition_${i}`,
|
||||
from: `event_${i}`,
|
||||
to: `event_${i + 1}`,
|
||||
type: 'TRANSITIONS_TO',
|
||||
properties: {
|
||||
duration: Math.random() * 1000,
|
||||
probability: Math.random()
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Add some random connections for causality
|
||||
if (Math.random() < 0.1 && i > 10) {
|
||||
const target = Math.floor(Math.random() * i);
|
||||
edges.push({
|
||||
id: `caused_by_${i}`,
|
||||
from: `event_${i}`,
|
||||
to: `event_${target}`,
|
||||
type: 'CAUSED_BY',
|
||||
properties: {
|
||||
correlation: Math.random()
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
nodes,
|
||||
edges,
|
||||
metadata: {
|
||||
nodeCount: nodes.length,
|
||||
edgeCount: edges.length,
|
||||
avgDegree: (edges.length * 2) / nodes.length,
|
||||
labels: ['Event', 'State'],
|
||||
relationshipTypes: ['TRANSITIONS_TO', 'CAUSED_BY']
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Save dataset to files
|
||||
*/
|
||||
export function saveDataset(dataset: GraphDataset, name: string, outputDir: string = './data') {
|
||||
mkdirSync(outputDir, { recursive: true });
|
||||
|
||||
const nodesFile = join(outputDir, `${name}_nodes.json`);
|
||||
const edgesFile = join(outputDir, `${name}_edges.json`);
|
||||
const metadataFile = join(outputDir, `${name}_metadata.json`);
|
||||
|
||||
console.log(`Saving dataset to ${outputDir}...`);
|
||||
|
||||
writeFileSync(nodesFile, JSON.stringify(dataset.nodes, null, 2));
|
||||
writeFileSync(edgesFile, JSON.stringify(dataset.edges, null, 2));
|
||||
writeFileSync(metadataFile, JSON.stringify(dataset.metadata, null, 2));
|
||||
|
||||
console.log(` Nodes: ${nodesFile}`);
|
||||
console.log(` Edges: ${edgesFile}`);
|
||||
console.log(` Metadata: ${metadataFile}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function to generate all datasets
|
||||
*/
|
||||
export async function generateAllDatasets() {
|
||||
console.log('=== RuVector Graph Benchmark Data Generation ===\n');
|
||||
|
||||
// Social Network
|
||||
const socialNetwork = await generateSocialNetwork(1000000, 10);
|
||||
saveDataset(socialNetwork, 'social_network', './benchmarks/data/graph');
|
||||
|
||||
console.log('');
|
||||
|
||||
// Knowledge Graph
|
||||
const knowledgeGraph = await generateKnowledgeGraph(100000);
|
||||
saveDataset(knowledgeGraph, 'knowledge_graph', './benchmarks/data/graph');
|
||||
|
||||
console.log('');
|
||||
|
||||
// Temporal Graph
|
||||
const temporalGraph = await generateTemporalGraph(500000, 365);
|
||||
saveDataset(temporalGraph, 'temporal_events', './benchmarks/data/graph');
|
||||
|
||||
console.log('\n=== Data Generation Complete ===');
|
||||
}
|
||||
|
||||
// Run if called directly
|
||||
if (require.main === module) {
|
||||
generateAllDatasets().catch(console.error);
|
||||
}
|
||||
Reference in New Issue
Block a user