/** * Graph data generator using agentic-synth * Generates synthetic graph datasets for benchmarking */ import { AgenticSynth, createSynth } from '@ruvector/agentic-synth'; import { writeFileSync, mkdirSync } from 'fs'; import { join } from 'path'; export interface GraphNode { id: string; labels: string[]; properties: Record; } export interface GraphEdge { id: string; from: string; to: string; type: string; properties: Record; } export interface GraphDataset { nodes: GraphNode[]; edges: GraphEdge[]; metadata: { nodeCount: number; edgeCount: number; avgDegree: number; labels: string[]; relationshipTypes: string[]; }; } /** * Generate social network graph data */ export async function generateSocialNetwork( numUsers: number = 1000000, avgFriends: number = 10 ): Promise { console.log(`Generating social network: ${numUsers} users, avg ${avgFriends} friends...`); const synth = createSynth({ provider: 'gemini', model: 'gemini-2.0-flash-exp' }); const nodes: GraphNode[] = []; const edges: GraphEdge[] = []; // Generate users in batches const batchSize = 10000; const numBatches = Math.ceil(numUsers / batchSize); for (let batch = 0; batch < numBatches; batch++) { const batchStart = batch * batchSize; const batchEnd = Math.min(batchStart + batchSize, numUsers); const batchUsers = batchEnd - batchStart; console.log(` Generating users ${batchStart}-${batchEnd}...`); // Use agentic-synth to generate realistic user data const userResult = await synth.generateStructured({ type: 'json', count: batchUsers, schema: { id: 'string', name: 'string', age: 'number', location: 'string', interests: 'array', joinDate: 'timestamp' }, prompt: `Generate realistic social media user profiles with diverse demographics, locations (cities worldwide), ages (18-80), and interests (hobbies, activities, topics). Make names culturally appropriate for their locations.` }); // Convert to graph nodes for (let i = 0; i < batchUsers; i++) { const userId = `user_${batchStart + i}`; const userData = userResult.data[i] as Record; nodes.push({ id: userId, labels: ['Person', 'User'], properties: userData }); } } console.log(`Generated ${nodes.length} user nodes`); // Generate friendships (edges) const numEdges = Math.floor(numUsers * avgFriends / 2); // Undirected, so divide by 2 console.log(`Generating ${numEdges} friendships...`); // Use preferential attachment (scale-free network) const degrees = new Array(numUsers).fill(0); for (let i = 0; i < numEdges; i++) { if (i % 100000 === 0) { console.log(` Generated ${i} edges...`); } // Select nodes with preferential attachment let from = Math.floor(Math.random() * numUsers); let to = Math.floor(Math.random() * numUsers); // Avoid self-loops while (to === from) { to = Math.floor(Math.random() * numUsers); } const edgeId = `friendship_${i}`; const friendshipDate = new Date( Date.now() - Math.random() * 365 * 24 * 60 * 60 * 1000 * 5 ).toISOString(); edges.push({ id: edgeId, from: `user_${from}`, to: `user_${to}`, type: 'FRIENDS_WITH', properties: { since: friendshipDate, strength: Math.random() } }); degrees[from]++; degrees[to]++; } const avgDegree = degrees.reduce((a, b) => a + b, 0) / numUsers; console.log(`Average degree: ${avgDegree.toFixed(2)}`); return { nodes, edges, metadata: { nodeCount: nodes.length, edgeCount: edges.length, avgDegree, labels: ['Person', 'User'], relationshipTypes: ['FRIENDS_WITH'] } }; } /** * Generate knowledge graph data */ export async function generateKnowledgeGraph( numEntities: number = 100000 ): Promise { console.log(`Generating knowledge graph: ${numEntities} entities...`); const synth = createSynth({ provider: 'gemini', model: 'gemini-2.0-flash-exp' }); const nodes: GraphNode[] = []; const edges: GraphEdge[] = []; // Generate different entity types const entityTypes = [ { label: 'Person', count: 0.3, schema: { name: 'string', birthDate: 'date', nationality: 'string' } }, { label: 'Organization', count: 0.25, schema: { name: 'string', founded: 'number', industry: 'string' } }, { label: 'Location', count: 0.2, schema: { name: 'string', country: 'string', lat: 'number', lon: 'number' } }, { label: 'Event', count: 0.15, schema: { name: 'string', date: 'date', type: 'string' } }, { label: 'Concept', count: 0.1, schema: { name: 'string', domain: 'string', definition: 'string' } } ]; let entityId = 0; for (const entityType of entityTypes) { const count = Math.floor(numEntities * entityType.count); console.log(` Generating ${count} ${entityType.label} entities...`); const result = await synth.generateStructured({ type: 'json', count, schema: entityType.schema, prompt: `Generate realistic ${entityType.label} entities for a knowledge graph. Ensure diversity and real-world accuracy.` }); for (const entity of result.data) { nodes.push({ id: `entity_${entityId++}`, labels: [entityType.label, 'Entity'], properties: entity as Record }); } } console.log(`Generated ${nodes.length} entity nodes`); // Generate relationships const relationshipTypes = [ 'WORKS_AT', 'LOCATED_IN', 'PARTICIPATED_IN', 'RELATED_TO', 'INFLUENCED_BY' ]; const numEdges = numEntities * 10; // 10 relationships per entity on average console.log(`Generating ${numEdges} relationships...`); for (let i = 0; i < numEdges; i++) { if (i % 50000 === 0) { console.log(` Generated ${i} relationships...`); } const from = Math.floor(Math.random() * nodes.length); const to = Math.floor(Math.random() * nodes.length); if (from === to) continue; const relType = relationshipTypes[Math.floor(Math.random() * relationshipTypes.length)]; edges.push({ id: `rel_${i}`, from: nodes[from].id, to: nodes[to].id, type: relType, properties: { confidence: Math.random(), source: 'generated' } }); } return { nodes, edges, metadata: { nodeCount: nodes.length, edgeCount: edges.length, avgDegree: (edges.length * 2) / nodes.length, labels: entityTypes.map(t => t.label), relationshipTypes } }; } /** * Generate temporal event graph */ export async function generateTemporalGraph( numEvents: number = 500000, timeRangeDays: number = 365 ): Promise { console.log(`Generating temporal graph: ${numEvents} events over ${timeRangeDays} days...`); const synth = createSynth({ provider: 'gemini', model: 'gemini-2.0-flash-exp' }); const nodes: GraphNode[] = []; const edges: GraphEdge[] = []; // Generate time-series events console.log(' Generating event data...'); const eventResult = await synth.generateTimeSeries({ type: 'timeseries', count: numEvents, interval: Math.floor((timeRangeDays * 24 * 60 * 60 * 1000) / numEvents), schema: { eventType: 'string', severity: 'number', entity: 'string', state: 'string' }, prompt: `Generate realistic system events including state changes, user actions, system alerts, and business events. Include severity levels 1-5.` }); for (let i = 0; i < numEvents; i++) { const eventData = eventResult.data[i] as Record; nodes.push({ id: `event_${i}`, labels: ['Event'], properties: { ...eventData, timestamp: new Date(Date.now() - Math.random() * timeRangeDays * 24 * 60 * 60 * 1000).toISOString() } }); } console.log(`Generated ${nodes.length} event nodes`); // Generate state transitions (temporal edges) console.log(' Generating state transitions...'); for (let i = 0; i < numEvents - 1; i++) { if (i % 50000 === 0) { console.log(` Generated ${i} transitions...`); } // Connect events that are causally related (next event in sequence) if (Math.random() < 0.3) { edges.push({ id: `transition_${i}`, from: `event_${i}`, to: `event_${i + 1}`, type: 'TRANSITIONS_TO', properties: { duration: Math.random() * 1000, probability: Math.random() } }); } // Add some random connections for causality if (Math.random() < 0.1 && i > 10) { const target = Math.floor(Math.random() * i); edges.push({ id: `caused_by_${i}`, from: `event_${i}`, to: `event_${target}`, type: 'CAUSED_BY', properties: { correlation: Math.random() } }); } } return { nodes, edges, metadata: { nodeCount: nodes.length, edgeCount: edges.length, avgDegree: (edges.length * 2) / nodes.length, labels: ['Event', 'State'], relationshipTypes: ['TRANSITIONS_TO', 'CAUSED_BY'] } }; } /** * Save dataset to files */ export function saveDataset(dataset: GraphDataset, name: string, outputDir: string = './data') { mkdirSync(outputDir, { recursive: true }); const nodesFile = join(outputDir, `${name}_nodes.json`); const edgesFile = join(outputDir, `${name}_edges.json`); const metadataFile = join(outputDir, `${name}_metadata.json`); console.log(`Saving dataset to ${outputDir}...`); writeFileSync(nodesFile, JSON.stringify(dataset.nodes, null, 2)); writeFileSync(edgesFile, JSON.stringify(dataset.edges, null, 2)); writeFileSync(metadataFile, JSON.stringify(dataset.metadata, null, 2)); console.log(` Nodes: ${nodesFile}`); console.log(` Edges: ${edgesFile}`); console.log(` Metadata: ${metadataFile}`); } /** * Main function to generate all datasets */ export async function generateAllDatasets() { console.log('=== RuVector Graph Benchmark Data Generation ===\n'); // Social Network const socialNetwork = await generateSocialNetwork(1000000, 10); saveDataset(socialNetwork, 'social_network', './benchmarks/data/graph'); console.log(''); // Knowledge Graph const knowledgeGraph = await generateKnowledgeGraph(100000); saveDataset(knowledgeGraph, 'knowledge_graph', './benchmarks/data/graph'); console.log(''); // Temporal Graph const temporalGraph = await generateTemporalGraph(500000, 365); saveDataset(temporalGraph, 'temporal_events', './benchmarks/data/graph'); console.log('\n=== Data Generation Complete ==='); } // Run if called directly if (require.main === module) { generateAllDatasets().catch(console.error); }