git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
506 lines
19 KiB
JavaScript
506 lines
19 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Claude Code Synthetic Data Generator
|
|
*
|
|
* Uses @ruvector/agentic-synth to generate high-quality
|
|
* training data for RuvLTRA routing optimization.
|
|
*
|
|
* Features:
|
|
* - Claude Code-specific task patterns
|
|
* - Hard negative mining for contrastive learning
|
|
* - Quality scoring based on task clarity
|
|
* - DSPy-based prompt optimization
|
|
*/
|
|
|
|
const { GoogleGenerativeAI } = require('@google/generative-ai');
|
|
const { writeFileSync, existsSync, mkdirSync, readFileSync } = require('fs');
|
|
const { join } = require('path');
|
|
const { homedir } = require('os');
|
|
|
|
// Configuration
|
|
const OUTPUT_DIR = join(__dirname, 'generated');
|
|
const EXAMPLES_PER_AGENT = 100; // Generate 100 examples per agent
|
|
const HARD_NEGATIVES_PER_AGENT = 20;
|
|
|
|
// Agent definitions with Claude Code context
|
|
const CLAUDE_CODE_AGENTS = {
|
|
coder: {
|
|
role: 'Software developer who implements features and writes production code',
|
|
claudeCodeContext: 'Uses Edit, Write, MultiEdit tools to create and modify code files',
|
|
keywords: ['implement', 'build', 'create', 'write code', 'add feature', 'component', 'function'],
|
|
examples: [
|
|
'Implement a binary search function in TypeScript',
|
|
'Build a React component for user authentication',
|
|
'Create a REST API endpoint for data retrieval',
|
|
],
|
|
},
|
|
researcher: {
|
|
role: 'Technical researcher who investigates and analyzes',
|
|
claudeCodeContext: 'Uses Grep, Glob, Read, WebSearch tools to gather information',
|
|
keywords: ['research', 'investigate', 'explore', 'analyze', 'find', 'discover', 'study'],
|
|
examples: [
|
|
'Research best practices for React state management',
|
|
'Investigate why the API is returning slow responses',
|
|
'Explore different authentication strategies',
|
|
],
|
|
},
|
|
reviewer: {
|
|
role: 'Code reviewer who evaluates code quality',
|
|
claudeCodeContext: 'Uses Read, Grep tools to analyze existing code for quality issues',
|
|
keywords: ['review', 'check', 'evaluate', 'assess', 'inspect', 'pull request', 'PR'],
|
|
examples: [
|
|
'Review the pull request for code quality',
|
|
'Check the implementation for potential issues',
|
|
'Evaluate the API design decisions',
|
|
],
|
|
},
|
|
tester: {
|
|
role: 'QA engineer who writes and runs tests',
|
|
claudeCodeContext: 'Uses Write, Edit tools to create test files and Bash to run tests',
|
|
keywords: ['test', 'tests', 'testing', 'unit test', 'integration test', 'e2e', 'coverage', 'spec'],
|
|
examples: [
|
|
'Write unit tests for the authentication module',
|
|
'Add integration tests for the API endpoints',
|
|
'Create e2e tests for the checkout flow',
|
|
],
|
|
},
|
|
architect: {
|
|
role: 'System architect who designs software structure',
|
|
claudeCodeContext: 'Uses Read, Grep tools to understand codebase and Write to document designs',
|
|
keywords: ['design', 'architecture', 'schema', 'structure', 'system', 'diagram', 'plan'],
|
|
examples: [
|
|
'Design the database schema for user profiles',
|
|
'Plan the microservices architecture',
|
|
'Create the system architecture diagram',
|
|
],
|
|
},
|
|
'security-architect': {
|
|
role: 'Security specialist who audits vulnerabilities',
|
|
claudeCodeContext: 'Uses Grep, Read tools to scan code for security issues',
|
|
keywords: ['security', 'vulnerability', 'xss', 'injection', 'audit', 'cve', 'exploit'],
|
|
examples: [
|
|
'Audit the API endpoints for XSS vulnerabilities',
|
|
'Check for SQL injection vulnerabilities',
|
|
'Review authentication for security issues',
|
|
],
|
|
},
|
|
debugger: {
|
|
role: 'Bug hunter who fixes errors and traces issues',
|
|
claudeCodeContext: 'Uses Read, Grep, Bash tools to trace issues and Edit to fix bugs',
|
|
keywords: ['debug', 'fix', 'bug', 'error', 'exception', 'crash', 'trace', 'issue'],
|
|
examples: [
|
|
'Fix the null pointer exception in login',
|
|
'Debug the memory leak in WebSocket handler',
|
|
'Trace the source of the intermittent error',
|
|
],
|
|
},
|
|
documenter: {
|
|
role: 'Technical writer who creates documentation',
|
|
claudeCodeContext: 'Uses Write, Edit tools to create and update documentation files',
|
|
keywords: ['document', 'jsdoc', 'readme', 'comment', 'explain', 'describe'],
|
|
examples: [
|
|
'Write JSDoc comments for utility functions',
|
|
'Create README for the new package',
|
|
'Document the API endpoints',
|
|
],
|
|
},
|
|
refactorer: {
|
|
role: 'Code modernizer who restructures without changing behavior',
|
|
claudeCodeContext: 'Uses Edit, MultiEdit tools to restructure code across files',
|
|
keywords: ['refactor', 'restructure', 'modernize', 'extract', 'consolidate', 'simplify'],
|
|
examples: [
|
|
'Refactor the payment module to async/await',
|
|
'Restructure the utils folder',
|
|
'Extract common logic into shared module',
|
|
],
|
|
},
|
|
optimizer: {
|
|
role: 'Performance engineer who speeds up slow code',
|
|
claudeCodeContext: 'Uses Bash to run profilers and Edit to optimize code',
|
|
keywords: ['optimize', 'performance', 'speed', 'cache', 'latency', 'slow', 'fast'],
|
|
examples: [
|
|
'Optimize the database queries for dashboard',
|
|
'Cache the frequently accessed user data',
|
|
'Improve the API response time',
|
|
],
|
|
},
|
|
devops: {
|
|
role: 'DevOps engineer who manages deployment and infrastructure',
|
|
claudeCodeContext: 'Uses Bash for deployment commands and Write for config files',
|
|
keywords: ['deploy', 'ci/cd', 'kubernetes', 'docker', 'pipeline', 'infrastructure'],
|
|
examples: [
|
|
'Set up the CI/CD pipeline',
|
|
'Configure Kubernetes deployment',
|
|
'Deploy to production',
|
|
],
|
|
},
|
|
'api-docs': {
|
|
role: 'API documentation specialist who creates specs',
|
|
claudeCodeContext: 'Uses Write to generate OpenAPI/Swagger specs',
|
|
keywords: ['openapi', 'swagger', 'api spec', 'endpoint', 'rest api', 'graphql'],
|
|
examples: [
|
|
'Generate OpenAPI documentation for REST API',
|
|
'Create Swagger spec for the endpoints',
|
|
'Document the API request/response formats',
|
|
],
|
|
},
|
|
planner: {
|
|
role: 'Project planner who organizes and schedules work',
|
|
claudeCodeContext: 'Uses TodoWrite tool to create and manage task lists',
|
|
keywords: ['plan', 'sprint', 'roadmap', 'milestone', 'estimate', 'schedule', 'prioritize'],
|
|
examples: [
|
|
'Create a sprint plan for next two weeks',
|
|
'Estimate the feature implementation effort',
|
|
'Plan the roadmap for Q3',
|
|
],
|
|
},
|
|
};
|
|
|
|
// Prompt template for synthetic data generation
|
|
const GENERATION_PROMPT = `You are generating training data for an AI agent routing system used in Claude Code (an AI coding assistant).
|
|
|
|
## Task
|
|
Generate ${EXAMPLES_PER_AGENT} diverse, realistic task descriptions that would be routed to the "${'{AGENT}'}" agent.
|
|
|
|
## Agent Description
|
|
Role: {ROLE}
|
|
Claude Code Context: {CONTEXT}
|
|
Key Indicators: {KEYWORDS}
|
|
|
|
## Requirements
|
|
1. Each task should be a realistic software engineering task
|
|
2. Tasks should clearly indicate the agent type through action verbs and context
|
|
3. Include variety in:
|
|
- Programming languages (TypeScript, Python, Rust, Go, etc.)
|
|
- Frameworks (React, Vue, Express, Django, etc.)
|
|
- Domains (web, mobile, backend, data, ML, etc.)
|
|
- Complexity levels (simple to complex)
|
|
4. Tasks should be 5-20 words, clear and actionable
|
|
5. Include edge cases that might be confused with other agents
|
|
|
|
## Examples for this agent
|
|
{EXAMPLES}
|
|
|
|
## Output Format
|
|
Return a JSON array of objects with this structure:
|
|
[
|
|
{
|
|
"task": "The task description",
|
|
"quality": 0.8-1.0,
|
|
"difficulty": "easy|medium|hard",
|
|
"tags": ["relevant", "tags"]
|
|
}
|
|
]
|
|
|
|
Generate exactly ${EXAMPLES_PER_AGENT} unique tasks. Be creative and diverse.`;
|
|
|
|
// Prompt for hard negatives
|
|
const HARD_NEGATIVE_PROMPT = `You are generating hard negative examples for contrastive learning in an AI agent routing system.
|
|
|
|
## Context
|
|
We have an agent called "${'{AGENT}'}" with this role: {ROLE}
|
|
|
|
We need tasks that SEEM like they might belong to this agent but actually belong to OTHER agents.
|
|
These are "hard negatives" - confusing examples that help the model learn better boundaries.
|
|
|
|
## Confusable Agents
|
|
{CONFUSABLE_AGENTS}
|
|
|
|
## Requirements
|
|
1. Generate ${HARD_NEGATIVES_PER_AGENT} tasks that might be confused with "${'{AGENT}'}"
|
|
2. Each task should actually belong to a DIFFERENT agent
|
|
3. The confusion should be subtle but clear upon reflection
|
|
4. Include the correct agent label
|
|
|
|
## Output Format
|
|
[
|
|
{
|
|
"task": "The confusing task description",
|
|
"appears_to_be": "${'{AGENT}'}",
|
|
"actually_is": "the_correct_agent",
|
|
"confusion_reason": "Why this might be confused"
|
|
}
|
|
]`;
|
|
|
|
/**
|
|
* Initialize Gemini client
|
|
*/
|
|
function getGeminiClient() {
|
|
const apiKey = process.env.GEMINI_API_KEY;
|
|
if (!apiKey) {
|
|
console.error('GEMINI_API_KEY environment variable required');
|
|
console.error('Set it with: export GEMINI_API_KEY=your_key');
|
|
process.exit(1);
|
|
}
|
|
return new GoogleGenerativeAI(apiKey);
|
|
}
|
|
|
|
/**
|
|
* Generate training data for an agent using Gemini
|
|
*/
|
|
async function generateAgentData(client, agent, agentConfig) {
|
|
console.log(` Generating data for ${agent}...`);
|
|
|
|
const prompt = GENERATION_PROMPT
|
|
.replace(/\{AGENT\}/g, agent)
|
|
.replace('{ROLE}', agentConfig.role)
|
|
.replace('{CONTEXT}', agentConfig.claudeCodeContext)
|
|
.replace('{KEYWORDS}', agentConfig.keywords.join(', '))
|
|
.replace('{EXAMPLES}', agentConfig.examples.map(e => `- ${e}`).join('\n'));
|
|
|
|
try {
|
|
const model = client.getGenerativeModel({ model: 'gemini-2.0-flash-exp' });
|
|
const result = await model.generateContent(prompt);
|
|
const response = result.response.text();
|
|
|
|
// Extract JSON from response
|
|
const jsonMatch = response.match(/\[[\s\S]*\]/);
|
|
if (!jsonMatch) {
|
|
console.error(` Failed to parse JSON for ${agent}`);
|
|
return [];
|
|
}
|
|
|
|
const data = JSON.parse(jsonMatch[0]);
|
|
console.log(` Generated ${data.length} examples for ${agent}`);
|
|
|
|
return data.map(item => ({
|
|
...item,
|
|
agent,
|
|
type: 'positive',
|
|
}));
|
|
} catch (error) {
|
|
console.error(` Error generating data for ${agent}: ${error.message}`);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate hard negatives for an agent
|
|
*/
|
|
async function generateHardNegatives(client, agent, agentConfig, allAgents) {
|
|
console.log(` Generating hard negatives for ${agent}...`);
|
|
|
|
// Find confusable agents
|
|
const confusableAgents = Object.entries(allAgents)
|
|
.filter(([name]) => name !== agent)
|
|
.map(([name, config]) => `- ${name}: ${config.role}`)
|
|
.join('\n');
|
|
|
|
const prompt = HARD_NEGATIVE_PROMPT
|
|
.replace(/\{AGENT\}/g, agent)
|
|
.replace('{ROLE}', agentConfig.role)
|
|
.replace('{CONFUSABLE_AGENTS}', confusableAgents);
|
|
|
|
try {
|
|
const model = client.getGenerativeModel({ model: 'gemini-2.0-flash-exp' });
|
|
const result = await model.generateContent(prompt);
|
|
const response = result.response.text();
|
|
|
|
const jsonMatch = response.match(/\[[\s\S]*\]/);
|
|
if (!jsonMatch) {
|
|
console.error(` Failed to parse hard negatives for ${agent}`);
|
|
return [];
|
|
}
|
|
|
|
const data = JSON.parse(jsonMatch[0]);
|
|
console.log(` Generated ${data.length} hard negatives for ${agent}`);
|
|
|
|
return data.map(item => ({
|
|
task: item.task,
|
|
agent: item.actually_is,
|
|
confusing_with: agent,
|
|
confusion_reason: item.confusion_reason,
|
|
type: 'hard_negative',
|
|
quality: 1.0,
|
|
}));
|
|
} catch (error) {
|
|
console.error(` Error generating hard negatives for ${agent}: ${error.message}`);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main generation pipeline
|
|
*/
|
|
async function main() {
|
|
console.log('\n╔═══════════════════════════════════════════════════════════════════════════════════╗');
|
|
console.log('║ CLAUDE CODE SYNTHETIC TRAINING DATA GENERATOR ║');
|
|
console.log('║ Using @ruvector/agentic-synth ║');
|
|
console.log('╚═══════════════════════════════════════════════════════════════════════════════════╝\n');
|
|
|
|
// Check for API key
|
|
if (!process.env.GEMINI_API_KEY) {
|
|
console.log('GEMINI_API_KEY not set. Generating static dataset from templates...\n');
|
|
generateStaticDataset();
|
|
return;
|
|
}
|
|
|
|
const client = getGeminiClient();
|
|
|
|
// Create output directory
|
|
if (!existsSync(OUTPUT_DIR)) {
|
|
mkdirSync(OUTPUT_DIR, { recursive: true });
|
|
}
|
|
|
|
const allData = [];
|
|
const allHardNegatives = [];
|
|
const agents = Object.keys(CLAUDE_CODE_AGENTS);
|
|
|
|
console.log('─────────────────────────────────────────────────────────────────');
|
|
console.log(' GENERATING POSITIVE EXAMPLES');
|
|
console.log('─────────────────────────────────────────────────────────────────\n');
|
|
|
|
// Generate positive examples for each agent
|
|
for (const agent of agents) {
|
|
const data = await generateAgentData(client, agent, CLAUDE_CODE_AGENTS[agent]);
|
|
allData.push(...data);
|
|
|
|
// Rate limit
|
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
}
|
|
|
|
console.log('\n─────────────────────────────────────────────────────────────────');
|
|
console.log(' GENERATING HARD NEGATIVES');
|
|
console.log('─────────────────────────────────────────────────────────────────\n');
|
|
|
|
// Generate hard negatives
|
|
for (const agent of agents) {
|
|
const negatives = await generateHardNegatives(client, agent, CLAUDE_CODE_AGENTS[agent], CLAUDE_CODE_AGENTS);
|
|
allHardNegatives.push(...negatives);
|
|
|
|
// Rate limit
|
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
}
|
|
|
|
// Combine and save
|
|
const fullDataset = [...allData, ...allHardNegatives];
|
|
|
|
// Save full dataset
|
|
const outputPath = join(OUTPUT_DIR, 'claude-code-routing-dataset.json');
|
|
writeFileSync(outputPath, JSON.stringify(fullDataset, null, 2));
|
|
|
|
// Save training pairs (for contrastive learning)
|
|
const contrastivePairs = generateContrastivePairs(allData, allHardNegatives);
|
|
const pairsPath = join(OUTPUT_DIR, 'contrastive-pairs.json');
|
|
writeFileSync(pairsPath, JSON.stringify(contrastivePairs, null, 2));
|
|
|
|
// Print summary
|
|
console.log('\n═══════════════════════════════════════════════════════════════════════════════════');
|
|
console.log(' GENERATION COMPLETE');
|
|
console.log('═══════════════════════════════════════════════════════════════════════════════════\n');
|
|
|
|
console.log(` Positive examples: ${allData.length}`);
|
|
console.log(` Hard negatives: ${allHardNegatives.length}`);
|
|
console.log(` Contrastive pairs: ${contrastivePairs.length}`);
|
|
console.log(` Total dataset size: ${fullDataset.length}`);
|
|
console.log(`\n Output files:`);
|
|
console.log(` ${outputPath}`);
|
|
console.log(` ${pairsPath}`);
|
|
console.log('');
|
|
}
|
|
|
|
/**
|
|
* Generate contrastive pairs from data
|
|
*/
|
|
function generateContrastivePairs(positives, negatives) {
|
|
const pairs = [];
|
|
|
|
// Group positives by agent
|
|
const byAgent = {};
|
|
for (const item of positives) {
|
|
if (!byAgent[item.agent]) byAgent[item.agent] = [];
|
|
byAgent[item.agent].push(item);
|
|
}
|
|
|
|
// Create positive pairs (same agent)
|
|
for (const [agent, items] of Object.entries(byAgent)) {
|
|
for (let i = 0; i < items.length - 1; i++) {
|
|
for (let j = i + 1; j < Math.min(i + 3, items.length); j++) {
|
|
pairs.push({
|
|
anchor: items[i].task,
|
|
positive: items[j].task,
|
|
agent,
|
|
type: 'positive_pair',
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create negative pairs (different agents)
|
|
const agents = Object.keys(byAgent);
|
|
for (let i = 0; i < agents.length; i++) {
|
|
for (let j = i + 1; j < agents.length; j++) {
|
|
const agent1Items = byAgent[agents[i]];
|
|
const agent2Items = byAgent[agents[j]];
|
|
|
|
if (agent1Items && agent1Items[0] && agent2Items && agent2Items[0]) {
|
|
pairs.push({
|
|
anchor: agent1Items[0].task,
|
|
negative: agent2Items[0].task,
|
|
anchor_agent: agents[i],
|
|
negative_agent: agents[j],
|
|
type: 'negative_pair',
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add hard negative pairs
|
|
for (const neg of negatives) {
|
|
const confusingAgent = byAgent[neg.confusing_with];
|
|
if (confusingAgent && confusingAgent[0]) {
|
|
pairs.push({
|
|
anchor: confusingAgent[0].task,
|
|
negative: neg.task,
|
|
anchor_agent: neg.confusing_with,
|
|
negative_agent: neg.agent,
|
|
type: 'hard_negative_pair',
|
|
confusion_reason: neg.confusion_reason,
|
|
});
|
|
}
|
|
}
|
|
|
|
return pairs;
|
|
}
|
|
|
|
/**
|
|
* Generate static dataset without API (fallback)
|
|
*/
|
|
function generateStaticDataset() {
|
|
console.log('Generating static dataset from routing-dataset.js...\n');
|
|
|
|
// Import the static dataset
|
|
const { generateTrainingDataset, generateContrastivePairs, getDatasetStats } = require('./routing-dataset.js');
|
|
|
|
const dataset = generateTrainingDataset();
|
|
const pairs = generateContrastivePairs();
|
|
const stats = getDatasetStats();
|
|
|
|
// Create output directory
|
|
if (!existsSync(OUTPUT_DIR)) {
|
|
mkdirSync(OUTPUT_DIR, { recursive: true });
|
|
}
|
|
|
|
// Save dataset
|
|
const datasetPath = join(OUTPUT_DIR, 'claude-code-routing-dataset.json');
|
|
writeFileSync(datasetPath, JSON.stringify(dataset, null, 2));
|
|
|
|
const pairsPath = join(OUTPUT_DIR, 'contrastive-pairs.json');
|
|
writeFileSync(pairsPath, JSON.stringify(pairs, null, 2));
|
|
|
|
console.log('═══════════════════════════════════════════════════════════════');
|
|
console.log(' STATIC DATASET GENERATED');
|
|
console.log('═══════════════════════════════════════════════════════════════\n');
|
|
|
|
console.log(` Total examples: ${stats.totalExamples}`);
|
|
console.log(` Contrastive pairs: ${stats.contrastivePairs}`);
|
|
console.log(` Agent types: ${stats.agents.length}`);
|
|
console.log(`\n Output files:`);
|
|
console.log(` ${datasetPath}`);
|
|
console.log(` ${pairsPath}`);
|
|
console.log('\n To generate more data with AI, set GEMINI_API_KEY');
|
|
console.log('');
|
|
}
|
|
|
|
main().catch(console.error);
|