Files
wifi-densepose/npm/packages/ruvllm/scripts/training/claude-hard-negatives.js
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

223 lines
9.4 KiB
JavaScript

#!/usr/bin/env node
/**
* Claude-Powered Hard Negative Generator for SOTA Agent Routing
*
* Uses Claude Opus 4.5 to generate high-quality confusing triplets
* that push embedding-only accuracy toward 100%.
*/
const fs = require('fs');
const path = require('path');
require('dotenv').config({ path: path.resolve(__dirname, '../../../../../.env') });
const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY;
if (!ANTHROPIC_API_KEY) {
console.error('Error: ANTHROPIC_API_KEY not found in .env');
process.exit(1);
}
// Agent types and their descriptions
const AGENTS = {
coder: 'Implements code, builds features, writes functions',
researcher: 'Investigates problems, explores documentation, gathers information',
reviewer: 'Reviews pull requests, checks code quality, suggests improvements',
tester: 'Writes tests, validates behavior, ensures coverage',
architect: 'Designs systems, creates schemas, plans architecture',
'security-architect': 'Audits for vulnerabilities, checks security, reviews auth',
debugger: 'Fixes bugs, traces errors, diagnoses issues',
documenter: 'Writes documentation, adds comments, creates READMEs',
refactorer: 'Refactors code, modernizes patterns, improves structure',
optimizer: 'Optimizes performance, adds caching, improves speed',
devops: 'Deploys apps, sets up CI/CD, manages infrastructure',
'api-docs': 'Generates OpenAPI specs, documents endpoints, creates Swagger',
planner: 'Creates sprint plans, estimates timelines, prioritizes tasks'
};
// Confusing pairs - agent types that are easily mixed up
const CONFUSING_PAIRS = [
['coder', 'refactorer'], // Both modify code
['researcher', 'architect'], // Both do analysis
['reviewer', 'tester'], // Both validate
['debugger', 'optimizer'], // Both fix issues
['documenter', 'api-docs'], // Both write docs
['architect', 'planner'], // Both plan
['security-architect', 'reviewer'], // Both check code
['coder', 'debugger'], // Both write/fix code
['tester', 'debugger'], // Both find problems
['optimizer', 'architect'] // Both improve systems
];
async function callClaude(prompt) {
const response = await fetch('https://api.anthropic.com/v1/messages', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-api-key': ANTHROPIC_API_KEY,
'anthropic-version': '2023-06-01'
},
body: JSON.stringify({
model: 'claude-opus-4-5-20251101',
max_tokens: 4096,
messages: [{
role: 'user',
content: prompt
}]
})
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Claude API error: ${response.status} - ${error}`);
}
const data = await response.json();
return data.content[0].text;
}
async function generateHardNegatives(pair, count = 10) {
const [agent1, agent2] = pair;
const prompt = `You are helping train an AI routing model. Generate ${count} task descriptions that are AMBIGUOUS between "${agent1}" and "${agent2}" agents.
Agent descriptions:
- ${agent1}: ${AGENTS[agent1]}
- ${agent2}: ${AGENTS[agent2]}
Generate tasks that could reasonably be assigned to either agent but have a subtle preference for one.
Format each line as JSON:
{"anchor": "task description", "positive": "correct_agent", "negative": "wrong_agent", "isHard": true, "reason": "why this is confusing"}
Requirements:
1. Tasks should be realistic software development scenarios
2. The distinction should be subtle but learnable
3. Include edge cases and ambiguous wording
4. Mix which agent is the positive/negative
Generate exactly ${count} examples, one per line:`;
const response = await callClaude(prompt);
// Parse response - extract JSON lines
const lines = response.split('\n').filter(line => line.trim().startsWith('{'));
const triplets = [];
for (const line of lines) {
try {
const triplet = JSON.parse(line);
if (triplet.anchor && triplet.positive && triplet.negative) {
triplets.push({
anchor: triplet.anchor,
positive: triplet.positive,
negative: triplet.negative,
isHard: true
});
}
} catch (e) {
// Skip malformed JSON
}
}
return triplets;
}
async function evaluateWithGRPO(triplets, model = 'keyword-first') {
// GRPO-style evaluation: Use Claude to judge if predictions are correct
const prompt = `You are evaluating an AI agent router. For each task, determine which agent should handle it.
Agents: ${Object.keys(AGENTS).join(', ')}
Tasks to evaluate:
${triplets.slice(0, 10).map((t, i) => `${i + 1}. "${t.anchor}"`).join('\n')}
For each task, respond with the agent name that should handle it and your confidence (0-1).
Format: 1. agent_name (0.95)`;
const response = await callClaude(prompt);
console.log('\nGRPO Evaluation (Claude as judge):');
console.log(response);
return response;
}
async function main() {
console.log('╔═══════════════════════════════════════════════════════════════════════════════════╗');
console.log('║ Claude-Powered Hard Negative Generator for SOTA Agent Routing ║');
console.log('╚═══════════════════════════════════════════════════════════════════════════════════╝\n');
const args = process.argv.slice(2);
const outputPath = args.find(a => a.startsWith('--output='))?.split('=')[1]
|| path.join(process.env.HOME, '.ruvllm/training/claude-hard-negatives.jsonl');
const tripletCount = parseInt(args.find(a => a.startsWith('--count='))?.split('=')[1] || '5');
const doGRPO = args.includes('--grpo');
console.log(`Configuration:`);
console.log(` Output: ${outputPath}`);
console.log(` Triplets per pair: ${tripletCount}`);
console.log(` Confusing pairs: ${CONFUSING_PAIRS.length}`);
console.log(` Total expected: ~${CONFUSING_PAIRS.length * tripletCount} triplets`);
console.log(` GRPO evaluation: ${doGRPO}`);
console.log();
const allTriplets = [];
console.log('Generating hard negatives using Claude Opus 4.5...\n');
for (const pair of CONFUSING_PAIRS) {
console.log(` Generating for ${pair[0]} vs ${pair[1]}...`);
try {
const triplets = await generateHardNegatives(pair, tripletCount);
allTriplets.push(...triplets);
console.log(` ✓ Generated ${triplets.length} triplets`);
} catch (error) {
console.log(` ✗ Error: ${error.message}`);
}
// Rate limiting - wait between requests
await new Promise(resolve => setTimeout(resolve, 1000));
}
console.log(`\nTotal triplets generated: ${allTriplets.length}`);
// Save triplets
const dir = path.dirname(outputPath);
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir, { recursive: true });
}
const jsonl = allTriplets.map(t => JSON.stringify(t)).join('\n');
fs.writeFileSync(outputPath, jsonl);
console.log(`Saved to: ${outputPath}`);
// Optional GRPO evaluation
if (doGRPO && allTriplets.length > 0) {
console.log('\n─────────────────────────────────────────────────────────────────');
console.log(' GRPO EVALUATION');
console.log('─────────────────────────────────────────────────────────────────\n');
await evaluateWithGRPO(allTriplets);
}
// Show sample
console.log('\n─────────────────────────────────────────────────────────────────');
console.log(' SAMPLE TRIPLETS');
console.log('─────────────────────────────────────────────────────────────────\n');
for (const triplet of allTriplets.slice(0, 5)) {
console.log(` Task: "${triplet.anchor}"`);
console.log(` → Correct: ${triplet.positive}, Wrong: ${triplet.negative}`);
console.log();
}
console.log('═══════════════════════════════════════════════════════════════════════════════════');
console.log(' NEXT STEPS');
console.log('═══════════════════════════════════════════════════════════════════════════════════\n');
console.log('1. Merge with existing triplets:');
console.log(` cat ~/.ruvllm/training/ruvltra-finetuned/triplets.jsonl ${outputPath} > combined.jsonl`);
console.log('\n2. Run training with enhanced data:');
console.log(' cargo run --example train_contrastive --release -- --triplets combined.jsonl --epochs 30');
console.log('\n3. Benchmark embedding-only accuracy improvement');
console.log();
}
main().catch(console.error);