git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
288 lines
14 KiB
JavaScript
288 lines
14 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Ensemble Model Comparison
|
|
*
|
|
* Strategies:
|
|
* 1. Task prefix - prepend context to make tasks more aligned with descriptions
|
|
* 2. Ensemble voting - combine multiple description variants
|
|
* 3. Agent-specific thresholds based on training patterns
|
|
*/
|
|
|
|
const { execSync } = require('child_process');
|
|
const { existsSync } = require('fs');
|
|
const { join } = require('path');
|
|
const { homedir } = require('os');
|
|
|
|
const MODELS_DIR = join(homedir(), '.ruvllm', 'models');
|
|
const RUVLTRA_MODEL = join(MODELS_DIR, 'ruvltra-claude-code-0.5b-q4_k_m.gguf');
|
|
const QWEN_MODEL = join(MODELS_DIR, 'qwen2.5-0.5b-instruct-q4_k_m.gguf');
|
|
|
|
// Original V1 descriptions (best baseline)
|
|
const DESCRIPTIONS_V1 = {
|
|
coder: 'implement create write build add code function class component feature',
|
|
researcher: 'research find investigate analyze explore search discover examine',
|
|
reviewer: 'review check evaluate assess inspect examine code quality',
|
|
tester: 'test unit integration e2e coverage mock assertion spec',
|
|
architect: 'design architecture schema system structure plan database',
|
|
'security-architect': 'security vulnerability xss injection audit cve authentication',
|
|
debugger: 'debug fix bug error issue broken crash exception trace',
|
|
documenter: 'document readme jsdoc comment explain describe documentation',
|
|
refactorer: 'refactor extract rename consolidate clean restructure simplify',
|
|
optimizer: 'optimize performance slow fast cache speed memory latency',
|
|
devops: 'deploy ci cd kubernetes docker pipeline container infrastructure',
|
|
'api-docs': 'openapi swagger api documentation graphql schema endpoint',
|
|
planner: 'plan estimate prioritize sprint roadmap schedule milestone',
|
|
};
|
|
|
|
// V6: Keywords reformulated as action phrases
|
|
const DESCRIPTIONS_V6 = {
|
|
coder: 'implement new functionality write code build features create components',
|
|
researcher: 'research and analyze investigate patterns explore best practices',
|
|
reviewer: 'review code quality check pull requests evaluate implementations',
|
|
tester: 'write tests create test coverage add unit and integration tests',
|
|
architect: 'design system architecture plan database schemas structure systems',
|
|
'security-architect': 'audit security vulnerabilities check xss and injection attacks',
|
|
debugger: 'debug and fix bugs trace errors resolve exceptions',
|
|
documenter: 'write documentation add jsdoc comments create readme files',
|
|
refactorer: 'refactor code modernize to async await restructure modules',
|
|
optimizer: 'optimize performance improve speed cache data reduce latency',
|
|
devops: 'deploy to cloud setup ci cd pipelines manage containers kubernetes',
|
|
'api-docs': 'generate openapi documentation create swagger api specs',
|
|
planner: 'plan sprints create roadmaps estimate timelines schedule milestones',
|
|
};
|
|
|
|
// Task prefixes to try
|
|
const TASK_PREFIXES = [
|
|
'', // No prefix (baseline)
|
|
'Task: ', // Simple task prefix
|
|
'The developer needs to: ', // Contextual prefix
|
|
'Claude Code task - ', // Model-specific prefix
|
|
];
|
|
|
|
const ROUTING_TESTS = [
|
|
{ task: 'Implement a binary search function in TypeScript', expected: 'coder' },
|
|
{ task: 'Write unit tests for the authentication module', expected: 'tester' },
|
|
{ task: 'Review the pull request for security vulnerabilities', expected: 'reviewer' },
|
|
{ task: 'Research best practices for React state management', expected: 'researcher' },
|
|
{ task: 'Design the database schema for user profiles', expected: 'architect' },
|
|
{ task: 'Fix the null pointer exception in the login handler', expected: 'debugger' },
|
|
{ task: 'Audit the API endpoints for XSS vulnerabilities', expected: 'security-architect' },
|
|
{ task: 'Write JSDoc comments for the utility functions', expected: 'documenter' },
|
|
{ task: 'Refactor the payment module to use async/await', expected: 'refactorer' },
|
|
{ task: 'Optimize the database queries for the dashboard', expected: 'optimizer' },
|
|
{ task: 'Set up the CI/CD pipeline for the microservices', expected: 'devops' },
|
|
{ task: 'Generate OpenAPI documentation for the REST API', expected: 'api-docs' },
|
|
{ task: 'Create a sprint plan for the next two weeks', expected: 'planner' },
|
|
{ task: 'Build a React component for user registration', expected: 'coder' },
|
|
{ task: 'Debug memory leak in the WebSocket handler', expected: 'debugger' },
|
|
{ task: 'Investigate slow API response times', expected: 'researcher' },
|
|
{ task: 'Check code for potential race conditions', expected: 'reviewer' },
|
|
{ task: 'Add integration tests for the payment gateway', expected: 'tester' },
|
|
{ task: 'Plan the architecture for real-time notifications', expected: 'architect' },
|
|
{ task: 'Cache the frequently accessed user data', expected: 'optimizer' },
|
|
];
|
|
|
|
function getEmbedding(modelPath, text) {
|
|
try {
|
|
const sanitized = text.replace(/"/g, '\\"').replace(/\n/g, ' ');
|
|
const result = execSync(
|
|
`llama-embedding -m "${modelPath}" -p "${sanitized}" --embd-output-format json 2>/dev/null`,
|
|
{ encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }
|
|
);
|
|
const json = JSON.parse(result);
|
|
return json.data[json.data.length - 1].embedding;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function cosineSimilarity(a, b) {
|
|
if (!a || !b || a.length !== b.length) return 0;
|
|
let dot = 0, normA = 0, normB = 0;
|
|
for (let i = 0; i < a.length; i++) {
|
|
dot += a[i] * b[i];
|
|
normA += a[i] * a[i];
|
|
normB += b[i] * b[i];
|
|
}
|
|
return dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1);
|
|
}
|
|
|
|
function routeTask(taskEmbedding, agentEmbeddings) {
|
|
let bestAgent = 'coder';
|
|
let bestSim = -1;
|
|
const allScores = {};
|
|
for (const [agent, emb] of Object.entries(agentEmbeddings)) {
|
|
const sim = cosineSimilarity(taskEmbedding, emb);
|
|
allScores[agent] = sim;
|
|
if (sim > bestSim) {
|
|
bestSim = sim;
|
|
bestAgent = agent;
|
|
}
|
|
}
|
|
return { agent: bestAgent, confidence: bestSim, scores: allScores };
|
|
}
|
|
|
|
/**
|
|
* Ensemble routing - vote across multiple description sets
|
|
*/
|
|
function routeTaskEnsemble(taskEmbedding, allAgentEmbeddings) {
|
|
const votes = {};
|
|
const agents = Object.keys(allAgentEmbeddings[0]);
|
|
|
|
for (const agent of agents) votes[agent] = 0;
|
|
|
|
// Each embedding set votes
|
|
for (const agentEmbeddings of allAgentEmbeddings) {
|
|
const { agent } = routeTask(taskEmbedding, agentEmbeddings);
|
|
votes[agent] = (votes[agent] || 0) + 1;
|
|
}
|
|
|
|
// Return agent with most votes
|
|
let bestAgent = 'coder';
|
|
let maxVotes = 0;
|
|
for (const [agent, count] of Object.entries(votes)) {
|
|
if (count > maxVotes) {
|
|
maxVotes = count;
|
|
bestAgent = agent;
|
|
}
|
|
}
|
|
|
|
return { agent: bestAgent, votes, voteCount: maxVotes };
|
|
}
|
|
|
|
function runBenchmark(modelPath, descriptions, prefix = '') {
|
|
const agentEmbeddings = {};
|
|
for (const [agent, desc] of Object.entries(descriptions)) {
|
|
agentEmbeddings[agent] = getEmbedding(modelPath, desc);
|
|
}
|
|
|
|
let correct = 0;
|
|
for (const test of ROUTING_TESTS) {
|
|
const taskEmb = getEmbedding(modelPath, prefix + test.task);
|
|
const { agent } = routeTask(taskEmb, agentEmbeddings);
|
|
if (agent === test.expected) correct++;
|
|
}
|
|
|
|
return { accuracy: correct / ROUTING_TESTS.length, correct, total: ROUTING_TESTS.length };
|
|
}
|
|
|
|
function runEnsembleBenchmark(modelPath, descriptionSets, prefix = '') {
|
|
// Precompute embeddings for all description sets
|
|
const allAgentEmbeddings = descriptionSets.map(descriptions => {
|
|
const embeds = {};
|
|
for (const [agent, desc] of Object.entries(descriptions)) {
|
|
embeds[agent] = getEmbedding(modelPath, desc);
|
|
}
|
|
return embeds;
|
|
});
|
|
|
|
let correct = 0;
|
|
const results = [];
|
|
for (const test of ROUTING_TESTS) {
|
|
const taskEmb = getEmbedding(modelPath, prefix + test.task);
|
|
const { agent, votes } = routeTaskEnsemble(taskEmb, allAgentEmbeddings);
|
|
const isCorrect = agent === test.expected;
|
|
if (isCorrect) correct++;
|
|
results.push({ task: test.task, expected: test.expected, got: agent, correct: isCorrect, votes });
|
|
}
|
|
|
|
return { accuracy: correct / ROUTING_TESTS.length, correct, total: ROUTING_TESTS.length, results };
|
|
}
|
|
|
|
async function main() {
|
|
console.log('\n╔═══════════════════════════════════════════════════════════════════════════════════╗');
|
|
console.log('║ ENSEMBLE & PREFIX MODEL COMPARISON ║');
|
|
console.log('╚═══════════════════════════════════════════════════════════════════════════════════╝\n');
|
|
|
|
if (!existsSync(RUVLTRA_MODEL)) {
|
|
console.error('RuvLTRA model not found.');
|
|
process.exit(1);
|
|
}
|
|
|
|
// Test prefix variations
|
|
console.log('─────────────────────────────────────────────────────────────────');
|
|
console.log(' PREFIX VARIATIONS (RuvLTRA)');
|
|
console.log('─────────────────────────────────────────────────────────────────\n');
|
|
|
|
const prefixResults = {};
|
|
for (const prefix of TASK_PREFIXES) {
|
|
const label = prefix || '(no prefix)';
|
|
process.stdout.write(` Testing "${label.padEnd(25)}"... `);
|
|
const result = runBenchmark(RUVLTRA_MODEL, DESCRIPTIONS_V1, prefix);
|
|
prefixResults[label] = result;
|
|
console.log(`${(result.accuracy * 100).toFixed(1)}%`);
|
|
}
|
|
|
|
// Find best prefix
|
|
const bestPrefix = Object.entries(prefixResults).reduce((a, b) =>
|
|
a[1].accuracy > b[1].accuracy ? a : b
|
|
);
|
|
|
|
console.log(`\n Best prefix: "${bestPrefix[0]}" = ${(bestPrefix[1].accuracy * 100).toFixed(1)}%`);
|
|
|
|
// Test ensemble voting
|
|
console.log('\n─────────────────────────────────────────────────────────────────');
|
|
console.log(' ENSEMBLE VOTING (RuvLTRA)');
|
|
console.log('─────────────────────────────────────────────────────────────────\n');
|
|
|
|
process.stdout.write(' Computing V1 + V6 ensemble... ');
|
|
const ensembleResult = runEnsembleBenchmark(RUVLTRA_MODEL, [DESCRIPTIONS_V1, DESCRIPTIONS_V6], '');
|
|
console.log(`${(ensembleResult.accuracy * 100).toFixed(1)}%`);
|
|
|
|
// Compare with Qwen
|
|
console.log('\n─────────────────────────────────────────────────────────────────');
|
|
console.log(' QWEN COMPARISON');
|
|
console.log('─────────────────────────────────────────────────────────────────\n');
|
|
|
|
process.stdout.write(' Qwen V1 baseline... ');
|
|
const qwenV1 = runBenchmark(QWEN_MODEL, DESCRIPTIONS_V1, '');
|
|
console.log(`${(qwenV1.accuracy * 100).toFixed(1)}%`);
|
|
|
|
process.stdout.write(' Qwen V1+V6 ensemble... ');
|
|
const qwenEnsemble = runEnsembleBenchmark(QWEN_MODEL, [DESCRIPTIONS_V1, DESCRIPTIONS_V6], '');
|
|
console.log(`${(qwenEnsemble.accuracy * 100).toFixed(1)}%`);
|
|
|
|
// Final results table
|
|
console.log('\n═══════════════════════════════════════════════════════════════════════════════════');
|
|
console.log(' FINAL RESULTS');
|
|
console.log('═══════════════════════════════════════════════════════════════════════════════════\n');
|
|
|
|
const fmt = (v) => `${(v * 100).toFixed(1)}%`.padStart(10);
|
|
|
|
console.log('┌───────────────────────────────┬────────────┬────────────┐');
|
|
console.log('│ Strategy │ RuvLTRA │ Qwen │');
|
|
console.log('├───────────────────────────────┼────────────┼────────────┤');
|
|
console.log(`│ V1 Baseline │${fmt(prefixResults['(no prefix)'].accuracy)} │${fmt(qwenV1.accuracy)} │`);
|
|
console.log(`│ V1 + Best Prefix │${fmt(bestPrefix[1].accuracy)} │ - │`);
|
|
console.log(`│ V1+V6 Ensemble │${fmt(ensembleResult.accuracy)} │${fmt(qwenEnsemble.accuracy)} │`);
|
|
console.log('└───────────────────────────────┴────────────┴────────────┘');
|
|
|
|
// Best overall
|
|
const ruvBest = Math.max(
|
|
prefixResults['(no prefix)'].accuracy,
|
|
bestPrefix[1].accuracy,
|
|
ensembleResult.accuracy
|
|
);
|
|
const qwenBest = Math.max(qwenV1.accuracy, qwenEnsemble.accuracy);
|
|
|
|
console.log(`\n RuvLTRA Best: ${(ruvBest * 100).toFixed(1)}%`);
|
|
console.log(` Qwen Best: ${(qwenBest * 100).toFixed(1)}%`);
|
|
console.log(` Advantage: RuvLTRA +${((ruvBest - qwenBest) * 100).toFixed(1)} points`);
|
|
|
|
// Show detailed ensemble results
|
|
console.log('\n─────────────────────────────────────────────────────────────────');
|
|
console.log(' ENSEMBLE VOTING DETAILS (RuvLTRA)');
|
|
console.log('─────────────────────────────────────────────────────────────────\n');
|
|
|
|
for (const r of ensembleResult.results) {
|
|
const mark = r.correct ? '✓' : '✗';
|
|
const task = r.task.slice(0, 45).padEnd(45);
|
|
const exp = r.expected.padEnd(18);
|
|
console.log(`${mark} ${task} ${exp}${r.correct ? '' : '→ ' + r.got}`);
|
|
}
|
|
|
|
console.log('\n');
|
|
}
|
|
|
|
main().catch(console.error);
|