374 lines
17 KiB
JavaScript
374 lines
17 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Hybrid Model Comparison
|
|
*
|
|
* Combines embedding similarity with keyword boosting.
|
|
* This addresses the "reviewer overfit" problem by:
|
|
* 1. Computing embedding similarity
|
|
* 2. Boosting agents that have keyword matches in the task
|
|
* 3. Using weighted combination for final score
|
|
*/
|
|
|
|
const { execSync } = require('child_process');
|
|
const { existsSync } = require('fs');
|
|
const { join } = require('path');
|
|
const { homedir } = require('os');
|
|
|
|
const MODELS_DIR = join(homedir(), '.ruvllm', 'models');
|
|
const RUVLTRA_MODEL = join(MODELS_DIR, 'ruvltra-claude-code-0.5b-q4_k_m.gguf');
|
|
const QWEN_MODEL = join(MODELS_DIR, 'qwen2.5-0.5b-instruct-q4_k_m.gguf');
|
|
|
|
// V1 descriptions for embedding
|
|
const DESCRIPTIONS_V1 = {
|
|
coder: 'implement create write build add code function class component feature',
|
|
researcher: 'research find investigate analyze explore search discover examine',
|
|
reviewer: 'review check evaluate assess inspect examine code quality',
|
|
tester: 'test unit integration e2e coverage mock assertion spec',
|
|
architect: 'design architecture schema system structure plan database',
|
|
'security-architect': 'security vulnerability xss injection audit cve authentication',
|
|
debugger: 'debug fix bug error issue broken crash exception trace',
|
|
documenter: 'document readme jsdoc comment explain describe documentation',
|
|
refactorer: 'refactor extract rename consolidate clean restructure simplify',
|
|
optimizer: 'optimize performance slow fast cache speed memory latency',
|
|
devops: 'deploy ci cd kubernetes docker pipeline container infrastructure',
|
|
'api-docs': 'openapi swagger api documentation graphql schema endpoint',
|
|
planner: 'plan estimate prioritize sprint roadmap schedule milestone',
|
|
};
|
|
|
|
// UNIQUE trigger keywords - words that strongly indicate a specific agent
|
|
// Priority-ordered: first match wins for disambiguation
|
|
// NOTE: "investigate" takes priority over "slow" for researcher vs optimizer
|
|
const TRIGGER_KEYWORDS = {
|
|
// Higher priority agents (check these first)
|
|
researcher: ['research', 'investigate', 'explore', 'discover', 'best practices', 'patterns', 'analyze', 'look into', 'find out'],
|
|
coder: ['implement', 'build', 'create', 'component', 'function', 'typescript', 'react', 'feature', 'write code'],
|
|
tester: ['test', 'tests', 'testing', 'unit test', 'integration test', 'e2e', 'coverage', 'spec'],
|
|
reviewer: ['review', 'pull request', 'pr', 'code quality', 'code review', 'check code'],
|
|
debugger: ['debug', 'fix', 'bug', 'error', 'exception', 'crash', 'trace', 'null pointer', 'memory leak'],
|
|
'security-architect': ['security', 'vulnerability', 'xss', 'injection', 'csrf', 'cve', 'audit', 'exploit'],
|
|
refactorer: ['refactor', 'async/await', 'modernize', 'restructure', 'extract', 'legacy'],
|
|
// Optimizer: removed "slow" (too generic), added query-specific terms
|
|
optimizer: ['optimize', 'performance', 'cache', 'caching', 'speed up', 'latency', 'faster', 'queries', 'reduce time'],
|
|
architect: ['design', 'architecture', 'schema', 'structure', 'diagram', 'system design', 'plan architecture'],
|
|
documenter: ['jsdoc', 'comment', 'comments', 'readme', 'documentation', 'document', 'explain'],
|
|
devops: ['deploy', 'ci/cd', 'kubernetes', 'docker', 'pipeline', 'infrastructure', 'container'],
|
|
'api-docs': ['openapi', 'swagger', 'api doc', 'rest api', 'graphql', 'endpoint'],
|
|
planner: ['sprint', 'plan', 'roadmap', 'milestone', 'estimate', 'schedule', 'prioritize'],
|
|
};
|
|
|
|
// Priority order for disambiguation (when multiple agents match)
|
|
const AGENT_PRIORITY = [
|
|
'researcher', // "investigate" wins over "slow"
|
|
'debugger', // "fix" wins over generic terms
|
|
'tester', // "test" is specific
|
|
'security-architect',
|
|
'coder',
|
|
'reviewer',
|
|
'refactorer',
|
|
'optimizer',
|
|
'architect',
|
|
'documenter',
|
|
'devops',
|
|
'api-docs',
|
|
'planner',
|
|
];
|
|
|
|
const ROUTING_TESTS = [
|
|
{ task: 'Implement a binary search function in TypeScript', expected: 'coder' },
|
|
{ task: 'Write unit tests for the authentication module', expected: 'tester' },
|
|
{ task: 'Review the pull request for security vulnerabilities', expected: 'reviewer' },
|
|
{ task: 'Research best practices for React state management', expected: 'researcher' },
|
|
{ task: 'Design the database schema for user profiles', expected: 'architect' },
|
|
{ task: 'Fix the null pointer exception in the login handler', expected: 'debugger' },
|
|
{ task: 'Audit the API endpoints for XSS vulnerabilities', expected: 'security-architect' },
|
|
{ task: 'Write JSDoc comments for the utility functions', expected: 'documenter' },
|
|
{ task: 'Refactor the payment module to use async/await', expected: 'refactorer' },
|
|
{ task: 'Optimize the database queries for the dashboard', expected: 'optimizer' },
|
|
{ task: 'Set up the CI/CD pipeline for the microservices', expected: 'devops' },
|
|
{ task: 'Generate OpenAPI documentation for the REST API', expected: 'api-docs' },
|
|
{ task: 'Create a sprint plan for the next two weeks', expected: 'planner' },
|
|
{ task: 'Build a React component for user registration', expected: 'coder' },
|
|
{ task: 'Debug memory leak in the WebSocket handler', expected: 'debugger' },
|
|
{ task: 'Investigate slow API response times', expected: 'researcher' },
|
|
{ task: 'Check code for potential race conditions', expected: 'reviewer' },
|
|
{ task: 'Add integration tests for the payment gateway', expected: 'tester' },
|
|
{ task: 'Plan the architecture for real-time notifications', expected: 'architect' },
|
|
{ task: 'Cache the frequently accessed user data', expected: 'optimizer' },
|
|
];
|
|
|
|
function getEmbedding(modelPath, text) {
|
|
try {
|
|
const sanitized = text.replace(/"/g, '\\"').replace(/\n/g, ' ');
|
|
const result = execSync(
|
|
`llama-embedding -m "${modelPath}" -p "${sanitized}" --embd-output-format json 2>/dev/null`,
|
|
{ encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }
|
|
);
|
|
const json = JSON.parse(result);
|
|
return json.data[json.data.length - 1].embedding;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function cosineSimilarity(a, b) {
|
|
if (!a || !b || a.length !== b.length) return 0;
|
|
let dot = 0, normA = 0, normB = 0;
|
|
for (let i = 0; i < a.length; i++) {
|
|
dot += a[i] * b[i];
|
|
normA += a[i] * a[i];
|
|
normB += b[i] * b[i];
|
|
}
|
|
return dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1);
|
|
}
|
|
|
|
/**
|
|
* Count keyword matches for each agent
|
|
*/
|
|
function getKeywordScores(task) {
|
|
const taskLower = task.toLowerCase();
|
|
const scores = {};
|
|
|
|
for (const [agent, keywords] of Object.entries(TRIGGER_KEYWORDS)) {
|
|
let matches = 0;
|
|
for (const kw of keywords) {
|
|
if (taskLower.includes(kw.toLowerCase())) {
|
|
matches++;
|
|
}
|
|
}
|
|
scores[agent] = matches;
|
|
}
|
|
|
|
return scores;
|
|
}
|
|
|
|
/**
|
|
* Pure embedding routing (baseline)
|
|
*/
|
|
function routeEmbeddingOnly(taskEmbedding, agentEmbeddings) {
|
|
let bestAgent = 'coder';
|
|
let bestSim = -1;
|
|
|
|
for (const [agent, emb] of Object.entries(agentEmbeddings)) {
|
|
const sim = cosineSimilarity(taskEmbedding, emb);
|
|
if (sim > bestSim) {
|
|
bestSim = sim;
|
|
bestAgent = agent;
|
|
}
|
|
}
|
|
|
|
return { agent: bestAgent, confidence: bestSim };
|
|
}
|
|
|
|
/**
|
|
* Pure keyword routing
|
|
*/
|
|
function routeKeywordOnly(task) {
|
|
const scores = getKeywordScores(task);
|
|
let bestAgent = 'coder';
|
|
let bestScore = 0;
|
|
|
|
for (const [agent, score] of Object.entries(scores)) {
|
|
if (score > bestScore) {
|
|
bestScore = score;
|
|
bestAgent = agent;
|
|
}
|
|
}
|
|
|
|
return { agent: bestAgent, confidence: bestScore };
|
|
}
|
|
|
|
/**
|
|
* Hybrid routing - combine embedding similarity with keyword boost
|
|
*/
|
|
function routeHybrid(task, taskEmbedding, agentEmbeddings, embeddingWeight = 0.6, keywordWeight = 0.4) {
|
|
const keywordScores = getKeywordScores(task);
|
|
|
|
// Normalize keyword scores to 0-1 range
|
|
const maxKeyword = Math.max(...Object.values(keywordScores), 1);
|
|
const normalizedKeywords = {};
|
|
for (const agent of Object.keys(keywordScores)) {
|
|
normalizedKeywords[agent] = keywordScores[agent] / maxKeyword;
|
|
}
|
|
|
|
let bestAgent = 'coder';
|
|
let bestScore = -1;
|
|
const allScores = {};
|
|
|
|
for (const [agent, emb] of Object.entries(agentEmbeddings)) {
|
|
const embSim = cosineSimilarity(taskEmbedding, emb);
|
|
const kwScore = normalizedKeywords[agent] || 0;
|
|
const combined = embeddingWeight * embSim + keywordWeight * kwScore;
|
|
allScores[agent] = { embedding: embSim, keyword: kwScore, combined };
|
|
|
|
if (combined > bestScore) {
|
|
bestScore = combined;
|
|
bestAgent = agent;
|
|
}
|
|
}
|
|
|
|
return { agent: bestAgent, confidence: bestScore, scores: allScores };
|
|
}
|
|
|
|
/**
|
|
* Keyword-first routing - use keywords as primary, embedding as tiebreaker
|
|
*/
|
|
function routeKeywordFirst(task, taskEmbedding, agentEmbeddings) {
|
|
const keywordScores = getKeywordScores(task);
|
|
|
|
// Find agents with max keyword matches
|
|
const maxKw = Math.max(...Object.values(keywordScores));
|
|
|
|
if (maxKw > 0) {
|
|
// At least one keyword match - use keywords, embedding as tiebreaker
|
|
const candidates = Object.entries(keywordScores)
|
|
.filter(([_, score]) => score === maxKw)
|
|
.map(([agent, _]) => agent);
|
|
|
|
if (candidates.length === 1) {
|
|
return { agent: candidates[0], confidence: maxKw };
|
|
}
|
|
|
|
// Multiple candidates with same keyword count - use embedding
|
|
let bestAgent = candidates[0];
|
|
let bestSim = -1;
|
|
for (const agent of candidates) {
|
|
const sim = cosineSimilarity(taskEmbedding, agentEmbeddings[agent]);
|
|
if (sim > bestSim) {
|
|
bestSim = sim;
|
|
bestAgent = agent;
|
|
}
|
|
}
|
|
return { agent: bestAgent, confidence: maxKw + bestSim / 10 };
|
|
}
|
|
|
|
// No keyword matches - fall back to pure embedding
|
|
return routeEmbeddingOnly(taskEmbedding, agentEmbeddings);
|
|
}
|
|
|
|
function runBenchmark(modelPath, routerFn, name) {
|
|
const agentEmbeddings = {};
|
|
for (const [agent, desc] of Object.entries(DESCRIPTIONS_V1)) {
|
|
agentEmbeddings[agent] = getEmbedding(modelPath, desc);
|
|
}
|
|
|
|
let correct = 0;
|
|
const results = [];
|
|
|
|
for (const test of ROUTING_TESTS) {
|
|
const taskEmb = getEmbedding(modelPath, test.task);
|
|
const { agent } = routerFn(test.task, taskEmb, agentEmbeddings);
|
|
const isCorrect = agent === test.expected;
|
|
if (isCorrect) correct++;
|
|
results.push({ task: test.task, expected: test.expected, got: agent, correct: isCorrect });
|
|
}
|
|
|
|
return { accuracy: correct / ROUTING_TESTS.length, correct, total: ROUTING_TESTS.length, results, name };
|
|
}
|
|
|
|
async function main() {
|
|
console.log('\n╔═══════════════════════════════════════════════════════════════════════════════════╗');
|
|
console.log('║ HYBRID ROUTING: Embeddings + Keywords ║');
|
|
console.log('╚═══════════════════════════════════════════════════════════════════════════════════╝\n');
|
|
|
|
if (!existsSync(RUVLTRA_MODEL)) {
|
|
console.error('RuvLTRA model not found.');
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log('Strategies:');
|
|
console.log(' 1. Embedding Only (baseline)');
|
|
console.log(' 2. Keyword Only (no model)');
|
|
console.log(' 3. Hybrid 60/40 (60% embedding, 40% keyword)');
|
|
console.log(' 4. Hybrid 40/60 (40% embedding, 60% keyword)');
|
|
console.log(' 5. Keyword-First (keywords primary, embedding tiebreaker)\n');
|
|
|
|
// RuvLTRA tests
|
|
console.log('─────────────────────────────────────────────────────────────────');
|
|
console.log(' RUVLTRA RESULTS');
|
|
console.log('─────────────────────────────────────────────────────────────────\n');
|
|
|
|
const ruvEmbedding = runBenchmark(RUVLTRA_MODEL,
|
|
(task, taskEmb, agentEmbs) => routeEmbeddingOnly(taskEmb, agentEmbs),
|
|
'Embedding Only');
|
|
console.log(` Embedding Only: ${(ruvEmbedding.accuracy * 100).toFixed(1)}%`);
|
|
|
|
const ruvKeyword = runBenchmark(RUVLTRA_MODEL,
|
|
(task, taskEmb, agentEmbs) => routeKeywordOnly(task),
|
|
'Keyword Only');
|
|
console.log(` Keyword Only: ${(ruvKeyword.accuracy * 100).toFixed(1)}%`);
|
|
|
|
const ruvHybrid60 = runBenchmark(RUVLTRA_MODEL,
|
|
(task, taskEmb, agentEmbs) => routeHybrid(task, taskEmb, agentEmbs, 0.6, 0.4),
|
|
'Hybrid 60/40');
|
|
console.log(` Hybrid 60/40: ${(ruvHybrid60.accuracy * 100).toFixed(1)}%`);
|
|
|
|
const ruvHybrid40 = runBenchmark(RUVLTRA_MODEL,
|
|
(task, taskEmb, agentEmbs) => routeHybrid(task, taskEmb, agentEmbs, 0.4, 0.6),
|
|
'Hybrid 40/60');
|
|
console.log(` Hybrid 40/60: ${(ruvHybrid40.accuracy * 100).toFixed(1)}%`);
|
|
|
|
const ruvKwFirst = runBenchmark(RUVLTRA_MODEL,
|
|
(task, taskEmb, agentEmbs) => routeKeywordFirst(task, taskEmb, agentEmbs),
|
|
'Keyword-First');
|
|
console.log(` Keyword-First: ${(ruvKwFirst.accuracy * 100).toFixed(1)}%`);
|
|
|
|
// Qwen tests
|
|
console.log('\n─────────────────────────────────────────────────────────────────');
|
|
console.log(' QWEN RESULTS');
|
|
console.log('─────────────────────────────────────────────────────────────────\n');
|
|
|
|
const qwenEmbedding = runBenchmark(QWEN_MODEL,
|
|
(task, taskEmb, agentEmbs) => routeEmbeddingOnly(taskEmb, agentEmbs),
|
|
'Embedding Only');
|
|
console.log(` Embedding Only: ${(qwenEmbedding.accuracy * 100).toFixed(1)}%`);
|
|
|
|
const qwenHybrid60 = runBenchmark(QWEN_MODEL,
|
|
(task, taskEmb, agentEmbs) => routeHybrid(task, taskEmb, agentEmbs, 0.6, 0.4),
|
|
'Hybrid 60/40');
|
|
console.log(` Hybrid 60/40: ${(qwenHybrid60.accuracy * 100).toFixed(1)}%`);
|
|
|
|
const qwenKwFirst = runBenchmark(QWEN_MODEL,
|
|
(task, taskEmb, agentEmbs) => routeKeywordFirst(task, taskEmb, agentEmbs),
|
|
'Keyword-First');
|
|
console.log(` Keyword-First: ${(qwenKwFirst.accuracy * 100).toFixed(1)}%`);
|
|
|
|
// Summary table
|
|
console.log('\n═══════════════════════════════════════════════════════════════════════════════════');
|
|
console.log(' SUMMARY');
|
|
console.log('═══════════════════════════════════════════════════════════════════════════════════\n');
|
|
|
|
const fmt = (v) => `${(v * 100).toFixed(1)}%`.padStart(8);
|
|
|
|
console.log('┌───────────────────────┬──────────┬──────────┬──────────────────┐');
|
|
console.log('│ Strategy │ RuvLTRA │ Qwen │ RuvLTRA vs Qwen │');
|
|
console.log('├───────────────────────┼──────────┼──────────┼──────────────────┤');
|
|
console.log(`│ Embedding Only │${fmt(ruvEmbedding.accuracy)} │${fmt(qwenEmbedding.accuracy)} │ +${((ruvEmbedding.accuracy - qwenEmbedding.accuracy) * 100).toFixed(1)} pts │`);
|
|
console.log(`│ Keyword Only │${fmt(ruvKeyword.accuracy)} │${fmt(ruvKeyword.accuracy)} │ same │`);
|
|
console.log(`│ Hybrid 60/40 │${fmt(ruvHybrid60.accuracy)} │${fmt(qwenHybrid60.accuracy)} │ +${((ruvHybrid60.accuracy - qwenHybrid60.accuracy) * 100).toFixed(1)} pts │`);
|
|
console.log(`│ Keyword-First │${fmt(ruvKwFirst.accuracy)} │${fmt(qwenKwFirst.accuracy)} │ +${((ruvKwFirst.accuracy - qwenKwFirst.accuracy) * 100).toFixed(1)} pts │`);
|
|
console.log('└───────────────────────┴──────────┴──────────┴──────────────────┘');
|
|
|
|
// Best results
|
|
const ruvBest = [ruvEmbedding, ruvKeyword, ruvHybrid60, ruvHybrid40, ruvKwFirst]
|
|
.reduce((a, b) => a.accuracy > b.accuracy ? a : b);
|
|
|
|
console.log(`\n BEST RuvLTRA: ${ruvBest.name} = ${(ruvBest.accuracy * 100).toFixed(1)}%`);
|
|
console.log(` Improvement over embedding-only: +${((ruvBest.accuracy - ruvEmbedding.accuracy) * 100).toFixed(1)} points`);
|
|
|
|
// Show best results details
|
|
console.log('\n─────────────────────────────────────────────────────────────────');
|
|
console.log(` BEST STRATEGY DETAILS: ${ruvBest.name}`);
|
|
console.log('─────────────────────────────────────────────────────────────────\n');
|
|
|
|
for (const r of ruvBest.results) {
|
|
const mark = r.correct ? '✓' : '✗';
|
|
const task = r.task.slice(0, 45).padEnd(45);
|
|
const exp = r.expected.padEnd(18);
|
|
console.log(`${mark} ${task} ${exp}${r.correct ? '' : '→ ' + r.got}`);
|
|
}
|
|
|
|
console.log('\n');
|
|
}
|
|
|
|
main().catch(console.error);
|