#!/usr/bin/env node /** * Ensemble Model Comparison * * Strategies: * 1. Task prefix - prepend context to make tasks more aligned with descriptions * 2. Ensemble voting - combine multiple description variants * 3. Agent-specific thresholds based on training patterns */ const { execSync } = require('child_process'); const { existsSync } = require('fs'); const { join } = require('path'); const { homedir } = require('os'); const MODELS_DIR = join(homedir(), '.ruvllm', 'models'); const RUVLTRA_MODEL = join(MODELS_DIR, 'ruvltra-claude-code-0.5b-q4_k_m.gguf'); const QWEN_MODEL = join(MODELS_DIR, 'qwen2.5-0.5b-instruct-q4_k_m.gguf'); // Original V1 descriptions (best baseline) const DESCRIPTIONS_V1 = { coder: 'implement create write build add code function class component feature', researcher: 'research find investigate analyze explore search discover examine', reviewer: 'review check evaluate assess inspect examine code quality', tester: 'test unit integration e2e coverage mock assertion spec', architect: 'design architecture schema system structure plan database', 'security-architect': 'security vulnerability xss injection audit cve authentication', debugger: 'debug fix bug error issue broken crash exception trace', documenter: 'document readme jsdoc comment explain describe documentation', refactorer: 'refactor extract rename consolidate clean restructure simplify', optimizer: 'optimize performance slow fast cache speed memory latency', devops: 'deploy ci cd kubernetes docker pipeline container infrastructure', 'api-docs': 'openapi swagger api documentation graphql schema endpoint', planner: 'plan estimate prioritize sprint roadmap schedule milestone', }; // V6: Keywords reformulated as action phrases const DESCRIPTIONS_V6 = { coder: 'implement new functionality write code build features create components', researcher: 'research and analyze investigate patterns explore best practices', reviewer: 'review code quality check pull requests evaluate implementations', tester: 'write tests create test coverage add unit and integration tests', architect: 'design system architecture plan database schemas structure systems', 'security-architect': 'audit security vulnerabilities check xss and injection attacks', debugger: 'debug and fix bugs trace errors resolve exceptions', documenter: 'write documentation add jsdoc comments create readme files', refactorer: 'refactor code modernize to async await restructure modules', optimizer: 'optimize performance improve speed cache data reduce latency', devops: 'deploy to cloud setup ci cd pipelines manage containers kubernetes', 'api-docs': 'generate openapi documentation create swagger api specs', planner: 'plan sprints create roadmaps estimate timelines schedule milestones', }; // Task prefixes to try const TASK_PREFIXES = [ '', // No prefix (baseline) 'Task: ', // Simple task prefix 'The developer needs to: ', // Contextual prefix 'Claude Code task - ', // Model-specific prefix ]; const ROUTING_TESTS = [ { task: 'Implement a binary search function in TypeScript', expected: 'coder' }, { task: 'Write unit tests for the authentication module', expected: 'tester' }, { task: 'Review the pull request for security vulnerabilities', expected: 'reviewer' }, { task: 'Research best practices for React state management', expected: 'researcher' }, { task: 'Design the database schema for user profiles', expected: 'architect' }, { task: 'Fix the null pointer exception in the login handler', expected: 'debugger' }, { task: 'Audit the API endpoints for XSS vulnerabilities', expected: 'security-architect' }, { task: 'Write JSDoc comments for the utility functions', expected: 'documenter' }, { task: 'Refactor the payment module to use async/await', expected: 'refactorer' }, { task: 'Optimize the database queries for the dashboard', expected: 'optimizer' }, { task: 'Set up the CI/CD pipeline for the microservices', expected: 'devops' }, { task: 'Generate OpenAPI documentation for the REST API', expected: 'api-docs' }, { task: 'Create a sprint plan for the next two weeks', expected: 'planner' }, { task: 'Build a React component for user registration', expected: 'coder' }, { task: 'Debug memory leak in the WebSocket handler', expected: 'debugger' }, { task: 'Investigate slow API response times', expected: 'researcher' }, { task: 'Check code for potential race conditions', expected: 'reviewer' }, { task: 'Add integration tests for the payment gateway', expected: 'tester' }, { task: 'Plan the architecture for real-time notifications', expected: 'architect' }, { task: 'Cache the frequently accessed user data', expected: 'optimizer' }, ]; function getEmbedding(modelPath, text) { try { const sanitized = text.replace(/"/g, '\\"').replace(/\n/g, ' '); const result = execSync( `llama-embedding -m "${modelPath}" -p "${sanitized}" --embd-output-format json 2>/dev/null`, { encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 } ); const json = JSON.parse(result); return json.data[json.data.length - 1].embedding; } catch { return null; } } function cosineSimilarity(a, b) { if (!a || !b || a.length !== b.length) return 0; let dot = 0, normA = 0, normB = 0; for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } return dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1); } function routeTask(taskEmbedding, agentEmbeddings) { let bestAgent = 'coder'; let bestSim = -1; const allScores = {}; for (const [agent, emb] of Object.entries(agentEmbeddings)) { const sim = cosineSimilarity(taskEmbedding, emb); allScores[agent] = sim; if (sim > bestSim) { bestSim = sim; bestAgent = agent; } } return { agent: bestAgent, confidence: bestSim, scores: allScores }; } /** * Ensemble routing - vote across multiple description sets */ function routeTaskEnsemble(taskEmbedding, allAgentEmbeddings) { const votes = {}; const agents = Object.keys(allAgentEmbeddings[0]); for (const agent of agents) votes[agent] = 0; // Each embedding set votes for (const agentEmbeddings of allAgentEmbeddings) { const { agent } = routeTask(taskEmbedding, agentEmbeddings); votes[agent] = (votes[agent] || 0) + 1; } // Return agent with most votes let bestAgent = 'coder'; let maxVotes = 0; for (const [agent, count] of Object.entries(votes)) { if (count > maxVotes) { maxVotes = count; bestAgent = agent; } } return { agent: bestAgent, votes, voteCount: maxVotes }; } function runBenchmark(modelPath, descriptions, prefix = '') { const agentEmbeddings = {}; for (const [agent, desc] of Object.entries(descriptions)) { agentEmbeddings[agent] = getEmbedding(modelPath, desc); } let correct = 0; for (const test of ROUTING_TESTS) { const taskEmb = getEmbedding(modelPath, prefix + test.task); const { agent } = routeTask(taskEmb, agentEmbeddings); if (agent === test.expected) correct++; } return { accuracy: correct / ROUTING_TESTS.length, correct, total: ROUTING_TESTS.length }; } function runEnsembleBenchmark(modelPath, descriptionSets, prefix = '') { // Precompute embeddings for all description sets const allAgentEmbeddings = descriptionSets.map(descriptions => { const embeds = {}; for (const [agent, desc] of Object.entries(descriptions)) { embeds[agent] = getEmbedding(modelPath, desc); } return embeds; }); let correct = 0; const results = []; for (const test of ROUTING_TESTS) { const taskEmb = getEmbedding(modelPath, prefix + test.task); const { agent, votes } = routeTaskEnsemble(taskEmb, allAgentEmbeddings); const isCorrect = agent === test.expected; if (isCorrect) correct++; results.push({ task: test.task, expected: test.expected, got: agent, correct: isCorrect, votes }); } return { accuracy: correct / ROUTING_TESTS.length, correct, total: ROUTING_TESTS.length, results }; } async function main() { console.log('\n╔═══════════════════════════════════════════════════════════════════════════════════╗'); console.log('║ ENSEMBLE & PREFIX MODEL COMPARISON ║'); console.log('╚═══════════════════════════════════════════════════════════════════════════════════╝\n'); if (!existsSync(RUVLTRA_MODEL)) { console.error('RuvLTRA model not found.'); process.exit(1); } // Test prefix variations console.log('─────────────────────────────────────────────────────────────────'); console.log(' PREFIX VARIATIONS (RuvLTRA)'); console.log('─────────────────────────────────────────────────────────────────\n'); const prefixResults = {}; for (const prefix of TASK_PREFIXES) { const label = prefix || '(no prefix)'; process.stdout.write(` Testing "${label.padEnd(25)}"... `); const result = runBenchmark(RUVLTRA_MODEL, DESCRIPTIONS_V1, prefix); prefixResults[label] = result; console.log(`${(result.accuracy * 100).toFixed(1)}%`); } // Find best prefix const bestPrefix = Object.entries(prefixResults).reduce((a, b) => a[1].accuracy > b[1].accuracy ? a : b ); console.log(`\n Best prefix: "${bestPrefix[0]}" = ${(bestPrefix[1].accuracy * 100).toFixed(1)}%`); // Test ensemble voting console.log('\n─────────────────────────────────────────────────────────────────'); console.log(' ENSEMBLE VOTING (RuvLTRA)'); console.log('─────────────────────────────────────────────────────────────────\n'); process.stdout.write(' Computing V1 + V6 ensemble... '); const ensembleResult = runEnsembleBenchmark(RUVLTRA_MODEL, [DESCRIPTIONS_V1, DESCRIPTIONS_V6], ''); console.log(`${(ensembleResult.accuracy * 100).toFixed(1)}%`); // Compare with Qwen console.log('\n─────────────────────────────────────────────────────────────────'); console.log(' QWEN COMPARISON'); console.log('─────────────────────────────────────────────────────────────────\n'); process.stdout.write(' Qwen V1 baseline... '); const qwenV1 = runBenchmark(QWEN_MODEL, DESCRIPTIONS_V1, ''); console.log(`${(qwenV1.accuracy * 100).toFixed(1)}%`); process.stdout.write(' Qwen V1+V6 ensemble... '); const qwenEnsemble = runEnsembleBenchmark(QWEN_MODEL, [DESCRIPTIONS_V1, DESCRIPTIONS_V6], ''); console.log(`${(qwenEnsemble.accuracy * 100).toFixed(1)}%`); // Final results table console.log('\n═══════════════════════════════════════════════════════════════════════════════════'); console.log(' FINAL RESULTS'); console.log('═══════════════════════════════════════════════════════════════════════════════════\n'); const fmt = (v) => `${(v * 100).toFixed(1)}%`.padStart(10); console.log('┌───────────────────────────────┬────────────┬────────────┐'); console.log('│ Strategy │ RuvLTRA │ Qwen │'); console.log('├───────────────────────────────┼────────────┼────────────┤'); console.log(`│ V1 Baseline │${fmt(prefixResults['(no prefix)'].accuracy)} │${fmt(qwenV1.accuracy)} │`); console.log(`│ V1 + Best Prefix │${fmt(bestPrefix[1].accuracy)} │ - │`); console.log(`│ V1+V6 Ensemble │${fmt(ensembleResult.accuracy)} │${fmt(qwenEnsemble.accuracy)} │`); console.log('└───────────────────────────────┴────────────┴────────────┘'); // Best overall const ruvBest = Math.max( prefixResults['(no prefix)'].accuracy, bestPrefix[1].accuracy, ensembleResult.accuracy ); const qwenBest = Math.max(qwenV1.accuracy, qwenEnsemble.accuracy); console.log(`\n RuvLTRA Best: ${(ruvBest * 100).toFixed(1)}%`); console.log(` Qwen Best: ${(qwenBest * 100).toFixed(1)}%`); console.log(` Advantage: RuvLTRA +${((ruvBest - qwenBest) * 100).toFixed(1)} points`); // Show detailed ensemble results console.log('\n─────────────────────────────────────────────────────────────────'); console.log(' ENSEMBLE VOTING DETAILS (RuvLTRA)'); console.log('─────────────────────────────────────────────────────────────────\n'); for (const r of ensembleResult.results) { const mark = r.correct ? '✓' : '✗'; const task = r.task.slice(0, 45).padEnd(45); const exp = r.expected.padEnd(18); console.log(`${mark} ${task} ${exp}${r.correct ? '' : '→ ' + r.got}`); } console.log('\n'); } main().catch(console.error);