#!/usr/bin/env node /** * Optimized Model Comparison * * Key insight: Shorter, more focused descriptions work better for embeddings. * This version tests: * 1. Focused discriminating keywords (no overlap) * 2. Multi-embedding approach (multiple short phrases per agent) * 3. Weighted voting from multiple description variants */ const { execSync } = require('child_process'); const { existsSync } = require('fs'); const { join } = require('path'); const { homedir } = require('os'); const MODELS_DIR = join(homedir(), '.ruvllm', 'models'); const QWEN_MODEL = join(MODELS_DIR, 'qwen2.5-0.5b-instruct-q4_k_m.gguf'); const RUVLTRA_MODEL = join(MODELS_DIR, 'ruvltra-claude-code-0.5b-q4_k_m.gguf'); // V1: Original keywords (baseline) const DESCRIPTIONS_V1 = { coder: 'implement create write build add code function class component feature', researcher: 'research find investigate analyze explore search discover examine', reviewer: 'review check evaluate assess inspect examine code quality', tester: 'test unit integration e2e coverage mock assertion spec', architect: 'design architecture schema system structure plan database', 'security-architect': 'security vulnerability xss injection audit cve authentication', debugger: 'debug fix bug error issue broken crash exception trace', documenter: 'document readme jsdoc comment explain describe documentation', refactorer: 'refactor extract rename consolidate clean restructure simplify', optimizer: 'optimize performance slow fast cache speed memory latency', devops: 'deploy ci cd kubernetes docker pipeline container infrastructure', 'api-docs': 'openapi swagger api documentation graphql schema endpoint', planner: 'plan estimate prioritize sprint roadmap schedule milestone', }; // V4: Focused discriminating keywords - remove overlap, add unique identifiers const DESCRIPTIONS_V4 = { coder: 'implement build create function component feature typescript react', researcher: 'research investigate explore discover best practices patterns', reviewer: 'review pull request code quality style check pr', tester: 'test unit integration e2e tests testing coverage spec', architect: 'design architecture schema database system structure diagram', 'security-architect': 'security vulnerability xss injection csrf audit cve', debugger: 'debug fix bug error exception crash trace null pointer', documenter: 'jsdoc comments readme documentation describe explain', refactorer: 'refactor async await modernize restructure extract', optimizer: 'optimize cache performance speed latency slow fast', devops: 'deploy ci cd kubernetes docker pipeline infrastructure', 'api-docs': 'openapi swagger rest api spec endpoint documentation', planner: 'sprint plan roadmap milestone estimate schedule prioritize', }; // V5: Multi-phrase approach - multiple short embeddings per agent, use max similarity const MULTI_DESCRIPTIONS = { coder: [ 'implement function', 'build component', 'create typescript code', 'write feature', ], researcher: [ 'research best practices', 'investigate issue', 'explore solutions', 'analyze patterns', ], reviewer: [ 'review pull request', 'check code quality', 'evaluate code', 'assess implementation', ], tester: [ 'write unit tests', 'add integration tests', 'create test coverage', 'test authentication', ], architect: [ 'design database schema', 'plan architecture', 'system structure', 'microservices design', ], 'security-architect': [ 'audit xss vulnerability', 'security audit', 'check injection', 'cve vulnerability', ], debugger: [ 'fix bug', 'debug error', 'trace exception', 'fix null pointer', ], documenter: [ 'write jsdoc comments', 'create readme', 'document functions', 'explain code', ], refactorer: [ 'refactor to async await', 'restructure code', 'modernize legacy', 'extract function', ], optimizer: [ 'cache data', 'optimize query', 'improve performance', 'reduce latency', ], devops: [ 'deploy kubernetes', 'setup ci cd', 'docker container', 'infrastructure pipeline', ], 'api-docs': [ 'generate openapi', 'swagger documentation', 'rest api spec', 'api endpoint docs', ], planner: [ 'create sprint plan', 'estimate timeline', 'prioritize tasks', 'roadmap milestone', ], }; const ROUTING_TESTS = [ { task: 'Implement a binary search function in TypeScript', expected: 'coder' }, { task: 'Write unit tests for the authentication module', expected: 'tester' }, { task: 'Review the pull request for security vulnerabilities', expected: 'reviewer' }, { task: 'Research best practices for React state management', expected: 'researcher' }, { task: 'Design the database schema for user profiles', expected: 'architect' }, { task: 'Fix the null pointer exception in the login handler', expected: 'debugger' }, { task: 'Audit the API endpoints for XSS vulnerabilities', expected: 'security-architect' }, { task: 'Write JSDoc comments for the utility functions', expected: 'documenter' }, { task: 'Refactor the payment module to use async/await', expected: 'refactorer' }, { task: 'Optimize the database queries for the dashboard', expected: 'optimizer' }, { task: 'Set up the CI/CD pipeline for the microservices', expected: 'devops' }, { task: 'Generate OpenAPI documentation for the REST API', expected: 'api-docs' }, { task: 'Create a sprint plan for the next two weeks', expected: 'planner' }, { task: 'Build a React component for user registration', expected: 'coder' }, { task: 'Debug memory leak in the WebSocket handler', expected: 'debugger' }, { task: 'Investigate slow API response times', expected: 'researcher' }, { task: 'Check code for potential race conditions', expected: 'reviewer' }, { task: 'Add integration tests for the payment gateway', expected: 'tester' }, { task: 'Plan the architecture for real-time notifications', expected: 'architect' }, { task: 'Cache the frequently accessed user data', expected: 'optimizer' }, ]; function getEmbedding(modelPath, text) { try { const sanitized = text.replace(/"/g, '\\"').replace(/\n/g, ' '); const result = execSync( `llama-embedding -m "${modelPath}" -p "${sanitized}" --embd-output-format json 2>/dev/null`, { encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 } ); const json = JSON.parse(result); return json.data[json.data.length - 1].embedding; } catch (err) { return null; } } function cosineSimilarity(a, b) { if (!a || !b || a.length !== b.length) return 0; let dot = 0, normA = 0, normB = 0; for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } return dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1); } /** * Standard single-embedding routing */ function routeTaskSingle(taskEmbedding, agentEmbeddings) { let bestAgent = 'coder'; let bestSim = -1; for (const [agent, emb] of Object.entries(agentEmbeddings)) { const sim = cosineSimilarity(taskEmbedding, emb); if (sim > bestSim) { bestSim = sim; bestAgent = agent; } } return { agent: bestAgent, confidence: bestSim }; } /** * Multi-embedding routing - use max similarity across multiple phrases */ function routeTaskMulti(taskEmbedding, multiAgentEmbeddings) { let bestAgent = 'coder'; let bestSim = -1; for (const [agent, embeddings] of Object.entries(multiAgentEmbeddings)) { // Take max similarity across all phrases for this agent let maxSim = -1; for (const emb of embeddings) { const sim = cosineSimilarity(taskEmbedding, emb); if (sim > maxSim) maxSim = sim; } if (maxSim > bestSim) { bestSim = maxSim; bestAgent = agent; } } return { agent: bestAgent, confidence: bestSim }; } /** * Run single-embedding benchmark */ function runSingleBenchmark(modelPath, descriptions, version) { process.stdout.write(` [${version}] Computing embeddings... `); const agentEmbeddings = {}; for (const [agent, desc] of Object.entries(descriptions)) { agentEmbeddings[agent] = getEmbedding(modelPath, desc); } console.log('done'); let correct = 0; for (const test of ROUTING_TESTS) { const taskEmb = getEmbedding(modelPath, test.task); const { agent } = routeTaskSingle(taskEmb, agentEmbeddings); if (agent === test.expected) correct++; } return { accuracy: correct / ROUTING_TESTS.length, correct, total: ROUTING_TESTS.length, version }; } /** * Run multi-embedding benchmark */ function runMultiBenchmark(modelPath, multiDescriptions, version) { process.stdout.write(` [${version}] Computing multi-embeddings... `); const multiAgentEmbeddings = {}; for (const [agent, phrases] of Object.entries(multiDescriptions)) { multiAgentEmbeddings[agent] = phrases.map(p => getEmbedding(modelPath, p)); } console.log('done'); let correct = 0; const results = []; for (const test of ROUTING_TESTS) { const taskEmb = getEmbedding(modelPath, test.task); const { agent, confidence } = routeTaskMulti(taskEmb, multiAgentEmbeddings); const isCorrect = agent === test.expected; if (isCorrect) correct++; results.push({ task: test.task, expected: test.expected, got: agent, correct: isCorrect }); } return { accuracy: correct / ROUTING_TESTS.length, correct, total: ROUTING_TESTS.length, version, results }; } async function main() { console.log('\n╔═══════════════════════════════════════════════════════════════════════════════════╗'); console.log('║ OPTIMIZED MODEL COMPARISON: Focused & Multi-Embedding ║'); console.log('╚═══════════════════════════════════════════════════════════════════════════════════╝\n'); if (!existsSync(RUVLTRA_MODEL)) { console.error('RuvLTRA model not found.'); process.exit(1); } console.log('Strategies:'); console.log(' V1: Original keywords (baseline)'); console.log(' V4: Focused discriminating keywords'); console.log(' V5: Multi-phrase (4 phrases per agent, max similarity)\n'); // RuvLTRA tests console.log('─────────────────────────────────────────────────────────────────'); console.log(' RUVLTRA CLAUDE CODE'); console.log('─────────────────────────────────────────────────────────────────'); const ruvV1 = runSingleBenchmark(RUVLTRA_MODEL, DESCRIPTIONS_V1, 'V1-Original'); const ruvV4 = runSingleBenchmark(RUVLTRA_MODEL, DESCRIPTIONS_V4, 'V4-Focused'); const ruvV5 = runMultiBenchmark(RUVLTRA_MODEL, MULTI_DESCRIPTIONS, 'V5-Multi'); // Qwen tests console.log('\n─────────────────────────────────────────────────────────────────'); console.log(' QWEN 0.5B BASE'); console.log('─────────────────────────────────────────────────────────────────'); const qwenV1 = runSingleBenchmark(QWEN_MODEL, DESCRIPTIONS_V1, 'V1-Original'); const qwenV4 = runSingleBenchmark(QWEN_MODEL, DESCRIPTIONS_V4, 'V4-Focused'); const qwenV5 = runMultiBenchmark(QWEN_MODEL, MULTI_DESCRIPTIONS, 'V5-Multi'); // Results console.log('\n═══════════════════════════════════════════════════════════════════════════════════'); console.log(' RESULTS'); console.log('═══════════════════════════════════════════════════════════════════════════════════\n'); console.log('┌─────────────────────────┬───────────────┬───────────────┬───────────────┐'); console.log('│ Strategy │ RuvLTRA │ Qwen Base │ RuvLTRA Delta │'); console.log('├─────────────────────────┼───────────────┼───────────────┼───────────────┤'); const fmt = (v) => `${(v * 100).toFixed(1)}%`.padStart(12); const fmtDelta = (v, base) => { const delta = (v - base) * 100; const sign = delta >= 0 ? '+' : ''; return `${sign}${delta.toFixed(1)}%`.padStart(12); }; console.log(`│ V1: Original │${fmt(ruvV1.accuracy)} │${fmt(qwenV1.accuracy)} │ baseline │`); console.log(`│ V4: Focused │${fmt(ruvV4.accuracy)} │${fmt(qwenV4.accuracy)} │${fmtDelta(ruvV4.accuracy, ruvV1.accuracy)} │`); console.log(`│ V5: Multi-phrase │${fmt(ruvV5.accuracy)} │${fmt(qwenV5.accuracy)} │${fmtDelta(ruvV5.accuracy, ruvV1.accuracy)} │`); console.log('└─────────────────────────┴───────────────┴───────────────┴───────────────┘'); // Best result const allResults = [ { model: 'RuvLTRA', ...ruvV1 }, { model: 'RuvLTRA', ...ruvV4 }, { model: 'RuvLTRA', ...ruvV5 }, { model: 'Qwen', ...qwenV1 }, { model: 'Qwen', ...qwenV4 }, { model: 'Qwen', ...qwenV5 }, ]; const best = allResults.reduce((a, b) => a.accuracy > b.accuracy ? a : b); console.log(`\n BEST: ${best.model} + ${best.version} = ${(best.accuracy * 100).toFixed(1)}%`); // Show V5 detailed results console.log('\n─────────────────────────────────────────────────────────────────'); console.log(' V5 MULTI-PHRASE DETAILED (RuvLTRA)'); console.log('─────────────────────────────────────────────────────────────────'); for (const r of ruvV5.results) { const mark = r.correct ? '✓' : '✗'; const task = r.task.slice(0, 50).padEnd(50); const exp = r.expected.padEnd(18); const got = r.got.padEnd(18); console.log(` ${mark} ${task} ${exp} ${r.correct ? '' : '→ ' + got}`); } // Final comparison const ruvBest = [ruvV1, ruvV4, ruvV5].reduce((a, b) => a.accuracy > b.accuracy ? a : b); const qwenBest = [qwenV1, qwenV4, qwenV5].reduce((a, b) => a.accuracy > b.accuracy ? a : b); console.log('\n═══════════════════════════════════════════════════════════════════════════════════'); console.log(' FINAL WINNER'); console.log('═══════════════════════════════════════════════════════════════════════════════════'); console.log(`\n RuvLTRA best: ${ruvBest.version} = ${(ruvBest.accuracy * 100).toFixed(1)}%`); console.log(` Qwen best: ${qwenBest.version} = ${(qwenBest.accuracy * 100).toFixed(1)}%`); console.log(`\n Margin: RuvLTRA leads by ${((ruvBest.accuracy - qwenBest.accuracy) * 100).toFixed(1)} points`); console.log('\n'); } main().catch(console.error);