git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
289 lines
18 KiB
JavaScript
289 lines
18 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Improved Model Comparison - Enhanced Agent Descriptions
|
|
*
|
|
* Key improvements:
|
|
* 1. Semantic sentence descriptions instead of keyword lists
|
|
* 2. Example tasks embedded in descriptions
|
|
* 3. Unique discriminating phrases for each agent
|
|
* 4. Adjusted similarity scoring with top-k voting
|
|
*/
|
|
|
|
const { execSync } = require('child_process');
|
|
const { existsSync } = require('fs');
|
|
const { join } = require('path');
|
|
const { homedir } = require('os');
|
|
|
|
// Model paths
|
|
const MODELS_DIR = join(homedir(), '.ruvllm', 'models');
|
|
const QWEN_MODEL = join(MODELS_DIR, 'qwen2.5-0.5b-instruct-q4_k_m.gguf');
|
|
const RUVLTRA_MODEL = join(MODELS_DIR, 'ruvltra-claude-code-0.5b-q4_k_m.gguf');
|
|
|
|
// IMPROVED: Semantic sentence descriptions with examples
|
|
const AGENT_DESCRIPTIONS_V1 = {
|
|
coder: 'implement create write build add code function class component feature',
|
|
researcher: 'research find investigate analyze explore search discover examine',
|
|
reviewer: 'review check evaluate assess inspect examine code quality',
|
|
tester: 'test unit integration e2e coverage mock assertion spec',
|
|
architect: 'design architecture schema system structure plan database',
|
|
'security-architect': 'security vulnerability xss injection audit cve authentication',
|
|
debugger: 'debug fix bug error issue broken crash exception trace',
|
|
documenter: 'document readme jsdoc comment explain describe documentation',
|
|
refactorer: 'refactor extract rename consolidate clean restructure simplify',
|
|
optimizer: 'optimize performance slow fast cache speed memory latency',
|
|
devops: 'deploy ci cd kubernetes docker pipeline container infrastructure',
|
|
'api-docs': 'openapi swagger api documentation graphql schema endpoint',
|
|
planner: 'plan estimate prioritize sprint roadmap schedule milestone',
|
|
};
|
|
|
|
// V2: Semantic sentences with task context
|
|
const AGENT_DESCRIPTIONS_V2 = {
|
|
coder: 'I write new code and implement features. Create functions, build components, implement algorithms like binary search, build React components, write TypeScript code.',
|
|
researcher: 'I research and investigate topics. Find best practices, explore solutions, investigate performance issues, analyze patterns, discover new approaches.',
|
|
reviewer: 'I review existing code for quality. Check pull requests, evaluate code style, assess readability, inspect for bugs, examine code patterns.',
|
|
tester: 'I write tests for code. Create unit tests, add integration tests, write e2e tests, mock dependencies, check test coverage, write test specs.',
|
|
architect: 'I design system architecture. Plan database schemas, design API structures, create system diagrams, plan microservices, design data models.',
|
|
'security-architect': 'I audit security vulnerabilities. Check for XSS, SQL injection, CSRF, audit authentication, review security policies, scan for CVEs.',
|
|
debugger: 'I fix bugs and debug errors. Trace exceptions, fix crashes, resolve null pointer errors, debug memory leaks, fix runtime issues.',
|
|
documenter: 'I write documentation and comments. Add JSDoc comments, write README files, explain code functionality, describe APIs, create guides.',
|
|
refactorer: 'I refactor and restructure code. Modernize to async/await, extract functions, rename variables, consolidate duplicate code, simplify logic.',
|
|
optimizer: 'I optimize performance and speed. Cache data, improve query performance, reduce latency, optimize memory usage, speed up slow operations.',
|
|
devops: 'I handle deployment and infrastructure. Set up CI/CD pipelines, configure Kubernetes, manage Docker containers, deploy to cloud.',
|
|
'api-docs': 'I create API documentation specs. Generate OpenAPI specs, write Swagger docs, document REST endpoints, create GraphQL schemas.',
|
|
planner: 'I create project plans and estimates. Sprint planning, roadmap creation, milestone tracking, task prioritization, schedule estimation.',
|
|
};
|
|
|
|
// V3: Even more specific with negative space
|
|
const AGENT_DESCRIPTIONS_V3 = {
|
|
coder: 'Software developer who implements new features and writes production code. Tasks: implement binary search, build React components, create TypeScript functions, add new functionality to applications.',
|
|
researcher: 'Technical researcher who investigates and analyzes. Tasks: research best practices, explore state management options, investigate slow response times, analyze codebase patterns.',
|
|
reviewer: 'Code reviewer who evaluates existing code quality. Tasks: review pull requests, check for race conditions, assess code style, evaluate implementation approaches.',
|
|
tester: 'QA engineer who writes automated tests. Tasks: write unit tests, add integration tests, create e2e test suites, test payment gateways, verify authentication modules.',
|
|
architect: 'System architect who designs software structure. Tasks: design database schemas, plan real-time notification systems, architect microservices, model data relationships.',
|
|
'security-architect': 'Security specialist who audits vulnerabilities. Tasks: audit API endpoints for XSS, check SQL injection risks, review authentication security, scan for CSRF vulnerabilities.',
|
|
debugger: 'Bug hunter who fixes errors and traces issues. Tasks: fix null pointer exceptions, debug memory leaks, trace WebSocket errors, resolve crash bugs.',
|
|
documenter: 'Technical writer who creates documentation. Tasks: write JSDoc comments, create README files, document utility functions, explain complex code.',
|
|
refactorer: 'Code modernizer who restructures without changing behavior. Tasks: refactor to async/await, extract reusable functions, modernize legacy patterns, simplify complex logic.',
|
|
optimizer: 'Performance engineer who speeds up slow code. Tasks: cache frequently accessed data, optimize database queries, reduce API latency, improve memory efficiency.',
|
|
devops: 'DevOps engineer who manages deployment infrastructure. Tasks: set up CI/CD pipelines, configure Kubernetes clusters, manage Docker deployments, automate releases.',
|
|
'api-docs': 'API documentation specialist. Tasks: generate OpenAPI documentation, create Swagger specs, document REST API endpoints, write API reference guides.',
|
|
planner: 'Project planner who organizes work. Tasks: create sprint plans, estimate timelines, prioritize backlog, schedule milestones, plan roadmaps.',
|
|
};
|
|
|
|
// Test cases for routing
|
|
const ROUTING_TESTS = [
|
|
{ task: 'Implement a binary search function in TypeScript', expected: 'coder' },
|
|
{ task: 'Write unit tests for the authentication module', expected: 'tester' },
|
|
{ task: 'Review the pull request for security vulnerabilities', expected: 'reviewer' },
|
|
{ task: 'Research best practices for React state management', expected: 'researcher' },
|
|
{ task: 'Design the database schema for user profiles', expected: 'architect' },
|
|
{ task: 'Fix the null pointer exception in the login handler', expected: 'debugger' },
|
|
{ task: 'Audit the API endpoints for XSS vulnerabilities', expected: 'security-architect' },
|
|
{ task: 'Write JSDoc comments for the utility functions', expected: 'documenter' },
|
|
{ task: 'Refactor the payment module to use async/await', expected: 'refactorer' },
|
|
{ task: 'Optimize the database queries for the dashboard', expected: 'optimizer' },
|
|
{ task: 'Set up the CI/CD pipeline for the microservices', expected: 'devops' },
|
|
{ task: 'Generate OpenAPI documentation for the REST API', expected: 'api-docs' },
|
|
{ task: 'Create a sprint plan for the next two weeks', expected: 'planner' },
|
|
{ task: 'Build a React component for user registration', expected: 'coder' },
|
|
{ task: 'Debug memory leak in the WebSocket handler', expected: 'debugger' },
|
|
{ task: 'Investigate slow API response times', expected: 'researcher' },
|
|
{ task: 'Check code for potential race conditions', expected: 'reviewer' },
|
|
{ task: 'Add integration tests for the payment gateway', expected: 'tester' },
|
|
{ task: 'Plan the architecture for real-time notifications', expected: 'architect' },
|
|
{ task: 'Cache the frequently accessed user data', expected: 'optimizer' },
|
|
];
|
|
|
|
// Similarity test pairs
|
|
const SIMILARITY_TESTS = [
|
|
{ text1: 'implement user authentication', text2: 'create login functionality', expected: 'high' },
|
|
{ text1: 'write unit tests', text2: 'fix database bug', expected: 'low' },
|
|
{ text1: 'optimize query performance', text2: 'improve database speed', expected: 'high' },
|
|
{ text1: 'design system architecture', text2: 'plan software structure', expected: 'high' },
|
|
{ text1: 'deploy to kubernetes', text2: 'analyze user behavior', expected: 'low' },
|
|
{ text1: 'refactor legacy code', text2: 'restructure old module', expected: 'high' },
|
|
{ text1: 'debug memory leak', text2: 'fix memory consumption issue', expected: 'high' },
|
|
{ text1: 'document api endpoints', text2: 'write openapi spec', expected: 'high' },
|
|
];
|
|
|
|
/**
|
|
* Get embedding from model
|
|
*/
|
|
function getEmbedding(modelPath, text) {
|
|
try {
|
|
const sanitized = text.replace(/"/g, '\\"').replace(/\n/g, ' ');
|
|
const result = execSync(
|
|
`llama-embedding -m "${modelPath}" -p "${sanitized}" --embd-output-format json 2>/dev/null`,
|
|
{ encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }
|
|
);
|
|
const json = JSON.parse(result);
|
|
return json.data[json.data.length - 1].embedding;
|
|
} catch (err) {
|
|
console.error(`Error: ${err.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Cosine similarity
|
|
*/
|
|
function cosineSimilarity(a, b) {
|
|
if (!a || !b || a.length !== b.length) return 0;
|
|
let dot = 0, normA = 0, normB = 0;
|
|
for (let i = 0; i < a.length; i++) {
|
|
dot += a[i] * b[i];
|
|
normA += a[i] * a[i];
|
|
normB += b[i] * b[i];
|
|
}
|
|
return dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1);
|
|
}
|
|
|
|
/**
|
|
* Route task with top-k analysis
|
|
*/
|
|
function routeTask(taskEmbedding, agentEmbeddings, topK = 3) {
|
|
const scores = [];
|
|
for (const [agent, embedding] of Object.entries(agentEmbeddings)) {
|
|
const sim = cosineSimilarity(taskEmbedding, embedding);
|
|
scores.push({ agent, similarity: sim });
|
|
}
|
|
scores.sort((a, b) => b.similarity - a.similarity);
|
|
|
|
return {
|
|
agent: scores[0].agent,
|
|
confidence: scores[0].similarity,
|
|
topK: scores.slice(0, topK),
|
|
margin: scores[0].similarity - scores[1].similarity,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Run benchmark for a specific description version
|
|
*/
|
|
function runBenchmark(modelPath, modelName, descriptions, version) {
|
|
console.log(`\n [${version}] Computing agent embeddings...`);
|
|
|
|
const agentEmbeddings = {};
|
|
for (const [agent, description] of Object.entries(descriptions)) {
|
|
process.stdout.write(` ${agent}... `);
|
|
agentEmbeddings[agent] = getEmbedding(modelPath, description);
|
|
console.log('done');
|
|
}
|
|
|
|
console.log(` [${version}] Running routing tests...`);
|
|
let correct = 0;
|
|
const failures = [];
|
|
|
|
for (const test of ROUTING_TESTS) {
|
|
const taskEmbedding = getEmbedding(modelPath, test.task);
|
|
const { agent, confidence, topK, margin } = routeTask(taskEmbedding, agentEmbeddings);
|
|
const isCorrect = agent === test.expected;
|
|
if (isCorrect) {
|
|
correct++;
|
|
} else {
|
|
failures.push({
|
|
task: test.task,
|
|
expected: test.expected,
|
|
got: agent,
|
|
topK,
|
|
margin,
|
|
});
|
|
}
|
|
}
|
|
|
|
const accuracy = correct / ROUTING_TESTS.length;
|
|
return { accuracy, correct, total: ROUTING_TESTS.length, failures, version };
|
|
}
|
|
|
|
/**
|
|
* Main comparison
|
|
*/
|
|
async function main() {
|
|
console.log('\n╔═══════════════════════════════════════════════════════════════════════════════════╗');
|
|
console.log('║ IMPROVED MODEL COMPARISON: Testing Description Strategies ║');
|
|
console.log('║ Semantic Descriptions vs Keyword Lists ║');
|
|
console.log('╚═══════════════════════════════════════════════════════════════════════════════════╝\n');
|
|
|
|
if (!existsSync(QWEN_MODEL) || !existsSync(RUVLTRA_MODEL)) {
|
|
console.error('Models not found. Run the original comparison first.');
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log('Testing 3 description strategies:');
|
|
console.log(' V1: Keyword lists (baseline)');
|
|
console.log(' V2: Semantic sentences with examples');
|
|
console.log(' V3: Task-specific descriptions with context\n');
|
|
|
|
// Test all three versions with RuvLTRA
|
|
console.log('─────────────────────────────────────────────────────────────────');
|
|
console.log(' RUVLTRA CLAUDE CODE MODEL');
|
|
console.log('─────────────────────────────────────────────────────────────────');
|
|
|
|
const v1Results = runBenchmark(RUVLTRA_MODEL, 'RuvLTRA', AGENT_DESCRIPTIONS_V1, 'V1-Keywords');
|
|
const v2Results = runBenchmark(RUVLTRA_MODEL, 'RuvLTRA', AGENT_DESCRIPTIONS_V2, 'V2-Semantic');
|
|
const v3Results = runBenchmark(RUVLTRA_MODEL, 'RuvLTRA', AGENT_DESCRIPTIONS_V3, 'V3-TaskSpecific');
|
|
|
|
// Also test Qwen with best strategy
|
|
console.log('\n─────────────────────────────────────────────────────────────────');
|
|
console.log(' QWEN 0.5B BASE MODEL');
|
|
console.log('─────────────────────────────────────────────────────────────────');
|
|
|
|
const qwenV1 = runBenchmark(QWEN_MODEL, 'Qwen', AGENT_DESCRIPTIONS_V1, 'V1-Keywords');
|
|
const qwenV3 = runBenchmark(QWEN_MODEL, 'Qwen', AGENT_DESCRIPTIONS_V3, 'V3-TaskSpecific');
|
|
|
|
// Results summary
|
|
console.log('\n═══════════════════════════════════════════════════════════════════════════════════');
|
|
console.log(' RESULTS COMPARISON');
|
|
console.log('═══════════════════════════════════════════════════════════════════════════════════\n');
|
|
|
|
console.log('┌─────────────────────────┬───────────────┬───────────────┬───────────────┐');
|
|
console.log('│ Strategy │ RuvLTRA │ Qwen Base │ Improvement │');
|
|
console.log('├─────────────────────────┼───────────────┼───────────────┼───────────────┤');
|
|
|
|
const formatPct = (v) => `${(v * 100).toFixed(1)}%`.padStart(12);
|
|
|
|
console.log(`│ V1: Keywords │${formatPct(v1Results.accuracy)} │${formatPct(qwenV1.accuracy)} │ baseline │`);
|
|
console.log(`│ V2: Semantic │${formatPct(v2Results.accuracy)} │ - │${formatPct(v2Results.accuracy - v1Results.accuracy)} │`);
|
|
console.log(`│ V3: Task-Specific │${formatPct(v3Results.accuracy)} │${formatPct(qwenV3.accuracy)} │${formatPct(v3Results.accuracy - v1Results.accuracy)} │`);
|
|
|
|
console.log('└─────────────────────────┴───────────────┴───────────────┴───────────────┘');
|
|
|
|
// Find best strategy
|
|
const best = [v1Results, v2Results, v3Results].reduce((a, b) => a.accuracy > b.accuracy ? a : b);
|
|
|
|
console.log(`\n BEST STRATEGY: ${best.version} with ${(best.accuracy * 100).toFixed(1)}% accuracy`);
|
|
console.log(` Improvement over V1: +${((best.accuracy - v1Results.accuracy) * 100).toFixed(1)} percentage points`);
|
|
|
|
// Show remaining failures for best strategy
|
|
if (best.failures.length > 0) {
|
|
console.log(`\n Remaining failures (${best.failures.length}):`);
|
|
for (const f of best.failures.slice(0, 5)) {
|
|
console.log(` "${f.task.slice(0, 45)}..."`);
|
|
console.log(` Expected: ${f.expected}, Got: ${f.got}`);
|
|
console.log(` Top-3: ${f.topK.map(t => `${t.agent}(${(t.similarity * 100).toFixed(0)}%)`).join(', ')}`);
|
|
}
|
|
}
|
|
|
|
// RuvLTRA vs Qwen with best strategy
|
|
console.log('\n═══════════════════════════════════════════════════════════════════════════════════');
|
|
console.log(' FINAL COMPARISON (V3 Task-Specific Descriptions)');
|
|
console.log('═══════════════════════════════════════════════════════════════════════════════════\n');
|
|
|
|
console.log('┌─────────────────────────────┬───────────────┬───────────────┐');
|
|
console.log('│ Metric │ Qwen Base │ RuvLTRA │');
|
|
console.log('├─────────────────────────────┼───────────────┼───────────────┤');
|
|
|
|
const qwenWins = qwenV3.accuracy > v3Results.accuracy;
|
|
const ruvWins = v3Results.accuracy > qwenV3.accuracy;
|
|
console.log(`│ V3 Routing Accuracy │${qwenWins ? '✓' : ' '}${formatPct(qwenV3.accuracy)} │${ruvWins ? '✓' : ' '}${formatPct(v3Results.accuracy)} │`);
|
|
console.log('└─────────────────────────────┴───────────────┴───────────────┘');
|
|
|
|
const winner = ruvWins ? 'RuvLTRA' : qwenWins ? 'Qwen' : 'Tie';
|
|
const margin = Math.abs(v3Results.accuracy - qwenV3.accuracy) * 100;
|
|
|
|
console.log(`\n WINNER: ${winner} (${margin.toFixed(1)} point margin)`);
|
|
console.log('\n');
|
|
}
|
|
|
|
main().catch(console.error);
|