281 lines
15 KiB
JavaScript
281 lines
15 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Real Model Comparison - Qwen 0.5B vs RuvLTRA Claude Code
|
|
*
|
|
* Uses llama-embedding for actual model inference.
|
|
*/
|
|
|
|
const { execSync } = require('child_process');
|
|
const { existsSync } = require('fs');
|
|
const { join } = require('path');
|
|
const { homedir } = require('os');
|
|
|
|
// Model paths
|
|
const MODELS_DIR = join(homedir(), '.ruvllm', 'models');
|
|
const QWEN_MODEL = join(MODELS_DIR, 'qwen2.5-0.5b-instruct-q4_k_m.gguf');
|
|
const RUVLTRA_MODEL = join(MODELS_DIR, 'ruvltra-claude-code-0.5b-q4_k_m.gguf');
|
|
|
|
// Agent descriptions for routing
|
|
const AGENT_DESCRIPTIONS = {
|
|
coder: 'implement create write build add code function class component feature',
|
|
researcher: 'research find investigate analyze explore search discover examine',
|
|
reviewer: 'review check evaluate assess inspect examine code quality',
|
|
tester: 'test unit integration e2e coverage mock assertion spec',
|
|
architect: 'design architecture schema system structure plan database',
|
|
'security-architect': 'security vulnerability xss injection audit cve authentication',
|
|
debugger: 'debug fix bug error issue broken crash exception trace',
|
|
documenter: 'document readme jsdoc comment explain describe documentation',
|
|
refactorer: 'refactor extract rename consolidate clean restructure simplify',
|
|
optimizer: 'optimize performance slow fast cache speed memory latency',
|
|
devops: 'deploy ci cd kubernetes docker pipeline container infrastructure',
|
|
'api-docs': 'openapi swagger api documentation graphql schema endpoint',
|
|
planner: 'plan estimate prioritize sprint roadmap schedule milestone',
|
|
};
|
|
|
|
// Test cases for routing
|
|
const ROUTING_TESTS = [
|
|
{ task: 'Implement a binary search function in TypeScript', expected: 'coder' },
|
|
{ task: 'Write unit tests for the authentication module', expected: 'tester' },
|
|
{ task: 'Review the pull request for security vulnerabilities', expected: 'reviewer' },
|
|
{ task: 'Research best practices for React state management', expected: 'researcher' },
|
|
{ task: 'Design the database schema for user profiles', expected: 'architect' },
|
|
{ task: 'Fix the null pointer exception in the login handler', expected: 'debugger' },
|
|
{ task: 'Audit the API endpoints for XSS vulnerabilities', expected: 'security-architect' },
|
|
{ task: 'Write JSDoc comments for the utility functions', expected: 'documenter' },
|
|
{ task: 'Refactor the payment module to use async/await', expected: 'refactorer' },
|
|
{ task: 'Optimize the database queries for the dashboard', expected: 'optimizer' },
|
|
{ task: 'Set up the CI/CD pipeline for the microservices', expected: 'devops' },
|
|
{ task: 'Generate OpenAPI documentation for the REST API', expected: 'api-docs' },
|
|
{ task: 'Create a sprint plan for the next two weeks', expected: 'planner' },
|
|
{ task: 'Build a React component for user registration', expected: 'coder' },
|
|
{ task: 'Debug memory leak in the WebSocket handler', expected: 'debugger' },
|
|
{ task: 'Investigate slow API response times', expected: 'researcher' },
|
|
{ task: 'Check code for potential race conditions', expected: 'reviewer' },
|
|
{ task: 'Add integration tests for the payment gateway', expected: 'tester' },
|
|
{ task: 'Plan the architecture for real-time notifications', expected: 'architect' },
|
|
{ task: 'Cache the frequently accessed user data', expected: 'optimizer' },
|
|
];
|
|
|
|
// Similarity test pairs
|
|
const SIMILARITY_TESTS = [
|
|
{ text1: 'implement user authentication', text2: 'create login functionality', expected: 'high' },
|
|
{ text1: 'write unit tests', text2: 'fix database bug', expected: 'low' },
|
|
{ text1: 'optimize query performance', text2: 'improve database speed', expected: 'high' },
|
|
{ text1: 'design system architecture', text2: 'plan software structure', expected: 'high' },
|
|
{ text1: 'deploy to kubernetes', text2: 'analyze user behavior', expected: 'low' },
|
|
{ text1: 'refactor legacy code', text2: 'restructure old module', expected: 'high' },
|
|
{ text1: 'debug memory leak', text2: 'fix memory consumption issue', expected: 'high' },
|
|
{ text1: 'document api endpoints', text2: 'write openapi spec', expected: 'high' },
|
|
];
|
|
|
|
/**
|
|
* Get embedding from model using llama-embedding
|
|
*/
|
|
function getEmbedding(modelPath, text) {
|
|
try {
|
|
const sanitized = text.replace(/"/g, '\\"').replace(/\n/g, ' ');
|
|
const result = execSync(
|
|
`llama-embedding -m "${modelPath}" -p "${sanitized}" --embd-output-format json 2>/dev/null`,
|
|
{ encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }
|
|
);
|
|
|
|
const json = JSON.parse(result);
|
|
// Return the last embedding (the full prompt embedding)
|
|
return json.data[json.data.length - 1].embedding;
|
|
} catch (err) {
|
|
console.error(`Error getting embedding: ${err.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Compute cosine similarity
|
|
*/
|
|
function cosineSimilarity(a, b) {
|
|
if (!a || !b || a.length !== b.length) return 0;
|
|
|
|
let dot = 0, normA = 0, normB = 0;
|
|
for (let i = 0; i < a.length; i++) {
|
|
dot += a[i] * b[i];
|
|
normA += a[i] * a[i];
|
|
normB += b[i] * b[i];
|
|
}
|
|
return dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1);
|
|
}
|
|
|
|
/**
|
|
* Route task to agent using embedding similarity
|
|
*/
|
|
function routeTask(taskEmbedding, agentEmbeddings) {
|
|
let bestAgent = 'coder';
|
|
let bestSimilarity = -1;
|
|
|
|
for (const [agent, embedding] of Object.entries(agentEmbeddings)) {
|
|
const sim = cosineSimilarity(taskEmbedding, embedding);
|
|
if (sim > bestSimilarity) {
|
|
bestSimilarity = sim;
|
|
bestAgent = agent;
|
|
}
|
|
}
|
|
|
|
return { agent: bestAgent, confidence: bestSimilarity };
|
|
}
|
|
|
|
/**
|
|
* Run routing benchmark for a model
|
|
*/
|
|
function runRoutingBenchmark(modelPath, modelName) {
|
|
console.log(`\n Computing agent embeddings for ${modelName}...`);
|
|
|
|
// Pre-compute agent embeddings
|
|
const agentEmbeddings = {};
|
|
for (const [agent, description] of Object.entries(AGENT_DESCRIPTIONS)) {
|
|
process.stdout.write(` ${agent}... `);
|
|
agentEmbeddings[agent] = getEmbedding(modelPath, description);
|
|
console.log('done');
|
|
}
|
|
|
|
console.log(` Running routing tests...`);
|
|
let correct = 0;
|
|
const results = [];
|
|
|
|
for (const test of ROUTING_TESTS) {
|
|
process.stdout.write(` "${test.task.slice(0, 40)}..." `);
|
|
const taskEmbedding = getEmbedding(modelPath, test.task);
|
|
const { agent, confidence } = routeTask(taskEmbedding, agentEmbeddings);
|
|
const isCorrect = agent === test.expected;
|
|
if (isCorrect) correct++;
|
|
console.log(`${agent} (expected: ${test.expected}) ${isCorrect ? '✓' : '✗'}`);
|
|
results.push({ task: test.task, expected: test.expected, actual: agent, correct: isCorrect, confidence });
|
|
}
|
|
|
|
const accuracy = correct / ROUTING_TESTS.length;
|
|
return { accuracy, correct, total: ROUTING_TESTS.length, results };
|
|
}
|
|
|
|
/**
|
|
* Run similarity benchmark for a model
|
|
*/
|
|
function runSimilarityBenchmark(modelPath, modelName) {
|
|
console.log(`\n Running similarity tests for ${modelName}...`);
|
|
|
|
let correct = 0;
|
|
const results = [];
|
|
|
|
for (const test of SIMILARITY_TESTS) {
|
|
process.stdout.write(` "${test.text1}" vs "${test.text2}"... `);
|
|
|
|
const emb1 = getEmbedding(modelPath, test.text1);
|
|
const emb2 = getEmbedding(modelPath, test.text2);
|
|
const similarity = cosineSimilarity(emb1, emb2);
|
|
|
|
// Threshold: > 0.7 is high, < 0.5 is low
|
|
const predicted = similarity > 0.6 ? 'high' : 'low';
|
|
const isCorrect = predicted === test.expected;
|
|
if (isCorrect) correct++;
|
|
|
|
console.log(`${(similarity * 100).toFixed(1)}% (${predicted}, expected: ${test.expected}) ${isCorrect ? '✓' : '✗'}`);
|
|
results.push({ text1: test.text1, text2: test.text2, similarity, predicted, expected: test.expected, correct: isCorrect });
|
|
}
|
|
|
|
const accuracy = correct / SIMILARITY_TESTS.length;
|
|
return { accuracy, correct, total: SIMILARITY_TESTS.length, results };
|
|
}
|
|
|
|
/**
|
|
* Main comparison
|
|
*/
|
|
async function main() {
|
|
console.log('\n╔═══════════════════════════════════════════════════════════════════════════════════╗');
|
|
console.log('║ REAL MODEL COMPARISON: Qwen 0.5B vs RuvLTRA Claude Code ║');
|
|
console.log('║ Using llama-embedding inference ║');
|
|
console.log('╚═══════════════════════════════════════════════════════════════════════════════════╝\n');
|
|
|
|
// Check models exist
|
|
if (!existsSync(QWEN_MODEL)) {
|
|
console.error(`Qwen model not found at: ${QWEN_MODEL}`);
|
|
console.error('Download with: curl -L -o ~/.ruvllm/models/qwen2.5-0.5b-instruct-q4_k_m.gguf "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf"');
|
|
process.exit(1);
|
|
}
|
|
|
|
if (!existsSync(RUVLTRA_MODEL)) {
|
|
console.error(`RuvLTRA model not found at: ${RUVLTRA_MODEL}`);
|
|
console.error('Download with: ruvllm models download claude-code');
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log('Models found:');
|
|
console.log(` Qwen: ${QWEN_MODEL}`);
|
|
console.log(` RuvLTRA: ${RUVLTRA_MODEL}`);
|
|
|
|
// Run benchmarks for both models
|
|
console.log('\n─────────────────────────────────────────────────────────────────');
|
|
console.log(' QWEN 0.5B BASE MODEL');
|
|
console.log('─────────────────────────────────────────────────────────────────');
|
|
|
|
const qwenRouting = runRoutingBenchmark(QWEN_MODEL, 'Qwen 0.5B');
|
|
const qwenSimilarity = runSimilarityBenchmark(QWEN_MODEL, 'Qwen 0.5B');
|
|
|
|
console.log('\n─────────────────────────────────────────────────────────────────');
|
|
console.log(' RUVLTRA CLAUDE CODE MODEL');
|
|
console.log('─────────────────────────────────────────────────────────────────');
|
|
|
|
const ruvltraRouting = runRoutingBenchmark(RUVLTRA_MODEL, 'RuvLTRA Claude Code');
|
|
const ruvltraSimilarity = runSimilarityBenchmark(RUVLTRA_MODEL, 'RuvLTRA Claude Code');
|
|
|
|
// Results summary
|
|
console.log('\n═══════════════════════════════════════════════════════════════════════════════════');
|
|
console.log(' COMPARISON RESULTS');
|
|
console.log('═══════════════════════════════════════════════════════════════════════════════════\n');
|
|
|
|
console.log('┌─────────────────────────────┬───────────────┬───────────────┐');
|
|
console.log('│ Metric │ Qwen Base │ RuvLTRA │');
|
|
console.log('├─────────────────────────────┼───────────────┼───────────────┤');
|
|
|
|
const qwenRoutingPct = `${(qwenRouting.accuracy * 100).toFixed(1)}%`;
|
|
const ruvltraRoutingPct = `${(ruvltraRouting.accuracy * 100).toFixed(1)}%`;
|
|
const routingWinner = ruvltraRouting.accuracy > qwenRouting.accuracy ? '✓' : ' ';
|
|
const routingLoser = qwenRouting.accuracy > ruvltraRouting.accuracy ? '✓' : ' ';
|
|
console.log(`│ Routing Accuracy │${routingLoser}${qwenRoutingPct.padStart(12)} │${routingWinner}${ruvltraRoutingPct.padStart(12)} │`);
|
|
|
|
const qwenSimPct = `${(qwenSimilarity.accuracy * 100).toFixed(1)}%`;
|
|
const ruvltraSimPct = `${(ruvltraSimilarity.accuracy * 100).toFixed(1)}%`;
|
|
const simWinner = ruvltraSimilarity.accuracy > qwenSimilarity.accuracy ? '✓' : ' ';
|
|
const simLoser = qwenSimilarity.accuracy > ruvltraSimilarity.accuracy ? '✓' : ' ';
|
|
console.log(`│ Similarity Detection │${simLoser}${qwenSimPct.padStart(12)} │${simWinner}${ruvltraSimPct.padStart(12)} │`);
|
|
|
|
// Overall score
|
|
const qwenOverall = (qwenRouting.accuracy * 0.6 + qwenSimilarity.accuracy * 0.4);
|
|
const ruvltraOverall = (ruvltraRouting.accuracy * 0.6 + ruvltraSimilarity.accuracy * 0.4);
|
|
const qwenOverallPct = `${(qwenOverall * 100).toFixed(1)}%`;
|
|
const ruvltraOverallPct = `${(ruvltraOverall * 100).toFixed(1)}%`;
|
|
const overallWinner = ruvltraOverall > qwenOverall ? '✓' : ' ';
|
|
const overallLoser = qwenOverall > ruvltraOverall ? '✓' : ' ';
|
|
console.log('├─────────────────────────────┼───────────────┼───────────────┤');
|
|
console.log(`│ Overall Score (60/40) │${overallLoser}${qwenOverallPct.padStart(12)} │${overallWinner}${ruvltraOverallPct.padStart(12)} │`);
|
|
|
|
console.log('└─────────────────────────────┴───────────────┴───────────────┘');
|
|
|
|
// Winner announcement
|
|
const winner = ruvltraOverall > qwenOverall ? 'RuvLTRA Claude Code' : 'Qwen 0.5B Base';
|
|
const improvement = Math.abs(ruvltraOverall - qwenOverall) * 100;
|
|
|
|
console.log('\n═══════════════════════════════════════════════════════════════════════════════════');
|
|
console.log(` WINNER: ${winner}`);
|
|
console.log('═══════════════════════════════════════════════════════════════════════════════════');
|
|
|
|
if (ruvltraOverall > qwenOverall) {
|
|
console.log(`\n RuvLTRA outperforms Qwen base by ${improvement.toFixed(1)} percentage points.`);
|
|
console.log(' Fine-tuning for Claude Code workflows provides measurable improvements.');
|
|
} else if (qwenOverall > ruvltraOverall) {
|
|
console.log(`\n Qwen base outperforms RuvLTRA by ${improvement.toFixed(1)} percentage points.`);
|
|
console.log(' Consider additional fine-tuning or different training approach.');
|
|
} else {
|
|
console.log('\n Both models perform equally. Fine-tuning may need adjustment.');
|
|
}
|
|
|
|
console.log('\n');
|
|
}
|
|
|
|
main().catch(console.error);
|