wifi-densepose/npm/packages/ruvllm/scripts/improved-model-compare.js

#!/usr/bin/env node
/**
 * Improved Model Comparison - Enhanced Agent Descriptions
 *
 * Key improvements:
 * 1. Semantic sentence descriptions instead of keyword lists
 * 2. Example tasks embedded in descriptions
 * 3. Unique discriminating phrases for each agent
 * 4. Adjusted similarity scoring with top-k voting
 */

const { execSync } = require('child_process');
const { existsSync } = require('fs');
const { join } = require('path');
const { homedir } = require('os');

// Model paths
const MODELS_DIR = join(homedir(), '.ruvllm', 'models');
const QWEN_MODEL = join(MODELS_DIR, 'qwen2.5-0.5b-instruct-q4_k_m.gguf');
const RUVLTRA_MODEL = join(MODELS_DIR, 'ruvltra-claude-code-0.5b-q4_k_m.gguf');

// IMPROVED: Semantic sentence descriptions with examples
const AGENT_DESCRIPTIONS_V1 = {
  coder: 'implement create write build add code function class component feature',
  researcher: 'research find investigate analyze explore search discover examine',
  reviewer: 'review check evaluate assess inspect examine code quality',
  tester: 'test unit integration e2e coverage mock assertion spec',
  architect: 'design architecture schema system structure plan database',
  'security-architect': 'security vulnerability xss injection audit cve authentication',
  debugger: 'debug fix bug error issue broken crash exception trace',
  documenter: 'document readme jsdoc comment explain describe documentation',
  refactorer: 'refactor extract rename consolidate clean restructure simplify',
  optimizer: 'optimize performance slow fast cache speed memory latency',
  devops: 'deploy ci cd kubernetes docker pipeline container infrastructure',
  'api-docs': 'openapi swagger api documentation graphql schema endpoint',
  planner: 'plan estimate prioritize sprint roadmap schedule milestone',
};

// V2: Semantic sentences with task context
const AGENT_DESCRIPTIONS_V2 = {
  coder: 'I write new code and implement features. Create functions, build components, implement algorithms like binary search, build React components, write TypeScript code.',
  researcher: 'I research and investigate topics. Find best practices, explore solutions, investigate performance issues, analyze patterns, discover new approaches.',
  reviewer: 'I review existing code for quality. Check pull requests, evaluate code style, assess readability, inspect for bugs, examine code patterns.',
  tester: 'I write tests for code. Create unit tests, add integration tests, write e2e tests, mock dependencies, check test coverage, write test specs.',
  architect: 'I design system architecture. Plan database schemas, design API structures, create system diagrams, plan microservices, design data models.',
  'security-architect': 'I audit security vulnerabilities. Check for XSS, SQL injection, CSRF, audit authentication, review security policies, scan for CVEs.',
  debugger: 'I fix bugs and debug errors. Trace exceptions, fix crashes, resolve null pointer errors, debug memory leaks, fix runtime issues.',
  documenter: 'I write documentation and comments. Add JSDoc comments, write README files, explain code functionality, describe APIs, create guides.',
  refactorer: 'I refactor and restructure code. Modernize to async/await, extract functions, rename variables, consolidate duplicate code, simplify logic.',
  optimizer: 'I optimize performance and speed. Cache data, improve query performance, reduce latency, optimize memory usage, speed up slow operations.',
  devops: 'I handle deployment and infrastructure. Set up CI/CD pipelines, configure Kubernetes, manage Docker containers, deploy to cloud.',
  'api-docs': 'I create API documentation specs. Generate OpenAPI specs, write Swagger docs, document REST endpoints, create GraphQL schemas.',
  planner: 'I create project plans and estimates. Sprint planning, roadmap creation, milestone tracking, task prioritization, schedule estimation.',
};

// V3: Even more specific with negative space
const AGENT_DESCRIPTIONS_V3 = {
  coder: 'Software developer who implements new features and writes production code. Tasks: implement binary search, build React components, create TypeScript functions, add new functionality to applications.',
  researcher: 'Technical researcher who investigates and analyzes. Tasks: research best practices, explore state management options, investigate slow response times, analyze codebase patterns.',
  reviewer: 'Code reviewer who evaluates existing code quality. Tasks: review pull requests, check for race conditions, assess code style, evaluate implementation approaches.',
  tester: 'QA engineer who writes automated tests. Tasks: write unit tests, add integration tests, create e2e test suites, test payment gateways, verify authentication modules.',
  architect: 'System architect who designs software structure. Tasks: design database schemas, plan real-time notification systems, architect microservices, model data relationships.',
  'security-architect': 'Security specialist who audits vulnerabilities. Tasks: audit API endpoints for XSS, check SQL injection risks, review authentication security, scan for CSRF vulnerabilities.',
  debugger: 'Bug hunter who fixes errors and traces issues. Tasks: fix null pointer exceptions, debug memory leaks, trace WebSocket errors, resolve crash bugs.',
  documenter: 'Technical writer who creates documentation. Tasks: write JSDoc comments, create README files, document utility functions, explain complex code.',
  refactorer: 'Code modernizer who restructures without changing behavior. Tasks: refactor to async/await, extract reusable functions, modernize legacy patterns, simplify complex logic.',
  optimizer: 'Performance engineer who speeds up slow code. Tasks: cache frequently accessed data, optimize database queries, reduce API latency, improve memory efficiency.',
  devops: 'DevOps engineer who manages deployment infrastructure. Tasks: set up CI/CD pipelines, configure Kubernetes clusters, manage Docker deployments, automate releases.',
  'api-docs': 'API documentation specialist. Tasks: generate OpenAPI documentation, create Swagger specs, document REST API endpoints, write API reference guides.',
  planner: 'Project planner who organizes work. Tasks: create sprint plans, estimate timelines, prioritize backlog, schedule milestones, plan roadmaps.',
};

// Test cases for routing
const ROUTING_TESTS = [
  { task: 'Implement a binary search function in TypeScript', expected: 'coder' },
  { task: 'Write unit tests for the authentication module', expected: 'tester' },
  { task: 'Review the pull request for security vulnerabilities', expected: 'reviewer' },
  { task: 'Research best practices for React state management', expected: 'researcher' },
  { task: 'Design the database schema for user profiles', expected: 'architect' },
  { task: 'Fix the null pointer exception in the login handler', expected: 'debugger' },
  { task: 'Audit the API endpoints for XSS vulnerabilities', expected: 'security-architect' },
  { task: 'Write JSDoc comments for the utility functions', expected: 'documenter' },
  { task: 'Refactor the payment module to use async/await', expected: 'refactorer' },
  { task: 'Optimize the database queries for the dashboard', expected: 'optimizer' },
  { task: 'Set up the CI/CD pipeline for the microservices', expected: 'devops' },
  { task: 'Generate OpenAPI documentation for the REST API', expected: 'api-docs' },
  { task: 'Create a sprint plan for the next two weeks', expected: 'planner' },
  { task: 'Build a React component for user registration', expected: 'coder' },
  { task: 'Debug memory leak in the WebSocket handler', expected: 'debugger' },
  { task: 'Investigate slow API response times', expected: 'researcher' },
  { task: 'Check code for potential race conditions', expected: 'reviewer' },
  { task: 'Add integration tests for the payment gateway', expected: 'tester' },
  { task: 'Plan the architecture for real-time notifications', expected: 'architect' },
  { task: 'Cache the frequently accessed user data', expected: 'optimizer' },
];

// Similarity test pairs
const SIMILARITY_TESTS = [
  { text1: 'implement user authentication', text2: 'create login functionality', expected: 'high' },
  { text1: 'write unit tests', text2: 'fix database bug', expected: 'low' },
  { text1: 'optimize query performance', text2: 'improve database speed', expected: 'high' },
  { text1: 'design system architecture', text2: 'plan software structure', expected: 'high' },
  { text1: 'deploy to kubernetes', text2: 'analyze user behavior', expected: 'low' },
  { text1: 'refactor legacy code', text2: 'restructure old module', expected: 'high' },
  { text1: 'debug memory leak', text2: 'fix memory consumption issue', expected: 'high' },
  { text1: 'document api endpoints', text2: 'write openapi spec', expected: 'high' },
];

/**
 * Get embedding from model
 */
function getEmbedding(modelPath, text) {
  try {
    const sanitized = text.replace(/"/g, '\\"').replace(/\n/g, ' ');
    const result = execSync(
      `llama-embedding -m "${modelPath}" -p "${sanitized}" --embd-output-format json 2>/dev/null`,
      { encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }
    );
    const json = JSON.parse(result);
    return json.data[json.data.length - 1].embedding;
  } catch (err) {
    console.error(`Error: ${err.message}`);
    return null;
  }
}

/**
 * Cosine similarity
 */
function cosineSimilarity(a, b) {
  if (!a || !b || a.length !== b.length) return 0;
  let dot = 0, normA = 0, normB = 0;
  for (let i = 0; i < a.length; i++) {
    dot += a[i] * b[i];
    normA += a[i] * a[i];
    normB += b[i] * b[i];
  }
  return dot / (Math.sqrt(normA) * Math.sqrt(normB) || 1);
}

/**
 * Route task with top-k analysis
 */
function routeTask(taskEmbedding, agentEmbeddings, topK = 3) {
  const scores = [];
  for (const [agent, embedding] of Object.entries(agentEmbeddings)) {
    const sim = cosineSimilarity(taskEmbedding, embedding);
    scores.push({ agent, similarity: sim });
  }
  scores.sort((a, b) => b.similarity - a.similarity);

  return {
    agent: scores[0].agent,
    confidence: scores[0].similarity,
    topK: scores.slice(0, topK),
    margin: scores[0].similarity - scores[1].similarity,
  };
}

/**
 * Run benchmark for a specific description version
 */
function runBenchmark(modelPath, modelName, descriptions, version) {
  console.log(`\n  [${version}] Computing agent embeddings...`);

  const agentEmbeddings = {};
  for (const [agent, description] of Object.entries(descriptions)) {
    process.stdout.write(`    ${agent}... `);
    agentEmbeddings[agent] = getEmbedding(modelPath, description);
    console.log('done');
  }

  console.log(`  [${version}] Running routing tests...`);
  let correct = 0;
  const failures = [];

  for (const test of ROUTING_TESTS) {
    const taskEmbedding = getEmbedding(modelPath, test.task);
    const { agent, confidence, topK, margin } = routeTask(taskEmbedding, agentEmbeddings);
    const isCorrect = agent === test.expected;
    if (isCorrect) {
      correct++;
    } else {
      failures.push({
        task: test.task,
        expected: test.expected,
        got: agent,
        topK,
        margin,
      });
    }
  }

  const accuracy = correct / ROUTING_TESTS.length;
  return { accuracy, correct, total: ROUTING_TESTS.length, failures, version };
}

/**
 * Main comparison
 */
async function main() {
  console.log('\n╔═══════════════════════════════════════════════════════════════════════════════════╗');
  console.log('║           IMPROVED MODEL COMPARISON: Testing Description Strategies               ║');
  console.log('║                     Semantic Descriptions vs Keyword Lists                        ║');
  console.log('╚═══════════════════════════════════════════════════════════════════════════════════╝\n');

  if (!existsSync(QWEN_MODEL) || !existsSync(RUVLTRA_MODEL)) {
    console.error('Models not found. Run the original comparison first.');
    process.exit(1);
  }

  console.log('Testing 3 description strategies:');
  console.log('  V1: Keyword lists (baseline)');
  console.log('  V2: Semantic sentences with examples');
  console.log('  V3: Task-specific descriptions with context\n');

  // Test all three versions with RuvLTRA
  console.log('─────────────────────────────────────────────────────────────────');
  console.log('                   RUVLTRA CLAUDE CODE MODEL');
  console.log('─────────────────────────────────────────────────────────────────');

  const v1Results = runBenchmark(RUVLTRA_MODEL, 'RuvLTRA', AGENT_DESCRIPTIONS_V1, 'V1-Keywords');
  const v2Results = runBenchmark(RUVLTRA_MODEL, 'RuvLTRA', AGENT_DESCRIPTIONS_V2, 'V2-Semantic');
  const v3Results = runBenchmark(RUVLTRA_MODEL, 'RuvLTRA', AGENT_DESCRIPTIONS_V3, 'V3-TaskSpecific');

  // Also test Qwen with best strategy
  console.log('\n─────────────────────────────────────────────────────────────────');
  console.log('                      QWEN 0.5B BASE MODEL');
  console.log('─────────────────────────────────────────────────────────────────');

  const qwenV1 = runBenchmark(QWEN_MODEL, 'Qwen', AGENT_DESCRIPTIONS_V1, 'V1-Keywords');
  const qwenV3 = runBenchmark(QWEN_MODEL, 'Qwen', AGENT_DESCRIPTIONS_V3, 'V3-TaskSpecific');

  // Results summary
  console.log('\n═══════════════════════════════════════════════════════════════════════════════════');
  console.log('                              RESULTS COMPARISON');
  console.log('═══════════════════════════════════════════════════════════════════════════════════\n');

  console.log('┌─────────────────────────┬───────────────┬───────────────┬───────────────┐');
  console.log('│ Strategy                │ RuvLTRA       │ Qwen Base     │ Improvement   │');
  console.log('├─────────────────────────┼───────────────┼───────────────┼───────────────┤');

  const formatPct = (v) => `${(v * 100).toFixed(1)}%`.padStart(12);

  console.log(`│ V1: Keywords            │${formatPct(v1Results.accuracy)}  │${formatPct(qwenV1.accuracy)}  │    baseline   │`);
  console.log(`│ V2: Semantic            │${formatPct(v2Results.accuracy)}  │      -        │${formatPct(v2Results.accuracy - v1Results.accuracy)}  │`);
  console.log(`│ V3: Task-Specific       │${formatPct(v3Results.accuracy)}  │${formatPct(qwenV3.accuracy)}  │${formatPct(v3Results.accuracy - v1Results.accuracy)}  │`);

  console.log('└─────────────────────────┴───────────────┴───────────────┴───────────────┘');

  // Find best strategy
  const best = [v1Results, v2Results, v3Results].reduce((a, b) => a.accuracy > b.accuracy ? a : b);

  console.log(`\n  BEST STRATEGY: ${best.version} with ${(best.accuracy * 100).toFixed(1)}% accuracy`);
  console.log(`  Improvement over V1: +${((best.accuracy - v1Results.accuracy) * 100).toFixed(1)} percentage points`);

  // Show remaining failures for best strategy
  if (best.failures.length > 0) {
    console.log(`\n  Remaining failures (${best.failures.length}):`);
    for (const f of best.failures.slice(0, 5)) {
      console.log(`    "${f.task.slice(0, 45)}..."`);
      console.log(`      Expected: ${f.expected}, Got: ${f.got}`);
      console.log(`      Top-3: ${f.topK.map(t => `${t.agent}(${(t.similarity * 100).toFixed(0)}%)`).join(', ')}`);
    }
  }

  // RuvLTRA vs Qwen with best strategy
  console.log('\n═══════════════════════════════════════════════════════════════════════════════════');
  console.log('              FINAL COMPARISON (V3 Task-Specific Descriptions)');
  console.log('═══════════════════════════════════════════════════════════════════════════════════\n');

  console.log('┌─────────────────────────────┬───────────────┬───────────────┐');
  console.log('│ Metric                      │ Qwen Base     │ RuvLTRA       │');
  console.log('├─────────────────────────────┼───────────────┼───────────────┤');

  const qwenWins = qwenV3.accuracy > v3Results.accuracy;
  const ruvWins = v3Results.accuracy > qwenV3.accuracy;
  console.log(`│ V3 Routing Accuracy         │${qwenWins ? '✓' : ' '}${formatPct(qwenV3.accuracy)}  │${ruvWins ? '✓' : ' '}${formatPct(v3Results.accuracy)}  │`);
  console.log('└─────────────────────────────┴───────────────┴───────────────┘');

  const winner = ruvWins ? 'RuvLTRA' : qwenWins ? 'Qwen' : 'Tie';
  const margin = Math.abs(v3Results.accuracy - qwenV3.accuracy) * 100;

  console.log(`\n  WINNER: ${winner} (${margin.toFixed(1)} point margin)`);
  console.log('\n');
}

main().catch(console.error);