git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
1325 lines
56 KiB
JavaScript
1325 lines
56 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Ecosystem Training Data Generator for RuvLTRA
|
|
*
|
|
* Generates comprehensive triplet training data for the Claude Flow ecosystem:
|
|
* - claude-flow: Multi-agent coordination and swarm orchestration
|
|
* - agentic-flow: AI workflow orchestration and ONNX embeddings
|
|
* - ruvector: High-performance vector database
|
|
*
|
|
* Features:
|
|
* - Reads capability definitions from JSON files
|
|
* - Generates 5-10 natural language prompts per capability
|
|
* - Creates hard negatives for contrastive learning
|
|
* - Outputs combined JSONL dataset for fine-tuning
|
|
*/
|
|
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
|
|
// ============================================================================
|
|
// CAPABILITY DEFINITIONS
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Claude Flow V3 Capabilities
|
|
* Multi-agent swarm coordination, memory, hooks, workflows
|
|
*/
|
|
const CLAUDE_FLOW_CAPABILITIES = {
|
|
name: 'claude-flow',
|
|
description: 'Multi-agent swarm coordination and orchestration framework',
|
|
version: '3.0.0',
|
|
categories: {
|
|
swarm: {
|
|
description: 'Multi-agent swarm coordination and topology management',
|
|
commands: {
|
|
'swarm init': {
|
|
description: 'Initialize a swarm with specified topology',
|
|
keywords: ['swarm', 'init', 'initialize', 'topology', 'multi-agent', 'coordination'],
|
|
parameters: ['--topology', '--max-agents', '--strategy'],
|
|
},
|
|
'swarm status': {
|
|
description: 'Get current swarm status and agent health',
|
|
keywords: ['swarm', 'status', 'health', 'agents', 'monitoring'],
|
|
parameters: ['--verbose'],
|
|
},
|
|
'swarm shutdown': {
|
|
description: 'Gracefully shutdown the swarm',
|
|
keywords: ['swarm', 'shutdown', 'stop', 'terminate', 'graceful'],
|
|
parameters: ['--graceful', '--force'],
|
|
},
|
|
},
|
|
},
|
|
agent: {
|
|
description: 'Agent lifecycle management',
|
|
commands: {
|
|
'agent spawn': {
|
|
description: 'Spawn a new agent with specified type',
|
|
keywords: ['agent', 'spawn', 'create', 'start', 'worker'],
|
|
parameters: ['-t', '--type', '--name', '--model'],
|
|
},
|
|
'agent list': {
|
|
description: 'List all active agents',
|
|
keywords: ['agent', 'list', 'show', 'active', 'running'],
|
|
parameters: ['--status', '--domain'],
|
|
},
|
|
'agent terminate': {
|
|
description: 'Terminate a specific agent',
|
|
keywords: ['agent', 'terminate', 'kill', 'stop', 'remove'],
|
|
parameters: ['--force'],
|
|
},
|
|
'agent status': {
|
|
description: 'Get status of a specific agent',
|
|
keywords: ['agent', 'status', 'info', 'details', 'health'],
|
|
parameters: ['--agentId'],
|
|
},
|
|
},
|
|
},
|
|
memory: {
|
|
description: 'Persistent memory with vector search',
|
|
commands: {
|
|
'memory store': {
|
|
description: 'Store a value in memory with optional namespace',
|
|
keywords: ['memory', 'store', 'save', 'persist', 'key-value'],
|
|
parameters: ['--key', '--value', '--namespace', '--ttl'],
|
|
},
|
|
'memory retrieve': {
|
|
description: 'Retrieve a value from memory by key',
|
|
keywords: ['memory', 'retrieve', 'get', 'fetch', 'read'],
|
|
parameters: ['--key', '--namespace'],
|
|
},
|
|
'memory search': {
|
|
description: 'Semantic vector search in memory',
|
|
keywords: ['memory', 'search', 'query', 'find', 'semantic', 'vector'],
|
|
parameters: ['--query', '--namespace', '--limit', '--threshold'],
|
|
},
|
|
'memory list': {
|
|
description: 'List memory entries',
|
|
keywords: ['memory', 'list', 'entries', 'keys', 'show'],
|
|
parameters: ['--namespace', '--limit'],
|
|
},
|
|
'memory delete': {
|
|
description: 'Delete a memory entry',
|
|
keywords: ['memory', 'delete', 'remove', 'clear'],
|
|
parameters: ['--key', '--namespace'],
|
|
},
|
|
},
|
|
},
|
|
hooks: {
|
|
description: 'Self-learning hooks and background workers',
|
|
commands: {
|
|
'hooks pre-task': {
|
|
description: 'Get agent suggestions before starting a task',
|
|
keywords: ['hooks', 'pre-task', 'routing', 'suggestions', 'before'],
|
|
parameters: ['--description', '--taskId'],
|
|
},
|
|
'hooks post-task': {
|
|
description: 'Record task completion for learning',
|
|
keywords: ['hooks', 'post-task', 'completion', 'learning', 'after'],
|
|
parameters: ['--taskId', '--success', '--quality'],
|
|
},
|
|
'hooks route': {
|
|
description: 'Route task to optimal agent',
|
|
keywords: ['hooks', 'route', 'routing', 'optimal', 'agent'],
|
|
parameters: ['--task', '--context'],
|
|
},
|
|
'hooks worker dispatch': {
|
|
description: 'Dispatch a background worker',
|
|
keywords: ['hooks', 'worker', 'dispatch', 'background', 'trigger'],
|
|
parameters: ['--trigger', '--context', '--priority'],
|
|
},
|
|
'hooks metrics': {
|
|
description: 'View learning metrics dashboard',
|
|
keywords: ['hooks', 'metrics', 'dashboard', 'stats', 'learning'],
|
|
parameters: ['--period', '--format'],
|
|
},
|
|
'hooks pretrain': {
|
|
description: 'Bootstrap intelligence from repository',
|
|
keywords: ['hooks', 'pretrain', 'bootstrap', 'intelligence', 'analyze'],
|
|
parameters: ['--path', '--depth'],
|
|
},
|
|
},
|
|
},
|
|
workflow: {
|
|
description: 'Workflow execution and templates',
|
|
commands: {
|
|
'workflow create': {
|
|
description: 'Create a new workflow',
|
|
keywords: ['workflow', 'create', 'new', 'define'],
|
|
parameters: ['--name', '--steps', '--description'],
|
|
},
|
|
'workflow execute': {
|
|
description: 'Execute a workflow',
|
|
keywords: ['workflow', 'execute', 'run', 'start'],
|
|
parameters: ['--workflowId', '--variables'],
|
|
},
|
|
'workflow status': {
|
|
description: 'Get workflow execution status',
|
|
keywords: ['workflow', 'status', 'progress', 'state'],
|
|
parameters: ['--workflowId', '--verbose'],
|
|
},
|
|
},
|
|
},
|
|
hivemind: {
|
|
description: 'Hive-mind collective consensus',
|
|
commands: {
|
|
'hive-mind init': {
|
|
description: 'Initialize hive-mind collective',
|
|
keywords: ['hive-mind', 'init', 'collective', 'consensus'],
|
|
parameters: ['--topology', '--queenId'],
|
|
},
|
|
'hive-mind spawn': {
|
|
description: 'Spawn workers and join to hive-mind',
|
|
keywords: ['hive-mind', 'spawn', 'workers', 'join'],
|
|
parameters: ['--count', '--prefix', '--role'],
|
|
},
|
|
'hive-mind consensus': {
|
|
description: 'Propose or vote on consensus',
|
|
keywords: ['hive-mind', 'consensus', 'vote', 'propose'],
|
|
parameters: ['--action', '--proposalId', '--vote'],
|
|
},
|
|
'hive-mind broadcast': {
|
|
description: 'Broadcast message to all workers',
|
|
keywords: ['hive-mind', 'broadcast', 'message', 'all'],
|
|
parameters: ['--message', '--priority'],
|
|
},
|
|
},
|
|
},
|
|
task: {
|
|
description: 'Task creation and management',
|
|
commands: {
|
|
'task create': {
|
|
description: 'Create a new task',
|
|
keywords: ['task', 'create', 'new', 'add'],
|
|
parameters: ['--type', '--description', '--priority'],
|
|
},
|
|
'task list': {
|
|
description: 'List all tasks',
|
|
keywords: ['task', 'list', 'show', 'all'],
|
|
parameters: ['--status', '--priority'],
|
|
},
|
|
'task complete': {
|
|
description: 'Mark task as complete',
|
|
keywords: ['task', 'complete', 'done', 'finish'],
|
|
parameters: ['--taskId', '--result'],
|
|
},
|
|
},
|
|
},
|
|
session: {
|
|
description: 'Session state management',
|
|
commands: {
|
|
'session save': {
|
|
description: 'Save current session state',
|
|
keywords: ['session', 'save', 'persist', 'state'],
|
|
parameters: ['--name', '--description'],
|
|
},
|
|
'session restore': {
|
|
description: 'Restore a saved session',
|
|
keywords: ['session', 'restore', 'load', 'resume'],
|
|
parameters: ['--sessionId', '--name'],
|
|
},
|
|
'session list': {
|
|
description: 'List saved sessions',
|
|
keywords: ['session', 'list', 'saved', 'history'],
|
|
parameters: ['--limit'],
|
|
},
|
|
},
|
|
},
|
|
neural: {
|
|
description: 'Neural pattern training and prediction',
|
|
commands: {
|
|
'neural train': {
|
|
description: 'Train a neural model',
|
|
keywords: ['neural', 'train', 'model', 'learning'],
|
|
parameters: ['--modelType', '--epochs', '--learningRate'],
|
|
},
|
|
'neural predict': {
|
|
description: 'Make predictions using neural model',
|
|
keywords: ['neural', 'predict', 'inference', 'model'],
|
|
parameters: ['--input', '--modelId'],
|
|
},
|
|
'neural patterns': {
|
|
description: 'Manage neural patterns',
|
|
keywords: ['neural', 'patterns', 'store', 'search'],
|
|
parameters: ['--action', '--patternId'],
|
|
},
|
|
},
|
|
},
|
|
security: {
|
|
description: 'Security scanning and threat detection',
|
|
commands: {
|
|
'aidefence scan': {
|
|
description: 'Scan input for AI manipulation threats',
|
|
keywords: ['aidefence', 'scan', 'security', 'threats', 'injection'],
|
|
parameters: ['--input', '--quick'],
|
|
},
|
|
'aidefence analyze': {
|
|
description: 'Deep analysis for threat types',
|
|
keywords: ['aidefence', 'analyze', 'deep', 'threats'],
|
|
parameters: ['--input', '--searchSimilar'],
|
|
},
|
|
'aidefence is_safe': {
|
|
description: 'Quick boolean safety check',
|
|
keywords: ['aidefence', 'safe', 'check', 'validate'],
|
|
parameters: ['--input'],
|
|
},
|
|
},
|
|
},
|
|
performance: {
|
|
description: 'Performance profiling and optimization',
|
|
commands: {
|
|
'performance benchmark': {
|
|
description: 'Run performance benchmarks',
|
|
keywords: ['performance', 'benchmark', 'speed', 'test'],
|
|
parameters: ['--suite', '--iterations'],
|
|
},
|
|
'performance profile': {
|
|
description: 'Profile specific component',
|
|
keywords: ['performance', 'profile', 'analyze', 'bottleneck'],
|
|
parameters: ['--target', '--duration'],
|
|
},
|
|
'performance optimize': {
|
|
description: 'Apply performance optimizations',
|
|
keywords: ['performance', 'optimize', 'improve', 'speed'],
|
|
parameters: ['--target', '--aggressive'],
|
|
},
|
|
},
|
|
},
|
|
embeddings: {
|
|
description: 'Vector embeddings with ONNX',
|
|
commands: {
|
|
'embeddings generate': {
|
|
description: 'Generate embeddings for text',
|
|
keywords: ['embeddings', 'generate', 'embed', 'vector'],
|
|
parameters: ['--text', '--hyperbolic'],
|
|
},
|
|
'embeddings compare': {
|
|
description: 'Compare similarity between texts',
|
|
keywords: ['embeddings', 'compare', 'similarity', 'distance'],
|
|
parameters: ['--text1', '--text2', '--metric'],
|
|
},
|
|
'embeddings search': {
|
|
description: 'Semantic search across stored embeddings',
|
|
keywords: ['embeddings', 'search', 'semantic', 'query'],
|
|
parameters: ['--query', '--topK', '--threshold'],
|
|
},
|
|
},
|
|
},
|
|
claims: {
|
|
description: 'Issue claiming and coordination',
|
|
commands: {
|
|
'claims claim': {
|
|
description: 'Claim an issue for work',
|
|
keywords: ['claims', 'claim', 'issue', 'work', 'assign'],
|
|
parameters: ['--issueId', '--claimant'],
|
|
},
|
|
'claims release': {
|
|
description: 'Release a claim on an issue',
|
|
keywords: ['claims', 'release', 'unclaim', 'free'],
|
|
parameters: ['--issueId', '--reason'],
|
|
},
|
|
'claims handoff': {
|
|
description: 'Handoff issue to another claimant',
|
|
keywords: ['claims', 'handoff', 'transfer', 'pass'],
|
|
parameters: ['--issueId', '--from', '--to'],
|
|
},
|
|
'claims board': {
|
|
description: 'View claims board',
|
|
keywords: ['claims', 'board', 'view', 'overview'],
|
|
parameters: [],
|
|
},
|
|
},
|
|
},
|
|
},
|
|
};
|
|
|
|
/**
|
|
* Agentic Flow Capabilities
|
|
* AI workflow orchestration and ONNX embeddings
|
|
*/
|
|
const AGENTIC_FLOW_CAPABILITIES = {
|
|
name: 'agentic-flow',
|
|
description: 'AI workflow orchestration with ONNX runtime and vector embeddings',
|
|
version: '1.0.0',
|
|
categories: {
|
|
embeddings: {
|
|
description: 'High-performance ONNX embeddings',
|
|
commands: {
|
|
'embed': {
|
|
description: 'Generate embeddings for text using ONNX models',
|
|
keywords: ['embed', 'embedding', 'vector', 'encode', 'onnx'],
|
|
parameters: ['--text', '--model', '--normalize'],
|
|
},
|
|
'batch-embed': {
|
|
description: 'Batch embed multiple texts efficiently',
|
|
keywords: ['batch', 'embed', 'multiple', 'parallel'],
|
|
parameters: ['--texts', '--concurrency'],
|
|
},
|
|
'similarity': {
|
|
description: 'Compute similarity between embeddings',
|
|
keywords: ['similarity', 'cosine', 'distance', 'compare'],
|
|
parameters: ['--a', '--b', '--metric'],
|
|
},
|
|
},
|
|
},
|
|
models: {
|
|
description: 'ONNX model management',
|
|
commands: {
|
|
'model load': {
|
|
description: 'Load an ONNX model for inference',
|
|
keywords: ['model', 'load', 'onnx', 'initialize'],
|
|
parameters: ['--path', '--name'],
|
|
},
|
|
'model list': {
|
|
description: 'List available models',
|
|
keywords: ['model', 'list', 'available', 'show'],
|
|
parameters: [],
|
|
},
|
|
'model info': {
|
|
description: 'Get model information and metadata',
|
|
keywords: ['model', 'info', 'metadata', 'details'],
|
|
parameters: ['--name'],
|
|
},
|
|
'model quantize': {
|
|
description: 'Quantize model for faster inference',
|
|
keywords: ['model', 'quantize', 'compress', 'optimize'],
|
|
parameters: ['--input', '--output', '--bits'],
|
|
},
|
|
},
|
|
},
|
|
pipeline: {
|
|
description: 'Workflow pipeline orchestration',
|
|
commands: {
|
|
'pipeline create': {
|
|
description: 'Create a new processing pipeline',
|
|
keywords: ['pipeline', 'create', 'workflow', 'chain'],
|
|
parameters: ['--name', '--steps'],
|
|
},
|
|
'pipeline run': {
|
|
description: 'Execute a pipeline with input data',
|
|
keywords: ['pipeline', 'run', 'execute', 'process'],
|
|
parameters: ['--name', '--input'],
|
|
},
|
|
'pipeline visualize': {
|
|
description: 'Visualize pipeline structure',
|
|
keywords: ['pipeline', 'visualize', 'graph', 'diagram'],
|
|
parameters: ['--name', '--format'],
|
|
},
|
|
},
|
|
},
|
|
cache: {
|
|
description: 'Embedding cache management',
|
|
commands: {
|
|
'cache set': {
|
|
description: 'Store embedding in cache',
|
|
keywords: ['cache', 'set', 'store', 'save'],
|
|
parameters: ['--key', '--embedding'],
|
|
},
|
|
'cache get': {
|
|
description: 'Retrieve embedding from cache',
|
|
keywords: ['cache', 'get', 'retrieve', 'fetch'],
|
|
parameters: ['--key'],
|
|
},
|
|
'cache clear': {
|
|
description: 'Clear embedding cache',
|
|
keywords: ['cache', 'clear', 'flush', 'reset'],
|
|
parameters: ['--namespace'],
|
|
},
|
|
'cache stats': {
|
|
description: 'Get cache statistics',
|
|
keywords: ['cache', 'stats', 'statistics', 'info'],
|
|
parameters: [],
|
|
},
|
|
},
|
|
},
|
|
search: {
|
|
description: 'Vector search operations',
|
|
commands: {
|
|
'search nearest': {
|
|
description: 'Find nearest neighbors to query vector',
|
|
keywords: ['search', 'nearest', 'neighbors', 'knn'],
|
|
parameters: ['--query', '--k', '--threshold'],
|
|
},
|
|
'search range': {
|
|
description: 'Range search within distance threshold',
|
|
keywords: ['search', 'range', 'radius', 'threshold'],
|
|
parameters: ['--query', '--radius'],
|
|
},
|
|
'search hybrid': {
|
|
description: 'Hybrid search combining keyword and semantic',
|
|
keywords: ['search', 'hybrid', 'combined', 'keyword', 'semantic'],
|
|
parameters: ['--query', '--alpha'],
|
|
},
|
|
},
|
|
},
|
|
},
|
|
};
|
|
|
|
/**
|
|
* RuVector Capabilities
|
|
* High-performance vector database
|
|
*/
|
|
const RUVECTOR_CAPABILITIES = {
|
|
name: 'ruvector',
|
|
description: 'High-performance vector database with HNSW indexing',
|
|
version: '0.1.0',
|
|
categories: {
|
|
collections: {
|
|
description: 'Vector collection management',
|
|
commands: {
|
|
'collection create': {
|
|
description: 'Create a new vector collection',
|
|
keywords: ['collection', 'create', 'new', 'database'],
|
|
parameters: ['--name', '--dimension', '--metric'],
|
|
},
|
|
'collection delete': {
|
|
description: 'Delete a vector collection',
|
|
keywords: ['collection', 'delete', 'drop', 'remove'],
|
|
parameters: ['--name', '--confirm'],
|
|
},
|
|
'collection info': {
|
|
description: 'Get collection information',
|
|
keywords: ['collection', 'info', 'stats', 'details'],
|
|
parameters: ['--name'],
|
|
},
|
|
'collection list': {
|
|
description: 'List all collections',
|
|
keywords: ['collection', 'list', 'all', 'show'],
|
|
parameters: [],
|
|
},
|
|
},
|
|
},
|
|
vectors: {
|
|
description: 'Vector CRUD operations',
|
|
commands: {
|
|
'vector insert': {
|
|
description: 'Insert vectors into collection',
|
|
keywords: ['vector', 'insert', 'add', 'upsert'],
|
|
parameters: ['--collection', '--vectors', '--ids'],
|
|
},
|
|
'vector delete': {
|
|
description: 'Delete vectors from collection',
|
|
keywords: ['vector', 'delete', 'remove', 'drop'],
|
|
parameters: ['--collection', '--ids'],
|
|
},
|
|
'vector get': {
|
|
description: 'Get vectors by ID',
|
|
keywords: ['vector', 'get', 'fetch', 'retrieve'],
|
|
parameters: ['--collection', '--ids'],
|
|
},
|
|
'vector update': {
|
|
description: 'Update existing vectors',
|
|
keywords: ['vector', 'update', 'modify', 'change'],
|
|
parameters: ['--collection', '--id', '--vector'],
|
|
},
|
|
},
|
|
},
|
|
search: {
|
|
description: 'Vector search with HNSW',
|
|
commands: {
|
|
'search knn': {
|
|
description: 'K-nearest neighbor search',
|
|
keywords: ['search', 'knn', 'nearest', 'similar'],
|
|
parameters: ['--collection', '--query', '--k'],
|
|
},
|
|
'search filter': {
|
|
description: 'Filtered vector search with metadata',
|
|
keywords: ['search', 'filter', 'metadata', 'conditional'],
|
|
parameters: ['--collection', '--query', '--filter'],
|
|
},
|
|
'search batch': {
|
|
description: 'Batch search multiple queries',
|
|
keywords: ['search', 'batch', 'multiple', 'parallel'],
|
|
parameters: ['--collection', '--queries', '--k'],
|
|
},
|
|
},
|
|
},
|
|
index: {
|
|
description: 'HNSW index management',
|
|
commands: {
|
|
'index build': {
|
|
description: 'Build HNSW index for collection',
|
|
keywords: ['index', 'build', 'create', 'hnsw'],
|
|
parameters: ['--collection', '--ef', '--m'],
|
|
},
|
|
'index rebuild': {
|
|
description: 'Rebuild existing index',
|
|
keywords: ['index', 'rebuild', 'refresh', 'reindex'],
|
|
parameters: ['--collection'],
|
|
},
|
|
'index stats': {
|
|
description: 'Get index statistics',
|
|
keywords: ['index', 'stats', 'info', 'metrics'],
|
|
parameters: ['--collection'],
|
|
},
|
|
'index optimize': {
|
|
description: 'Optimize index for search performance',
|
|
keywords: ['index', 'optimize', 'tune', 'improve'],
|
|
parameters: ['--collection', '--target'],
|
|
},
|
|
},
|
|
},
|
|
persistence: {
|
|
description: 'Data persistence and backup',
|
|
commands: {
|
|
'snapshot create': {
|
|
description: 'Create a snapshot of collection',
|
|
keywords: ['snapshot', 'create', 'backup', 'save'],
|
|
parameters: ['--collection', '--path'],
|
|
},
|
|
'snapshot restore': {
|
|
description: 'Restore collection from snapshot',
|
|
keywords: ['snapshot', 'restore', 'load', 'recover'],
|
|
parameters: ['--path', '--collection'],
|
|
},
|
|
'snapshot list': {
|
|
description: 'List available snapshots',
|
|
keywords: ['snapshot', 'list', 'backups', 'show'],
|
|
parameters: ['--collection'],
|
|
},
|
|
},
|
|
},
|
|
quantization: {
|
|
description: 'Vector quantization for memory efficiency',
|
|
commands: {
|
|
'quantize apply': {
|
|
description: 'Apply quantization to collection',
|
|
keywords: ['quantize', 'apply', 'compress', 'reduce'],
|
|
parameters: ['--collection', '--type', '--bits'],
|
|
},
|
|
'quantize info': {
|
|
description: 'Get quantization info',
|
|
keywords: ['quantize', 'info', 'status', 'details'],
|
|
parameters: ['--collection'],
|
|
},
|
|
},
|
|
},
|
|
},
|
|
};
|
|
|
|
// ============================================================================
|
|
// PROMPT TEMPLATES
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Natural language prompt variations for generating diverse training data
|
|
* Each template has placeholders for action, object, and description
|
|
*/
|
|
const PROMPT_TEMPLATES = {
|
|
// Direct commands
|
|
direct: [
|
|
'{action} {object}',
|
|
'{action} the {object}',
|
|
'Please {action} {object}',
|
|
'I need to {action} {object}',
|
|
],
|
|
// Request style
|
|
request: [
|
|
'Can you {action} {object}?',
|
|
'Help me {action} {object}',
|
|
'I want to {action} {object}',
|
|
'Could you {action} {object} for me?',
|
|
],
|
|
// Question style
|
|
question: [
|
|
'How do I {action} {object}?',
|
|
'What\'s the best way to {action} {object}?',
|
|
'How can I {action} {object}?',
|
|
'Which command {desc}?',
|
|
],
|
|
// Contextual
|
|
contextual: [
|
|
'I\'m trying to {desc}',
|
|
'I need a way to {desc}',
|
|
'My goal is to {desc}',
|
|
'For this project, I need to {desc}',
|
|
],
|
|
// Descriptive
|
|
descriptive: [
|
|
'{desc}',
|
|
'Help with {desc}',
|
|
'I want {desc}',
|
|
'Need {desc}',
|
|
],
|
|
};
|
|
|
|
/**
|
|
* Action verbs mapped to capability types
|
|
*/
|
|
const ACTION_MAPPINGS = {
|
|
// Swarm/Agent actions
|
|
swarm: ['initialize', 'start', 'create', 'set up', 'configure', 'launch'],
|
|
agent: ['spawn', 'create', 'start', 'launch', 'deploy', 'run'],
|
|
terminate: ['stop', 'kill', 'terminate', 'shutdown', 'end', 'close'],
|
|
status: ['check', 'get', 'view', 'show', 'monitor', 'inspect'],
|
|
list: ['list', 'show', 'display', 'enumerate', 'get all'],
|
|
|
|
// Memory actions
|
|
store: ['store', 'save', 'persist', 'write', 'put', 'cache'],
|
|
retrieve: ['retrieve', 'get', 'fetch', 'read', 'load'],
|
|
search: ['search', 'find', 'query', 'look up', 'discover'],
|
|
delete: ['delete', 'remove', 'clear', 'drop', 'erase'],
|
|
|
|
// Workflow actions
|
|
create: ['create', 'make', 'build', 'define', 'set up'],
|
|
execute: ['execute', 'run', 'start', 'trigger', 'launch'],
|
|
|
|
// Vector operations
|
|
embed: ['embed', 'encode', 'vectorize', 'generate embedding for'],
|
|
insert: ['insert', 'add', 'upsert', 'put', 'store'],
|
|
knn: ['find similar', 'search for nearest', 'query neighbors', 'find k-nearest'],
|
|
|
|
// Index operations
|
|
build: ['build', 'create', 'construct', 'generate'],
|
|
optimize: ['optimize', 'tune', 'improve', 'speed up'],
|
|
rebuild: ['rebuild', 'regenerate', 'refresh', 'recreate'],
|
|
};
|
|
|
|
// ============================================================================
|
|
// HARD NEGATIVE PATTERNS
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Confusing command pairs for hard negative generation
|
|
* These are commands that sound similar but have different purposes
|
|
*/
|
|
const CONFUSING_PAIRS = [
|
|
// Claude Flow internal confusion
|
|
{ cmd1: 'memory store', cmd2: 'memory search', reason: 'both involve memory' },
|
|
{ cmd1: 'agent spawn', cmd2: 'hive-mind spawn', reason: 'both spawn workers' },
|
|
{ cmd1: 'swarm init', cmd2: 'hive-mind init', reason: 'both initialize coordination' },
|
|
{ cmd1: 'hooks route', cmd2: 'hooks pre-task', reason: 'both involve task routing' },
|
|
{ cmd1: 'workflow execute', cmd2: 'task create', reason: 'both start work' },
|
|
{ cmd1: 'session save', cmd2: 'memory store', reason: 'both persist data' },
|
|
{ cmd1: 'neural train', cmd2: 'hooks pretrain', reason: 'both involve training' },
|
|
{ cmd1: 'embeddings search', cmd2: 'memory search', reason: 'both search semantically' },
|
|
{ cmd1: 'performance profile', cmd2: 'performance benchmark', reason: 'both analyze performance' },
|
|
{ cmd1: 'claims claim', cmd2: 'task create', reason: 'both assign work' },
|
|
|
|
// Cross-tool confusion
|
|
{ cmd1: 'claude-flow memory search', cmd2: 'ruvector search knn', reason: 'both do vector search' },
|
|
{ cmd1: 'claude-flow embeddings generate', cmd2: 'agentic-flow embed', reason: 'both generate embeddings' },
|
|
{ cmd1: 'ruvector collection create', cmd2: 'claude-flow memory store', reason: 'both store data' },
|
|
{ cmd1: 'agentic-flow cache set', cmd2: 'claude-flow memory store', reason: 'both cache data' },
|
|
{ cmd1: 'ruvector index build', cmd2: 'claude-flow hooks pretrain', reason: 'both build indexes' },
|
|
{ cmd1: 'agentic-flow search hybrid', cmd2: 'ruvector search filter', reason: 'both filtered search' },
|
|
{ cmd1: 'claude-flow swarm init', cmd2: 'agentic-flow pipeline create', reason: 'both orchestrate work' },
|
|
|
|
// Category confusion
|
|
{ cmd1: 'ruvector vector insert', cmd2: 'agentic-flow cache set', reason: 'both store vectors' },
|
|
{ cmd1: 'claude-flow agent list', cmd2: 'agentic-flow model list', reason: 'both list resources' },
|
|
{ cmd1: 'ruvector snapshot create', cmd2: 'claude-flow session save', reason: 'both create backups' },
|
|
{ cmd1: 'agentic-flow model quantize', cmd2: 'ruvector quantize apply', reason: 'both quantize' },
|
|
];
|
|
|
|
// ============================================================================
|
|
// TRIPLET GENERATION
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Generate natural language prompts for a capability
|
|
*/
|
|
function generatePrompts(capability, command, config) {
|
|
const prompts = [];
|
|
const { description, keywords } = config;
|
|
|
|
// Extract action and object from command
|
|
const parts = command.split(' ');
|
|
const category = parts[0];
|
|
const action = parts[1] || parts[0];
|
|
|
|
// Get action variations - but avoid repeating category name as action
|
|
const actionVariations = ACTION_MAPPINGS[action] || [action];
|
|
const primaryAction = actionVariations[0];
|
|
|
|
// Create clean description for prompts
|
|
const descLower = description.toLowerCase();
|
|
|
|
// Avoid redundant phrases like "search search" or "status status"
|
|
const isActionSameAsCategory = primaryAction.toLowerCase() === category.toLowerCase();
|
|
|
|
// Direct commands: "{action} {object}"
|
|
if (!isActionSameAsCategory) {
|
|
prompts.push(`${primaryAction} ${category}`);
|
|
prompts.push(`${primaryAction} the ${category}`);
|
|
prompts.push(`I need to ${primaryAction} ${category}`);
|
|
prompts.push(`Can you ${primaryAction} ${category}?`);
|
|
prompts.push(`Help me ${primaryAction} ${category}`);
|
|
} else {
|
|
// When action == category, use description instead
|
|
prompts.push(`${primaryAction}`);
|
|
prompts.push(`I need to ${primaryAction}`);
|
|
prompts.push(`Help me ${primaryAction}`);
|
|
}
|
|
|
|
// Use description-based prompts (always good quality)
|
|
prompts.push(`I want to ${descLower}`);
|
|
prompts.push(`How do I ${descLower}?`);
|
|
prompts.push(`I need a way to ${descLower}`);
|
|
|
|
// Action variations (skip if redundant)
|
|
for (const actionVar of actionVariations.slice(1, 3)) {
|
|
if (actionVar.toLowerCase() !== category.toLowerCase()) {
|
|
prompts.push(`${actionVar} ${category}`);
|
|
prompts.push(`I want to ${actionVar} ${category}`);
|
|
}
|
|
}
|
|
|
|
// Keyword-based prompts (only use unique keywords not in action/category)
|
|
const usedWords = new Set([primaryAction.toLowerCase(), category.toLowerCase(),
|
|
...actionVariations.map(a => a.toLowerCase())]);
|
|
for (const keyword of keywords) {
|
|
const kwLower = keyword.toLowerCase();
|
|
if (!usedWords.has(kwLower) && kwLower !== category.toLowerCase()) {
|
|
prompts.push(`${keyword} in ${category}`);
|
|
prompts.push(`I need ${keyword} functionality`);
|
|
usedWords.add(kwLower);
|
|
if (usedWords.size > keywords.length + 3) break;
|
|
}
|
|
}
|
|
|
|
// Tool-specific technical prompts
|
|
prompts.push(`run ${command}`);
|
|
prompts.push(`${capability} ${command}`);
|
|
|
|
// Clean up prompts: remove duplicates, fix spacing, validate
|
|
const cleanPrompts = [...new Set(prompts)]
|
|
.map(p => p.trim().replace(/\s+/g, ' '))
|
|
.filter(p => {
|
|
// Filter out bad prompts
|
|
if (p.length < 5) return false;
|
|
if (p.includes('undefined')) return false;
|
|
// Check for redundant word repetition (e.g., "status status")
|
|
const words = p.toLowerCase().split(' ');
|
|
for (let i = 0; i < words.length - 1; i++) {
|
|
if (words[i] === words[i + 1] && words[i].length > 2) return false;
|
|
}
|
|
return true;
|
|
})
|
|
.slice(0, 10);
|
|
|
|
return cleanPrompts;
|
|
}
|
|
|
|
/**
|
|
* Find hard negatives for a command
|
|
*/
|
|
function findHardNegatives(tool, command, allCapabilities) {
|
|
const negatives = [];
|
|
const fullCommand = `${tool} ${command}`;
|
|
|
|
// Find from predefined confusing pairs
|
|
for (const pair of CONFUSING_PAIRS) {
|
|
if (pair.cmd1.includes(command) || pair.cmd2.includes(command)) {
|
|
const negative = pair.cmd1.includes(command) ? pair.cmd2 : pair.cmd1;
|
|
negatives.push({
|
|
command: negative,
|
|
reason: pair.reason,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Find similar commands from other tools
|
|
for (const cap of allCapabilities) {
|
|
if (cap.name === tool) continue;
|
|
|
|
for (const [catName, category] of Object.entries(cap.categories)) {
|
|
for (const [cmdName, cmdConfig] of Object.entries(category.commands)) {
|
|
// Check for keyword overlap
|
|
const cmdKeywords = cmdConfig.keywords || [];
|
|
const sourceConfig = getCommandConfig(tool, command, allCapabilities);
|
|
const sourceKeywords = sourceConfig?.keywords || [];
|
|
|
|
const overlap = cmdKeywords.filter((k) => sourceKeywords.includes(k));
|
|
if (overlap.length >= 2) {
|
|
negatives.push({
|
|
command: `${cap.name} ${cmdName}`,
|
|
reason: `keyword overlap: ${overlap.join(', ')}`,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Find similar commands within same tool (different category)
|
|
const sourceCapability = allCapabilities.find((c) => c.name === tool);
|
|
if (sourceCapability) {
|
|
const [sourceCategory] = command.split(' ');
|
|
for (const [catName, category] of Object.entries(sourceCapability.categories)) {
|
|
if (catName === sourceCategory) continue;
|
|
|
|
for (const [cmdName, cmdConfig] of Object.entries(category.commands)) {
|
|
// Similar action words
|
|
const cmdAction = cmdName.split(' ')[1] || cmdName.split(' ')[0];
|
|
const sourceAction = command.split(' ')[1] || command.split(' ')[0];
|
|
|
|
if (cmdAction === sourceAction) {
|
|
negatives.push({
|
|
command: `${tool} ${cmdName}`,
|
|
reason: `same action '${cmdAction}' different category`,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Limit and deduplicate
|
|
const seen = new Set();
|
|
return negatives.filter((n) => {
|
|
if (seen.has(n.command)) return false;
|
|
seen.add(n.command);
|
|
return true;
|
|
}).slice(0, 5);
|
|
}
|
|
|
|
/**
|
|
* Get command config from capabilities
|
|
*/
|
|
function getCommandConfig(tool, command, allCapabilities) {
|
|
const capability = allCapabilities.find((c) => c.name === tool);
|
|
if (!capability) return null;
|
|
|
|
for (const category of Object.values(capability.categories)) {
|
|
if (category.commands[command]) {
|
|
return category.commands[command];
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Generate triplets for all capabilities
|
|
*/
|
|
function generateTriplets(capabilities) {
|
|
const triplets = [];
|
|
|
|
for (const cap of capabilities) {
|
|
for (const [catName, category] of Object.entries(cap.categories)) {
|
|
for (const [cmdName, cmdConfig] of Object.entries(category.commands)) {
|
|
const fullCommand = `${cap.name} ${cmdName}`;
|
|
const prompts = generatePrompts(cap.name, cmdName, cmdConfig);
|
|
const negatives = findHardNegatives(cap.name, cmdName, capabilities);
|
|
|
|
// Create triplets
|
|
for (const prompt of prompts) {
|
|
// Skip malformed prompts
|
|
if (!prompt || prompt.length < 5) continue;
|
|
|
|
// For each prompt, create triplets with each negative
|
|
if (negatives.length > 0) {
|
|
for (const negative of negatives) {
|
|
// Ensure negative has full tool prefix
|
|
let negCommand = negative.command;
|
|
if (!negCommand.includes(' ') || (!negCommand.startsWith('claude-flow') &&
|
|
!negCommand.startsWith('agentic-flow') && !negCommand.startsWith('ruvector'))) {
|
|
continue; // Skip incomplete negatives
|
|
}
|
|
|
|
// Skip if negative equals positive
|
|
if (negCommand === fullCommand) continue;
|
|
|
|
triplets.push({
|
|
anchor: prompt,
|
|
positive: fullCommand,
|
|
negative: negCommand,
|
|
isHard: true,
|
|
category: catName,
|
|
tool: cap.name,
|
|
});
|
|
}
|
|
} else {
|
|
// Create triplet with random different tool command as negative
|
|
const otherCaps = capabilities.filter((c) => c.name !== cap.name);
|
|
if (otherCaps.length > 0) {
|
|
const randomCap = otherCaps[Math.floor(Math.random() * otherCaps.length)];
|
|
const randomCatName = Object.keys(randomCap.categories)[0];
|
|
const randomCmdName = Object.keys(randomCap.categories[randomCatName].commands)[0];
|
|
const negCommand = `${randomCap.name} ${randomCmdName}`;
|
|
|
|
// Skip if somehow equals positive
|
|
if (negCommand === fullCommand) continue;
|
|
|
|
triplets.push({
|
|
anchor: prompt,
|
|
positive: fullCommand,
|
|
negative: negCommand,
|
|
isHard: false,
|
|
category: catName,
|
|
tool: cap.name,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return triplets;
|
|
}
|
|
|
|
/**
|
|
* Generate category-specific examples with rich diversity
|
|
*/
|
|
function generateCategoryExamples() {
|
|
const examples = [];
|
|
|
|
// ---- SWARM COORDINATION ----
|
|
const swarmExamples = [
|
|
// Initialization
|
|
{ anchor: 'Set up a multi-agent swarm for parallel processing', positive: 'claude-flow swarm init', negatives: ['agentic-flow pipeline create', 'claude-flow agent spawn'] },
|
|
{ anchor: 'Initialize hierarchical agent coordination', positive: 'claude-flow swarm init', negatives: ['claude-flow hive-mind init'] },
|
|
{ anchor: 'Create a mesh topology swarm with 10 agents', positive: 'claude-flow swarm init', negatives: ['claude-flow hive-mind spawn'] },
|
|
{ anchor: 'Configure swarm consensus using raft protocol', positive: 'claude-flow swarm init', negatives: ['claude-flow hive-mind consensus'] },
|
|
{ anchor: 'Start a queen-led hive-mind collective', positive: 'claude-flow hive-mind init', negatives: ['claude-flow swarm init'] },
|
|
{ anchor: 'Coordinate multiple AI agents on a complex task', positive: 'claude-flow swarm init', negatives: ['agentic-flow pipeline create'] },
|
|
{ anchor: 'Set up Byzantine fault-tolerant consensus', positive: 'claude-flow hive-mind init', negatives: ['claude-flow swarm init'] },
|
|
{ anchor: 'Launch a distributed agent network', positive: 'claude-flow swarm init', negatives: ['claude-flow agent spawn'] },
|
|
{ anchor: 'Create a star topology for centralized coordination', positive: 'claude-flow swarm init', negatives: ['claude-flow hive-mind init'] },
|
|
// Status/monitoring
|
|
{ anchor: 'Check the health of all running agents', positive: 'claude-flow swarm status', negatives: ['claude-flow agent status'] },
|
|
{ anchor: 'Monitor swarm performance and throughput', positive: 'claude-flow swarm status', negatives: ['claude-flow performance benchmark'] },
|
|
{ anchor: 'Get swarm coordination status', positive: 'claude-flow swarm status', negatives: ['claude-flow agent list'] },
|
|
];
|
|
|
|
// ---- AGENT MANAGEMENT ----
|
|
const agentExamples = [
|
|
{ anchor: 'Spawn a coder agent to implement features', positive: 'claude-flow agent spawn', negatives: ['claude-flow hive-mind spawn', 'claude-flow task create'] },
|
|
{ anchor: 'Create a new worker agent for this task', positive: 'claude-flow agent spawn', negatives: ['claude-flow hive-mind spawn'] },
|
|
{ anchor: 'Start a researcher agent to investigate', positive: 'claude-flow agent spawn', negatives: ['claude-flow task create'] },
|
|
{ anchor: 'List all active agents in the system', positive: 'claude-flow agent list', negatives: ['agentic-flow model list', 'ruvector collection list'] },
|
|
{ anchor: 'Show running agent processes', positive: 'claude-flow agent list', negatives: ['claude-flow task list'] },
|
|
{ anchor: 'Kill a misbehaving agent', positive: 'claude-flow agent terminate', negatives: ['claude-flow swarm shutdown'] },
|
|
{ anchor: 'Stop the agent that is stuck', positive: 'claude-flow agent terminate', negatives: ['claude-flow task cancel'] },
|
|
{ anchor: 'Get details about a specific agent', positive: 'claude-flow agent status', negatives: ['claude-flow swarm status'] },
|
|
];
|
|
|
|
// ---- MEMORY OPERATIONS ----
|
|
const memoryExamples = [
|
|
// Store operations
|
|
{ anchor: 'Store learned patterns for future reference', positive: 'claude-flow memory store', negatives: ['ruvector vector insert', 'agentic-flow cache set'] },
|
|
{ anchor: 'Save task completion metrics to memory', positive: 'claude-flow memory store', negatives: ['claude-flow session save'] },
|
|
{ anchor: 'Persist agent decisions for analysis', positive: 'claude-flow memory store', negatives: ['ruvector vector insert'] },
|
|
{ anchor: 'Cache successful code patterns', positive: 'claude-flow memory store', negatives: ['agentic-flow cache set'] },
|
|
{ anchor: 'Remember this debugging solution', positive: 'claude-flow memory store', negatives: ['agentic-flow cache set'] },
|
|
{ anchor: 'Store API response for later retrieval', positive: 'claude-flow memory store', negatives: ['ruvector vector insert'] },
|
|
{ anchor: 'Save this configuration for reuse', positive: 'claude-flow memory store', negatives: ['claude-flow session save'] },
|
|
// Retrieve operations
|
|
{ anchor: 'Get the stored pattern for authentication', positive: 'claude-flow memory retrieve', negatives: ['claude-flow memory search', 'ruvector vector get'] },
|
|
{ anchor: 'Fetch previously saved configuration', positive: 'claude-flow memory retrieve', negatives: ['claude-flow session restore'] },
|
|
{ anchor: 'Load cached data from memory', positive: 'claude-flow memory retrieve', negatives: ['agentic-flow cache get'] },
|
|
// Search operations
|
|
{ anchor: 'Search memory for similar patterns', positive: 'claude-flow memory search', negatives: ['ruvector search knn', 'agentic-flow search nearest'] },
|
|
{ anchor: 'Find relevant past solutions', positive: 'claude-flow memory search', negatives: ['ruvector search filter'] },
|
|
{ anchor: 'Query semantic memory for debugging tips', positive: 'claude-flow memory search', negatives: ['agentic-flow search hybrid'] },
|
|
{ anchor: 'Look up related patterns in storage', positive: 'claude-flow memory search', negatives: ['ruvector search knn'] },
|
|
];
|
|
|
|
// ---- VECTOR DATABASE (RUVECTOR) ----
|
|
const vectorExamples = [
|
|
// Search operations
|
|
{ anchor: 'Find k-nearest matches to this embedding', positive: 'ruvector search knn', negatives: ['claude-flow memory search', 'agentic-flow search nearest'] },
|
|
{ anchor: 'Search vectors with metadata filters', positive: 'ruvector search filter', negatives: ['claude-flow memory search', 'agentic-flow search hybrid'] },
|
|
{ anchor: 'Perform approximate nearest neighbor search', positive: 'ruvector search knn', negatives: ['agentic-flow search nearest'] },
|
|
{ anchor: 'Query the vector database for similar items', positive: 'ruvector search knn', negatives: ['claude-flow memory search'] },
|
|
{ anchor: 'Find similar embeddings in the collection', positive: 'ruvector search knn', negatives: ['claude-flow embeddings search'] },
|
|
{ anchor: 'Batch search multiple query vectors', positive: 'ruvector search batch', negatives: ['agentic-flow batch-embed'] },
|
|
// Collection operations
|
|
{ anchor: 'Create a new vector collection for documents', positive: 'ruvector collection create', negatives: ['claude-flow memory store'] },
|
|
{ anchor: 'Set up a database for embedding storage', positive: 'ruvector collection create', negatives: ['agentic-flow cache set'] },
|
|
{ anchor: 'Delete the old vector collection', positive: 'ruvector collection delete', negatives: ['claude-flow memory delete'] },
|
|
{ anchor: 'Get information about the collection', positive: 'ruvector collection info', negatives: ['agentic-flow model info'] },
|
|
// Vector CRUD
|
|
{ anchor: 'Insert embeddings into the database', positive: 'ruvector vector insert', negatives: ['claude-flow memory store', 'agentic-flow cache set'] },
|
|
{ anchor: 'Add vectors to the collection', positive: 'ruvector vector insert', negatives: ['claude-flow memory store'] },
|
|
{ anchor: 'Upsert vectors with metadata', positive: 'ruvector vector insert', negatives: ['agentic-flow cache set'] },
|
|
{ anchor: 'Delete vectors by ID', positive: 'ruvector vector delete', negatives: ['claude-flow memory delete'] },
|
|
// Index operations
|
|
{ anchor: 'Build HNSW index for faster search', positive: 'ruvector index build', negatives: ['claude-flow hooks pretrain'] },
|
|
{ anchor: 'Create search index for vectors', positive: 'ruvector index build', negatives: ['claude-flow neural train'] },
|
|
{ anchor: 'Optimize index for query performance', positive: 'ruvector index optimize', negatives: ['claude-flow performance optimize'] },
|
|
{ anchor: 'Rebuild search index after updates', positive: 'ruvector index rebuild', negatives: ['claude-flow hooks pretrain'] },
|
|
{ anchor: 'Configure HNSW parameters for accuracy', positive: 'ruvector index build', negatives: ['ruvector quantize apply'] },
|
|
// Persistence
|
|
{ anchor: 'Create a snapshot backup of vectors', positive: 'ruvector snapshot create', negatives: ['claude-flow session save'] },
|
|
{ anchor: 'Restore vectors from backup', positive: 'ruvector snapshot restore', negatives: ['claude-flow session restore'] },
|
|
];
|
|
|
|
// ---- EMBEDDINGS (AGENTIC-FLOW) ----
|
|
const embeddingExamples = [
|
|
{ anchor: 'Generate embeddings for these documents', positive: 'agentic-flow embed', negatives: ['claude-flow embeddings generate'] },
|
|
{ anchor: 'Create vector representations of text', positive: 'agentic-flow embed', negatives: ['claude-flow embeddings generate'] },
|
|
{ anchor: 'Encode sentences into embeddings', positive: 'agentic-flow embed', negatives: ['claude-flow embeddings generate'] },
|
|
{ anchor: 'Vectorize code snippets for search', positive: 'agentic-flow embed', negatives: ['ruvector vector insert'] },
|
|
{ anchor: 'Produce semantic embeddings from descriptions', positive: 'agentic-flow embed', negatives: ['claude-flow embeddings generate'] },
|
|
{ anchor: 'Convert text to numerical vectors using ONNX', positive: 'agentic-flow embed', negatives: ['claude-flow embeddings generate'] },
|
|
{ anchor: 'Batch embed multiple documents efficiently', positive: 'agentic-flow batch-embed', negatives: ['ruvector search batch'] },
|
|
{ anchor: 'Embed large corpus in parallel', positive: 'agentic-flow batch-embed', negatives: ['ruvector vector insert'] },
|
|
{ anchor: 'Compare similarity between two texts', positive: 'agentic-flow similarity', negatives: ['claude-flow embeddings compare'] },
|
|
{ anchor: 'Calculate cosine distance between embeddings', positive: 'agentic-flow similarity', negatives: ['claude-flow embeddings compare'] },
|
|
// Model operations
|
|
{ anchor: 'Load the ONNX model for inference', positive: 'agentic-flow model load', negatives: ['claude-flow neural train'] },
|
|
{ anchor: 'List available embedding models', positive: 'agentic-flow model list', negatives: ['claude-flow agent list'] },
|
|
{ anchor: 'Quantize the model for faster inference', positive: 'agentic-flow model quantize', negatives: ['ruvector quantize apply'] },
|
|
// Cache operations
|
|
{ anchor: 'Cache the embedding for reuse', positive: 'agentic-flow cache set', negatives: ['claude-flow memory store'] },
|
|
{ anchor: 'Get cached embedding', positive: 'agentic-flow cache get', negatives: ['claude-flow memory retrieve'] },
|
|
{ anchor: 'Clear the embedding cache', positive: 'agentic-flow cache clear', negatives: ['claude-flow memory delete'] },
|
|
// Search
|
|
{ anchor: 'Find nearest neighbors to query', positive: 'agentic-flow search nearest', negatives: ['ruvector search knn'] },
|
|
{ anchor: 'Hybrid keyword and semantic search', positive: 'agentic-flow search hybrid', negatives: ['ruvector search filter'] },
|
|
// Pipeline
|
|
{ anchor: 'Create an embedding pipeline', positive: 'agentic-flow pipeline create', negatives: ['claude-flow workflow create'] },
|
|
{ anchor: 'Run the processing pipeline', positive: 'agentic-flow pipeline run', negatives: ['claude-flow workflow execute'] },
|
|
];
|
|
|
|
// ---- HOOKS AND LEARNING ----
|
|
const hookExamples = [
|
|
{ anchor: 'Route this task to the optimal agent', positive: 'claude-flow hooks route', negatives: ['claude-flow agent spawn', 'claude-flow hooks pre-task'] },
|
|
{ anchor: 'Get agent suggestions before starting work', positive: 'claude-flow hooks pre-task', negatives: ['claude-flow hooks route'] },
|
|
{ anchor: 'Record task completion for learning', positive: 'claude-flow hooks post-task', negatives: ['claude-flow hooks metrics'] },
|
|
{ anchor: 'Analyze codebase to bootstrap intelligence', positive: 'claude-flow hooks pretrain', negatives: ['claude-flow neural train', 'ruvector index build'] },
|
|
{ anchor: 'Track metrics from completed tasks', positive: 'claude-flow hooks metrics', negatives: ['claude-flow performance benchmark'] },
|
|
{ anchor: 'Pre-train routing model on repository', positive: 'claude-flow hooks pretrain', negatives: ['claude-flow neural train'] },
|
|
{ anchor: 'Dispatch a background worker for optimization', positive: 'claude-flow hooks worker dispatch', negatives: ['claude-flow agent spawn'] },
|
|
{ anchor: 'Log task before starting', positive: 'claude-flow hooks pre-task', negatives: ['claude-flow task create'] },
|
|
];
|
|
|
|
// ---- WORKFLOW AND TASKS ----
|
|
const workflowExamples = [
|
|
{ anchor: 'Create a new workflow for code review', positive: 'claude-flow workflow create', negatives: ['agentic-flow pipeline create', 'claude-flow task create'] },
|
|
{ anchor: 'Define a multi-step workflow', positive: 'claude-flow workflow create', negatives: ['agentic-flow pipeline create'] },
|
|
{ anchor: 'Execute the deployment workflow', positive: 'claude-flow workflow execute', negatives: ['agentic-flow pipeline run'] },
|
|
{ anchor: 'Run the CI/CD workflow', positive: 'claude-flow workflow execute', negatives: ['claude-flow task create'] },
|
|
{ anchor: 'Check workflow execution status', positive: 'claude-flow workflow status', negatives: ['claude-flow task status'] },
|
|
{ anchor: 'Create a task for the coder agent', positive: 'claude-flow task create', negatives: ['claude-flow agent spawn'] },
|
|
{ anchor: 'Add a new task to the queue', positive: 'claude-flow task create', negatives: ['claude-flow workflow create'] },
|
|
{ anchor: 'Mark this task as complete', positive: 'claude-flow task complete', negatives: ['claude-flow hooks post-task'] },
|
|
{ anchor: 'List pending tasks', positive: 'claude-flow task list', negatives: ['claude-flow agent list'] },
|
|
];
|
|
|
|
// ---- SESSION AND STATE ----
|
|
const sessionExamples = [
|
|
{ anchor: 'Save the current session state', positive: 'claude-flow session save', negatives: ['claude-flow memory store', 'ruvector snapshot create'] },
|
|
{ anchor: 'Persist session for later continuation', positive: 'claude-flow session save', negatives: ['claude-flow memory store'] },
|
|
{ anchor: 'Restore previous session', positive: 'claude-flow session restore', negatives: ['ruvector snapshot restore', 'claude-flow memory retrieve'] },
|
|
{ anchor: 'Continue where I left off', positive: 'claude-flow session restore', negatives: ['claude-flow memory retrieve'] },
|
|
{ anchor: 'List saved sessions', positive: 'claude-flow session list', negatives: ['claude-flow memory list'] },
|
|
];
|
|
|
|
// ---- NEURAL AND ML ----
|
|
const neuralExamples = [
|
|
{ anchor: 'Train the routing model', positive: 'claude-flow neural train', negatives: ['claude-flow hooks pretrain'] },
|
|
{ anchor: 'Train neural patterns for better routing', positive: 'claude-flow neural train', negatives: ['claude-flow hooks pretrain'] },
|
|
{ anchor: 'Make prediction with neural model', positive: 'claude-flow neural predict', negatives: ['claude-flow hooks route'] },
|
|
{ anchor: 'Get neural routing prediction', positive: 'claude-flow neural predict', negatives: ['agentic-flow embed'] },
|
|
{ anchor: 'Store learned neural patterns', positive: 'claude-flow neural patterns', negatives: ['claude-flow memory store'] },
|
|
];
|
|
|
|
// ---- PERFORMANCE AND SECURITY ----
|
|
const perfExamples = [
|
|
{ anchor: 'Benchmark system performance', positive: 'claude-flow performance benchmark', negatives: ['claude-flow hooks metrics'] },
|
|
{ anchor: 'Profile slow operations', positive: 'claude-flow performance profile', negatives: ['claude-flow performance benchmark'] },
|
|
{ anchor: 'Optimize for lower latency', positive: 'claude-flow performance optimize', negatives: ['ruvector index optimize'] },
|
|
{ anchor: 'Scan input for security threats', positive: 'claude-flow aidefence scan', negatives: ['claude-flow hooks pre-task'] },
|
|
{ anchor: 'Check if input is safe', positive: 'claude-flow aidefence is_safe', negatives: ['claude-flow aidefence scan'] },
|
|
{ anchor: 'Analyze potential prompt injection', positive: 'claude-flow aidefence analyze', negatives: ['claude-flow hooks route'] },
|
|
];
|
|
|
|
// ---- CLAIMS AND COORDINATION ----
|
|
const claimsExamples = [
|
|
{ anchor: 'Claim this issue for work', positive: 'claude-flow claims claim', negatives: ['claude-flow task create'] },
|
|
{ anchor: 'Assign this issue to me', positive: 'claude-flow claims claim', negatives: ['claude-flow task create'] },
|
|
{ anchor: 'Release my claim on this issue', positive: 'claude-flow claims release', negatives: ['claude-flow task complete'] },
|
|
{ anchor: 'Hand off issue to another agent', positive: 'claude-flow claims handoff', negatives: ['claude-flow claims release'] },
|
|
{ anchor: 'View the claims board', positive: 'claude-flow claims board', negatives: ['claude-flow task list'] },
|
|
];
|
|
|
|
// Convert all examples to triplet format
|
|
const allExampleSets = [
|
|
swarmExamples, agentExamples, memoryExamples, vectorExamples,
|
|
embeddingExamples, hookExamples, workflowExamples, sessionExamples,
|
|
neuralExamples, perfExamples, claimsExamples
|
|
];
|
|
|
|
for (const exampleSet of allExampleSets) {
|
|
for (const ex of exampleSet) {
|
|
const negatives = ex.negatives || [ex.negative];
|
|
for (const neg of negatives) {
|
|
examples.push({
|
|
anchor: ex.anchor,
|
|
positive: ex.positive,
|
|
negative: neg,
|
|
isHard: true,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return examples;
|
|
}
|
|
|
|
// ============================================================================
|
|
// MAIN GENERATION
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Save capability definitions to JSON files
|
|
*/
|
|
function saveCapabilities(outputDir) {
|
|
const capabilities = [
|
|
{ filename: 'claude-flow-capabilities.json', data: CLAUDE_FLOW_CAPABILITIES },
|
|
{ filename: 'agentic-flow-capabilities.json', data: AGENTIC_FLOW_CAPABILITIES },
|
|
{ filename: 'ruvector-capabilities.json', data: RUVECTOR_CAPABILITIES },
|
|
];
|
|
|
|
for (const { filename, data } of capabilities) {
|
|
const filepath = path.join(outputDir, filename);
|
|
fs.writeFileSync(filepath, JSON.stringify(data, null, 2));
|
|
console.log(` Saved ${filepath}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main entry point
|
|
*/
|
|
function main() {
|
|
console.log('\n' + '='.repeat(80));
|
|
console.log(' ECOSYSTEM TRAINING DATA GENERATOR FOR RUVLTRA');
|
|
console.log('='.repeat(80) + '\n');
|
|
|
|
const args = process.argv.slice(2);
|
|
const outputDir = path.dirname(path.resolve(__filename));
|
|
const outputFile = args.find((a) => a.startsWith('--output='))?.split('=')[1] ||
|
|
path.join(outputDir, 'ecosystem-triplets.jsonl');
|
|
const saveCapabilityFiles = args.includes('--save-capabilities');
|
|
|
|
console.log('Configuration:');
|
|
console.log(` Output: ${outputFile}`);
|
|
console.log(` Save capability JSONs: ${saveCapabilityFiles}`);
|
|
console.log();
|
|
|
|
// All capabilities
|
|
const allCapabilities = [
|
|
CLAUDE_FLOW_CAPABILITIES,
|
|
AGENTIC_FLOW_CAPABILITIES,
|
|
RUVECTOR_CAPABILITIES,
|
|
];
|
|
|
|
// Save capability definitions if requested
|
|
if (saveCapabilityFiles) {
|
|
console.log('Saving capability definitions...');
|
|
saveCapabilities(outputDir);
|
|
console.log();
|
|
}
|
|
|
|
// Generate triplets
|
|
console.log('Generating training triplets...');
|
|
|
|
const triplets = generateTriplets(allCapabilities);
|
|
const categoryExamples = generateCategoryExamples();
|
|
|
|
// Combine all triplets
|
|
const allTriplets = [...triplets, ...categoryExamples];
|
|
|
|
// Shuffle for better training
|
|
allTriplets.sort(() => Math.random() - 0.5);
|
|
|
|
// Statistics
|
|
const stats = {
|
|
total: allTriplets.length,
|
|
byTool: {},
|
|
byCategory: {},
|
|
hardNegatives: 0,
|
|
};
|
|
|
|
for (const t of allTriplets) {
|
|
if (t.tool) {
|
|
stats.byTool[t.tool] = (stats.byTool[t.tool] || 0) + 1;
|
|
}
|
|
if (t.category) {
|
|
stats.byCategory[t.category] = (stats.byCategory[t.category] || 0) + 1;
|
|
}
|
|
if (t.isHard) {
|
|
stats.hardNegatives++;
|
|
}
|
|
}
|
|
|
|
// Save triplets as JSONL
|
|
const jsonlContent = allTriplets.map((t) => JSON.stringify({
|
|
anchor: t.anchor,
|
|
positive: t.positive,
|
|
negative: t.negative,
|
|
})).join('\n');
|
|
|
|
fs.writeFileSync(outputFile, jsonlContent);
|
|
|
|
// Print summary
|
|
console.log('\n' + '-'.repeat(60));
|
|
console.log(' GENERATION SUMMARY');
|
|
console.log('-'.repeat(60) + '\n');
|
|
|
|
console.log(`Total triplets generated: ${stats.total}`);
|
|
console.log(`Hard negatives: ${stats.hardNegatives} (${((stats.hardNegatives / stats.total) * 100).toFixed(1)}%)`);
|
|
console.log();
|
|
|
|
console.log('Triplets by tool:');
|
|
for (const [tool, count] of Object.entries(stats.byTool)) {
|
|
console.log(` ${tool.padEnd(20)} ${count}`);
|
|
}
|
|
console.log();
|
|
|
|
console.log('Triplets by category:');
|
|
for (const [category, count] of Object.entries(stats.byCategory).slice(0, 10)) {
|
|
console.log(` ${category.padEnd(20)} ${count}`);
|
|
}
|
|
if (Object.keys(stats.byCategory).length > 10) {
|
|
console.log(` ... and ${Object.keys(stats.byCategory).length - 10} more categories`);
|
|
}
|
|
console.log();
|
|
|
|
console.log(`Output saved to: ${outputFile}`);
|
|
console.log();
|
|
|
|
// Show sample triplets
|
|
console.log('-'.repeat(60));
|
|
console.log(' SAMPLE TRIPLETS');
|
|
console.log('-'.repeat(60) + '\n');
|
|
|
|
for (const triplet of allTriplets.slice(0, 5)) {
|
|
console.log(` Anchor: "${triplet.anchor}"`);
|
|
console.log(` Positive: ${triplet.positive}`);
|
|
console.log(` Negative: ${triplet.negative}`);
|
|
console.log();
|
|
}
|
|
|
|
console.log('='.repeat(80));
|
|
console.log(' NEXT STEPS');
|
|
console.log('='.repeat(80) + '\n');
|
|
|
|
console.log('1. Merge with existing training data:');
|
|
console.log(` cat ~/.ruvllm/training/ruvltra-finetuned/triplets.jsonl ${outputFile} > combined.jsonl`);
|
|
console.log();
|
|
console.log('2. Train with contrastive loss:');
|
|
console.log(' cargo run --example train_contrastive --release -- --triplets combined.jsonl --epochs 30');
|
|
console.log();
|
|
console.log('3. Evaluate routing accuracy improvement');
|
|
console.log();
|
|
}
|
|
|
|
// Export for testing
|
|
module.exports = {
|
|
CLAUDE_FLOW_CAPABILITIES,
|
|
AGENTIC_FLOW_CAPABILITIES,
|
|
RUVECTOR_CAPABILITIES,
|
|
generatePrompts,
|
|
findHardNegatives,
|
|
generateTriplets,
|
|
generateCategoryExamples,
|
|
CONFUSING_PAIRS,
|
|
};
|
|
|
|
// Run if called directly
|
|
if (require.main === module) {
|
|
main();
|
|
}
|