#!/usr/bin/env node /** * Pretrain Intelligence System v2 - Enhanced with all v2 features * * Improvements over v1: * - Uses ALL available data (no arbitrary limits) * - Bootstraps Confidence Calibration from performance-metrics * - Adds Pattern Decay timestamps to Q-table * - Identifies Uncertain States for Active Learning * - Prepares A/B Testing baseline metrics */ import Database from 'better-sqlite3'; import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs'; import { join, dirname, extname, basename } from 'path'; import { fileURLToPath } from 'url'; import { createHash } from 'crypto'; const __dirname = dirname(fileURLToPath(import.meta.url)); const DATA_DIR = join(__dirname, 'data'); const MEMORY_DB = '/workspaces/ruvector/.swarm/memory.db'; // Ensure data directory exists if (!existsSync(DATA_DIR)) mkdirSync(DATA_DIR, { recursive: true }); /** * Text to embedding (same as in index.js) */ function textToEmbedding(text, dims = 128) { const embedding = new Float32Array(dims).fill(0); const normalized = text.toLowerCase().replace(/[^a-z0-9\s]/g, ' '); const words = normalized.split(/\s+/).filter(w => w.length > 1); const wordFreq = {}; for (const word of words) { wordFreq[word] = (wordFreq[word] || 0) + 1; } for (const [word, freq] of Object.entries(wordFreq)) { const hash = createHash('sha256').update(word).digest(); const idfWeight = 1 / Math.log(1 + freq); for (let i = 0; i < dims; i++) { const byteIdx = i % hash.length; const val = ((hash[byteIdx] & 0xFF) / 127.5) - 1; embedding[i] += val * idfWeight; } } const magnitude = Math.sqrt(embedding.reduce((sum, v) => sum + v * v, 0)); if (magnitude > 0) { for (let i = 0; i < dims; i++) embedding[i] /= magnitude; } return Array.from(embedding); } /** * Main pretraining function */ async function pretrain() { console.log('🧠 RuVector Intelligence Pretraining v2'); console.log('========================================\n'); if (!existsSync(MEMORY_DB)) { console.error('āŒ Memory database not found:', MEMORY_DB); process.exit(1); } const db = new Database(MEMORY_DB, { readonly: true }); const stats = { commands: 0, agents: 0, files: 0, patterns: 0, coordination: 0, calibration: 0 }; // ========== 1. Extract Command Patterns → Q-Table with Decay Metadata ========== console.log('šŸ“Š Extracting command patterns (ALL data)...'); const qTable = {}; const trajectories = []; // Get ALL commands (no limit) const commands = db.prepare(` SELECT key, value, created_at FROM memory_entries WHERE namespace = 'command-history' ORDER BY created_at DESC `).all(); for (const row of commands) { try { const data = JSON.parse(row.value); const cmd = data.command || ''; const success = data.success === true || data.exitCode === '0'; const timestamp = row.created_at ? new Date(row.created_at * 1000).toISOString() : new Date().toISOString(); // Classify command type let cmdType = 'other'; if (cmd.startsWith('cargo')) cmdType = 'cargo'; else if (cmd.startsWith('npm')) cmdType = 'npm'; else if (cmd.startsWith('git')) cmdType = 'git'; else if (cmd.startsWith('wasm-pack')) cmdType = 'wasm'; else if (cmd.includes('test')) cmdType = 'test'; else if (cmd.includes('build')) cmdType = 'build'; // Detect context from command let context = 'general'; if (cmd.includes('rvlite')) context = 'rvlite'; else if (cmd.includes('ruvector-core')) context = 'ruvector-core'; else if (cmd.includes('ruvector-graph')) context = 'ruvector-graph'; else if (cmd.includes('wasm')) context = 'wasm'; else if (cmd.includes('postgres')) context = 'postgres'; else if (cmd.includes('mincut')) context = 'mincut'; else if (cmd.includes('gnn')) context = 'gnn'; else if (cmd.includes('attention')) context = 'attention'; else if (cmd.includes('sona')) context = 'sona'; const state = `${cmdType}_in_${context}`; const action = success ? 'command-succeeded' : 'command-failed'; const reward = success ? 1.0 : -0.5; // Initialize state with v2 metadata if (!qTable[state]) { qTable[state] = { 'command-succeeded': 0, 'command-failed': 0, _meta: { lastUpdate: timestamp, updateCount: 0, firstSeen: timestamp } }; } const stateCount = (qTable[state]._meta?.updateCount || 0) + 1; qTable[state]._meta.updateCount = stateCount; qTable[state]._meta.lastUpdate = timestamp; // Decaying learning rate with Q-value caps const learningRate = Math.max(0.01, 0.3 / Math.sqrt(stateCount)); const currentQ = qTable[state][action] || 0; const newQ = currentQ + learningRate * (reward - currentQ); qTable[state][action] = Math.min(0.8, Math.max(-0.5, newQ)); // Record trajectory with timestamp trajectories.push({ id: `pretrain-cmd-${stats.commands}`, state, action, outcome: cmd.slice(0, 100), reward, timestamp }); stats.commands++; } catch (e) { /* skip malformed */ } } console.log(` āœ… Processed ${stats.commands} commands`); // ========== 2. Extract Agent Assignments → Q-Table ========== console.log('šŸ¤– Extracting agent assignments (ALL data)...'); const agentAssignments = db.prepare(` SELECT key, value, created_at FROM memory_entries WHERE namespace = 'agent-assignments' ORDER BY created_at DESC `).all(); for (const row of agentAssignments) { try { const data = JSON.parse(row.value); const file = data.file || ''; const ext = extname(file).slice(1) || 'unknown'; const agentType = data.type || 'coder'; const recommended = data.recommended === true; const timestamp = row.created_at ? new Date(row.created_at * 1000).toISOString() : new Date().toISOString(); // Extract crate if applicable const crateMatch = file.match(/crates\/([^/]+)/); const crate = crateMatch ? crateMatch[1] : 'project'; const state = `edit_${ext}_in_${crate}`; const action = agentType; const reward = recommended ? 1.0 : 0.5; // Initialize with v2 metadata if (!qTable[state]) { qTable[state] = { _meta: { lastUpdate: timestamp, updateCount: 0, firstSeen: timestamp } }; } const stateCount = (qTable[state]._meta?.updateCount || 0) + 1; qTable[state]._meta.updateCount = stateCount; qTable[state]._meta.lastUpdate = timestamp; const learningRate = Math.max(0.01, 0.2 / Math.sqrt(stateCount)); const currentQ = qTable[state][action] || 0; qTable[state][action] = Math.min(0.75, currentQ + learningRate * (reward - currentQ)); trajectories.push({ id: `pretrain-agent-${stats.agents}`, state, action, outcome: `recommended for ${basename(file)}`, reward, timestamp }); stats.agents++; } catch (e) { /* skip */ } } console.log(` āœ… Processed ${stats.agents} agent assignments`); // ========== 3. Bootstrap Calibration from Performance Metrics ========== console.log('šŸ“ˆ Bootstrapping confidence calibration...'); const calibrationBuckets = {}; const performanceMetrics = db.prepare(` SELECT key, value FROM memory_entries WHERE namespace = 'performance-metrics' AND key LIKE 'command-metrics:%' `).all(); // Group by complexity (as a proxy for confidence) const complexityToConfidence = { 'low': 0.9, 'medium': 0.7, 'high': 0.5 }; for (const row of performanceMetrics) { try { const data = JSON.parse(row.value); const success = data.success === true; const complexity = data.complexity || 'medium'; const confidence = complexityToConfidence[complexity] || 0.7; // Round to bucket (0.5, 0.6, 0.7, 0.8, 0.9) const bucket = (Math.round(confidence * 10) / 10).toFixed(1); if (!calibrationBuckets[bucket]) { calibrationBuckets[bucket] = { correct: 0, total: 0 }; } calibrationBuckets[bucket].total++; if (success) calibrationBuckets[bucket].correct++; stats.calibration++; } catch (e) { /* skip */ } } // Calculate calibration - format must match CalibrationTracker expected format // CalibrationTracker expects: { buckets: { "0.9": { total, correct } }, predictions: [] } const calibration = { buckets: {}, predictions: [] }; for (const [bucket, data] of Object.entries(calibrationBuckets)) { calibration.buckets[bucket] = { total: data.total, correct: data.correct // CalibrationTracker uses "correct", not "accuracy" }; } console.log(` āœ… Bootstrapped calibration from ${stats.calibration} metrics`); console.log(` šŸ“Š Calibration buckets: ${Object.keys(calibration.buckets).length}`); // ========== 4. Extract File History → Vector Memory ========== console.log('šŸ“ Extracting file edit history (ALL data)...'); const memories = []; const fileHistory = db.prepare(` SELECT key, value, created_at FROM memory_entries WHERE namespace = 'file-history' ORDER BY created_at DESC `).all(); for (const row of fileHistory) { try { const data = JSON.parse(row.value); const file = data.file || ''; const ext = extname(file).slice(1); const crateMatch = file.match(/crates\/([^/]+)/); const crate = crateMatch ? crateMatch[1] : null; const timestamp = row.created_at ? new Date(row.created_at * 1000).toISOString() : new Date().toISOString(); const content = `edit ${ext} file ${basename(file)} in ${crate || 'project'}`; memories.push({ id: `pretrain-file-${stats.files}`, type: 'edit', content, embedding: textToEmbedding(content), metadata: { file, crate, ext, timestamp } }); stats.files++; } catch (e) { /* skip */ } } console.log(` āœ… Processed ${stats.files} file edits`); // ========== 5. Extract Reasoning Patterns ========== console.log('🧩 Extracting reasoning patterns...'); const patterns = db.prepare(` SELECT id, type, pattern_data, confidence, created_at FROM patterns ORDER BY confidence DESC `).all(); for (const row of patterns) { try { const data = JSON.parse(row.pattern_data); const content = data.content || data.title || JSON.stringify(data).slice(0, 200); const timestamp = row.created_at || new Date().toISOString(); memories.push({ id: `pretrain-pattern-${stats.patterns}`, type: 'pattern', content, embedding: textToEmbedding(content), metadata: { patternId: row.id, patternType: row.type, confidence: row.confidence, timestamp } }); stats.patterns++; } catch (e) { /* skip */ } } console.log(` āœ… Processed ${stats.patterns} patterns`); // ========== 6. Identify Uncertain States for Active Learning ========== console.log('šŸŽÆ Identifying uncertain states...'); const uncertainStates = []; for (const [state, actions] of Object.entries(qTable)) { const qValues = Object.entries(actions) .filter(([k, v]) => k !== '_meta' && k !== '_count' && typeof v === 'number') .map(([k, v]) => ({ action: k, q: v })) .sort((a, b) => b.q - a.q); if (qValues.length >= 2) { const gap = qValues[0].q - qValues[1].q; if (gap < 0.1 && qValues[0].q > 0) { // Close Q-values = uncertain uncertainStates.push({ state, bestAction: qValues[0].action, secondBest: qValues[1].action, gap: gap.toFixed(4), needsExploration: true }); } } } console.log(` āœ… Found ${uncertainStates.length} uncertain states for active learning`); // ========== 7. Build Swarm Coordination Graph ========== console.log('šŸ”— Building swarm coordination graph...'); const nodes = {}; const edges = {}; const agents = new Set(); for (const row of agentAssignments) { try { const data = JSON.parse(row.value); if (data.type) agents.add(data.type); } catch (e) { /* skip */ } } const agentCapabilities = { 'coder': ['rust', 'typescript', 'implementation'], 'technical-writer': ['documentation', 'markdown'], 'reviewer': ['code-review', 'security'], 'tester': ['unit-test', 'integration'], 'general-developer': ['general', 'debugging'], 'rust-developer': ['rust', 'cargo', 'wasm'], 'typescript-developer': ['typescript', 'javascript', 'node'], 'ml-developer': ['gnn', 'attention', 'neural'], 'documentation-writer': ['docs', 'readme', 'api-docs'] }; for (const agent of agents) { nodes[agent] = { type: agent, capabilities: agentCapabilities[agent] || [agent], load: 0, active: true }; stats.coordination++; } // Create edges based on common file edits const agentFiles = {}; for (const row of agentAssignments) { try { const data = JSON.parse(row.value); const agent = data.type; const file = data.file; if (!agentFiles[agent]) agentFiles[agent] = []; agentFiles[agent].push(file); } catch (e) { /* skip */ } } const agentList = Object.keys(agentFiles); for (let i = 0; i < agentList.length; i++) { for (let j = i + 1; j < agentList.length; j++) { const a1 = agentList[i]; const a2 = agentList[j]; const files1 = new Set(agentFiles[a1].map(f => dirname(f))); const files2 = new Set(agentFiles[a2].map(f => dirname(f))); let overlap = 0; for (const dir of files1) { if (files2.has(dir)) overlap++; } if (overlap > 0) { edges[`${a1}:${a2}`] = { weight: overlap, interactions: overlap }; } } } console.log(` āœ… Built graph with ${Object.keys(nodes).length} agents, ${Object.keys(edges).length} edges`); // ========== 8. Save All Data ========== console.log('\nšŸ’¾ Saving pretrained data (v2)...'); // Save Q-Table with decay metadata writeFileSync( join(DATA_DIR, 'patterns.json'), JSON.stringify(qTable, null, 2) ); console.log(` āœ… Q-Table: ${Object.keys(qTable).length} states (with decay metadata)`); // Save Trajectories (keep last 2000 for more history) writeFileSync( join(DATA_DIR, 'trajectories.json'), JSON.stringify(trajectories.slice(-2000), null, 2) ); console.log(` āœ… Trajectories: ${Math.min(trajectories.length, 2000)} entries`); // Save Memories writeFileSync( join(DATA_DIR, 'memory.json'), JSON.stringify(memories, null, 2) ); console.log(` āœ… Vector Memory: ${memories.length} entries`); // Save Calibration (NEW) writeFileSync( join(DATA_DIR, 'calibration.json'), JSON.stringify(calibration, null, 2) ); console.log(` āœ… Calibration: ${Object.keys(calibration.buckets).length} buckets`); // Save Uncertain States for Active Learning (NEW) writeFileSync( join(DATA_DIR, 'uncertain-states.json'), JSON.stringify({ states: uncertainStates, lastUpdated: new Date().toISOString() }, null, 2) ); console.log(` āœ… Uncertain States: ${uncertainStates.length} entries`); // Save Swarm Graph writeFileSync( join(DATA_DIR, 'coordination-graph.json'), JSON.stringify({ nodes, edges, lastUpdated: new Date().toISOString() }, null, 2) ); console.log(` āœ… Swarm Graph: ${Object.keys(nodes).length} nodes`); // Save Swarm State writeFileSync( join(DATA_DIR, 'swarm-state.json'), JSON.stringify({ tasks: [], optimizations: 0, pretrained: true, pretrainVersion: 2, pretrainedAt: new Date().toISOString(), stats, features: { patternDecay: true, calibration: true, activeLearning: true, uncertainStates: uncertainStates.length } }, null, 2) ); // Initialize empty feedback tracking (suggestions must be array, followRates must be object) writeFileSync( join(DATA_DIR, 'feedback.json'), JSON.stringify({ suggestions: [], followRates: {}, lastUpdated: new Date().toISOString() }, null, 2) ); console.log(` āœ… Feedback tracking initialized`); db.close(); // ========== Summary ========== console.log('\nāœ… Pretraining v2 Complete!'); console.log('==========================='); console.log(` Commands processed: ${stats.commands.toLocaleString()}`); console.log(` Agent assignments: ${stats.agents}`); console.log(` File edits: ${stats.files.toLocaleString()}`); console.log(` Patterns: ${stats.patterns}`); console.log(` Calibration samples: ${stats.calibration.toLocaleString()}`); console.log(` Uncertain states: ${uncertainStates.length}`); console.log(` Swarm nodes: ${Object.keys(nodes).length}`); console.log(` Total Q-states: ${Object.keys(qTable).length}`); console.log(` Total memories: ${memories.length.toLocaleString()}`); console.log('\n🧠 Intelligence system v2 pretrained with:'); console.log(' āœ… Pattern decay timestamps'); console.log(' āœ… Confidence calibration bootstrap'); console.log(' āœ… Active learning uncertain states'); console.log(' āœ… All available training data\n'); } pretrain().catch(console.error);