Files
wifi-densepose/examples/ruvLLM/docs/sparc/02-pseudocode.md
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

31 KiB

RuvLLM: Algorithm Design

SPARC Phase 2: Pseudocode


1. Core Request Flow

1.1 Main Orchestrator

ALGORITHM ProcessQuery(query: String, session: Session) -> Response:
    INPUT:
        query: User query string
        session: Session containing user context, history, constraints
    OUTPUT:
        response: Generated response with metadata

    // Step 1: Preprocessing and Embedding
    tokens ← Tokenize(query)
    query_embedding ← EmbedQuery(query)
    query_features ← ExtractQueryFeatures(tokens, query_embedding)

    // Step 2: Memory Retrieval via HNSW
    candidates ← HNSWSearch(
        vector: query_embedding,
        k: 64,
        ef_search: GetAdaptiveEfSearch(session.latency_budget)
    )

    // Step 3: Graph Attention over Neighborhood
    graph_context ← GraphAttention(
        center_node: query_embedding,
        neighbors: candidates,
        hops: 2,
        attention_heads: 4
    )

    // Step 4: Feature Extraction for Router
    router_features ← BuildRouterFeatures(
        query_features,
        candidates.statistics(),
        graph_context.summary(),
        session.constraints
    )

    // Step 5: FastGRNN Routing Decision
    routing_decision ← FastGRNNRoute(router_features, session.hidden_state)
    session.hidden_state ← routing_decision.new_hidden

    // Step 6: Context Construction
    context ← BuildContext(
        graph_context.ranked_nodes,
        max_tokens: routing_decision.context_size,
        dedup: TRUE
    )

    // Step 7: LFM2 Generation
    response ← LFM2Generate(
        model: routing_decision.model_selection,
        prompt: FormatPrompt(query, context),
        temperature: routing_decision.temperature,
        top_p: routing_decision.top_p,
        max_tokens: GetMaxTokens(routing_decision.model_selection)
    )

    // Step 8: Quality Evaluation
    quality_score ← EvaluateQuality(query, response, context)

    // Step 9: Optional Writeback
    IF quality_score > QUALITY_THRESHOLD:
        MemoryWriteback(query, response, quality_score)

    // Step 10: Telemetry
    LogTelemetry(
        routing_decision,
        candidates.stats,
        latency_breakdown,
        quality_score
    )

    RETURN Response {
        text: response,
        confidence: quality_score,
        sources: context.sources,
        routing_info: routing_decision
    }

1.2 Adaptive efSearch Selection

ALGORITHM GetAdaptiveEfSearch(latency_budget_ms: f32) -> u32:
    // Dynamic HNSW parameter based on latency constraints

    IF latency_budget_ms < 100:
        RETURN 32    // Fast mode, lower recall
    ELSE IF latency_budget_ms < 300:
        RETURN 64    // Balanced mode
    ELSE IF latency_budget_ms < 500:
        RETURN 128   // High recall mode
    ELSE:
        RETURN 256   // Maximum recall mode

2. FastGRNN Router

2.1 Core FastGRNN Cell

ALGORITHM FastGRNNCell(x: Vector, h: Vector, params: FastGRNNParams) -> Vector:
    INPUT:
        x: Input feature vector [input_dim]
        h: Hidden state [hidden_dim]
        params: {W_z, U_z, b_z, W_h, U_h, b_h, zeta, nu}
    OUTPUT:
        h_new: Updated hidden state [hidden_dim]

    // Update gate
    z_pre ← MatMul(params.W_z, x) + MatMul(params.U_z, h) + params.b_z
    z ← Sigmoid(z_pre)

    // Candidate hidden state
    h_tilde_pre ← MatMul(params.W_h, x) + MatMul(params.U_h, h) + params.b_h
    h_tilde ← Tanh(h_tilde_pre)

    // FastGRNN update with learned scalars
    h_new ← (params.zeta * (1 - z) + params.nu) ⊙ h_tilde + z ⊙ h

    RETURN h_new

2.2 Router Forward Pass

ALGORITHM FastGRNNRoute(features: Vector, hidden: Vector) -> RoutingDecision:
    INPUT:
        features: Router input features [128]
        hidden: Previous hidden state [64]
    OUTPUT:
        decision: RoutingDecision with model, context, temperature, top_p

    // Normalize input
    features_norm ← LayerNorm(features)

    // FastGRNN cell update
    h_new ← FastGRNNCell(features_norm, hidden, ROUTER_PARAMS)

    // Output heads
    model_logits ← Linear(h_new, W_model)           // [4] for 4 model sizes
    context_logits ← Linear(h_new, W_context)       // [5] for context bins
    temp_raw ← Linear(h_new, W_temp)                // [1] scalar
    top_p_raw ← Linear(h_new, W_top_p)              // [1] scalar
    confidence_raw ← Linear(h_new, W_confidence)    // [1] scalar

    // Activations
    model_probs ← Softmax(model_logits)
    context_probs ← Softmax(context_logits)
    temperature ← Sigmoid(temp_raw) * 2.0           // Scale to [0, 2]
    top_p ← Sigmoid(top_p_raw)                      // Scale to [0, 1]
    confidence ← Sigmoid(confidence_raw)

    // Decoding with confidence threshold
    IF confidence < CONFIDENCE_THRESHOLD:
        // Fall back to safe defaults
        model_idx ← 2  // 1.2B model
        context_idx ← 3  // 2048 tokens
    ELSE:
        model_idx ← ArgMax(model_probs)
        context_idx ← ArgMax(context_probs)

    RETURN RoutingDecision {
        model_selection: MODEL_SIZES[model_idx],
        context_size: CONTEXT_BINS[context_idx],
        temperature: temperature,
        top_p: top_p,
        confidence: confidence,
        new_hidden: h_new
    }

CONSTANTS:
    MODEL_SIZES = [350M, 700M, 1.2B, 2.6B]
    CONTEXT_BINS = [256, 512, 1024, 2048, 4096]
    CONFIDENCE_THRESHOLD = 0.7

2.3 Feature Extraction

ALGORITHM BuildRouterFeatures(
    query_features: QueryFeatures,
    search_stats: SearchStatistics,
    graph_summary: GraphSummary,
    constraints: SystemConstraints
) -> Vector:
    OUTPUT: features [128]

    features ← EmptyVector(128)
    offset ← 0

    // Query features [32 dims]
    features[offset:offset+1] ← Normalize(query_features.token_count, 0, 512)
    offset += 1
    features[offset:offset+8] ← query_features.language_one_hot
    offset += 8
    features[offset:offset+16] ← query_features.domain_embedding
    offset += 16
    features[offset:offset+1] ← Normalize(query_features.user_frequency, 0, 1000)
    offset += 1
    features[offset:offset+6] ← query_features.query_type_probs
    offset += 6

    // Embedding statistics [16 dims]
    features[offset:offset+1] ← Normalize(query_features.embedding_l2_norm, 0, 10)
    offset += 1
    features[offset:offset+8] ← query_features.pca_components[:8]
    offset += 8
    features[offset:offset+1] ← query_features.embedding_entropy
    offset += 1
    features[offset:offset+1] ← query_features.embedding_sparsity
    offset += 1
    features[offset:offset+4] ← query_features.cluster_soft_assignment
    offset += 4
    features[offset:offset+1] ← 0  // padding
    offset += 1

    // Search statistics [48 dims]
    features[offset:offset+1] ← Normalize(search_stats.k_retrieved, 0, 64)
    offset += 1
    features[offset:offset+4] ← [
        Normalize(search_stats.distance_mean, 0, 2),
        Normalize(search_stats.distance_std, 0, 1),
        Normalize(search_stats.distance_min, 0, 2),
        Normalize(search_stats.distance_max, 0, 2)
    ]
    offset += 4
    features[offset:offset+1] ← search_stats.distance_entropy
    offset += 1
    features[offset:offset+1] ← Normalize(search_stats.graph_depth, 0, 10)
    offset += 1
    features[offset:offset+1] ← search_stats.recall_estimate
    offset += 1
    features[offset:offset+16] ← graph_summary.neighborhood_density_histogram
    offset += 16
    features[offset:offset+24] ← graph_summary.semantic_coherence_features
    offset += 24

    // System constraints [32 dims]
    features[offset:offset+1] ← Normalize(constraints.latency_budget_ms, 0, 5000)
    offset += 1
    features[offset:offset+4] ← constraints.device_class_one_hot
    offset += 4
    features[offset:offset+4] ← constraints.privacy_level_one_hot
    offset += 4
    features[offset:offset+1] ← Normalize(constraints.memory_available_mb, 0, 16000)
    offset += 1
    features[offset:offset+1] ← Normalize(constraints.battery_level, 0, 100)
    offset += 1
    features[offset:offset+1] ← Normalize(constraints.concurrent_requests, 0, 100)
    offset += 1
    features[offset:offset+16] ← constraints.historical_accuracy_per_domain
    offset += 16
    features[offset:offset+4] ← [0, 0, 0, 0]  // padding
    offset += 4

    ASSERT offset == 128
    RETURN features

3. Graph Attention Engine

3.1 Two-Hop Neighborhood Expansion

ALGORITHM ExpandNeighborhood(
    center_nodes: List<Node>,
    db: VectorDB,
    max_hops: u32,
    max_per_hop: u32
) -> SubGraph:
    INPUT:
        center_nodes: Initial retrieved nodes
        db: Vector database with graph structure
        max_hops: Maximum expansion hops (typically 2)
        max_per_hop: Maximum neighbors per node per hop
    OUTPUT:
        subgraph: Expanded subgraph with nodes and edges

    visited ← HashSet<NodeID>()
    frontier ← center_nodes
    all_nodes ← center_nodes.clone()
    all_edges ← List<Edge>()

    FOR hop IN 1..=max_hops:
        next_frontier ← List<Node>()

        FOR node IN frontier:
            IF node.id IN visited:
                CONTINUE
            visited.add(node.id)

            // Get outgoing edges
            edges ← db.get_edges(node.id, limit: max_per_hop)
            all_edges.extend(edges)

            FOR edge IN edges:
                IF edge.dst NOT IN visited:
                    neighbor ← db.get_node(edge.dst)
                    next_frontier.append(neighbor)
                    all_nodes.append(neighbor)

        frontier ← next_frontier

    RETURN SubGraph {
        nodes: all_nodes,
        edges: all_edges,
        center_ids: center_nodes.map(n => n.id)
    }

3.2 Graph Attention Mechanism

ALGORITHM GraphAttention(
    center_embedding: Vector,
    subgraph: SubGraph,
    config: GraphAttentionConfig
) -> GraphContext:
    INPUT:
        center_embedding: Query embedding
        subgraph: Expanded neighborhood
        config: {num_heads, head_dim, dropout}
    OUTPUT:
        context: Attended graph context

    // Build attention inputs
    node_embeddings ← subgraph.nodes.map(n => n.vector)
    edge_features ← BuildEdgeFeatures(subgraph.edges)
    adjacency ← BuildAdjacencyMatrix(subgraph)

    // Multi-head graph attention
    attended_embeddings ← []
    attention_weights ← []

    FOR head IN 0..config.num_heads:
        // Project Q, K, V for this head
        Q ← Linear(center_embedding, W_Q[head])
        K ← Linear_batch(node_embeddings, W_K[head])
        V ← Linear_batch(node_embeddings, W_V[head])

        // Compute attention scores with edge features
        scores ← []
        FOR i, node IN enumerate(node_embeddings):
            // Base attention
            score ← Dot(Q, K[i]) / Sqrt(config.head_dim)

            // Edge-aware modulation
            IF EdgeExists(center_id, node.id, subgraph):
                edge ← GetEdge(center_id, node.id, subgraph)
                edge_emb ← EdgeEmbed(edge.rel, edge.weight)
                score += Dot(Q, edge_emb)

            // Distance decay
            hop_distance ← GetHopDistance(center_id, node.id, subgraph)
            score *= Exp(-config.distance_decay * hop_distance)

            scores.append(score)

        // Normalize with softmax (masked for disconnected nodes)
        weights ← MaskedSoftmax(scores, adjacency)
        attention_weights.append(weights)

        // Weighted aggregation
        head_output ← WeightedSum(V, weights)
        attended_embeddings.append(head_output)

    // Concatenate heads and project
    concatenated ← Concat(attended_embeddings)
    output ← Linear(concatenated, W_O) + center_embedding  // Residual

    // Rank nodes by attention weight
    avg_weights ← Mean(attention_weights, axis=0)
    ranked_indices ← ArgSort(avg_weights, descending=TRUE)

    RETURN GraphContext {
        embedding: output,
        ranked_nodes: subgraph.nodes[ranked_indices],
        attention_weights: avg_weights[ranked_indices],
        summary: ExtractGraphSummary(subgraph, avg_weights)
    }

3.3 Edge Feature Encoding

ALGORITHM BuildEdgeFeatures(edges: List<Edge>) -> EdgeFeatures:
    // Encode edge relationships and metadata

    features ← List<Vector>()

    FOR edge IN edges:
        // Relationship type embedding
        rel_emb ← RELATION_EMBEDDINGS[edge.rel]  // Learned embeddings

        // Temporal features
        age_days ← (NOW - edge.metadata.timestamp) / SECONDS_PER_DAY
        recency ← Exp(-age_days / DECAY_CONSTANT)

        // Confidence and weight
        confidence ← edge.metadata.confidence
        weight ← edge.weight

        // Combine features
        edge_feature ← Concat([
            rel_emb,                    // [16]
            [recency],                  // [1]
            [confidence],               // [1]
            [weight],                   // [1]
            [Log(1 + age_days) / 10]    // [1]
        ])

        features.append(edge_feature)

    RETURN EdgeFeatures { vectors: features, dim: 20 }

CONSTANTS:
    RELATION_EMBEDDINGS = LearnedEmbedding(num_relations=10, dim=16)
    DECAY_CONSTANT = 30.0  // days

4. Self-Learning Algorithms

4.1 Memory Writeback

ALGORITHM MemoryWriteback(
    query: String,
    response: String,
    quality_score: f32,
    db: VectorDB
) -> Result<Option<NodeID>>:
    INPUT:
        query, response: Q&A pair
        quality_score: Judge-evaluated quality [0, 1]
        db: Vector database
    OUTPUT:
        inserted_id: ID of new node, or None if skipped

    // Quality gate
    IF quality_score < QUALITY_THRESHOLD:
        RETURN None

    // Create embedding
    combined_text ← Format("Q: {query}\nA: {response}")
    embedding ← EmbedText(combined_text)

    // Deduplication check
    similar ← db.search(embedding, k=5, threshold=0.95)
    IF similar.len() > 0:
        // Near-duplicate found
        best_match ← similar[0]

        IF quality_score > best_match.metadata.quality:
            // Update existing entry (better quality)
            db.update_metadata(best_match.id, {
                quality: quality_score,
                updated_at: NOW,
                update_count: best_match.metadata.update_count + 1
            })
            RETURN Some(best_match.id)
        ELSE:
            // Skip - existing entry is better
            RETURN None

    // Insert new entry
    node ← Node {
        id: NewUUID(),
        vector: embedding,
        text: combined_text,
        type: NodeType::QAPair,
        source: "self_learning",
        metadata: {
            timestamp: NOW,
            quality: quality_score,
            domain: ClassifyDomain(query),
            version: 1,
            update_count: 0
        }
    }

    inserted_id ← db.insert(node)

    // Create edges to similar existing nodes
    FOR neighbor IN similar:
        edge ← Edge {
            src: inserted_id,
            dst: neighbor.id,
            rel: EdgeType::SameTopic,
            weight: neighbor.score,
            metadata: {
                timestamp: NOW,
                created_by: "self_learning"
            }
        }
        db.insert_edge(edge)

    RETURN Some(inserted_id)

CONSTANTS:
    QUALITY_THRESHOLD = 0.75  // 3.75/5.0

4.2 Experience Replay Buffer

ALGORITHM ReservoirSampling:
    // Maintain fixed-size buffer with uniform sampling

    STRUCT ReplayBuffer:
        entries: List<ReplayEntry>
        capacity: u32
        total_seen: u64

    FUNCTION new(capacity: u32) -> ReplayBuffer:
        RETURN ReplayBuffer {
            entries: [],
            capacity: capacity,
            total_seen: 0
        }

    FUNCTION add(self, entry: ReplayEntry):
        self.total_seen += 1

        IF self.entries.len() < self.capacity:
            self.entries.append(entry)
        ELSE:
            // Reservoir sampling: replace with probability capacity/total_seen
            idx ← RandomInt(0, self.total_seen)
            IF idx < self.capacity:
                self.entries[idx] ← entry

    FUNCTION sample(self, batch_size: u32) -> List<ReplayEntry>:
        IF self.entries.len() < batch_size:
            RETURN self.entries.clone()

        indices ← RandomSample(0, self.entries.len(), batch_size, replace=FALSE)
        RETURN indices.map(i => self.entries[i].clone())

    FUNCTION distribution_stats(self) -> DistributionStats:
        // Analyze distribution for curriculum balancing
        domain_counts ← CountBy(self.entries, e => e.domain)
        quality_hist ← Histogram(self.entries.map(e => e.quality), bins=10)
        complexity_hist ← Histogram(self.entries.map(e => e.complexity), bins=10)

        RETURN DistributionStats {
            domain_counts,
            quality_hist,
            complexity_hist,
            coverage: domain_counts.len() / TOTAL_DOMAINS
        }

4.3 EWC Training Update

ALGORITHM EWCTrainingStep(
    model: RouterModel,
    batch: List<TrainingSample>,
    ewc: ElasticWeightConsolidation,
    optimizer: Optimizer
) -> TrainingMetrics:
    INPUT:
        model: FastGRNN router model
        batch: Training samples with labels
        ewc: EWC state with Fisher info and optimal weights
        optimizer: Adam optimizer
    OUTPUT:
        metrics: Loss and accuracy metrics

    // Forward pass
    predictions ← []
    FOR sample IN batch:
        features ← BuildRouterFeatures(sample)
        pred ← model.forward(features, sample.hidden_state)
        predictions.append(pred)

    // Task loss
    model_loss ← CrossEntropy(
        predictions.map(p => p.model_probs),
        batch.map(s => s.label_model)
    )

    context_loss ← CrossEntropy(
        predictions.map(p => p.context_probs),
        batch.map(s => s.label_context)
    )

    temp_loss ← SmoothL1(
        predictions.map(p => p.temperature),
        batch.map(s => s.label_temperature)
    )

    top_p_loss ← SmoothL1(
        predictions.map(p => p.top_p),
        batch.map(s => s.label_top_p)
    )

    task_loss ← model_loss + context_loss + ALPHA * temp_loss + BETA * top_p_loss

    // EWC regularization loss
    current_weights ← model.get_weights()
    ewc_loss ← ewc.regularization_loss(current_weights)

    // Total loss
    total_loss ← task_loss + ewc_loss

    // Backward pass
    gradients ← Backward(total_loss, model.parameters())

    // Optimizer step
    optimizer.step(model.parameters(), gradients)

    // Compute metrics
    accuracy ← ComputeAccuracy(predictions, batch)

    RETURN TrainingMetrics {
        total_loss,
        task_loss,
        ewc_loss,
        model_accuracy: accuracy.model,
        context_accuracy: accuracy.context
    }

CONSTANTS:
    ALPHA = 0.1  // Temperature loss weight
    BETA = 0.1   // Top-p loss weight

4.4 Fisher Information Update

ALGORITHM UpdateFisherInformation(
    model: RouterModel,
    dataset: List<Sample>,
    ewc: ElasticWeightConsolidation,
    num_samples: u32
) -> ElasticWeightConsolidation:
    // Compute Fisher information diagonal approximation

    // Sample subset for efficiency
    samples ← RandomSample(dataset, num_samples)

    // Accumulate squared gradients
    fisher_accum ← ZeroVector(model.num_parameters())

    FOR sample IN samples:
        features ← BuildRouterFeatures(sample)
        pred ← model.forward(features, sample.hidden_state)

        // Log-likelihood gradient (for correctly classified samples)
        log_prob ← Log(pred.model_probs[sample.label_model])
        gradients ← Backward(log_prob, model.parameters())

        // Accumulate squared gradients
        FOR i IN 0..model.num_parameters():
            fisher_accum[i] += gradients[i] ** 2

    // Average
    fisher_diag ← fisher_accum / num_samples

    // Update EWC state
    ewc.fisher_info ← fisher_diag
    ewc.optimal_weights ← model.get_weights().clone()

    RETURN ewc

5. LFM2 Inference

5.1 Generation with KV Cache

ALGORITHM LFM2Generate(
    model: LFM2Model,
    prompt: String,
    config: GenerationConfig,
    kv_cache: Option<KVCache>
) -> (String, KVCache):
    INPUT:
        model: Loaded LFM2 model (350M/700M/1.2B/2.6B)
        prompt: Formatted prompt with context
        config: {temperature, top_p, max_tokens}
        kv_cache: Optional cached KV states from previous turn
    OUTPUT:
        response: Generated text
        updated_cache: KV cache for reuse

    // Tokenize prompt
    tokens ← Tokenize(prompt)

    // Determine cache reuse
    IF kv_cache IS NOT None AND prompt.starts_with(kv_cache.prefix):
        // Reuse cached KV states
        new_tokens ← tokens[kv_cache.prefix_len:]
        cache ← kv_cache.states
    ELSE:
        // Start fresh
        new_tokens ← tokens
        cache ← None

    // Prefill phase (process prompt)
    cache ← model.prefill(new_tokens, cache)

    // Decode phase (generate tokens)
    output_tokens ← []
    FOR _ IN 0..config.max_tokens:
        // Get next token logits
        logits ← model.decode_step(cache)

        // Apply temperature
        logits ← logits / config.temperature

        // Top-p (nucleus) sampling
        sorted_idx ← ArgSort(logits, descending=TRUE)
        cumsum ← CumulativeSum(Softmax(logits[sorted_idx]))
        cutoff_idx ← FirstWhere(cumsum > config.top_p)
        valid_idx ← sorted_idx[:cutoff_idx + 1]

        // Sample from valid tokens
        probs ← Softmax(logits[valid_idx])
        next_token ← Sample(valid_idx, probs)

        // Check for EOS
        IF next_token == EOS_TOKEN:
            BREAK

        output_tokens.append(next_token)

        // Update cache
        cache ← model.update_cache(cache, next_token)

    // Decode to text
    response ← Detokenize(output_tokens)

    // Build updated cache
    updated_cache ← KVCache {
        prefix: prompt,
        prefix_len: tokens.len(),
        states: cache
    }

    RETURN (response, updated_cache)

5.2 Model Selection and Loading

ALGORITHM SelectAndLoadModel(
    model_size: ModelSize,
    device: DeviceType,
    memory_budget: u64
) -> LFM2Model:
    INPUT:
        model_size: Enum {350M, 700M, 1.2B, 2.6B}
        device: Enum {CPU, GPU, NPU}
        memory_budget: Available memory in bytes
    OUTPUT:
        model: Loaded and optimized model

    // Determine quantization based on device and memory
    quantization ← SelectQuantization(model_size, device, memory_budget)

    // Model paths
    model_path ← MODEL_PATHS[model_size][quantization]

    // Load model
    MATCH device:
        CPU:
            model ← LlamaCpp.load(model_path, {
                n_ctx: GetContextSize(model_size),
                n_threads: GetOptimalThreads(),
                use_mmap: TRUE,
                use_mlock: FALSE
            })

        GPU:
            model ← VLLM.load(model_path, {
                tensor_parallel: GetGPUCount(),
                dtype: quantization.dtype,
                max_model_len: GetContextSize(model_size)
            })

        NPU:
            // ExecuTorch for edge devices
            model ← ExecuTorch.load(model_path + ".pte")

    RETURN model


ALGORITHM SelectQuantization(
    model_size: ModelSize,
    device: DeviceType,
    memory_budget: u64
) -> Quantization:
    // Memory requirements (approximate)
    base_memory ← MODEL_BASE_MEMORY[model_size]

    IF device == GPU:
        IF memory_budget >= base_memory:
            RETURN Quantization::FP16
        ELSE IF memory_budget >= base_memory / 2:
            RETURN Quantization::INT8
        ELSE:
            RETURN Quantization::INT4

    ELSE:  // CPU
        IF memory_budget >= base_memory / 2:
            RETURN Quantization::Q5_K_M
        ELSE IF memory_budget >= base_memory / 4:
            RETURN Quantization::Q4_K_M
        ELSE:
            RETURN Quantization::Q2_K

CONSTANTS:
    MODEL_BASE_MEMORY = {
        350M: 700_000_000,    // ~700MB FP16
        700M: 1_400_000_000,  // ~1.4GB FP16
        1.2B: 2_400_000_000,  // ~2.4GB FP16
        2.6B: 5_200_000_000   // ~5.2GB FP16
    }

6. Utility Algorithms

6.1 Quality Evaluation

ALGORITHM EvaluateQuality(
    query: String,
    response: String,
    context: List<Document>
) -> f32:
    INPUT:
        query: Original user query
        response: Generated response
        context: Retrieved context documents
    OUTPUT:
        score: Quality score [0, 1]

    // Build evaluation prompt
    context_text ← context.map(d => d.text).join("\n---\n")

    eval_prompt ← Format("""
        Evaluate the following response on a scale of 1-5.

        === Context ===
        {context_text}

        === Query ===
        {query}

        === Response ===
        {response}

        === Evaluation Criteria ===
        1. Factual Accuracy: Is the response grounded in the context?
        2. Completeness: Does it fully address the query?
        3. Coherence: Is the response logically structured?
        4. Conciseness: Is it appropriately brief without being incomplete?

        Provide your evaluation as a single integer from 1 to 5:
    """)

    // Use judge model (typically 2.6B)
    judge_response ← JUDGE_MODEL.generate(eval_prompt, max_tokens=10)

    // Parse score
    score_int ← ParseInteger(judge_response.trim())
    IF score_int IS None OR score_int < 1 OR score_int > 5:
        score_int ← 3  // Default to neutral on parse failure

    // Normalize to [0, 1]
    score ← (score_int - 1) / 4.0

    RETURN score

6.2 Context Building

ALGORITHM BuildContext(
    ranked_nodes: List<Node>,
    max_tokens: u32,
    deduplicate: bool
) -> ContextResult:
    INPUT:
        ranked_nodes: Attention-ranked nodes
        max_tokens: Maximum context token budget
        deduplicate: Whether to remove near-duplicate content
    OUTPUT:
        context: Constructed context with sources

    selected_nodes ← []
    seen_hashes ← HashSet<u64>()
    total_tokens ← 0

    FOR node IN ranked_nodes:
        // Token count
        node_tokens ← CountTokens(node.text)

        // Check budget
        IF total_tokens + node_tokens > max_tokens:
            CONTINUE

        // Deduplication
        IF deduplicate:
            text_hash ← MinHash(node.text, num_hashes=128)
            similar_seen ← seen_hashes.any(h => JaccardSimilarity(h, text_hash) > 0.8)
            IF similar_seen:
                CONTINUE
            seen_hashes.add(text_hash)

        selected_nodes.append(node)
        total_tokens += node_tokens

    // Format context
    context_text ← selected_nodes.enumerate()
        .map((i, node) => Format("[{i+1}] {node.text}"))
        .join("\n\n")

    sources ← selected_nodes.map(n => Source {
        id: n.id,
        text_preview: n.text[:100],
        confidence: n.metadata.confidence
    })

    RETURN ContextResult {
        text: context_text,
        sources: sources,
        token_count: total_tokens,
        nodes_used: selected_nodes.len()
    }

6.3 Telemetry Logging

ALGORITHM LogTelemetry(
    routing: RoutingDecision,
    search_stats: SearchStatistics,
    latency: LatencyBreakdown,
    quality: f32
):
    entry ← TelemetryEntry {
        timestamp: NOW,
        request_id: CurrentRequestID(),

        // Routing
        model_selected: routing.model_selection,
        model_probs: routing.model_probs,
        context_size: routing.context_size,
        temperature: routing.temperature,
        top_p: routing.top_p,
        router_confidence: routing.confidence,

        // Retrieval
        k_retrieved: search_stats.k_retrieved,
        distance_stats: search_stats.distances,
        graph_depth: search_stats.graph_depth,

        // Latency
        total_ms: latency.total,
        retrieval_ms: latency.retrieval,
        routing_ms: latency.routing,
        generation_ms: latency.generation,
        writeback_ms: latency.writeback,

        // Quality
        quality_score: quality,

        // System
        device_class: CurrentDevice(),
        memory_used: GetMemoryUsage()
    }

    // Async write to metrics store
    METRICS_CHANNEL.send(entry)

    // Prometheus metrics
    HISTOGRAM_LATENCY.observe(latency.total)
    COUNTER_REQUESTS.inc()
    GAUGE_QUALITY.set(quality)
    HISTOGRAM_MODEL.observe(ModelSizeToInt(routing.model_selection))

7. Initialization and Shutdown

7.1 System Initialization

ALGORITHM InitializeRuvLLM(config: RuvLLMConfig) -> RuvLLMSystem:
    // 1. Initialize vector database
    db ← VectorDB.open(config.db_path, {
        dimensions: config.embedding_dim,
        hnsw_m: config.hnsw_m,
        hnsw_ef_construction: config.hnsw_ef_construction
    })

    // 2. Load embedding model
    embedder ← EmbeddingAdapter.load(config.embedding_model_path)

    // 3. Initialize router
    router ← FastGRNNRouter.load(config.router_model_path)

    // 4. Load LFM2 models (lazy loading for memory efficiency)
    models ← LazyModelLoader {
        paths: config.lfm2_paths,
        loaded: HashMap::new(),
        max_loaded: config.max_concurrent_models
    }

    // 5. Initialize graph attention
    graph_attention ← GraphAttentionEngine.new({
        num_heads: config.attention_heads,
        head_dim: config.attention_head_dim
    })

    // 6. Initialize self-learning components
    replay_buffer ← ReplayBuffer.new(config.replay_capacity)
    ewc ← ElasticWeightConsolidation.load_or_new(config.ewc_path)
    optimizer ← Adam.new(router.parameters(), lr=config.learning_rate)

    // 7. Initialize quality judge
    judge ← QualityJudge.new(models.get(ModelSize::2.6B))

    // 8. Start background services
    telemetry_service ← TelemetryService.start(config.metrics_endpoint)
    training_service ← TrainingService.start(
        router, replay_buffer, ewc, optimizer,
        config.training_interval
    )

    RETURN RuvLLMSystem {
        db, embedder, router, models,
        graph_attention, replay_buffer, ewc,
        judge, telemetry_service, training_service
    }

7.2 Graceful Shutdown

ALGORITHM ShutdownRuvLLM(system: RuvLLMSystem):
    // 1. Stop accepting new requests
    system.accepting_requests ← FALSE

    // 2. Wait for in-flight requests (with timeout)
    WaitWithTimeout(system.request_counter == 0, timeout=30s)

    // 3. Flush replay buffer
    system.replay_buffer.persist(config.replay_path)

    // 4. Save EWC state
    system.ewc.persist(config.ewc_path)

    // 5. Save router checkpoint
    system.router.save_checkpoint(config.router_checkpoint_path)

    // 6. Flush metrics
    system.telemetry_service.flush()

    // 7. Close database
    system.db.sync()
    system.db.close()

    // 8. Unload models
    system.models.unload_all()

    LOG("RuvLLM shutdown complete")

Document Version: 1.0 Last Updated: 2025-12-02 Author: RuvLLM Architecture Team