Files

ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900

2026-02-28 14:39:40 -05:00

31 KiB

Raw Blame History

RuvLLM: Algorithm Design

SPARC Phase 2: Pseudocode

1. Core Request Flow

1.1 Main Orchestrator

ALGORITHM ProcessQuery(query: String, session: Session) -> Response:
    INPUT:
        query: User query string
        session: Session containing user context, history, constraints
    OUTPUT:
        response: Generated response with metadata

    // Step 1: Preprocessing and Embedding
    tokens ← Tokenize(query)
    query_embedding ← EmbedQuery(query)
    query_features ← ExtractQueryFeatures(tokens, query_embedding)

    // Step 2: Memory Retrieval via HNSW
    candidates ← HNSWSearch(
        vector: query_embedding,
        k: 64,
        ef_search: GetAdaptiveEfSearch(session.latency_budget)
    )

    // Step 3: Graph Attention over Neighborhood
    graph_context ← GraphAttention(
        center_node: query_embedding,
        neighbors: candidates,
        hops: 2,
        attention_heads: 4
    )

    // Step 4: Feature Extraction for Router
    router_features ← BuildRouterFeatures(
        query_features,
        candidates.statistics(),
        graph_context.summary(),
        session.constraints
    )

    // Step 5: FastGRNN Routing Decision
    routing_decision ← FastGRNNRoute(router_features, session.hidden_state)
    session.hidden_state ← routing_decision.new_hidden

    // Step 6: Context Construction
    context ← BuildContext(
        graph_context.ranked_nodes,
        max_tokens: routing_decision.context_size,
        dedup: TRUE
    )

    // Step 7: LFM2 Generation
    response ← LFM2Generate(
        model: routing_decision.model_selection,
        prompt: FormatPrompt(query, context),
        temperature: routing_decision.temperature,
        top_p: routing_decision.top_p,
        max_tokens: GetMaxTokens(routing_decision.model_selection)
    )

    // Step 8: Quality Evaluation
    quality_score ← EvaluateQuality(query, response, context)

    // Step 9: Optional Writeback
    IF quality_score > QUALITY_THRESHOLD:
        MemoryWriteback(query, response, quality_score)

    // Step 10: Telemetry
    LogTelemetry(
        routing_decision,
        candidates.stats,
        latency_breakdown,
        quality_score
    )

    RETURN Response {
        text: response,
        confidence: quality_score,
        sources: context.sources,
        routing_info: routing_decision
    }

1.2 Adaptive efSearch Selection

ALGORITHM GetAdaptiveEfSearch(latency_budget_ms: f32) -> u32:
    // Dynamic HNSW parameter based on latency constraints

    IF latency_budget_ms < 100:
        RETURN 32    // Fast mode, lower recall
    ELSE IF latency_budget_ms < 300:
        RETURN 64    // Balanced mode
    ELSE IF latency_budget_ms < 500:
        RETURN 128   // High recall mode
    ELSE:
        RETURN 256   // Maximum recall mode

2. FastGRNN Router

2.1 Core FastGRNN Cell

ALGORITHM FastGRNNCell(x: Vector, h: Vector, params: FastGRNNParams) -> Vector:
    INPUT:
        x: Input feature vector [input_dim]
        h: Hidden state [hidden_dim]
        params: {W_z, U_z, b_z, W_h, U_h, b_h, zeta, nu}
    OUTPUT:
        h_new: Updated hidden state [hidden_dim]

    // Update gate
    z_pre ← MatMul(params.W_z, x) + MatMul(params.U_z, h) + params.b_z
    z ← Sigmoid(z_pre)

    // Candidate hidden state
    h_tilde_pre ← MatMul(params.W_h, x) + MatMul(params.U_h, h) + params.b_h
    h_tilde ← Tanh(h_tilde_pre)

    // FastGRNN update with learned scalars
    h_new ← (params.zeta * (1 - z) + params.nu) ⊙ h_tilde + z ⊙ h

    RETURN h_new

2.2 Router Forward Pass

ALGORITHM FastGRNNRoute(features: Vector, hidden: Vector) -> RoutingDecision:
    INPUT:
        features: Router input features [128]
        hidden: Previous hidden state [64]
    OUTPUT:
        decision: RoutingDecision with model, context, temperature, top_p

    // Normalize input
    features_norm ← LayerNorm(features)

    // FastGRNN cell update
    h_new ← FastGRNNCell(features_norm, hidden, ROUTER_PARAMS)

    // Output heads
    model_logits ← Linear(h_new, W_model)           // [4] for 4 model sizes
    context_logits ← Linear(h_new, W_context)       // [5] for context bins
    temp_raw ← Linear(h_new, W_temp)                // [1] scalar
    top_p_raw ← Linear(h_new, W_top_p)              // [1] scalar
    confidence_raw ← Linear(h_new, W_confidence)    // [1] scalar

    // Activations
    model_probs ← Softmax(model_logits)
    context_probs ← Softmax(context_logits)
    temperature ← Sigmoid(temp_raw) * 2.0           // Scale to [0, 2]
    top_p ← Sigmoid(top_p_raw)                      // Scale to [0, 1]
    confidence ← Sigmoid(confidence_raw)

    // Decoding with confidence threshold
    IF confidence < CONFIDENCE_THRESHOLD:
        // Fall back to safe defaults
        model_idx ← 2  // 1.2B model
        context_idx ← 3  // 2048 tokens
    ELSE:
        model_idx ← ArgMax(model_probs)
        context_idx ← ArgMax(context_probs)

    RETURN RoutingDecision {
        model_selection: MODEL_SIZES[model_idx],
        context_size: CONTEXT_BINS[context_idx],
        temperature: temperature,
        top_p: top_p,
        confidence: confidence,
        new_hidden: h_new
    }

CONSTANTS:
    MODEL_SIZES = [350M, 700M, 1.2B, 2.6B]
    CONTEXT_BINS = [256, 512, 1024, 2048, 4096]
    CONFIDENCE_THRESHOLD = 0.7

2.3 Feature Extraction

ALGORITHM BuildRouterFeatures(
    query_features: QueryFeatures,
    search_stats: SearchStatistics,
    graph_summary: GraphSummary,
    constraints: SystemConstraints
) -> Vector:
    OUTPUT: features [128]

    features ← EmptyVector(128)
    offset ← 0

    // Query features [32 dims]
    features[offset:offset+1] ← Normalize(query_features.token_count, 0, 512)
    offset += 1
    features[offset:offset+8] ← query_features.language_one_hot
    offset += 8
    features[offset:offset+16] ← query_features.domain_embedding
    offset += 16
    features[offset:offset+1] ← Normalize(query_features.user_frequency, 0, 1000)
    offset += 1
    features[offset:offset+6] ← query_features.query_type_probs
    offset += 6

    // Embedding statistics [16 dims]
    features[offset:offset+1] ← Normalize(query_features.embedding_l2_norm, 0, 10)
    offset += 1
    features[offset:offset+8] ← query_features.pca_components[:8]
    offset += 8
    features[offset:offset+1] ← query_features.embedding_entropy
    offset += 1
    features[offset:offset+1] ← query_features.embedding_sparsity
    offset += 1
    features[offset:offset+4] ← query_features.cluster_soft_assignment
    offset += 4
    features[offset:offset+1] ← 0  // padding
    offset += 1

    // Search statistics [48 dims]
    features[offset:offset+1] ← Normalize(search_stats.k_retrieved, 0, 64)
    offset += 1
    features[offset:offset+4] ← [
        Normalize(search_stats.distance_mean, 0, 2),
        Normalize(search_stats.distance_std, 0, 1),
        Normalize(search_stats.distance_min, 0, 2),
        Normalize(search_stats.distance_max, 0, 2)
    ]
    offset += 4
    features[offset:offset+1] ← search_stats.distance_entropy
    offset += 1
    features[offset:offset+1] ← Normalize(search_stats.graph_depth, 0, 10)
    offset += 1
    features[offset:offset+1] ← search_stats.recall_estimate
    offset += 1
    features[offset:offset+16] ← graph_summary.neighborhood_density_histogram
    offset += 16
    features[offset:offset+24] ← graph_summary.semantic_coherence_features
    offset += 24

    // System constraints [32 dims]
    features[offset:offset+1] ← Normalize(constraints.latency_budget_ms, 0, 5000)
    offset += 1
    features[offset:offset+4] ← constraints.device_class_one_hot
    offset += 4
    features[offset:offset+4] ← constraints.privacy_level_one_hot
    offset += 4
    features[offset:offset+1] ← Normalize(constraints.memory_available_mb, 0, 16000)
    offset += 1
    features[offset:offset+1] ← Normalize(constraints.battery_level, 0, 100)
    offset += 1
    features[offset:offset+1] ← Normalize(constraints.concurrent_requests, 0, 100)
    offset += 1
    features[offset:offset+16] ← constraints.historical_accuracy_per_domain
    offset += 16
    features[offset:offset+4] ← [0, 0, 0, 0]  // padding
    offset += 4

    ASSERT offset == 128
    RETURN features

3. Graph Attention Engine

3.1 Two-Hop Neighborhood Expansion

ALGORITHM ExpandNeighborhood(
    center_nodes: List<Node>,
    db: VectorDB,
    max_hops: u32,
    max_per_hop: u32
) -> SubGraph:
    INPUT:
        center_nodes: Initial retrieved nodes
        db: Vector database with graph structure
        max_hops: Maximum expansion hops (typically 2)
        max_per_hop: Maximum neighbors per node per hop
    OUTPUT:
        subgraph: Expanded subgraph with nodes and edges

    visited ← HashSet<NodeID>()
    frontier ← center_nodes
    all_nodes ← center_nodes.clone()
    all_edges ← List<Edge>()

    FOR hop IN 1..=max_hops:
        next_frontier ← List<Node>()

        FOR node IN frontier:
            IF node.id IN visited:
                CONTINUE
            visited.add(node.id)

            // Get outgoing edges
            edges ← db.get_edges(node.id, limit: max_per_hop)
            all_edges.extend(edges)

            FOR edge IN edges:
                IF edge.dst NOT IN visited:
                    neighbor ← db.get_node(edge.dst)
                    next_frontier.append(neighbor)
                    all_nodes.append(neighbor)

        frontier ← next_frontier

    RETURN SubGraph {
        nodes: all_nodes,
        edges: all_edges,
        center_ids: center_nodes.map(n => n.id)
    }

3.2 Graph Attention Mechanism

ALGORITHM GraphAttention(
    center_embedding: Vector,
    subgraph: SubGraph,
    config: GraphAttentionConfig
) -> GraphContext:
    INPUT:
        center_embedding: Query embedding
        subgraph: Expanded neighborhood
        config: {num_heads, head_dim, dropout}
    OUTPUT:
        context: Attended graph context

    // Build attention inputs
    node_embeddings ← subgraph.nodes.map(n => n.vector)
    edge_features ← BuildEdgeFeatures(subgraph.edges)
    adjacency ← BuildAdjacencyMatrix(subgraph)

    // Multi-head graph attention
    attended_embeddings ← []
    attention_weights ← []

    FOR head IN 0..config.num_heads:
        // Project Q, K, V for this head
        Q ← Linear(center_embedding, W_Q[head])
        K ← Linear_batch(node_embeddings, W_K[head])
        V ← Linear_batch(node_embeddings, W_V[head])

        // Compute attention scores with edge features
        scores ← []
        FOR i, node IN enumerate(node_embeddings):
            // Base attention
            score ← Dot(Q, K[i]) / Sqrt(config.head_dim)

            // Edge-aware modulation
            IF EdgeExists(center_id, node.id, subgraph):
                edge ← GetEdge(center_id, node.id, subgraph)
                edge_emb ← EdgeEmbed(edge.rel, edge.weight)
                score += Dot(Q, edge_emb)

            // Distance decay
            hop_distance ← GetHopDistance(center_id, node.id, subgraph)
            score *= Exp(-config.distance_decay * hop_distance)

            scores.append(score)

        // Normalize with softmax (masked for disconnected nodes)
        weights ← MaskedSoftmax(scores, adjacency)
        attention_weights.append(weights)

        // Weighted aggregation
        head_output ← WeightedSum(V, weights)
        attended_embeddings.append(head_output)

    // Concatenate heads and project
    concatenated ← Concat(attended_embeddings)
    output ← Linear(concatenated, W_O) + center_embedding  // Residual

    // Rank nodes by attention weight
    avg_weights ← Mean(attention_weights, axis=0)
    ranked_indices ← ArgSort(avg_weights, descending=TRUE)

    RETURN GraphContext {
        embedding: output,
        ranked_nodes: subgraph.nodes[ranked_indices],
        attention_weights: avg_weights[ranked_indices],
        summary: ExtractGraphSummary(subgraph, avg_weights)
    }

3.3 Edge Feature Encoding

ALGORITHM BuildEdgeFeatures(edges: List<Edge>) -> EdgeFeatures:
    // Encode edge relationships and metadata

    features ← List<Vector>()

    FOR edge IN edges:
        // Relationship type embedding
        rel_emb ← RELATION_EMBEDDINGS[edge.rel]  // Learned embeddings

        // Temporal features
        age_days ← (NOW - edge.metadata.timestamp) / SECONDS_PER_DAY
        recency ← Exp(-age_days / DECAY_CONSTANT)

        // Confidence and weight
        confidence ← edge.metadata.confidence
        weight ← edge.weight

        // Combine features
        edge_feature ← Concat([
            rel_emb,                    // [16]
            [recency],                  // [1]
            [confidence],               // [1]
            [weight],                   // [1]
            [Log(1 + age_days) / 10]    // [1]
        ])

        features.append(edge_feature)

    RETURN EdgeFeatures { vectors: features, dim: 20 }

CONSTANTS:
    RELATION_EMBEDDINGS = LearnedEmbedding(num_relations=10, dim=16)
    DECAY_CONSTANT = 30.0  // days

4. Self-Learning Algorithms

4.1 Memory Writeback

ALGORITHM MemoryWriteback(
    query: String,
    response: String,
    quality_score: f32,
    db: VectorDB
) -> Result<Option<NodeID>>:
    INPUT:
        query, response: Q&A pair
        quality_score: Judge-evaluated quality [0, 1]
        db: Vector database
    OUTPUT:
        inserted_id: ID of new node, or None if skipped

    // Quality gate
    IF quality_score < QUALITY_THRESHOLD:
        RETURN None

    // Create embedding
    combined_text ← Format("Q: {query}\nA: {response}")
    embedding ← EmbedText(combined_text)

    // Deduplication check
    similar ← db.search(embedding, k=5, threshold=0.95)
    IF similar.len() > 0:
        // Near-duplicate found
        best_match ← similar[0]

        IF quality_score > best_match.metadata.quality:
            // Update existing entry (better quality)
            db.update_metadata(best_match.id, {
                quality: quality_score,
                updated_at: NOW,
                update_count: best_match.metadata.update_count + 1
            })
            RETURN Some(best_match.id)
        ELSE:
            // Skip - existing entry is better
            RETURN None

    // Insert new entry
    node ← Node {
        id: NewUUID(),
        vector: embedding,
        text: combined_text,
        type: NodeType::QAPair,
        source: "self_learning",
        metadata: {
            timestamp: NOW,
            quality: quality_score,
            domain: ClassifyDomain(query),
            version: 1,
            update_count: 0
        }
    }

    inserted_id ← db.insert(node)

    // Create edges to similar existing nodes
    FOR neighbor IN similar:
        edge ← Edge {
            src: inserted_id,
            dst: neighbor.id,
            rel: EdgeType::SameTopic,
            weight: neighbor.score,
            metadata: {
                timestamp: NOW,
                created_by: "self_learning"
            }
        }
        db.insert_edge(edge)

    RETURN Some(inserted_id)

CONSTANTS:
    QUALITY_THRESHOLD = 0.75  // 3.75/5.0

4.2 Experience Replay Buffer

ALGORITHM ReservoirSampling:
    // Maintain fixed-size buffer with uniform sampling

    STRUCT ReplayBuffer:
        entries: List<ReplayEntry>
        capacity: u32
        total_seen: u64

    FUNCTION new(capacity: u32) -> ReplayBuffer:
        RETURN ReplayBuffer {
            entries: [],
            capacity: capacity,
            total_seen: 0
        }

    FUNCTION add(self, entry: ReplayEntry):
        self.total_seen += 1

        IF self.entries.len() < self.capacity:
            self.entries.append(entry)
        ELSE:
            // Reservoir sampling: replace with probability capacity/total_seen
            idx ← RandomInt(0, self.total_seen)
            IF idx < self.capacity:
                self.entries[idx] ← entry

    FUNCTION sample(self, batch_size: u32) -> List<ReplayEntry>:
        IF self.entries.len() < batch_size:
            RETURN self.entries.clone()

        indices ← RandomSample(0, self.entries.len(), batch_size, replace=FALSE)
        RETURN indices.map(i => self.entries[i].clone())

    FUNCTION distribution_stats(self) -> DistributionStats:
        // Analyze distribution for curriculum balancing
        domain_counts ← CountBy(self.entries, e => e.domain)
        quality_hist ← Histogram(self.entries.map(e => e.quality), bins=10)
        complexity_hist ← Histogram(self.entries.map(e => e.complexity), bins=10)

        RETURN DistributionStats {
            domain_counts,
            quality_hist,
            complexity_hist,
            coverage: domain_counts.len() / TOTAL_DOMAINS
        }

4.3 EWC Training Update

ALGORITHM EWCTrainingStep(
    model: RouterModel,
    batch: List<TrainingSample>,
    ewc: ElasticWeightConsolidation,
    optimizer: Optimizer
) -> TrainingMetrics:
    INPUT:
        model: FastGRNN router model
        batch: Training samples with labels
        ewc: EWC state with Fisher info and optimal weights
        optimizer: Adam optimizer
    OUTPUT:
        metrics: Loss and accuracy metrics

    // Forward pass
    predictions ← []
    FOR sample IN batch:
        features ← BuildRouterFeatures(sample)
        pred ← model.forward(features, sample.hidden_state)
        predictions.append(pred)

    // Task loss
    model_loss ← CrossEntropy(
        predictions.map(p => p.model_probs),
        batch.map(s => s.label_model)
    )

    context_loss ← CrossEntropy(
        predictions.map(p => p.context_probs),
        batch.map(s => s.label_context)
    )

    temp_loss ← SmoothL1(
        predictions.map(p => p.temperature),
        batch.map(s => s.label_temperature)
    )

    top_p_loss ← SmoothL1(
        predictions.map(p => p.top_p),
        batch.map(s => s.label_top_p)
    )

    task_loss ← model_loss + context_loss + ALPHA * temp_loss + BETA * top_p_loss

    // EWC regularization loss
    current_weights ← model.get_weights()
    ewc_loss ← ewc.regularization_loss(current_weights)

    // Total loss
    total_loss ← task_loss + ewc_loss

    // Backward pass
    gradients ← Backward(total_loss, model.parameters())

    // Optimizer step
    optimizer.step(model.parameters(), gradients)

    // Compute metrics
    accuracy ← ComputeAccuracy(predictions, batch)

    RETURN TrainingMetrics {
        total_loss,
        task_loss,
        ewc_loss,
        model_accuracy: accuracy.model,
        context_accuracy: accuracy.context
    }

CONSTANTS:
    ALPHA = 0.1  // Temperature loss weight
    BETA = 0.1   // Top-p loss weight

4.4 Fisher Information Update

ALGORITHM UpdateFisherInformation(
    model: RouterModel,
    dataset: List<Sample>,
    ewc: ElasticWeightConsolidation,
    num_samples: u32
) -> ElasticWeightConsolidation:
    // Compute Fisher information diagonal approximation

    // Sample subset for efficiency
    samples ← RandomSample(dataset, num_samples)

    // Accumulate squared gradients
    fisher_accum ← ZeroVector(model.num_parameters())

    FOR sample IN samples:
        features ← BuildRouterFeatures(sample)
        pred ← model.forward(features, sample.hidden_state)

        // Log-likelihood gradient (for correctly classified samples)
        log_prob ← Log(pred.model_probs[sample.label_model])
        gradients ← Backward(log_prob, model.parameters())

        // Accumulate squared gradients
        FOR i IN 0..model.num_parameters():
            fisher_accum[i] += gradients[i] ** 2

    // Average
    fisher_diag ← fisher_accum / num_samples

    // Update EWC state
    ewc.fisher_info ← fisher_diag
    ewc.optimal_weights ← model.get_weights().clone()

    RETURN ewc

5. LFM2 Inference

5.1 Generation with KV Cache

ALGORITHM LFM2Generate(
    model: LFM2Model,
    prompt: String,
    config: GenerationConfig,
    kv_cache: Option<KVCache>
) -> (String, KVCache):
    INPUT:
        model: Loaded LFM2 model (350M/700M/1.2B/2.6B)
        prompt: Formatted prompt with context
        config: {temperature, top_p, max_tokens}
        kv_cache: Optional cached KV states from previous turn
    OUTPUT:
        response: Generated text
        updated_cache: KV cache for reuse

    // Tokenize prompt
    tokens ← Tokenize(prompt)

    // Determine cache reuse
    IF kv_cache IS NOT None AND prompt.starts_with(kv_cache.prefix):
        // Reuse cached KV states
        new_tokens ← tokens[kv_cache.prefix_len:]
        cache ← kv_cache.states
    ELSE:
        // Start fresh
        new_tokens ← tokens
        cache ← None

    // Prefill phase (process prompt)
    cache ← model.prefill(new_tokens, cache)

    // Decode phase (generate tokens)
    output_tokens ← []
    FOR _ IN 0..config.max_tokens:
        // Get next token logits
        logits ← model.decode_step(cache)

        // Apply temperature
        logits ← logits / config.temperature

        // Top-p (nucleus) sampling
        sorted_idx ← ArgSort(logits, descending=TRUE)
        cumsum ← CumulativeSum(Softmax(logits[sorted_idx]))
        cutoff_idx ← FirstWhere(cumsum > config.top_p)
        valid_idx ← sorted_idx[:cutoff_idx + 1]

        // Sample from valid tokens
        probs ← Softmax(logits[valid_idx])
        next_token ← Sample(valid_idx, probs)

        // Check for EOS
        IF next_token == EOS_TOKEN:
            BREAK

        output_tokens.append(next_token)

        // Update cache
        cache ← model.update_cache(cache, next_token)

    // Decode to text
    response ← Detokenize(output_tokens)

    // Build updated cache
    updated_cache ← KVCache {
        prefix: prompt,
        prefix_len: tokens.len(),
        states: cache
    }

    RETURN (response, updated_cache)

5.2 Model Selection and Loading

ALGORITHM SelectAndLoadModel(
    model_size: ModelSize,
    device: DeviceType,
    memory_budget: u64
) -> LFM2Model:
    INPUT:
        model_size: Enum {350M, 700M, 1.2B, 2.6B}
        device: Enum {CPU, GPU, NPU}
        memory_budget: Available memory in bytes
    OUTPUT:
        model: Loaded and optimized model

    // Determine quantization based on device and memory
    quantization ← SelectQuantization(model_size, device, memory_budget)

    // Model paths
    model_path ← MODEL_PATHS[model_size][quantization]

    // Load model
    MATCH device:
        CPU:
            model ← LlamaCpp.load(model_path, {
                n_ctx: GetContextSize(model_size),
                n_threads: GetOptimalThreads(),
                use_mmap: TRUE,
                use_mlock: FALSE
            })

        GPU:
            model ← VLLM.load(model_path, {
                tensor_parallel: GetGPUCount(),
                dtype: quantization.dtype,
                max_model_len: GetContextSize(model_size)
            })

        NPU:
            // ExecuTorch for edge devices
            model ← ExecuTorch.load(model_path + ".pte")

    RETURN model


ALGORITHM SelectQuantization(
    model_size: ModelSize,
    device: DeviceType,
    memory_budget: u64
) -> Quantization:
    // Memory requirements (approximate)
    base_memory ← MODEL_BASE_MEMORY[model_size]

    IF device == GPU:
        IF memory_budget >= base_memory:
            RETURN Quantization::FP16
        ELSE IF memory_budget >= base_memory / 2:
            RETURN Quantization::INT8
        ELSE:
            RETURN Quantization::INT4

    ELSE:  // CPU
        IF memory_budget >= base_memory / 2:
            RETURN Quantization::Q5_K_M
        ELSE IF memory_budget >= base_memory / 4:
            RETURN Quantization::Q4_K_M
        ELSE:
            RETURN Quantization::Q2_K

CONSTANTS:
    MODEL_BASE_MEMORY = {
        350M: 700_000_000,    // ~700MB FP16
        700M: 1_400_000_000,  // ~1.4GB FP16
        1.2B: 2_400_000_000,  // ~2.4GB FP16
        2.6B: 5_200_000_000   // ~5.2GB FP16
    }

6. Utility Algorithms

6.1 Quality Evaluation

ALGORITHM EvaluateQuality(
    query: String,
    response: String,
    context: List<Document>
) -> f32:
    INPUT:
        query: Original user query
        response: Generated response
        context: Retrieved context documents
    OUTPUT:
        score: Quality score [0, 1]

    // Build evaluation prompt
    context_text ← context.map(d => d.text).join("\n---\n")

    eval_prompt ← Format("""
        Evaluate the following response on a scale of 1-5.

        === Context ===
        {context_text}

        === Query ===
        {query}

        === Response ===
        {response}

        === Evaluation Criteria ===
        1. Factual Accuracy: Is the response grounded in the context?
        2. Completeness: Does it fully address the query?
        3. Coherence: Is the response logically structured?
        4. Conciseness: Is it appropriately brief without being incomplete?

        Provide your evaluation as a single integer from 1 to 5:
    """)

    // Use judge model (typically 2.6B)
    judge_response ← JUDGE_MODEL.generate(eval_prompt, max_tokens=10)

    // Parse score
    score_int ← ParseInteger(judge_response.trim())
    IF score_int IS None OR score_int < 1 OR score_int > 5:
        score_int ← 3  // Default to neutral on parse failure

    // Normalize to [0, 1]
    score ← (score_int - 1) / 4.0

    RETURN score

6.2 Context Building

ALGORITHM BuildContext(
    ranked_nodes: List<Node>,
    max_tokens: u32,
    deduplicate: bool
) -> ContextResult:
    INPUT:
        ranked_nodes: Attention-ranked nodes
        max_tokens: Maximum context token budget
        deduplicate: Whether to remove near-duplicate content
    OUTPUT:
        context: Constructed context with sources

    selected_nodes ← []
    seen_hashes ← HashSet<u64>()
    total_tokens ← 0

    FOR node IN ranked_nodes:
        // Token count
        node_tokens ← CountTokens(node.text)

        // Check budget
        IF total_tokens + node_tokens > max_tokens:
            CONTINUE

        // Deduplication
        IF deduplicate:
            text_hash ← MinHash(node.text, num_hashes=128)
            similar_seen ← seen_hashes.any(h => JaccardSimilarity(h, text_hash) > 0.8)
            IF similar_seen:
                CONTINUE
            seen_hashes.add(text_hash)

        selected_nodes.append(node)
        total_tokens += node_tokens

    // Format context
    context_text ← selected_nodes.enumerate()
        .map((i, node) => Format("[{i+1}] {node.text}"))
        .join("\n\n")

    sources ← selected_nodes.map(n => Source {
        id: n.id,
        text_preview: n.text[:100],
        confidence: n.metadata.confidence
    })

    RETURN ContextResult {
        text: context_text,
        sources: sources,
        token_count: total_tokens,
        nodes_used: selected_nodes.len()
    }

6.3 Telemetry Logging

ALGORITHM LogTelemetry(
    routing: RoutingDecision,
    search_stats: SearchStatistics,
    latency: LatencyBreakdown,
    quality: f32
):
    entry ← TelemetryEntry {
        timestamp: NOW,
        request_id: CurrentRequestID(),

        // Routing
        model_selected: routing.model_selection,
        model_probs: routing.model_probs,
        context_size: routing.context_size,
        temperature: routing.temperature,
        top_p: routing.top_p,
        router_confidence: routing.confidence,

        // Retrieval
        k_retrieved: search_stats.k_retrieved,
        distance_stats: search_stats.distances,
        graph_depth: search_stats.graph_depth,

        // Latency
        total_ms: latency.total,
        retrieval_ms: latency.retrieval,
        routing_ms: latency.routing,
        generation_ms: latency.generation,
        writeback_ms: latency.writeback,

        // Quality
        quality_score: quality,

        // System
        device_class: CurrentDevice(),
        memory_used: GetMemoryUsage()
    }

    // Async write to metrics store
    METRICS_CHANNEL.send(entry)

    // Prometheus metrics
    HISTOGRAM_LATENCY.observe(latency.total)
    COUNTER_REQUESTS.inc()
    GAUGE_QUALITY.set(quality)
    HISTOGRAM_MODEL.observe(ModelSizeToInt(routing.model_selection))

7. Initialization and Shutdown

7.1 System Initialization

ALGORITHM InitializeRuvLLM(config: RuvLLMConfig) -> RuvLLMSystem:
    // 1. Initialize vector database
    db ← VectorDB.open(config.db_path, {
        dimensions: config.embedding_dim,
        hnsw_m: config.hnsw_m,
        hnsw_ef_construction: config.hnsw_ef_construction
    })

    // 2. Load embedding model
    embedder ← EmbeddingAdapter.load(config.embedding_model_path)

    // 3. Initialize router
    router ← FastGRNNRouter.load(config.router_model_path)

    // 4. Load LFM2 models (lazy loading for memory efficiency)
    models ← LazyModelLoader {
        paths: config.lfm2_paths,
        loaded: HashMap::new(),
        max_loaded: config.max_concurrent_models
    }

    // 5. Initialize graph attention
    graph_attention ← GraphAttentionEngine.new({
        num_heads: config.attention_heads,
        head_dim: config.attention_head_dim
    })

    // 6. Initialize self-learning components
    replay_buffer ← ReplayBuffer.new(config.replay_capacity)
    ewc ← ElasticWeightConsolidation.load_or_new(config.ewc_path)
    optimizer ← Adam.new(router.parameters(), lr=config.learning_rate)

    // 7. Initialize quality judge
    judge ← QualityJudge.new(models.get(ModelSize::2.6B))

    // 8. Start background services
    telemetry_service ← TelemetryService.start(config.metrics_endpoint)
    training_service ← TrainingService.start(
        router, replay_buffer, ewc, optimizer,
        config.training_interval
    )

    RETURN RuvLLMSystem {
        db, embedder, router, models,
        graph_attention, replay_buffer, ewc,
        judge, telemetry_service, training_service
    }

7.2 Graceful Shutdown

ALGORITHM ShutdownRuvLLM(system: RuvLLMSystem):
    // 1. Stop accepting new requests
    system.accepting_requests ← FALSE

    // 2. Wait for in-flight requests (with timeout)
    WaitWithTimeout(system.request_counter == 0, timeout=30s)

    // 3. Flush replay buffer
    system.replay_buffer.persist(config.replay_path)

    // 4. Save EWC state
    system.ewc.persist(config.ewc_path)

    // 5. Save router checkpoint
    system.router.save_checkpoint(config.router_checkpoint_path)

    // 6. Flush metrics
    system.telemetry_service.flush()

    // 7. Close database
    system.db.sync()
    system.db.close()

    // 8. Unload models
    system.models.unload_all()

    LOG("RuvLLM shutdown complete")

Document Version: 1.0 Last Updated: 2025-12-02 Author: RuvLLM Architecture Team

31 KiB Raw Blame History

RuvLLM: Algorithm Design

SPARC Phase 2: Pseudocode

1. Core Request Flow

1.1 Main Orchestrator

1.2 Adaptive efSearch Selection

2. FastGRNN Router

2.1 Core FastGRNN Cell

2.2 Router Forward Pass

2.3 Feature Extraction

3. Graph Attention Engine

3.1 Two-Hop Neighborhood Expansion

3.2 Graph Attention Mechanism

3.3 Edge Feature Encoding

4. Self-Learning Algorithms

4.1 Memory Writeback

4.2 Experience Replay Buffer

4.3 EWC Training Update

4.4 Fisher Information Update

5. LFM2 Inference

5.1 Generation with KV Cache

5.2 Model Selection and Loading

6. Utility Algorithms

6.1 Quality Evaluation

6.2 Context Building

6.3 Telemetry Logging

7. Initialization and Shutdown

7.1 System Initialization

7.2 Graceful Shutdown

31 KiB

Raw Blame History