Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions
--- a/crates/ruvllm/tests/fixtures/mod.rs
+++ b/crates/ruvllm/tests/fixtures/mod.rs
@@ -0,0 +1,404 @@
+//! Test Fixtures for RuvLTRA-Small
+//!
+//! This module provides test fixtures including sample prompts, expected patterns,
+//! and perplexity baselines for validating the RuvLTRA-Small inference engine.
+
+use std::collections::HashMap;
+
+// ============================================================================
+// Sample Prompts
+// ============================================================================
+
+/// Collection of test prompts organized by category
+pub mod prompts {
+    /// Simple text completion prompts
+    pub mod completion {
+        pub const QUICK_BROWN_FOX: &str = "The quick brown fox";
+        pub const ONCE_UPON_A_TIME: &str = "Once upon a time";
+        pub const IN_THE_BEGINNING: &str = "In the beginning";
+        pub const IT_WAS_A_DARK: &str = "It was a dark and stormy night";
+    }
+
+    /// Instruction-following prompts
+    pub mod instruction {
+        pub const WRITE_HAIKU: &str = "Write a haiku about programming:";
+        pub const EXPLAIN_GRAVITY: &str = "Explain gravity in simple terms:";
+        pub const LIST_PLANETS: &str = "List the planets in our solar system:";
+        pub const DESCRIBE_OCEAN: &str = "Describe the ocean in three sentences:";
+    }
+
+    /// Question-answering prompts
+    pub mod qa {
+        pub const CAPITAL_FRANCE: &str = "Q: What is the capital of France?\nA:";
+        pub const TWO_PLUS_TWO: &str = "Q: What is 2 + 2?\nA:";
+        pub const COLOR_SKY: &str = "Q: What color is the sky?\nA:";
+        pub const LARGEST_PLANET: &str = "Q: What is the largest planet in our solar system?\nA:";
+    }
+
+    /// Code generation prompts
+    pub mod code {
+        pub const FIBONACCI: &str = "def fibonacci(n):\n    '''Return the nth Fibonacci number.'''\n";
+        pub const HELLO_WORLD: &str = "# Python function to print hello world\ndef hello():";
+        pub const FACTORIAL: &str = "def factorial(n):\n    '''Return n factorial.'''\n";
+        pub const SORT_LIST: &str = "def sort_list(items):\n    '''Sort a list in ascending order.'''\n";
+    }
+
+    /// Conversation/chat prompts
+    pub mod conversation {
+        pub const GREETING: &str = "User: Hello!\nAssistant:";
+        pub const TELL_JOKE: &str = "User: Tell me a joke.\nAssistant:";
+        pub const WEATHER: &str = "User: What's the weather like today?\nAssistant:";
+        pub const HELP: &str = "User: Can you help me?\nAssistant:";
+    }
+
+    /// Edge case prompts
+    pub mod edge_cases {
+        pub const EMPTY: &str = "";
+        pub const SINGLE_CHAR: &str = "A";
+        pub const SINGLE_WORD: &str = "Hello";
+        pub const SPECIAL_CHARS: &str = "Translate: \"Hello, world!\" ->";
+        pub const UNICODE: &str = "\u{4f60}\u{597d}\u{4e16}\u{754c}"; // 你好世界
+        pub const NUMBERS_ONLY: &str = "1 2 3 4 5";
+        pub const VERY_LONG: &str = "The quick brown fox jumps over the lazy dog. \
+            The quick brown fox jumps over the lazy dog. \
+            The quick brown fox jumps over the lazy dog. \
+            The quick brown fox jumps over the lazy dog. \
+            The quick brown fox jumps over the lazy dog. \
+            Continue:";
+    }
+}
+
+// ============================================================================
+// Expected Output Patterns
+// ============================================================================
+
+/// Expected patterns in generated outputs
+pub mod expected_patterns {
+    /// Patterns expected after "The quick brown fox"
+    pub const FOX_COMPLETION: &[&str] = &[
+        "jumps", "jumped", "runs", "ran", "over", "the", "lazy", "dog"
+    ];
+
+    /// Patterns expected in haiku responses
+    pub const HAIKU_PATTERNS: &[&str] = &[
+        "code", "bug", "compile", "debug", "screen", "night", "lines", "function"
+    ];
+
+    /// Capital of France
+    pub const FRANCE_CAPITAL: &str = "Paris";
+
+    /// Answer to 2+2
+    pub const TWO_PLUS_TWO: &str = "4";
+
+    /// Patterns in Fibonacci code
+    pub const FIBONACCI_PATTERNS: &[&str] = &[
+        "return", "if", "else", "n", "<=", "1", "+", "fibonacci"
+    ];
+
+    /// Patterns in greeting responses
+    pub const GREETING_PATTERNS: &[&str] = &[
+        "hello", "hi", "hey", "how", "help", "assist", "welcome"
+    ];
+
+    /// Patterns in factorial code
+    pub const FACTORIAL_PATTERNS: &[&str] = &[
+        "return", "if", "n", "<=", "1", "*", "factorial"
+    ];
+}
+
+// ============================================================================
+// Perplexity Baselines
+// ============================================================================
+
+/// Perplexity baseline values for quality validation
+pub mod perplexity {
+    /// Maximum acceptable perplexity for coherent output
+    pub const MAX_ACCEPTABLE: f32 = 50.0;
+
+    /// Warning threshold for elevated perplexity
+    pub const WARNING_THRESHOLD: f32 = 30.0;
+
+    /// Excellent perplexity (high-quality output)
+    pub const EXCELLENT: f32 = 15.0;
+
+    /// Expected perplexity ranges by task type
+    pub mod task_ranges {
+        /// Simple completion: low perplexity expected
+        pub const COMPLETION: (f32, f32) = (5.0, 20.0);
+
+        /// Code generation: moderate perplexity
+        pub const CODE: (f32, f32) = (8.0, 30.0);
+
+        /// Creative writing: higher perplexity acceptable
+        pub const CREATIVE: (f32, f32) = (15.0, 45.0);
+
+        /// Factual QA: low perplexity (confident answers)
+        pub const FACTUAL: (f32, f32) = (3.0, 15.0);
+    }
+
+    /// Quantization degradation limits
+    pub mod degradation {
+        /// Max perplexity increase from quantization (%)
+        pub const MAX_INCREASE_PCT: f32 = 20.0;
+
+        /// Q4_K expected degradation from F16 (%)
+        pub const Q4K_EXPECTED: f32 = 15.0;
+
+        /// Q8_0 expected degradation from F16 (%)
+        pub const Q8_EXPECTED: f32 = 3.0;
+    }
+}
+
+// ============================================================================
+// Token Probability Thresholds
+// ============================================================================
+
+/// Thresholds for token probability validation
+pub mod probability_thresholds {
+    /// Minimum probability for top-1 token
+    pub const MIN_TOP1: f32 = 0.01;
+
+    /// Minimum cumulative probability for top-5 tokens
+    pub const MIN_TOP5_CUMULATIVE: f32 = 0.1;
+
+    /// Maximum entropy for non-degenerate output
+    pub const MAX_ENTROPY: f32 = 10.0;
+
+    /// Minimum confidence for factual answers
+    pub const MIN_FACTUAL_CONFIDENCE: f32 = 0.5;
+}
+
+// ============================================================================
+// Coherence Metrics
+// ============================================================================
+
+/// Coherence validation thresholds
+pub mod coherence {
+    /// Maximum consecutive word repetitions
+    pub const MAX_CONSECUTIVE_REPEATS: usize = 3;
+
+    /// Maximum n-gram repetition ratio
+    pub const MAX_NGRAM_REPETITION: f32 = 0.3;
+
+    /// Minimum alphanumeric ratio for valid text
+    pub const MIN_ALPHANUMERIC_RATIO: f32 = 0.7;
+
+    /// Maximum special character ratio
+    pub const MAX_SPECIAL_CHAR_RATIO: f32 = 0.2;
+
+    /// Sentence length bounds
+    pub const MIN_SENTENCE_LENGTH: usize = 3;
+    pub const MAX_SENTENCE_LENGTH: usize = 200;
+}
+
+// ============================================================================
+// Performance Baselines
+// ============================================================================
+
+/// Performance baseline values
+pub mod performance {
+    /// Tokens per second baselines by device
+    pub mod tokens_per_second {
+        /// M4 Pro with ANE
+        pub const M4_PRO_ANE: f32 = 60.0;
+
+        /// M4 Pro NEON only
+        pub const M4_PRO_NEON: f32 = 45.0;
+
+        /// M1 with ANE
+        pub const M1_ANE: f32 = 40.0;
+
+        /// x86 CPU (AVX2)
+        pub const X86_AVX2: f32 = 15.0;
+    }
+
+    /// Latency thresholds (milliseconds)
+    pub mod latency_ms {
+        /// Maximum time to first token
+        pub const MAX_FIRST_TOKEN: u64 = 500;
+
+        /// Maximum inter-token latency
+        pub const MAX_INTER_TOKEN: u64 = 100;
+
+        /// Target inter-token latency
+        pub const TARGET_INTER_TOKEN: u64 = 20;
+    }
+
+    /// Memory thresholds (bytes)
+    pub mod memory {
+        /// Maximum model memory (Q4_K)
+        pub const MAX_MODEL_Q4K: usize = 1_500_000_000;
+
+        /// Maximum KV cache memory
+        pub const MAX_KV_CACHE: usize = 500_000_000;
+
+        /// Maximum working memory
+        pub const MAX_WORKING: usize = 200_000_000;
+    }
+}
+
+// ============================================================================
+// Test Data Generators
+// ============================================================================
+
+/// Generate a long prompt of specified length
+pub fn generate_long_prompt(word_count: usize) -> String {
+    let words = [
+        "the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
+        "and", "then", "runs", "around", "park", "with", "great", "joy"
+    ];
+
+    (0..word_count)
+        .map(|i| words[i % words.len()])
+        .collect::<Vec<_>>()
+        .join(" ")
+}
+
+/// Generate a sequence of numbers for pattern completion tests
+pub fn generate_number_sequence(start: i32, count: usize) -> String {
+    (start..start + count as i32)
+        .map(|n| n.to_string())
+        .collect::<Vec<_>>()
+        .join(", ")
+}
+
+/// Generate a repeated pattern prompt
+pub fn generate_repetition_prompt(word: &str, count: usize) -> String {
+    vec![word; count].join(" ")
+}
+
+// ============================================================================
+// Validation Helpers
+// ============================================================================
+
+/// Check if output contains any of the expected patterns
+pub fn contains_expected_pattern(output: &str, patterns: &[&str]) -> bool {
+    let output_lower = output.to_lowercase();
+    patterns.iter().any(|p| output_lower.contains(&p.to_lowercase()))
+}
+
+/// Calculate repetition ratio for n-grams
+pub fn calculate_ngram_repetition(text: &str, n: usize) -> f32 {
+    let words: Vec<&str> = text.split_whitespace().collect();
+    if words.len() < n {
+        return 0.0;
+    }
+
+    let total_ngrams = words.len() - n + 1;
+    let mut ngram_counts: HashMap<Vec<&str>, usize> = HashMap::new();
+
+    for window in words.windows(n) {
+        *ngram_counts.entry(window.to_vec()).or_insert(0) += 1;
+    }
+
+    let repeated = ngram_counts.values().filter(|&&c| c > 1).sum::<usize>();
+    repeated as f32 / total_ngrams as f32
+}
+
+/// Count consecutive word repetitions
+pub fn count_consecutive_repeats(text: &str) -> usize {
+    let words: Vec<&str> = text.split_whitespace().collect();
+    let mut max_repeats = 0;
+    let mut current_repeats = 0;
+
+    for i in 1..words.len() {
+        if words[i] == words[i - 1] {
+            current_repeats += 1;
+            max_repeats = max_repeats.max(current_repeats);
+        } else {
+            current_repeats = 0;
+        }
+    }
+
+    max_repeats
+}
+
+/// Calculate alphanumeric ratio
+pub fn alphanumeric_ratio(text: &str) -> f32 {
+    if text.is_empty() {
+        return 0.0;
+    }
+
+    let alphanumeric = text.chars()
+        .filter(|c| c.is_alphanumeric())
+        .count();
+
+    alphanumeric as f32 / text.len() as f32
+}
+
+/// Check if text passes basic coherence checks
+pub fn is_coherent(text: &str) -> bool {
+    // Check alphanumeric ratio
+    if alphanumeric_ratio(text) < coherence::MIN_ALPHANUMERIC_RATIO {
+        return false;
+    }
+
+    // Check repetition
+    if count_consecutive_repeats(text) > coherence::MAX_CONSECUTIVE_REPEATS {
+        return false;
+    }
+
+    // Check n-gram repetition
+    if calculate_ngram_repetition(text, 3) > coherence::MAX_NGRAM_REPETITION {
+        return false;
+    }
+
+    true
+}
+
+// ============================================================================
+// Tests for Fixtures Module
+// ============================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_generate_long_prompt() {
+        let prompt = generate_long_prompt(100);
+        let word_count = prompt.split_whitespace().count();
+        assert_eq!(word_count, 100);
+    }
+
+    #[test]
+    fn test_generate_number_sequence() {
+        let seq = generate_number_sequence(1, 5);
+        assert_eq!(seq, "1, 2, 3, 4, 5");
+    }
+
+    #[test]
+    fn test_contains_expected_pattern() {
+        let output = "The fox jumps over the lazy dog";
+        assert!(contains_expected_pattern(output, expected_patterns::FOX_COMPLETION));
+    }
+
+    #[test]
+    fn test_ngram_repetition() {
+        let no_repeat = "the quick brown fox jumps over";
+        assert!(calculate_ngram_repetition(no_repeat, 2) < 0.1);
+
+        let high_repeat = "the the the the the the";
+        assert!(calculate_ngram_repetition(high_repeat, 2) > 0.5);
+    }
+
+    #[test]
+    fn test_consecutive_repeats() {
+        assert_eq!(count_consecutive_repeats("hello world"), 0);
+        assert_eq!(count_consecutive_repeats("hello hello world"), 1);
+        assert_eq!(count_consecutive_repeats("hello hello hello"), 2);
+    }
+
+    #[test]
+    fn test_alphanumeric_ratio() {
+        assert!(alphanumeric_ratio("Hello World") > 0.8);
+        assert!(alphanumeric_ratio("!@#$%^&*()") < 0.1);
+    }
+
+    #[test]
+    fn test_coherence_check() {
+        assert!(is_coherent("The quick brown fox jumps over the lazy dog."));
+        assert!(!is_coherent("!@#$%^&*()!@#$%^&*()!@#$%^&*()"));
+        assert!(!is_coherent("the the the the the the the"));
+    }
+}
--- a/crates/ruvllm/tests/fixtures/perplexity_baselines.json
+++ b/crates/ruvllm/tests/fixtures/perplexity_baselines.json
@@ -0,0 +1,161 @@
+{
+  "metadata": {
+    "version": "1.0.0",
+    "description": "Perplexity baselines for RuvLTRA-Small quality validation",
+    "model": "ruvltra-small",
+    "quantization_tested": ["Q4_K", "Q5_K", "Q8_0", "F16"],
+    "last_updated": "2024-01-19"
+  },
+  "quality_thresholds": {
+    "max_acceptable_perplexity": 50.0,
+    "warning_perplexity": 30.0,
+    "excellent_perplexity": 15.0,
+    "notes": "Perplexity values vary by dataset and prompt type"
+  },
+  "baselines": {
+    "wikitext": {
+      "description": "WikiText-2 test set perplexity",
+      "dataset_url": "https://huggingface.co/datasets/wikitext",
+      "values": {
+        "F16": {
+          "perplexity": 8.5,
+          "tokens_evaluated": 250000,
+          "notes": "Full precision baseline"
+        },
+        "Q8_0": {
+          "perplexity": 8.7,
+          "degradation_pct": 2.4,
+          "notes": "8-bit quantization, minimal quality loss"
+        },
+        "Q5_K": {
+          "perplexity": 9.2,
+          "degradation_pct": 8.2,
+          "notes": "5-bit k-quant, good balance"
+        },
+        "Q4_K": {
+          "perplexity": 9.8,
+          "degradation_pct": 15.3,
+          "notes": "4-bit k-quant, most common deployment format"
+        },
+        "Q2_K": {
+          "perplexity": 14.5,
+          "degradation_pct": 70.6,
+          "notes": "2-bit extreme quantization, noticeable degradation"
+        }
+      }
+    },
+    "lambada": {
+      "description": "LAMBADA last-word prediction accuracy",
+      "metric": "accuracy",
+      "values": {
+        "F16": {
+          "accuracy": 0.72,
+          "notes": "Full precision accuracy"
+        },
+        "Q4_K": {
+          "accuracy": 0.68,
+          "degradation_pct": 5.6,
+          "notes": "Slight accuracy drop acceptable"
+        }
+      }
+    },
+    "hellaswag": {
+      "description": "HellaSwag commonsense reasoning",
+      "metric": "accuracy",
+      "values": {
+        "F16": {
+          "accuracy": 0.68
+        },
+        "Q4_K": {
+          "accuracy": 0.65,
+          "degradation_pct": 4.4
+        }
+      }
+    },
+    "custom_prompts": {
+      "description": "Perplexity on custom test prompts",
+      "values": {
+        "simple_completion": {
+          "expected_ppl_range": [5.0, 20.0],
+          "notes": "Common phrase continuation should have low perplexity"
+        },
+        "code_generation": {
+          "expected_ppl_range": [8.0, 30.0],
+          "notes": "Code has higher entropy but should still be coherent"
+        },
+        "creative_writing": {
+          "expected_ppl_range": [15.0, 45.0],
+          "notes": "Creative tasks have higher acceptable perplexity"
+        },
+        "factual_qa": {
+          "expected_ppl_range": [3.0, 15.0],
+          "notes": "Factual responses should be confident"
+        }
+      }
+    }
+  },
+  "degradation_limits": {
+    "max_perplexity_increase_pct": 20.0,
+    "max_accuracy_decrease_pct": 10.0,
+    "notes": "Quantization should not degrade quality beyond these limits"
+  },
+  "token_probability_thresholds": {
+    "min_top1_probability": 0.01,
+    "min_top5_cumulative": 0.1,
+    "max_entropy": 10.0,
+    "notes": "Thresholds for detecting garbled or degenerate output"
+  },
+  "repetition_metrics": {
+    "max_ngram_repetition_ratio": 0.3,
+    "max_consecutive_repeats": 3,
+    "ngram_window_sizes": [2, 3, 4],
+    "notes": "Detect excessive repetition in generated text"
+  },
+  "coherence_metrics": {
+    "min_sentence_length": 3,
+    "max_sentence_length": 200,
+    "punctuation_ratio_range": [0.01, 0.15],
+    "alphanumeric_ratio_min": 0.7,
+    "notes": "Basic structural coherence checks"
+  },
+  "speed_baselines": {
+    "description": "Token generation speed baselines (tokens/second)",
+    "device_baselines": {
+      "m4_pro_ane": {
+        "prompt_processing": 2000,
+        "generation": 60,
+        "notes": "M4 Pro with ANE acceleration"
+      },
+      "m4_pro_neon": {
+        "prompt_processing": 1500,
+        "generation": 45,
+        "notes": "M4 Pro NEON-only fallback"
+      },
+      "m1_ane": {
+        "prompt_processing": 1200,
+        "generation": 40,
+        "notes": "M1 with ANE"
+      },
+      "cpu_x86": {
+        "prompt_processing": 500,
+        "generation": 15,
+        "notes": "x86 CPU baseline (AVX2)"
+      }
+    }
+  },
+  "memory_baselines": {
+    "model_sizes_mb": {
+      "F16": 4000,
+      "Q8_0": 2200,
+      "Q4_K": 1200,
+      "Q2_K": 700
+    },
+    "kv_cache_per_token_bytes": {
+      "F16": 1100,
+      "Q8_0": 1100,
+      "notes": "KV cache typically stays in F16 for accuracy"
+    },
+    "peak_memory_multiplier": 1.5,
+    "notes": "Peak memory = model_size * multiplier during inference"
+  }
+}
--- a/crates/ruvllm/tests/fixtures/test_prompts.json
+++ b/crates/ruvllm/tests/fixtures/test_prompts.json
@@ -0,0 +1,191 @@
+{
+  "metadata": {
+    "version": "1.0.0",
+    "description": "Test prompts for RuvLTRA-Small validation",
+    "model": "ruvltra-small",
+    "last_updated": "2024-01-19"
+  },
+  "prompts": {
+    "simple_completion": {
+      "id": "simple_001",
+      "category": "completion",
+      "prompt": "The quick brown fox",
+      "expected_patterns": ["jumps", "jumped", "runs", "ran", "over", "lazy"],
+      "max_tokens": 50,
+      "temperature": 0.7,
+      "notes": "Classic completion test for basic language modeling"
+    },
+    "instruction_haiku": {
+      "id": "instruction_001",
+      "category": "instruction",
+      "prompt": "Write a haiku about programming:",
+      "expected_patterns": ["code", "bug", "compile", "debug", "screen", "night", "lines", "function"],
+      "max_tokens": 100,
+      "temperature": 0.8,
+      "notes": "Tests instruction-following ability"
+    },
+    "qa_capital": {
+      "id": "qa_001",
+      "category": "question_answering",
+      "prompt": "Q: What is the capital of France?\nA:",
+      "expected_output": "Paris",
+      "max_tokens": 20,
+      "temperature": 0.1,
+      "notes": "Simple factual QA with deterministic expected output"
+    },
+    "qa_math": {
+      "id": "qa_002",
+      "category": "question_answering",
+      "prompt": "Q: What is 2 + 2?\nA:",
+      "expected_output": "4",
+      "max_tokens": 10,
+      "temperature": 0.0,
+      "notes": "Simple math QA"
+    },
+    "code_fibonacci": {
+      "id": "code_001",
+      "category": "code_generation",
+      "prompt": "def fibonacci(n):\n    '''Return the nth Fibonacci number.'''\n",
+      "expected_patterns": ["return", "if", "else", "n", "<=", "1", "+", "fibonacci"],
+      "max_tokens": 150,
+      "temperature": 0.3,
+      "notes": "Code generation with expected structural patterns"
+    },
+    "code_hello_world": {
+      "id": "code_002",
+      "category": "code_generation",
+      "prompt": "# Python function to print hello world\ndef",
+      "expected_patterns": ["print", "hello", "world", "def"],
+      "max_tokens": 50,
+      "temperature": 0.2,
+      "notes": "Simple code generation"
+    },
+    "conversation_greeting": {
+      "id": "conv_001",
+      "category": "conversation",
+      "prompt": "User: Hello!\nAssistant:",
+      "expected_patterns": ["hello", "hi", "how", "help", "can", "assist"],
+      "max_tokens": 50,
+      "temperature": 0.7,
+      "notes": "Basic conversation response"
+    },
+    "conversation_joke": {
+      "id": "conv_002",
+      "category": "conversation",
+      "prompt": "User: Tell me a joke.\nAssistant:",
+      "expected_patterns": ["why", "what", "because", "knock", "chicken"],
+      "max_tokens": 100,
+      "temperature": 0.9,
+      "notes": "Creative response generation"
+    },
+    "summarization": {
+      "id": "summary_001",
+      "category": "summarization",
+      "prompt": "Summarize the following in one sentence:\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed.\nSummary:",
+      "expected_patterns": ["machine learning", "AI", "artificial intelligence", "learn", "data"],
+      "max_tokens": 50,
+      "temperature": 0.3,
+      "notes": "Tests summarization capability"
+    },
+    "translation": {
+      "id": "translation_001",
+      "category": "translation",
+      "prompt": "Translate to French: Hello, how are you?\nFrench:",
+      "expected_patterns": ["bonjour", "comment", "allez", "vous"],
+      "max_tokens": 30,
+      "temperature": 0.1,
+      "notes": "Basic translation test"
+    },
+    "sentiment": {
+      "id": "sentiment_001",
+      "category": "classification",
+      "prompt": "Classify the sentiment of this review as positive, negative, or neutral:\n\"This product is amazing! Best purchase I've ever made.\"\nSentiment:",
+      "expected_output": "positive",
+      "max_tokens": 10,
+      "temperature": 0.0,
+      "notes": "Sentiment classification"
+    },
+    "reasoning_chain": {
+      "id": "reasoning_001",
+      "category": "reasoning",
+      "prompt": "Question: If I have 3 apples and give away 1, how many do I have left?\nLet's think step by step:",
+      "expected_patterns": ["3", "1", "2", "subtract", "minus", "left", "remaining"],
+      "max_tokens": 100,
+      "temperature": 0.1,
+      "notes": "Chain-of-thought reasoning"
+    }
+  },
+  "edge_cases": {
+    "empty_prompt": {
+      "id": "edge_001",
+      "prompt": "",
+      "expected_behavior": "Should handle gracefully, may produce empty output or generic response",
+      "max_tokens": 20
+    },
+    "single_char": {
+      "id": "edge_002",
+      "prompt": "A",
+      "expected_behavior": "Should produce coherent completion",
+      "max_tokens": 30
+    },
+    "special_characters": {
+      "id": "edge_003",
+      "prompt": "Translate: \"Hello, world!\" ->",
+      "expected_behavior": "Should handle quotes and punctuation correctly",
+      "max_tokens": 30
+    },
+    "very_long_prompt": {
+      "id": "edge_004",
+      "prompt": "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. Continue:",
+      "expected_behavior": "Should handle long context without issues",
+      "max_tokens": 50
+    },
+    "unicode": {
+      "id": "edge_005",
+      "prompt": "Translate to English: \u4f60\u597d\u4e16\u754c",
+      "expected_patterns": ["hello", "world"],
+      "max_tokens": 20
+    },
+    "mixed_language": {
+      "id": "edge_006",
+      "prompt": "English and \u65e5\u672c\u8a9e mixed:",
+      "expected_behavior": "Should handle multilingual input",
+      "max_tokens": 50
+    },
+    "numbers": {
+      "id": "edge_007",
+      "prompt": "Continue the sequence: 1, 2, 3, 4,",
+      "expected_patterns": ["5", "6", "7"],
+      "max_tokens": 20
+    },
+    "repetitive": {
+      "id": "edge_008",
+      "prompt": "Hello hello hello hello hello",
+      "expected_behavior": "Should not amplify repetition excessively",
+      "max_tokens": 30
+    }
+  },
+  "stress_tests": {
+    "max_context": {
+      "id": "stress_001",
+      "description": "Test with maximum context length",
+      "prompt_length": 8192,
+      "max_tokens": 100,
+      "notes": "Generate prompt programmatically to fill context"
+    },
+    "long_generation": {
+      "id": "stress_002",
+      "description": "Generate many tokens",
+      "prompt": "Once upon a time",
+      "max_tokens": 2000,
+      "notes": "Test stability over long generation"
+    },
+    "rapid_requests": {
+      "id": "stress_003",
+      "description": "Many rapid sequential requests",
+      "num_requests": 100,
+      "prompt": "Hello",
+      "max_tokens": 10
+    }
+  }
+}