Files
wifi-densepose/crates/ruvllm/tests/fixtures/perplexity_baselines.json
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

162 lines
4.5 KiB
JSON

{
"metadata": {
"version": "1.0.0",
"description": "Perplexity baselines for RuvLTRA-Small quality validation",
"model": "ruvltra-small",
"quantization_tested": ["Q4_K", "Q5_K", "Q8_0", "F16"],
"last_updated": "2024-01-19"
},
"quality_thresholds": {
"max_acceptable_perplexity": 50.0,
"warning_perplexity": 30.0,
"excellent_perplexity": 15.0,
"notes": "Perplexity values vary by dataset and prompt type"
},
"baselines": {
"wikitext": {
"description": "WikiText-2 test set perplexity",
"dataset_url": "https://huggingface.co/datasets/wikitext",
"values": {
"F16": {
"perplexity": 8.5,
"tokens_evaluated": 250000,
"notes": "Full precision baseline"
},
"Q8_0": {
"perplexity": 8.7,
"degradation_pct": 2.4,
"notes": "8-bit quantization, minimal quality loss"
},
"Q5_K": {
"perplexity": 9.2,
"degradation_pct": 8.2,
"notes": "5-bit k-quant, good balance"
},
"Q4_K": {
"perplexity": 9.8,
"degradation_pct": 15.3,
"notes": "4-bit k-quant, most common deployment format"
},
"Q2_K": {
"perplexity": 14.5,
"degradation_pct": 70.6,
"notes": "2-bit extreme quantization, noticeable degradation"
}
}
},
"lambada": {
"description": "LAMBADA last-word prediction accuracy",
"metric": "accuracy",
"values": {
"F16": {
"accuracy": 0.72,
"notes": "Full precision accuracy"
},
"Q4_K": {
"accuracy": 0.68,
"degradation_pct": 5.6,
"notes": "Slight accuracy drop acceptable"
}
}
},
"hellaswag": {
"description": "HellaSwag commonsense reasoning",
"metric": "accuracy",
"values": {
"F16": {
"accuracy": 0.68
},
"Q4_K": {
"accuracy": 0.65,
"degradation_pct": 4.4
}
}
},
"custom_prompts": {
"description": "Perplexity on custom test prompts",
"values": {
"simple_completion": {
"expected_ppl_range": [5.0, 20.0],
"notes": "Common phrase continuation should have low perplexity"
},
"code_generation": {
"expected_ppl_range": [8.0, 30.0],
"notes": "Code has higher entropy but should still be coherent"
},
"creative_writing": {
"expected_ppl_range": [15.0, 45.0],
"notes": "Creative tasks have higher acceptable perplexity"
},
"factual_qa": {
"expected_ppl_range": [3.0, 15.0],
"notes": "Factual responses should be confident"
}
}
}
},
"degradation_limits": {
"max_perplexity_increase_pct": 20.0,
"max_accuracy_decrease_pct": 10.0,
"notes": "Quantization should not degrade quality beyond these limits"
},
"token_probability_thresholds": {
"min_top1_probability": 0.01,
"min_top5_cumulative": 0.1,
"max_entropy": 10.0,
"notes": "Thresholds for detecting garbled or degenerate output"
},
"repetition_metrics": {
"max_ngram_repetition_ratio": 0.3,
"max_consecutive_repeats": 3,
"ngram_window_sizes": [2, 3, 4],
"notes": "Detect excessive repetition in generated text"
},
"coherence_metrics": {
"min_sentence_length": 3,
"max_sentence_length": 200,
"punctuation_ratio_range": [0.01, 0.15],
"alphanumeric_ratio_min": 0.7,
"notes": "Basic structural coherence checks"
},
"speed_baselines": {
"description": "Token generation speed baselines (tokens/second)",
"device_baselines": {
"m4_pro_ane": {
"prompt_processing": 2000,
"generation": 60,
"notes": "M4 Pro with ANE acceleration"
},
"m4_pro_neon": {
"prompt_processing": 1500,
"generation": 45,
"notes": "M4 Pro NEON-only fallback"
},
"m1_ane": {
"prompt_processing": 1200,
"generation": 40,
"notes": "M1 with ANE"
},
"cpu_x86": {
"prompt_processing": 500,
"generation": 15,
"notes": "x86 CPU baseline (AVX2)"
}
}
},
"memory_baselines": {
"model_sizes_mb": {
"F16": 4000,
"Q8_0": 2200,
"Q4_K": 1200,
"Q2_K": 700
},
"kv_cache_per_token_bytes": {
"F16": 1100,
"Q8_0": 1100,
"notes": "KV cache typically stays in F16 for accuracy"
},
"peak_memory_multiplier": 1.5,
"notes": "Peak memory = model_size * multiplier during inference"
}
}