Files
wifi-densepose/vendor/ruvector/crates/ruvllm/models/ruvltra_small.json

161 lines
5.0 KiB
JSON

{
"model_info": {
"name": "RuvLTRA-Small-0.5B",
"version": "1.0.0",
"description": "ANE-optimized small language model based on Qwen2.5-0.5B-Instruct for edge deployment",
"base_model": "Qwen/Qwen2.5-0.5B-Instruct",
"license": "Apache-2.0",
"created": "2026-01-19",
"author": "RuvVector Team"
},
"architecture": {
"model_type": "qwen2",
"architecture_class": "Qwen2ForCausalLM",
"hidden_size": 896,
"intermediate_size": 4864,
"num_hidden_layers": 24,
"num_attention_heads": 14,
"num_key_value_heads": 2,
"vocab_size": 151936,
"max_position_embeddings": 32768,
"rope_theta": 1000000.0,
"rms_norm_eps": 1e-6,
"hidden_act": "silu",
"attention_dropout": 0.0,
"tie_word_embeddings": true,
"use_sliding_window": false,
"sliding_window": 32768,
"max_window_layers": 21,
"bos_token_id": 151643,
"eos_token_id": 151645
},
"parameters": {
"total": "0.49B",
"total_exact": 494000000,
"non_embedding": "0.36B",
"non_embedding_exact": 360000000
},
"features": {
"attention_mechanism": "grouped_query_attention",
"positional_encoding": "rotary_position_embedding",
"activation_function": "swiglu",
"normalization": "rmsnorm",
"has_qkv_bias": true,
"multilingual": true,
"supported_languages": 29
},
"optimizations": {
"ane_optimized": true,
"sona_enabled": true,
"flash_attention_compatible": true,
"continuous_batching": true,
"speculative_decoding_ready": true,
"target_hardware": [
"apple_neural_engine",
"metal_gpu",
"cpu_arm64",
"cpu_x86_64"
],
"memory_optimizations": [
"kv_cache_compression",
"activation_checkpointing",
"weight_sharing"
]
},
"quantization": {
"recommended": "Q4_K_M",
"targets": {
"Q4_K_M": {
"description": "4-bit quantization with K-means clustering (medium)",
"size_mb": 491,
"quality": "good",
"speed": "fast",
"memory_reduction": "75%",
"recommended_for": ["mobile", "edge", "resource_constrained"]
},
"Q5_K_M": {
"description": "5-bit quantization with K-means clustering (medium)",
"size_mb": 522,
"quality": "better",
"speed": "fast",
"memory_reduction": "68%",
"recommended_for": ["balanced", "quality_conscious"]
},
"Q8_0": {
"description": "8-bit quantization (round-to-nearest)",
"size_mb": 676,
"quality": "best",
"speed": "moderate",
"memory_reduction": "50%",
"recommended_for": ["accuracy_critical", "development"]
}
},
"additional_quantizations": {
"Q2_K": {"size_mb": 415, "quality": "acceptable"},
"Q3_K_M": {"size_mb": 432, "quality": "fair"},
"Q4_0": {"size_mb": 429, "quality": "good"},
"Q5_0": {"size_mb": 490, "quality": "better"},
"Q6_K": {"size_mb": 650, "quality": "very_good"}
}
},
"download_urls": {
"official_gguf": {
"base_url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF",
"files": {
"Q4_K_M": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
"Q5_K_M": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q5_k_m.gguf",
"Q8_0": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf"
}
},
"safetensors": {
"base_url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct",
"format": "safetensors",
"dtype": "bfloat16"
},
"third_party_gguf": {
"bartowski": "https://huggingface.co/bartowski/Qwen2.5-0.5B-Instruct-GGUF",
"tensorblock": "https://huggingface.co/tensorblock/Qwen2.5-0.5B-GGUF"
}
},
"cli_download_commands": {
"Q4_K_M": "huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct-GGUF qwen2.5-0.5b-instruct-q4_k_m.gguf --local-dir . --local-dir-use-symlinks False",
"Q5_K_M": "huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct-GGUF qwen2.5-0.5b-instruct-q5_k_m.gguf --local-dir . --local-dir-use-symlinks False",
"Q8_0": "huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct-GGUF qwen2.5-0.5b-instruct-q8_0.gguf --local-dir . --local-dir-use-symlinks False"
},
"performance_targets": {
"inference_latency_ms": {
"first_token": 50,
"per_token": 15
},
"throughput_tokens_per_sec": {
"ane": 150,
"metal_gpu": 120,
"cpu": 40
},
"memory_usage_mb": {
"Q4_K_M": 600,
"Q5_K_M": 650,
"Q8_0": 800
},
"context_window": {
"default": 8192,
"max": 32768
}
},
"ruvllm_integration": {
"loader": "gguf",
"backend_priority": ["ane", "metal", "cpu"],
"tokenizer": "qwen2",
"chat_template": "qwen2_instruct",
"system_prompt_support": true,
"function_calling": false,
"streaming": true
},
"benchmarks": {
"mmlu": "pending",
"humaneval": "pending",
"gsm8k": "pending",
"arc_challenge": "pending"
}
}