[package] name = "ruvllm" version.workspace = true edition.workspace = true rust-version.workspace = true license.workspace = true authors.workspace = true repository.workspace = true readme = "README.md" description = "LLM serving runtime with Ruvector integration - Paged attention, KV cache, and SONA learning" keywords = ["llm", "inference", "paged-attention", "kv-cache", "ruvector"] categories = ["science", "algorithms"] [dependencies] # Ruvector integration ruvector-core = { version = "2.0", path = "../ruvector-core", default-features = false, features = ["storage", "hnsw", "parallel", "simd"] } ruvector-sona = { version = "0.1.6", path = "../sona", default-features = false, features = ["serde-support"] } # Optional Ruvector crates for advanced features ruvector-attention = { version = "2.0", path = "../ruvector-attention", optional = true } ruvector-graph = { version = "2.0", path = "../ruvector-graph", optional = true, default-features = false } ruvector-gnn = { version = "2.0", path = "../ruvector-gnn", optional = true } # Serialization serde = { workspace = true } serde_json = { workspace = true } # Error handling thiserror = { workspace = true } anyhow = { workspace = true } tracing = { workspace = true } # Performance dashmap = { workspace = true } parking_lot = { workspace = true } once_cell = { workspace = true } smallvec = "1.13" # Time and UUID chrono = { workspace = true, features = ["serde"] } uuid = { workspace = true, features = ["v4", "serde"] } # Math ndarray = { workspace = true } rand = { workspace = true } # Pattern matching regex = "1.10" # Parallelism (optional) rayon = { version = "1.10", optional = true } # Serialization (binary) - needs to match workspace for ruvector-core compatibility bincode = { workspace = true } # Async (optional for non-WASM) tokio = { workspace = true, optional = true } # Async traits and streams async-trait = "0.1" futures-core = "0.3" tokio-stream = { version = "0.1", optional = true } # Candle ML framework (optional) candle-core = { version = "0.8", optional = true } candle-nn = { version = "0.8", optional = true } candle-transformers = { version = "0.8", optional = true } # Tokenizers tokenizers = { version = "0.20", optional = true, default-features = false, features = ["onig"] } # HuggingFace Hub for model downloads hf-hub = { version = "0.3", optional = true, features = ["tokio"] } # mistral-rs backend for high-performance inference (optional) # NOTE: mistralrs crate is not yet on crates.io - use git dependency when available: # mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs", optional = true } # Or when published to crates.io, uncomment: # mistralrs = { version = "0.4", optional = true, default-features = false } # mistralrs-core = { version = "0.4", optional = true } # Directories for cache dirs = "5.0" # Half-precision floating point half = "2.4" # Memory mapping for efficient large file access (optional) memmap2 = { version = "0.9", optional = true } # SHA256 hashing for model integrity verification sha2 = "0.10" # MD5 hashing for input hashing in semantic cache md5 = "0.7" # Metal GPU acceleration (macOS only) [target.'cfg(target_os = "macos")'.dependencies] metal = { version = "0.29", optional = true } objc = { version = "0.2", optional = true } # Core ML bindings (macOS/iOS) - for Apple Neural Engine acceleration objc2 = { version = "0.6", optional = true } objc2-foundation = { version = "0.3", optional = true, features = ["NSString", "NSError", "NSURL", "NSArray", "NSDictionary", "NSData"] } objc2-core-ml = { version = "0.3", optional = true, features = ["MLModel", "MLModelConfiguration", "MLFeatureProvider", "MLFeatureValue", "MLMultiArray", "MLPredictionOptions", "MLModelDescription", "MLFeatureDescription", "MLDictionaryFeatureProvider", "MLModelError"] } block2 = { version = "0.6", optional = true } [dev-dependencies] criterion = { workspace = true } tempfile = "3.13" tracing-subscriber = { workspace = true } [features] # Default includes candle for working inference out of the box default = ["async-runtime", "candle"] async-runtime = ["tokio", "tokio-stream"] # Minimal build without inference (for embedding/library use only) minimal = ["async-runtime"] wasm = [] wasm-simd = [] # Ruvector integration features attention = ["dep:ruvector-attention"] graph = ["dep:ruvector-graph"] gnn = ["dep:ruvector-gnn"] # Full Ruvector integration (all optional crates) ruvector-full = ["attention", "graph", "gnn"] # Multi-threaded GEMM/GEMV with rayon (4-6x speedup on M4 Pro 10-core) parallel = ["dep:rayon"] # Candle backend for LLM inference (Rust-native, Metal acceleration on Mac) candle = ["candle-core", "candle-nn", "candle-transformers", "tokenizers", "hf-hub"] # Metal acceleration for Apple Silicon (M1/M2/M3/M4) via Candle metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"] # Native Metal compute shaders (low-level, M4 Pro optimized) metal-compute = ["dep:metal", "dep:objc"] # CUDA acceleration for NVIDIA GPUs cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"] # Full inference backend with Metal (recommended for Mac) inference-metal = ["candle", "metal"] # Full Metal compute with native shaders (best performance on M4 Pro) inference-metal-native = ["candle", "metal", "metal-compute"] # Full inference backend with CUDA (recommended for NVIDIA) inference-cuda = ["candle", "cuda"] # Memory-mapped file access for efficient GGUF loading mmap = ["dep:memmap2"] # GGUF support with memory mapping (recommended for large models) gguf-mmap = ["mmap"] # Apple Accelerate framework for BLAS operations (macOS only, ~2x GEMV speedup) accelerate = [] # Apple Neural Engine via Core ML (macOS/iOS, optimal for small models and batch inference) # Provides 38 TOPS dedicated ML acceleration with 3-4x better power efficiency coreml = ["dep:objc2", "dep:objc2-foundation", "dep:objc2-core-ml", "dep:block2"] # Hybrid GPU+ANE pipeline (use ANE for MLP, GPU for attention) hybrid-ane = ["metal-compute", "coreml"] # mistral-rs backend feature (enables full mistral-rs integration) # NOTE: Uncomment when mistralrs crate is available # mistral-rs = ["dep:mistralrs", "dep:mistralrs-core", "tokenizers"] # mistral-rs-metal = ["mistral-rs", "mistralrs/metal"] # mistral-rs-cuda = ["mistral-rs", "mistralrs/cuda"] [lib] crate-type = ["rlib"] # Benchmark configurations [[bench]] name = "attention_bench" harness = false [[bench]] name = "rope_bench" harness = false [[bench]] name = "norm_bench" harness = false [[bench]] name = "matmul_bench" harness = false [[bench]] name = "lora_bench" harness = false [[bench]] name = "e2e_bench" harness = false [[bench]] name = "metal_bench" harness = false [[bench]] name = "serving_bench" harness = false [[bench]] name = "ane_bench" harness = false [[bench]] name = "ruvltra_benchmark" harness = false # Test configurations [[test]] name = "real_model_test" path = "tests/real_model_test.rs" # Example binaries [[example]] name = "download_test_model" path = "examples/download_test_model.rs" [[example]] name = "hub_cli" path = "examples/hub_cli.rs" [[example]] name = "benchmark_model" path = "examples/benchmark_model.rs" [[example]] name = "run_eval" path = "examples/run_eval.rs" required-features = ["async-runtime"]