243 lines
7.2 KiB
TOML
243 lines
7.2 KiB
TOML
[package]
|
|
name = "ruvllm"
|
|
version.workspace = true
|
|
edition.workspace = true
|
|
rust-version.workspace = true
|
|
license.workspace = true
|
|
authors.workspace = true
|
|
repository.workspace = true
|
|
readme = "README.md"
|
|
description = "LLM serving runtime with Ruvector integration - Paged attention, KV cache, and SONA learning"
|
|
keywords = ["llm", "inference", "paged-attention", "kv-cache", "ruvector"]
|
|
categories = ["science", "algorithms"]
|
|
|
|
[dependencies]
|
|
# Ruvector integration
|
|
ruvector-core = { version = "2.0", path = "../ruvector-core", default-features = false, features = ["storage", "hnsw", "parallel", "simd"] }
|
|
ruvector-sona = { version = "0.1.6", path = "../sona", default-features = false, features = ["serde-support"] }
|
|
|
|
# Optional Ruvector crates for advanced features
|
|
ruvector-attention = { version = "2.0", path = "../ruvector-attention", optional = true }
|
|
ruvector-graph = { version = "2.0", path = "../ruvector-graph", optional = true, default-features = false }
|
|
ruvector-gnn = { version = "2.0", path = "../ruvector-gnn", optional = true }
|
|
|
|
# Serialization
|
|
serde = { workspace = true }
|
|
serde_json = { workspace = true }
|
|
|
|
# Error handling
|
|
thiserror = { workspace = true }
|
|
anyhow = { workspace = true }
|
|
tracing = { workspace = true }
|
|
|
|
# Performance
|
|
dashmap = { workspace = true }
|
|
parking_lot = { workspace = true }
|
|
once_cell = { workspace = true }
|
|
smallvec = "1.13"
|
|
|
|
# Time and UUID
|
|
chrono = { workspace = true, features = ["serde"] }
|
|
uuid = { workspace = true, features = ["v4", "serde"] }
|
|
|
|
# Math
|
|
ndarray = { workspace = true }
|
|
rand = { workspace = true }
|
|
|
|
# Pattern matching
|
|
regex = "1.10"
|
|
|
|
# Parallelism (optional)
|
|
rayon = { version = "1.10", optional = true }
|
|
|
|
# Serialization (binary) - needs to match workspace for ruvector-core compatibility
|
|
bincode = { workspace = true }
|
|
|
|
# Async (optional for non-WASM)
|
|
tokio = { workspace = true, optional = true }
|
|
|
|
# Async traits and streams
|
|
async-trait = "0.1"
|
|
futures-core = "0.3"
|
|
tokio-stream = { version = "0.1", optional = true }
|
|
|
|
# Candle ML framework (optional)
|
|
candle-core = { version = "0.8", optional = true }
|
|
candle-nn = { version = "0.8", optional = true }
|
|
candle-transformers = { version = "0.8", optional = true }
|
|
|
|
# Tokenizers
|
|
tokenizers = { version = "0.20", optional = true, default-features = false, features = ["onig"] }
|
|
|
|
# HuggingFace Hub for model downloads
|
|
hf-hub = { version = "0.3", optional = true, features = ["tokio"] }
|
|
|
|
# mistral-rs backend for high-performance inference (optional)
|
|
# NOTE: mistralrs crate is not yet on crates.io - use git dependency when available:
|
|
# mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs", optional = true }
|
|
# Or when published to crates.io, uncomment:
|
|
# mistralrs = { version = "0.4", optional = true, default-features = false }
|
|
# mistralrs-core = { version = "0.4", optional = true }
|
|
|
|
# Directories for cache
|
|
dirs = "5.0"
|
|
|
|
# Half-precision floating point
|
|
half = "2.4"
|
|
|
|
# Memory mapping for efficient large file access (optional)
|
|
memmap2 = { version = "0.9", optional = true }
|
|
|
|
# SHA256 hashing for model integrity verification
|
|
sha2 = "0.10"
|
|
|
|
# MD5 hashing for input hashing in semantic cache
|
|
md5 = "0.7"
|
|
|
|
# Metal GPU acceleration (macOS only)
|
|
[target.'cfg(target_os = "macos")'.dependencies]
|
|
metal = { version = "0.29", optional = true }
|
|
objc = { version = "0.2", optional = true }
|
|
|
|
# Core ML bindings (macOS/iOS) - for Apple Neural Engine acceleration
|
|
objc2 = { version = "0.6", optional = true }
|
|
objc2-foundation = { version = "0.3", optional = true, features = ["NSString", "NSError", "NSURL", "NSArray", "NSDictionary", "NSData"] }
|
|
objc2-core-ml = { version = "0.3", optional = true, features = ["MLModel", "MLModelConfiguration", "MLFeatureProvider", "MLFeatureValue", "MLMultiArray", "MLPredictionOptions", "MLModelDescription", "MLFeatureDescription", "MLDictionaryFeatureProvider", "MLModelError"] }
|
|
block2 = { version = "0.6", optional = true }
|
|
|
|
[dev-dependencies]
|
|
criterion = { workspace = true }
|
|
tempfile = "3.13"
|
|
tracing-subscriber = { workspace = true }
|
|
|
|
[features]
|
|
# Default includes candle for working inference out of the box
|
|
default = ["async-runtime", "candle"]
|
|
async-runtime = ["tokio", "tokio-stream"]
|
|
|
|
# Minimal build without inference (for embedding/library use only)
|
|
minimal = ["async-runtime"]
|
|
wasm = []
|
|
wasm-simd = []
|
|
|
|
# Ruvector integration features
|
|
attention = ["dep:ruvector-attention"]
|
|
graph = ["dep:ruvector-graph"]
|
|
gnn = ["dep:ruvector-gnn"]
|
|
|
|
# Full Ruvector integration (all optional crates)
|
|
ruvector-full = ["attention", "graph", "gnn"]
|
|
|
|
# Multi-threaded GEMM/GEMV with rayon (4-6x speedup on M4 Pro 10-core)
|
|
parallel = ["dep:rayon"]
|
|
|
|
# Candle backend for LLM inference (Rust-native, Metal acceleration on Mac)
|
|
candle = ["candle-core", "candle-nn", "candle-transformers", "tokenizers", "hf-hub"]
|
|
|
|
# Metal acceleration for Apple Silicon (M1/M2/M3/M4) via Candle
|
|
metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]
|
|
|
|
# Native Metal compute shaders (low-level, M4 Pro optimized)
|
|
metal-compute = ["dep:metal", "dep:objc"]
|
|
|
|
# CUDA acceleration for NVIDIA GPUs
|
|
cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
|
|
|
|
# Full inference backend with Metal (recommended for Mac)
|
|
inference-metal = ["candle", "metal"]
|
|
|
|
# Full Metal compute with native shaders (best performance on M4 Pro)
|
|
inference-metal-native = ["candle", "metal", "metal-compute"]
|
|
|
|
# Full inference backend with CUDA (recommended for NVIDIA)
|
|
inference-cuda = ["candle", "cuda"]
|
|
|
|
# Memory-mapped file access for efficient GGUF loading
|
|
mmap = ["dep:memmap2"]
|
|
|
|
# GGUF support with memory mapping (recommended for large models)
|
|
gguf-mmap = ["mmap"]
|
|
|
|
# Apple Accelerate framework for BLAS operations (macOS only, ~2x GEMV speedup)
|
|
accelerate = []
|
|
|
|
# Apple Neural Engine via Core ML (macOS/iOS, optimal for small models and batch inference)
|
|
# Provides 38 TOPS dedicated ML acceleration with 3-4x better power efficiency
|
|
coreml = ["dep:objc2", "dep:objc2-foundation", "dep:objc2-core-ml", "dep:block2"]
|
|
|
|
# Hybrid GPU+ANE pipeline (use ANE for MLP, GPU for attention)
|
|
hybrid-ane = ["metal-compute", "coreml"]
|
|
|
|
# mistral-rs backend feature (enables full mistral-rs integration)
|
|
# NOTE: Uncomment when mistralrs crate is available
|
|
# mistral-rs = ["dep:mistralrs", "dep:mistralrs-core", "tokenizers"]
|
|
# mistral-rs-metal = ["mistral-rs", "mistralrs/metal"]
|
|
# mistral-rs-cuda = ["mistral-rs", "mistralrs/cuda"]
|
|
|
|
[lib]
|
|
crate-type = ["rlib"]
|
|
|
|
# Benchmark configurations
|
|
[[bench]]
|
|
name = "attention_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "rope_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "norm_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "matmul_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "lora_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "e2e_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "metal_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "serving_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "ane_bench"
|
|
harness = false
|
|
|
|
[[bench]]
|
|
name = "ruvltra_benchmark"
|
|
harness = false
|
|
|
|
# Test configurations
|
|
[[test]]
|
|
name = "real_model_test"
|
|
path = "tests/real_model_test.rs"
|
|
|
|
# Example binaries
|
|
[[example]]
|
|
name = "download_test_model"
|
|
path = "examples/download_test_model.rs"
|
|
|
|
[[example]]
|
|
name = "hub_cli"
|
|
path = "examples/hub_cli.rs"
|
|
|
|
[[example]]
|
|
name = "benchmark_model"
|
|
path = "examples/benchmark_model.rs"
|
|
|
|
[[example]]
|
|
name = "run_eval"
|
|
path = "examples/run_eval.rs"
|
|
required-features = ["async-runtime"]
|