Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/crates/ruvllm/Cargo.toml
+++ b/vendor/ruvector/crates/ruvllm/Cargo.toml
@@ -0,0 +1,242 @@
+[package]
+name = "ruvllm"
+version.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+license.workspace = true
+authors.workspace = true
+repository.workspace = true
+readme = "README.md"
+description = "LLM serving runtime with Ruvector integration - Paged attention, KV cache, and SONA learning"
+keywords = ["llm", "inference", "paged-attention", "kv-cache", "ruvector"]
+categories = ["science", "algorithms"]
+
+[dependencies]
+# Ruvector integration
+ruvector-core = { version = "2.0", path = "../ruvector-core", default-features = false, features = ["storage", "hnsw", "parallel", "simd"] }
+ruvector-sona = { version = "0.1.6", path = "../sona", default-features = false, features = ["serde-support"] }
+
+# Optional Ruvector crates for advanced features
+ruvector-attention = { version = "2.0", path = "../ruvector-attention", optional = true }
+ruvector-graph = { version = "2.0", path = "../ruvector-graph", optional = true, default-features = false }
+ruvector-gnn = { version = "2.0", path = "../ruvector-gnn", optional = true }
+
+# Serialization
+serde = { workspace = true }
+serde_json = { workspace = true }
+
+# Error handling
+thiserror = { workspace = true }
+anyhow = { workspace = true }
+tracing = { workspace = true }
+
+# Performance
+dashmap = { workspace = true }
+parking_lot = { workspace = true }
+once_cell = { workspace = true }
+smallvec = "1.13"
+
+# Time and UUID
+chrono = { workspace = true, features = ["serde"] }
+uuid = { workspace = true, features = ["v4", "serde"] }
+
+# Math
+ndarray = { workspace = true }
+rand = { workspace = true }
+
+# Pattern matching
+regex = "1.10"
+
+# Parallelism (optional)
+rayon = { version = "1.10", optional = true }
+
+# Serialization (binary) - needs to match workspace for ruvector-core compatibility
+bincode = { workspace = true }
+
+# Async (optional for non-WASM)
+tokio = { workspace = true, optional = true }
+
+# Async traits and streams
+async-trait = "0.1"
+futures-core = "0.3"
+tokio-stream = { version = "0.1", optional = true }
+
+# Candle ML framework (optional)
+candle-core = { version = "0.8", optional = true }
+candle-nn = { version = "0.8", optional = true }
+candle-transformers = { version = "0.8", optional = true }
+
+# Tokenizers
+tokenizers = { version = "0.20", optional = true, default-features = false, features = ["onig"] }
+
+# HuggingFace Hub for model downloads
+hf-hub = { version = "0.3", optional = true, features = ["tokio"] }
+
+# mistral-rs backend for high-performance inference (optional)
+# NOTE: mistralrs crate is not yet on crates.io - use git dependency when available:
+# mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs", optional = true }
+# Or when published to crates.io, uncomment:
+# mistralrs = { version = "0.4", optional = true, default-features = false }
+# mistralrs-core = { version = "0.4", optional = true }
+
+# Directories for cache
+dirs = "5.0"
+
+# Half-precision floating point
+half = "2.4"
+
+# Memory mapping for efficient large file access (optional)
+memmap2 = { version = "0.9", optional = true }
+
+# SHA256 hashing for model integrity verification
+sha2 = "0.10"
+
+# MD5 hashing for input hashing in semantic cache
+md5 = "0.7"
+
+# Metal GPU acceleration (macOS only)
+[target.'cfg(target_os = "macos")'.dependencies]
+metal = { version = "0.29", optional = true }
+objc = { version = "0.2", optional = true }
+
+# Core ML bindings (macOS/iOS) - for Apple Neural Engine acceleration
+objc2 = { version = "0.6", optional = true }
+objc2-foundation = { version = "0.3", optional = true, features = ["NSString", "NSError", "NSURL", "NSArray", "NSDictionary", "NSData"] }
+objc2-core-ml = { version = "0.3", optional = true, features = ["MLModel", "MLModelConfiguration", "MLFeatureProvider", "MLFeatureValue", "MLMultiArray", "MLPredictionOptions", "MLModelDescription", "MLFeatureDescription", "MLDictionaryFeatureProvider", "MLModelError"] }
+block2 = { version = "0.6", optional = true }
+
+[dev-dependencies]
+criterion = { workspace = true }
+tempfile = "3.13"
+tracing-subscriber = { workspace = true }
+
+[features]
+# Default includes candle for working inference out of the box
+default = ["async-runtime", "candle"]
+async-runtime = ["tokio", "tokio-stream"]
+
+# Minimal build without inference (for embedding/library use only)
+minimal = ["async-runtime"]
+wasm = []
+wasm-simd = []
+
+# Ruvector integration features
+attention = ["dep:ruvector-attention"]
+graph = ["dep:ruvector-graph"]
+gnn = ["dep:ruvector-gnn"]
+
+# Full Ruvector integration (all optional crates)
+ruvector-full = ["attention", "graph", "gnn"]
+
+# Multi-threaded GEMM/GEMV with rayon (4-6x speedup on M4 Pro 10-core)
+parallel = ["dep:rayon"]
+
+# Candle backend for LLM inference (Rust-native, Metal acceleration on Mac)
+candle = ["candle-core", "candle-nn", "candle-transformers", "tokenizers", "hf-hub"]
+
+# Metal acceleration for Apple Silicon (M1/M2/M3/M4) via Candle
+metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]
+
+# Native Metal compute shaders (low-level, M4 Pro optimized)
+metal-compute = ["dep:metal", "dep:objc"]
+
+# CUDA acceleration for NVIDIA GPUs
+cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
+
+# Full inference backend with Metal (recommended for Mac)
+inference-metal = ["candle", "metal"]
+
+# Full Metal compute with native shaders (best performance on M4 Pro)
+inference-metal-native = ["candle", "metal", "metal-compute"]
+
+# Full inference backend with CUDA (recommended for NVIDIA)
+inference-cuda = ["candle", "cuda"]
+
+# Memory-mapped file access for efficient GGUF loading
+mmap = ["dep:memmap2"]
+
+# GGUF support with memory mapping (recommended for large models)
+gguf-mmap = ["mmap"]
+
+# Apple Accelerate framework for BLAS operations (macOS only, ~2x GEMV speedup)
+accelerate = []
+
+# Apple Neural Engine via Core ML (macOS/iOS, optimal for small models and batch inference)
+# Provides 38 TOPS dedicated ML acceleration with 3-4x better power efficiency
+coreml = ["dep:objc2", "dep:objc2-foundation", "dep:objc2-core-ml", "dep:block2"]
+
+# Hybrid GPU+ANE pipeline (use ANE for MLP, GPU for attention)
+hybrid-ane = ["metal-compute", "coreml"]
+
+# mistral-rs backend feature (enables full mistral-rs integration)
+# NOTE: Uncomment when mistralrs crate is available
+# mistral-rs = ["dep:mistralrs", "dep:mistralrs-core", "tokenizers"]
+# mistral-rs-metal = ["mistral-rs", "mistralrs/metal"]
+# mistral-rs-cuda = ["mistral-rs", "mistralrs/cuda"]
+
+[lib]
+crate-type = ["rlib"]
+
+# Benchmark configurations
+[[bench]]
+name = "attention_bench"
+harness = false
+
+[[bench]]
+name = "rope_bench"
+harness = false
+
+[[bench]]
+name = "norm_bench"
+harness = false
+
+[[bench]]
+name = "matmul_bench"
+harness = false
+
+[[bench]]
+name = "lora_bench"
+harness = false
+
+[[bench]]
+name = "e2e_bench"
+harness = false
+
+[[bench]]
+name = "metal_bench"
+harness = false
+
+[[bench]]
+name = "serving_bench"
+harness = false
+
+[[bench]]
+name = "ane_bench"
+harness = false
+
+[[bench]]
+name = "ruvltra_benchmark"
+harness = false
+
+# Test configurations
+[[test]]
+name = "real_model_test"
+path = "tests/real_model_test.rs"
+
+# Example binaries
+[[example]]
+name = "download_test_model"
+path = "examples/download_test_model.rs"
+
+[[example]]
+name = "hub_cli"
+path = "examples/hub_cli.rs"
+
+[[example]]
+name = "benchmark_model"
+path = "examples/benchmark_model.rs"
+
+[[example]]
+name = "run_eval"
+path = "examples/run_eval.rs"
+required-features = ["async-runtime"]