Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/scripts/benchmark/run_benchmarks.sh
+++ b/vendor/ruvector/scripts/benchmark/run_benchmarks.sh
@@ -0,0 +1,292 @@
+#!/usr/bin/env bash
+#
+# RuVector Comprehensive Benchmark Runner
+# =======================================
+#
+# This script runs all benchmarks and outputs results in JSON format
+# suitable for CI/CD tracking and historical comparison.
+#
+# Usage:
+#   ./scripts/run_benchmarks.sh              # Run all benchmarks
+#   ./scripts/run_benchmarks.sh --quick      # Quick mode (reduced iterations)
+#   ./scripts/run_benchmarks.sh --json       # Output JSON only
+#   ./scripts/run_benchmarks.sh --help       # Show help
+#
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+OUTPUT_DIR="${PROJECT_ROOT}/bench_results"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+JSON_OUTPUT="${OUTPUT_DIR}/benchmark_${TIMESTAMP}.json"
+
+# Default settings
+QUICK_MODE=false
+JSON_ONLY=false
+VECTORS=10000
+QUERIES=100
+DIMENSIONS=384
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --quick)
+            QUICK_MODE=true
+            VECTORS=1000
+            QUERIES=50
+            shift
+            ;;
+        --json)
+            JSON_ONLY=true
+            shift
+            ;;
+        --help|-h)
+            echo "RuVector Benchmark Runner"
+            echo ""
+            echo "Usage: $0 [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --quick    Run with reduced iterations for faster results"
+            echo "  --json     Output JSON only (suppress console output)"
+            echo "  --help     Show this help message"
+            echo ""
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Logging functions
+log_info() {
+    if [ "$JSON_ONLY" = false ]; then
+        echo -e "${BLUE}[INFO]${NC} $1"
+    fi
+}
+
+log_success() {
+    if [ "$JSON_ONLY" = false ]; then
+        echo -e "${GREEN}[SUCCESS]${NC} $1"
+    fi
+}
+
+log_warning() {
+    if [ "$JSON_ONLY" = false ]; then
+        echo -e "${YELLOW}[WARNING]${NC} $1"
+    fi
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1" >&2
+}
+
+# Create output directory
+mkdir -p "${OUTPUT_DIR}"
+
+# Get system information
+get_system_info() {
+    local cpu_info=""
+    local memory=""
+    local os_version=""
+    local rust_version=""
+
+    # CPU info
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        cpu_info=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")
+        memory=$(sysctl -n hw.memsize 2>/dev/null | awk '{printf "%.0f GB", $0/1024/1024/1024}')
+        os_version=$(sw_vers -productVersion 2>/dev/null || echo "Unknown")
+    elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
+        cpu_info=$(grep -m1 'model name' /proc/cpuinfo 2>/dev/null | cut -d':' -f2 | xargs || echo "Unknown")
+        memory=$(free -h 2>/dev/null | awk '/^Mem:/ {print $2}' || echo "Unknown")
+        os_version=$(cat /etc/os-release 2>/dev/null | grep -m1 VERSION= | cut -d'"' -f2 || echo "Unknown")
+    fi
+
+    rust_version=$(rustc --version 2>/dev/null | awk '{print $2}' || echo "Unknown")
+
+    cat << EOF
+{
+    "cpu": "${cpu_info}",
+    "memory": "${memory}",
+    "os": "${os_version}",
+    "rust_version": "${rust_version}",
+    "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
+    "quick_mode": ${QUICK_MODE}
+}
+EOF
+}
+
+# Run NEON SIMD benchmark
+run_neon_benchmark() {
+    log_info "Running NEON SIMD benchmark..."
+
+    local output
+    output=$(cd "${PROJECT_ROOT}" && cargo run --example neon_benchmark --release -p ruvector-core 2>&1 | tail -20)
+
+    # Parse results
+    local euclidean_simd euclidean_scalar euclidean_speedup
+    local dot_simd dot_scalar dot_speedup
+    local cosine_simd cosine_scalar cosine_speedup
+
+    euclidean_simd=$(echo "$output" | grep -A1 "Euclidean" | grep "SIMD:" | awk '{print $2}')
+    euclidean_scalar=$(echo "$output" | grep -A2 "Euclidean" | grep "Scalar:" | awk '{print $2}')
+    euclidean_speedup=$(echo "$output" | grep -A3 "Euclidean" | grep "Speedup:" | awk '{print $2}' | tr -d 'x')
+
+    dot_simd=$(echo "$output" | grep -A1 "Dot Product" | grep "SIMD:" | awk '{print $2}')
+    dot_scalar=$(echo "$output" | grep -A2 "Dot Product" | grep "Scalar:" | awk '{print $2}')
+    dot_speedup=$(echo "$output" | grep -A3 "Dot Product" | grep "Speedup:" | awk '{print $2}' | tr -d 'x')
+
+    cosine_simd=$(echo "$output" | grep -A1 "Cosine" | grep "SIMD:" | awk '{print $2}')
+    cosine_scalar=$(echo "$output" | grep -A2 "Cosine" | grep "Scalar:" | awk '{print $2}')
+    cosine_speedup=$(echo "$output" | grep -A3 "Cosine" | grep "Speedup:" | awk '{print $2}' | tr -d 'x')
+
+    cat << EOF
+{
+    "euclidean": {
+        "simd_ms": ${euclidean_simd:-0},
+        "scalar_ms": ${euclidean_scalar:-0},
+        "speedup": ${euclidean_speedup:-0}
+    },
+    "dot_product": {
+        "simd_ms": ${dot_simd:-0},
+        "scalar_ms": ${dot_scalar:-0},
+        "speedup": ${dot_speedup:-0}
+    },
+    "cosine": {
+        "simd_ms": ${cosine_simd:-0},
+        "scalar_ms": ${cosine_scalar:-0},
+        "speedup": ${cosine_speedup:-0}
+    }
+}
+EOF
+
+    log_success "NEON benchmark complete"
+}
+
+# Run Criterion benchmarks
+run_criterion_benchmarks() {
+    log_info "Running Criterion benchmarks..."
+
+    local bench_args=""
+    if [ "$QUICK_MODE" = true ]; then
+        bench_args="-- --quick"
+    fi
+
+    # Run distance metrics benchmark
+    cd "${PROJECT_ROOT}/crates/ruvector-core"
+    cargo bench --bench distance_metrics ${bench_args} 2>&1 | grep -E "time:" | head -20 > "${OUTPUT_DIR}/distance_metrics_raw.txt" || true
+
+    # Run HNSW search benchmark
+    cargo bench --bench hnsw_search ${bench_args} 2>&1 | grep -E "time:" | head -10 > "${OUTPUT_DIR}/hnsw_search_raw.txt" || true
+
+    # Run quantization benchmark
+    cargo bench --bench quantization_bench ${bench_args} 2>&1 | grep -E "time:" | head -20 > "${OUTPUT_DIR}/quantization_raw.txt" || true
+
+    log_success "Criterion benchmarks complete"
+
+    # Return placeholder JSON (real parsing would be more complex)
+    echo '{"criterion_complete": true}'
+}
+
+# Run comparison benchmark
+run_comparison_benchmark() {
+    log_info "Running comparison benchmark..."
+
+    cd "${PROJECT_ROOT}"
+    cargo run -p ruvector-bench --bin comparison-benchmark --release -- \
+        --num-vectors ${VECTORS} \
+        --queries ${QUERIES} \
+        --dimensions ${DIMENSIONS} \
+        --output "${OUTPUT_DIR}" 2>&1 | tail -10
+
+    # Read the generated JSON
+    if [ -f "${OUTPUT_DIR}/comparison_benchmark.json" ]; then
+        cat "${OUTPUT_DIR}/comparison_benchmark.json"
+    else
+        echo '{"error": "comparison benchmark output not found"}'
+    fi
+
+    log_success "Comparison benchmark complete"
+}
+
+# Main function
+main() {
+    log_info "=========================================="
+    log_info "RuVector Benchmark Suite"
+    log_info "=========================================="
+    log_info "Output directory: ${OUTPUT_DIR}"
+    log_info "Quick mode: ${QUICK_MODE}"
+    log_info ""
+
+    # Collect system info
+    log_info "Collecting system information..."
+    local system_info
+    system_info=$(get_system_info)
+
+    # Run benchmarks
+    log_info ""
+    log_info "Starting benchmarks..."
+    log_info ""
+
+    local neon_results
+    neon_results=$(run_neon_benchmark)
+
+    local criterion_results
+    criterion_results=$(run_criterion_benchmarks)
+
+    local comparison_results
+    comparison_results=$(run_comparison_benchmark)
+
+    # Combine all results into final JSON
+    local final_json
+    final_json=$(cat << EOF
+{
+    "system_info": ${system_info},
+    "neon_simd": ${neon_results},
+    "criterion": ${criterion_results},
+    "comparison": ${comparison_results},
+    "summary": {
+        "vectors_tested": ${VECTORS},
+        "queries_tested": ${QUERIES},
+        "dimensions": ${DIMENSIONS}
+    }
+}
+EOF
+)
+
+    # Save JSON output
+    echo "${final_json}" > "${JSON_OUTPUT}"
+    log_success "Benchmark results saved to: ${JSON_OUTPUT}"
+
+    # Output JSON if requested
+    if [ "$JSON_ONLY" = true ]; then
+        echo "${final_json}"
+    else
+        log_info ""
+        log_info "=========================================="
+        log_info "Benchmark Summary"
+        log_info "=========================================="
+        echo ""
+        echo "SIMD Speedups:"
+        echo "  Euclidean: $(echo "$neon_results" | grep -o '"speedup": [0-9.]*' | head -1 | awk '{print $2}')x"
+        echo "  Dot Product: $(echo "$neon_results" | grep -o '"speedup": [0-9.]*' | sed -n '2p' | awk '{print $2}')x"
+        echo "  Cosine: $(echo "$neon_results" | grep -o '"speedup": [0-9.]*' | tail -1 | awk '{print $2}')x"
+        echo ""
+        log_success "All benchmarks complete!"
+        log_info "Full results: ${JSON_OUTPUT}"
+        log_info "Markdown report: ${OUTPUT_DIR}/comparison_benchmark.md"
+    fi
+}
+
+# Run main
+main "$@"
--- a/vendor/ruvector/scripts/benchmark/run_llm_benchmarks.sh
+++ b/vendor/ruvector/scripts/benchmark/run_llm_benchmarks.sh
@@ -0,0 +1,378 @@
+#!/bin/bash
+#
+# RuvLLM Benchmark Runner for Mac M4 Pro
+#
+# This script runs all Criterion benchmarks for the RuvLLM crate,
+# generates JSON results, and compares against baseline performance.
+#
+# Performance Targets for M4 Pro:
+# - Flash attention (256 seq): <2ms
+# - RMSNorm (4096 dim): <10us
+# - GEMM (4096x4096): <5ms
+# - MicroLoRA forward: <1ms
+# - E2E inference: 100+ tokens/sec
+#
+# Usage:
+#   ./scripts/run_llm_benchmarks.sh [OPTIONS]
+#
+# Options:
+#   --quick         Run quick benchmarks only (reduced sample size)
+#   --save-baseline Save current results as baseline
+#   --compare       Compare against saved baseline
+#   --bench NAME    Run specific benchmark (attention, rope, norm, matmul, lora, e2e)
+#   --json          Output JSON results
+#   --html          Generate HTML report
+#   --all           Run all benchmarks (default)
+#   --help          Show this help message
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+RUVLLM_DIR="$PROJECT_ROOT/crates/ruvllm"
+RESULTS_DIR="$PROJECT_ROOT/target/criterion"
+BASELINE_DIR="$PROJECT_ROOT/target/benchmark-baseline"
+
+# Default options
+QUICK_MODE=false
+SAVE_BASELINE=false
+COMPARE_BASELINE=false
+OUTPUT_JSON=false
+OUTPUT_HTML=false
+BENCH_NAME=""
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --quick)
+            QUICK_MODE=true
+            shift
+            ;;
+        --save-baseline)
+            SAVE_BASELINE=true
+            shift
+            ;;
+        --compare)
+            COMPARE_BASELINE=true
+            shift
+            ;;
+        --bench)
+            BENCH_NAME="$2"
+            shift 2
+            ;;
+        --json)
+            OUTPUT_JSON=true
+            shift
+            ;;
+        --html)
+            OUTPUT_HTML=true
+            shift
+            ;;
+        --all)
+            BENCH_NAME=""
+            shift
+            ;;
+        --help)
+            head -35 "$0" | tail -30
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Function to print section headers
+print_header() {
+    echo ""
+    echo -e "${BLUE}========================================${NC}"
+    echo -e "${BLUE}  $1${NC}"
+    echo -e "${BLUE}========================================${NC}"
+    echo ""
+}
+
+# Function to print system info
+print_system_info() {
+    print_header "System Information"
+
+    echo "Date: $(date)"
+    echo "Host: $(hostname)"
+    echo ""
+
+    # Detect Mac and chip
+    if [[ "$(uname)" == "Darwin" ]]; then
+        echo "Platform: macOS"
+        echo "macOS Version: $(sw_vers -productVersion)"
+
+        # Detect Apple Silicon chip
+        CHIP=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")
+        echo "CPU: $CHIP"
+
+        # Check for M4 Pro specifically
+        if [[ "$CHIP" == *"M4 Pro"* ]]; then
+            echo -e "${GREEN}M4 Pro detected - optimal performance expected${NC}"
+        elif [[ "$CHIP" == *"M4"* ]]; then
+            echo -e "${YELLOW}M4 detected - good performance expected${NC}"
+        elif [[ "$CHIP" == *"M3"* ]] || [[ "$CHIP" == *"M2"* ]] || [[ "$CHIP" == *"M1"* ]]; then
+            echo -e "${YELLOW}Apple Silicon detected (not M4 Pro)${NC}"
+        fi
+
+        # Memory info
+        TOTAL_MEM=$(sysctl -n hw.memsize 2>/dev/null || echo "0")
+        TOTAL_MEM_GB=$((TOTAL_MEM / 1024 / 1024 / 1024))
+        echo "Total Memory: ${TOTAL_MEM_GB}GB"
+
+        # CPU cores
+        PERF_CORES=$(sysctl -n hw.perflevel0.physicalcpu 2>/dev/null || echo "N/A")
+        EFFI_CORES=$(sysctl -n hw.perflevel1.physicalcpu 2>/dev/null || echo "N/A")
+        echo "Performance Cores: $PERF_CORES"
+        echo "Efficiency Cores: $EFFI_CORES"
+
+    else
+        echo "Platform: $(uname -s)"
+        echo "Architecture: $(uname -m)"
+    fi
+
+    echo ""
+    echo "Rust Version: $(rustc --version)"
+    echo "Cargo Version: $(cargo --version)"
+}
+
+# Function to check prerequisites
+check_prerequisites() {
+    print_header "Checking Prerequisites"
+
+    # Check if we're in the right directory
+    if [[ ! -d "$RUVLLM_DIR" ]]; then
+        echo -e "${RED}Error: RuvLLM crate not found at $RUVLLM_DIR${NC}"
+        exit 1
+    fi
+
+    # Check for Cargo.toml
+    if [[ ! -f "$RUVLLM_DIR/Cargo.toml" ]]; then
+        echo -e "${RED}Error: Cargo.toml not found in $RUVLLM_DIR${NC}"
+        exit 1
+    fi
+
+    # Check for benchmark files
+    BENCH_DIR="$RUVLLM_DIR/benches"
+    if [[ ! -d "$BENCH_DIR" ]]; then
+        echo -e "${RED}Error: Benchmarks directory not found at $BENCH_DIR${NC}"
+        exit 1
+    fi
+
+    echo -e "${GREEN}Prerequisites OK${NC}"
+}
+
+# Function to build benchmarks
+build_benchmarks() {
+    print_header "Building Benchmarks"
+
+    cd "$RUVLLM_DIR"
+
+    echo "Building in release mode with optimizations..."
+    RUSTFLAGS="-C target-cpu=native" cargo build --release --benches 2>&1 || {
+        echo -e "${YELLOW}Warning: Some benchmarks may have failed to build${NC}"
+    }
+
+    echo -e "${GREEN}Build complete${NC}"
+}
+
+# Function to run a specific benchmark
+run_benchmark() {
+    local bench_name=$1
+    local extra_args=$2
+
+    echo ""
+    echo -e "${YELLOW}Running benchmark: $bench_name${NC}"
+    echo "-------------------------------------------"
+
+    cd "$RUVLLM_DIR"
+
+    local cmd="cargo bench --bench ${bench_name}_bench"
+
+    if [[ "$QUICK_MODE" == true ]]; then
+        cmd="$cmd -- --quick"
+    fi
+
+    if [[ "$COMPARE_BASELINE" == true ]] && [[ -d "$BASELINE_DIR" ]]; then
+        cmd="$cmd --baseline baseline"
+    fi
+
+    if [[ "$OUTPUT_JSON" == true ]]; then
+        cmd="$cmd --format json"
+    fi
+
+    if [[ -n "$extra_args" ]]; then
+        cmd="$cmd $extra_args"
+    fi
+
+    echo "Command: $cmd"
+    echo ""
+
+    # Run benchmark and capture output
+    RUSTFLAGS="-C target-cpu=native" $cmd 2>&1 || true
+}
+
+# Function to run all benchmarks
+run_all_benchmarks() {
+    print_header "Running All Benchmarks"
+
+    local benchmarks=("attention" "rope" "norm" "matmul" "lora" "e2e")
+
+    for bench in "${benchmarks[@]}"; do
+        run_benchmark "$bench"
+    done
+}
+
+# Function to save baseline
+save_baseline() {
+    print_header "Saving Baseline"
+
+    if [[ -d "$RESULTS_DIR" ]]; then
+        mkdir -p "$BASELINE_DIR"
+        cp -r "$RESULTS_DIR"/* "$BASELINE_DIR/"
+        echo -e "${GREEN}Baseline saved to $BASELINE_DIR${NC}"
+    else
+        echo -e "${RED}No results found to save as baseline${NC}"
+    fi
+}
+
+# Function to generate summary
+generate_summary() {
+    print_header "Performance Summary"
+
+    echo "Performance Targets for M4 Pro:"
+    echo "================================"
+    echo ""
+    echo "| Benchmark               | Target    | Status |"
+    echo "|-------------------------|-----------|--------|"
+    echo "| Flash attention (256)   | <2ms      | TBD    |"
+    echo "| RMSNorm (4096)          | <10us     | TBD    |"
+    echo "| GEMM (4096x4096)        | <5ms      | TBD    |"
+    echo "| MicroLoRA forward       | <1ms      | TBD    |"
+    echo "| E2E inference           | 100+ t/s  | TBD    |"
+    echo ""
+
+    # Try to extract actual results from Criterion output
+    if [[ -d "$RESULTS_DIR" ]]; then
+        echo "Results saved to: $RESULTS_DIR"
+        echo ""
+
+        # List benchmark directories
+        echo "Completed benchmarks:"
+        ls -1 "$RESULTS_DIR" 2>/dev/null | head -20 || echo "  (none found)"
+    fi
+}
+
+# Function to generate JSON output
+generate_json_output() {
+    if [[ "$OUTPUT_JSON" != true ]]; then
+        return
+    fi
+
+    print_header "Generating JSON Output"
+
+    local json_file="$PROJECT_ROOT/target/benchmark-results.json"
+
+    # Create JSON structure
+    cat > "$json_file" << EOF
+{
+    "timestamp": "$(date -Iseconds)",
+    "system": {
+        "platform": "$(uname -s)",
+        "arch": "$(uname -m)",
+        "cpu": "$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'Unknown')",
+        "memory_gb": $(($(sysctl -n hw.memsize 2>/dev/null || echo 0) / 1024 / 1024 / 1024))
+    },
+    "rust_version": "$(rustc --version | cut -d' ' -f2)",
+    "results_dir": "$RESULTS_DIR",
+    "benchmarks": {
+        "attention": {"status": "completed"},
+        "rope": {"status": "completed"},
+        "norm": {"status": "completed"},
+        "matmul": {"status": "completed"},
+        "lora": {"status": "completed"},
+        "e2e": {"status": "completed"}
+    },
+    "targets": {
+        "flash_attention_256_ms": 2.0,
+        "rms_norm_4096_us": 10.0,
+        "gemm_4096_ms": 5.0,
+        "micro_lora_forward_ms": 1.0,
+        "e2e_tokens_per_sec": 100
+    }
+}
+EOF
+
+    echo -e "${GREEN}JSON output saved to: $json_file${NC}"
+}
+
+# Function to generate HTML report
+generate_html_report() {
+    if [[ "$OUTPUT_HTML" != true ]]; then
+        return
+    fi
+
+    print_header "Generating HTML Report"
+
+    # Criterion generates HTML reports by default
+    local report_index="$RESULTS_DIR/report/index.html"
+
+    if [[ -f "$report_index" ]]; then
+        echo -e "${GREEN}HTML report available at: $report_index${NC}"
+
+        # Try to open in browser on macOS
+        if [[ "$(uname)" == "Darwin" ]]; then
+            echo "Opening report in browser..."
+            open "$report_index" 2>/dev/null || true
+        fi
+    else
+        echo -e "${YELLOW}HTML report not found. Run benchmarks first.${NC}"
+    fi
+}
+
+# Main execution
+main() {
+    print_system_info
+    check_prerequisites
+    build_benchmarks
+
+    if [[ -n "$BENCH_NAME" ]]; then
+        # Run specific benchmark
+        run_benchmark "$BENCH_NAME"
+    else
+        # Run all benchmarks
+        run_all_benchmarks
+    fi
+
+    if [[ "$SAVE_BASELINE" == true ]]; then
+        save_baseline
+    fi
+
+    generate_summary
+    generate_json_output
+    generate_html_report
+
+    print_header "Benchmark Run Complete"
+
+    echo "To view detailed results:"
+    echo "  open $RESULTS_DIR/report/index.html"
+    echo ""
+    echo "To compare with baseline:"
+    echo "  $0 --save-baseline  # First, save current as baseline"
+    echo "  # Make changes..."
+    echo "  $0 --compare        # Then compare new results"
+}
+
+# Run main
+main