Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/examples/scipix/scripts/download_models.sh
+++ b/vendor/ruvector/examples/scipix/scripts/download_models.sh
@@ -0,0 +1,198 @@
+#!/bin/bash
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}Downloading RuVector Mathpix ONNX Models${NC}"
+echo ""
+
+# Configuration
+MODELS_DIR="models"
+GITHUB_REPO="ruvnet/ruvector"
+RELEASE_TAG="scipix-models-v1.0.0"
+
+# Model configurations
+declare -A MODELS=(
+    ["scipix_encoder.onnx"]="https://github.com/${GITHUB_REPO}/releases/download/${RELEASE_TAG}/scipix_encoder.onnx"
+    ["scipix_decoder.onnx"]="https://github.com/${GITHUB_REPO}/releases/download/${RELEASE_TAG}/scipix_decoder.onnx"
+    ["scipix_tokenizer.onnx"]="https://github.com/${GITHUB_REPO}/releases/download/${RELEASE_TAG}/scipix_tokenizer.onnx"
+)
+
+# SHA256 checksums (these should be updated with actual checksums)
+declare -A CHECKSUMS=(
+    ["scipix_encoder.onnx"]="SHA256_PLACEHOLDER"
+    ["scipix_decoder.onnx"]="SHA256_PLACEHOLDER"
+    ["scipix_tokenizer.onnx"]="SHA256_PLACEHOLDER"
+)
+
+# Create models directory
+mkdir -p "${MODELS_DIR}"
+
+# Function to download a file with progress
+download_file() {
+    local url=$1
+    local output=$2
+
+    if command -v curl &> /dev/null; then
+        curl -L --progress-bar -o "${output}" "${url}"
+    elif command -v wget &> /dev/null; then
+        wget --show-progress -O "${output}" "${url}"
+    else
+        echo -e "${RED}Error: Neither curl nor wget is available. Please install one.${NC}"
+        exit 1
+    fi
+}
+
+# Function to verify checksum
+verify_checksum() {
+    local file=$1
+    local expected=$2
+
+    if [ "${expected}" = "SHA256_PLACEHOLDER" ]; then
+        echo -e "${YELLOW}Warning: No checksum available for ${file}. Skipping verification.${NC}"
+        return 0
+    fi
+
+    if command -v sha256sum &> /dev/null; then
+        local actual=$(sha256sum "${file}" | cut -d' ' -f1)
+    elif command -v shasum &> /dev/null; then
+        local actual=$(shasum -a 256 "${file}" | cut -d' ' -f1)
+    else
+        echo -e "${YELLOW}Warning: No SHA256 tool available. Skipping verification.${NC}"
+        return 0
+    fi
+
+    if [ "${actual}" = "${expected}" ]; then
+        echo -e "${GREEN}Checksum verified for ${file}${NC}"
+        return 0
+    else
+        echo -e "${RED}Checksum mismatch for ${file}!${NC}"
+        echo -e "${RED}Expected: ${expected}${NC}"
+        echo -e "${RED}Got: ${actual}${NC}"
+        return 1
+    fi
+}
+
+# Download each model
+for model in "${!MODELS[@]}"; do
+    output_path="${MODELS_DIR}/${model}"
+
+    # Check if model already exists
+    if [ -f "${output_path}" ]; then
+        echo -e "${YELLOW}${model} already exists. Verifying...${NC}"
+        if verify_checksum "${output_path}" "${CHECKSUMS[$model]}"; then
+            echo -e "${GREEN}${model} is valid. Skipping download.${NC}"
+            continue
+        else
+            echo -e "${YELLOW}${model} verification failed. Re-downloading...${NC}"
+            rm -f "${output_path}"
+        fi
+    fi
+
+    echo -e "${BLUE}Downloading ${model}...${NC}"
+
+    # Try to download from GitHub releases
+    if download_file "${MODELS[$model]}" "${output_path}"; then
+        echo -e "${GREEN}Downloaded ${model}${NC}"
+
+        # Verify checksum
+        if ! verify_checksum "${output_path}" "${CHECKSUMS[$model]}"; then
+            echo -e "${RED}Failed to verify ${model}. Removing file.${NC}"
+            rm -f "${output_path}"
+            exit 1
+        fi
+    else
+        echo -e "${YELLOW}Failed to download from releases. Trying alternative sources...${NC}"
+
+        # Alternative: Download from Hugging Face (if available)
+        HF_URL="https://huggingface.co/ruvnet/scipix-models/resolve/main/${model}"
+        if download_file "${HF_URL}" "${output_path}"; then
+            echo -e "${GREEN}Downloaded ${model} from Hugging Face${NC}"
+            verify_checksum "${output_path}" "${CHECKSUMS[$model]}" || true
+        else
+            echo -e "${RED}Failed to download ${model} from all sources${NC}"
+
+            # Create a placeholder file with instructions
+            cat > "${output_path}.README" << EOF
+Model: ${model}
+
+This model file could not be downloaded automatically.
+
+Please download it manually from one of these sources:
+1. GitHub Releases: ${MODELS[$model]}
+2. Hugging Face: https://huggingface.co/ruvnet/scipix-models
+
+After downloading, place the file at:
+${output_path}
+
+Expected SHA256 checksum: ${CHECKSUMS[$model]}
+EOF
+            echo -e "${YELLOW}Created instructions at ${output_path}.README${NC}"
+        fi
+    fi
+done
+
+# Create model configuration file
+echo -e "${BLUE}Creating model configuration...${NC}"
+cat > "${MODELS_DIR}/config.json" << EOF
+{
+  "models": {
+    "encoder": {
+      "path": "scipix_encoder.onnx",
+      "type": "image_encoder",
+      "input_shape": [1, 3, 224, 224],
+      "output_dim": 768
+    },
+    "decoder": {
+      "path": "scipix_decoder.onnx",
+      "type": "sequence_decoder",
+      "vocab_size": 50000,
+      "max_length": 512
+    },
+    "tokenizer": {
+      "path": "scipix_tokenizer.onnx",
+      "type": "tokenizer",
+      "vocab_size": 50000
+    }
+  },
+  "version": "1.0.0",
+  "created_at": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+}
+EOF
+
+# Verify all models are present
+echo ""
+echo -e "${BLUE}Verifying model files...${NC}"
+missing_models=0
+for model in "${!MODELS[@]}"; do
+    if [ -f "${MODELS_DIR}/${model}" ]; then
+        size=$(du -h "${MODELS_DIR}/${model}" | cut -f1)
+        echo -e "${GREEN}✓ ${model} (${size})${NC}"
+    else
+        echo -e "${RED}✗ ${model} (missing)${NC}"
+        ((missing_models++))
+    fi
+done
+
+echo ""
+if [ ${missing_models} -eq 0 ]; then
+    echo -e "${GREEN}====================================${NC}"
+    echo -e "${GREEN}All models downloaded successfully!${NC}"
+    echo -e "${GREEN}====================================${NC}"
+    echo ""
+    echo -e "${BLUE}Models are located in: ${MODELS_DIR}/${NC}"
+    echo -e "${BLUE}Configuration file: ${MODELS_DIR}/config.json${NC}"
+    exit 0
+else
+    echo -e "${YELLOW}====================================${NC}"
+    echo -e "${YELLOW}Warning: ${missing_models} model(s) missing${NC}"
+    echo -e "${YELLOW}====================================${NC}"
+    echo ""
+    echo -e "${YELLOW}Please check the .README files in ${MODELS_DIR}/ for manual download instructions.${NC}"
+    exit 1
+fi
--- a/vendor/ruvector/examples/scipix/scripts/run_benchmarks.sh
+++ b/vendor/ruvector/examples/scipix/scripts/run_benchmarks.sh
@@ -0,0 +1,240 @@
+#!/bin/bash
+set -e
+
+# ruvector-scipix Benchmark Suite Runner
+# Comprehensive performance benchmarking with baseline tracking and regression detection
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+BENCHMARK_DIR="$PROJECT_DIR/target/criterion"
+BASELINE="${BASELINE:-main}"
+GENERATE_HTML="${GENERATE_HTML:-true}"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}=====================================${NC}"
+echo -e "${BLUE}ruvector-scipix Benchmark Suite${NC}"
+echo -e "${BLUE}=====================================${NC}"
+echo ""
+
+# Check if running in project directory
+if [ ! -f "$PROJECT_DIR/Cargo.toml" ]; then
+    echo -e "${RED}Error: Must run from scipix project directory${NC}"
+    exit 1
+fi
+
+# Function to run a single benchmark
+run_benchmark() {
+    local bench_name=$1
+    local description=$2
+
+    echo -e "${GREEN}Running ${bench_name}...${NC}"
+    echo -e "${YELLOW}${description}${NC}"
+
+    cd "$PROJECT_DIR"
+
+    if [ "$BASELINE" != "" ]; then
+        cargo bench --bench "$bench_name" -- --save-baseline "$BASELINE"
+    else
+        cargo bench --bench "$bench_name"
+    fi
+
+    echo ""
+}
+
+# Function to compare with baseline
+compare_baseline() {
+    local bench_name=$1
+    local baseline=$2
+
+    echo -e "${BLUE}Comparing ${bench_name} with baseline ${baseline}...${NC}"
+
+    cd "$PROJECT_DIR"
+    cargo bench --bench "$bench_name" -- --baseline "$baseline"
+
+    echo ""
+}
+
+# Function to check for regressions
+check_regressions() {
+    echo -e "${BLUE}Checking for performance regressions...${NC}"
+
+    # Target metrics
+    echo -e "${YELLOW}Performance Targets:${NC}"
+    echo "  - Single image OCR: <100ms P95"
+    echo "  - Batch (16 images): <500ms"
+    echo "  - Preprocessing: <20ms"
+    echo "  - LaTeX generation: <5ms"
+    echo ""
+
+    # Parse criterion output for regressions
+    # In production, this would parse actual benchmark results
+    if [ -d "$BENCHMARK_DIR" ]; then
+        echo -e "${GREEN}Benchmark results saved to: ${BENCHMARK_DIR}${NC}"
+    fi
+}
+
+# Function to generate HTML reports
+generate_reports() {
+    if [ "$GENERATE_HTML" = "true" ]; then
+        echo -e "${BLUE}Generating HTML reports...${NC}"
+
+        if [ -d "$BENCHMARK_DIR" ]; then
+            # Criterion automatically generates HTML reports
+            echo -e "${GREEN}HTML reports generated in ${BENCHMARK_DIR}${NC}"
+            echo -e "${YELLOW}Open ${BENCHMARK_DIR}/report/index.html in your browser${NC}"
+        fi
+    fi
+}
+
+# Parse command line arguments
+MODE="${1:-all}"
+COMPARE_BASELINE_NAME="${2:-}"
+
+case "$MODE" in
+    "all")
+        echo -e "${YELLOW}Running all benchmarks...${NC}\n"
+
+        run_benchmark "ocr_latency" "OCR latency benchmarks (single, batch, cold vs warm)"
+        run_benchmark "preprocessing" "Image preprocessing benchmarks (transforms, pipeline)"
+        run_benchmark "latex_generation" "LaTeX generation benchmarks (AST, string building)"
+        run_benchmark "inference" "Model inference benchmarks (detection, recognition, math)"
+        run_benchmark "cache" "Cache benchmarks (embedding, similarity search)"
+        run_benchmark "api" "API benchmarks (parsing, serialization, middleware)"
+        run_benchmark "memory" "Memory benchmarks (peak usage, growth, fragmentation)"
+
+        check_regressions
+        generate_reports
+        ;;
+
+    "latency")
+        run_benchmark "ocr_latency" "OCR latency benchmarks"
+        ;;
+
+    "preprocessing")
+        run_benchmark "preprocessing" "Image preprocessing benchmarks"
+        ;;
+
+    "latex")
+        run_benchmark "latex_generation" "LaTeX generation benchmarks"
+        ;;
+
+    "inference")
+        run_benchmark "inference" "Model inference benchmarks"
+        ;;
+
+    "cache")
+        run_benchmark "cache" "Cache benchmarks"
+        ;;
+
+    "api")
+        run_benchmark "api" "API benchmarks"
+        ;;
+
+    "memory")
+        run_benchmark "memory" "Memory benchmarks"
+        ;;
+
+    "compare")
+        if [ -z "$COMPARE_BASELINE_NAME" ]; then
+            echo -e "${RED}Error: Baseline name required for comparison${NC}"
+            echo "Usage: $0 compare <baseline-name>"
+            exit 1
+        fi
+
+        echo -e "${YELLOW}Comparing all benchmarks with baseline: ${COMPARE_BASELINE_NAME}${NC}\n"
+
+        compare_baseline "ocr_latency" "$COMPARE_BASELINE_NAME"
+        compare_baseline "preprocessing" "$COMPARE_BASELINE_NAME"
+        compare_baseline "latex_generation" "$COMPARE_BASELINE_NAME"
+        compare_baseline "inference" "$COMPARE_BASELINE_NAME"
+        compare_baseline "cache" "$COMPARE_BASELINE_NAME"
+        compare_baseline "api" "$COMPARE_BASELINE_NAME"
+        compare_baseline "memory" "$COMPARE_BASELINE_NAME"
+        ;;
+
+    "quick")
+        echo -e "${YELLOW}Running quick benchmark suite (reduced samples)...${NC}\n"
+
+        export CARGO_BENCH_OPTS="-- --quick"
+
+        run_benchmark "ocr_latency" "Quick OCR latency check"
+        run_benchmark "preprocessing" "Quick preprocessing check"
+        ;;
+
+    "ci")
+        echo -e "${YELLOW}Running CI benchmark suite...${NC}\n"
+
+        # Run benchmarks with minimal samples for CI
+        export CARGO_BENCH_OPTS="-- --sample-size 10"
+
+        run_benchmark "ocr_latency" "CI OCR latency"
+        run_benchmark "preprocessing" "CI preprocessing"
+        run_benchmark "latex_generation" "CI LaTeX generation"
+
+        # Check for major regressions only
+        check_regressions
+        ;;
+
+    "help"|"--help"|"-h")
+        echo "Usage: $0 [MODE] [OPTIONS]"
+        echo ""
+        echo "Modes:"
+        echo "  all              Run all benchmarks (default)"
+        echo "  latency          Run OCR latency benchmarks only"
+        echo "  preprocessing    Run preprocessing benchmarks only"
+        echo "  latex            Run LaTeX generation benchmarks only"
+        echo "  inference        Run model inference benchmarks only"
+        echo "  cache            Run cache benchmarks only"
+        echo "  api              Run API benchmarks only"
+        echo "  memory           Run memory benchmarks only"
+        echo "  compare <name>   Compare with saved baseline"
+        echo "  quick            Run quick benchmark suite"
+        echo "  ci               Run CI benchmark suite"
+        echo "  help             Show this help message"
+        echo ""
+        echo "Environment Variables:"
+        echo "  BASELINE=<name>        Save results as baseline (default: main)"
+        echo "  GENERATE_HTML=<bool>   Generate HTML reports (default: true)"
+        echo ""
+        echo "Examples:"
+        echo "  $0 all                    # Run all benchmarks"
+        echo "  $0 latency                # Run latency benchmarks only"
+        echo "  BASELINE=v1.0 $0 all      # Save as v1.0 baseline"
+        echo "  $0 compare v1.0           # Compare with v1.0 baseline"
+        echo "  $0 quick                  # Quick benchmark suite"
+        ;;
+
+    *)
+        echo -e "${RED}Error: Unknown mode '$MODE'${NC}"
+        echo "Use '$0 help' for usage information"
+        exit 1
+        ;;
+esac
+
+echo ""
+echo -e "${GREEN}=====================================${NC}"
+echo -e "${GREEN}Benchmarks Complete!${NC}"
+echo -e "${GREEN}=====================================${NC}"
+
+# Print summary
+if [ -d "$BENCHMARK_DIR" ]; then
+    echo ""
+    echo -e "${YELLOW}Results Summary:${NC}"
+    echo -e "  Benchmark data: ${BENCHMARK_DIR}"
+
+    if [ "$GENERATE_HTML" = "true" ]; then
+        echo -e "  HTML reports: ${BENCHMARK_DIR}/report/index.html"
+    fi
+
+    if [ "$BASELINE" != "" ]; then
+        echo -e "  Saved baseline: ${BASELINE}"
+    fi
+fi
+
+echo ""
--- a/vendor/ruvector/examples/scipix/scripts/setup_dev.sh
+++ b/vendor/ruvector/examples/scipix/scripts/setup_dev.sh
@@ -0,0 +1,207 @@
+#!/bin/bash
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}Setting up RuVector Mathpix Development Environment${NC}"
+echo ""
+
+# Check if Rust is installed
+if ! command -v rustc &> /dev/null; then
+    echo -e "${RED}Rust is not installed. Installing Rust...${NC}"
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+    source $HOME/.cargo/env
+else
+    echo -e "${GREEN}Rust is already installed: $(rustc --version)${NC}"
+fi
+
+# Update Rust toolchain
+echo -e "${BLUE}Updating Rust toolchain...${NC}"
+rustup update stable
+rustup default stable
+
+# Install required components
+echo -e "${BLUE}Installing Rust components...${NC}"
+rustup component add rustfmt clippy
+
+# Install development tools
+echo -e "${BLUE}Installing development tools...${NC}"
+
+# Code coverage
+if ! command -v cargo-tarpaulin &> /dev/null; then
+    echo -e "${YELLOW}Installing cargo-tarpaulin...${NC}"
+    cargo install cargo-tarpaulin
+else
+    echo -e "${GREEN}cargo-tarpaulin is already installed${NC}"
+fi
+
+# Security audit
+if ! command -v cargo-audit &> /dev/null; then
+    echo -e "${YELLOW}Installing cargo-audit...${NC}"
+    cargo install cargo-audit
+else
+    echo -e "${GREEN}cargo-audit is already installed${NC}"
+fi
+
+# Dependency checker
+if ! command -v cargo-deny &> /dev/null; then
+    echo -e "${YELLOW}Installing cargo-deny...${NC}"
+    cargo install cargo-deny
+else
+    echo -e "${GREEN}cargo-deny is already installed${NC}"
+fi
+
+# License checker
+if ! command -v cargo-license &> /dev/null; then
+    echo -e "${YELLOW}Installing cargo-license...${NC}"
+    cargo install cargo-license
+else
+    echo -e "${GREEN}cargo-license is already installed${NC}"
+fi
+
+# WASM tools
+if ! command -v wasm-pack &> /dev/null; then
+    echo -e "${YELLOW}Installing wasm-pack...${NC}"
+    curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
+else
+    echo -e "${GREEN}wasm-pack is already installed${NC}"
+fi
+
+# Benchmark comparison tool
+if ! command -v critcmp &> /dev/null; then
+    echo -e "${YELLOW}Installing critcmp...${NC}"
+    cargo install critcmp
+else
+    echo -e "${GREEN}critcmp is already installed${NC}"
+fi
+
+# Cargo watch for development
+if ! command -v cargo-watch &> /dev/null; then
+    echo -e "${YELLOW}Installing cargo-watch...${NC}"
+    cargo install cargo-watch
+else
+    echo -e "${GREEN}cargo-watch is already installed${NC}"
+fi
+
+# Flamegraph for profiling
+if ! command -v cargo-flamegraph &> /dev/null; then
+    echo -e "${YELLOW}Installing cargo-flamegraph...${NC}"
+    cargo install flamegraph
+else
+    echo -e "${GREEN}cargo-flamegraph is already installed${NC}"
+fi
+
+# Binary size analysis
+if ! command -v cargo-bloat &> /dev/null; then
+    echo -e "${YELLOW}Installing cargo-bloat...${NC}"
+    cargo install cargo-bloat
+else
+    echo -e "${GREEN}cargo-bloat is already installed${NC}"
+fi
+
+# Outdated dependency checker
+if ! command -v cargo-outdated &> /dev/null; then
+    echo -e "${YELLOW}Installing cargo-outdated...${NC}"
+    cargo install cargo-outdated
+else
+    echo -e "${GREEN}cargo-outdated is already installed${NC}"
+fi
+
+# Install WASM target
+echo -e "${BLUE}Installing WASM target...${NC}"
+rustup target add wasm32-unknown-unknown
+
+# Install Node.js if not present (for WASM testing)
+if ! command -v node &> /dev/null; then
+    echo -e "${YELLOW}Node.js not found. Please install Node.js for WASM testing.${NC}"
+    echo -e "${YELLOW}Visit: https://nodejs.org/${NC}"
+else
+    echo -e "${GREEN}Node.js is installed: $(node --version)${NC}"
+fi
+
+# Create necessary directories
+echo -e "${BLUE}Creating project directories...${NC}"
+mkdir -p models
+mkdir -p benchmarks/results
+mkdir -p coverage
+mkdir -p docs
+mkdir -p .github/workflows
+
+# Download test models
+echo -e "${BLUE}Downloading test models...${NC}"
+if [ -f "./scripts/download_models.sh" ]; then
+    chmod +x ./scripts/download_models.sh
+    ./scripts/download_models.sh
+else
+    echo -e "${YELLOW}Model download script not found. Skipping model download.${NC}"
+fi
+
+# Initialize git hooks (if in git repo)
+if [ -d ".git" ]; then
+    echo -e "${BLUE}Setting up git hooks...${NC}"
+
+    # Pre-commit hook
+    cat > .git/hooks/pre-commit << 'EOF'
+#!/bin/bash
+echo "Running pre-commit checks..."
+
+# Format check
+cargo fmt --check
+if [ $? -ne 0 ]; then
+    echo "Code formatting check failed. Run 'cargo fmt' to fix."
+    exit 1
+fi
+
+# Clippy
+cargo clippy -- -D warnings
+if [ $? -ne 0 ]; then
+    echo "Clippy check failed."
+    exit 1
+fi
+
+# Tests
+cargo test
+if [ $? -ne 0 ]; then
+    echo "Tests failed."
+    exit 1
+fi
+
+echo "Pre-commit checks passed!"
+EOF
+    chmod +x .git/hooks/pre-commit
+    echo -e "${GREEN}Git hooks installed${NC}"
+fi
+
+# Build the project
+echo -e "${BLUE}Building project...${NC}"
+cargo build
+
+# Run tests
+echo -e "${BLUE}Running tests...${NC}"
+cargo test
+
+echo ""
+echo -e "${GREEN}====================================${NC}"
+echo -e "${GREEN}Development environment setup complete!${NC}"
+echo -e "${GREEN}====================================${NC}"
+echo ""
+echo -e "${BLUE}Available commands:${NC}"
+echo -e "  ${GREEN}make help${NC}        - Show all available make commands"
+echo -e "  ${GREEN}make build${NC}       - Build the project"
+echo -e "  ${GREEN}make test${NC}        - Run tests"
+echo -e "  ${GREEN}make bench${NC}       - Run benchmarks"
+echo -e "  ${GREEN}make coverage${NC}    - Generate coverage report"
+echo -e "  ${GREEN}make wasm${NC}        - Build WASM package"
+echo -e "  ${GREEN}make watch${NC}       - Watch for changes and rebuild"
+echo ""
+echo -e "${BLUE}Quick start:${NC}"
+echo -e "  1. Run ${GREEN}make test${NC} to verify everything works"
+echo -e "  2. Run ${GREEN}make bench${NC} to see baseline performance"
+echo -e "  3. Run ${GREEN}make coverage${NC} to check test coverage"
+echo ""
+echo -e "${GREEN}Happy coding!${NC}"