Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,198 @@
#!/bin/bash
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
echo -e "${BLUE}Downloading RuVector Mathpix ONNX Models${NC}"
echo ""
# Configuration
MODELS_DIR="models"
GITHUB_REPO="ruvnet/ruvector"
RELEASE_TAG="scipix-models-v1.0.0"
# Model configurations
declare -A MODELS=(
["scipix_encoder.onnx"]="https://github.com/${GITHUB_REPO}/releases/download/${RELEASE_TAG}/scipix_encoder.onnx"
["scipix_decoder.onnx"]="https://github.com/${GITHUB_REPO}/releases/download/${RELEASE_TAG}/scipix_decoder.onnx"
["scipix_tokenizer.onnx"]="https://github.com/${GITHUB_REPO}/releases/download/${RELEASE_TAG}/scipix_tokenizer.onnx"
)
# SHA256 checksums (these should be updated with actual checksums)
declare -A CHECKSUMS=(
["scipix_encoder.onnx"]="SHA256_PLACEHOLDER"
["scipix_decoder.onnx"]="SHA256_PLACEHOLDER"
["scipix_tokenizer.onnx"]="SHA256_PLACEHOLDER"
)
# Create models directory
mkdir -p "${MODELS_DIR}"
# Function to download a file with progress
download_file() {
local url=$1
local output=$2
if command -v curl &> /dev/null; then
curl -L --progress-bar -o "${output}" "${url}"
elif command -v wget &> /dev/null; then
wget --show-progress -O "${output}" "${url}"
else
echo -e "${RED}Error: Neither curl nor wget is available. Please install one.${NC}"
exit 1
fi
}
# Function to verify checksum
verify_checksum() {
local file=$1
local expected=$2
if [ "${expected}" = "SHA256_PLACEHOLDER" ]; then
echo -e "${YELLOW}Warning: No checksum available for ${file}. Skipping verification.${NC}"
return 0
fi
if command -v sha256sum &> /dev/null; then
local actual=$(sha256sum "${file}" | cut -d' ' -f1)
elif command -v shasum &> /dev/null; then
local actual=$(shasum -a 256 "${file}" | cut -d' ' -f1)
else
echo -e "${YELLOW}Warning: No SHA256 tool available. Skipping verification.${NC}"
return 0
fi
if [ "${actual}" = "${expected}" ]; then
echo -e "${GREEN}Checksum verified for ${file}${NC}"
return 0
else
echo -e "${RED}Checksum mismatch for ${file}!${NC}"
echo -e "${RED}Expected: ${expected}${NC}"
echo -e "${RED}Got: ${actual}${NC}"
return 1
fi
}
# Download each model
for model in "${!MODELS[@]}"; do
output_path="${MODELS_DIR}/${model}"
# Check if model already exists
if [ -f "${output_path}" ]; then
echo -e "${YELLOW}${model} already exists. Verifying...${NC}"
if verify_checksum "${output_path}" "${CHECKSUMS[$model]}"; then
echo -e "${GREEN}${model} is valid. Skipping download.${NC}"
continue
else
echo -e "${YELLOW}${model} verification failed. Re-downloading...${NC}"
rm -f "${output_path}"
fi
fi
echo -e "${BLUE}Downloading ${model}...${NC}"
# Try to download from GitHub releases
if download_file "${MODELS[$model]}" "${output_path}"; then
echo -e "${GREEN}Downloaded ${model}${NC}"
# Verify checksum
if ! verify_checksum "${output_path}" "${CHECKSUMS[$model]}"; then
echo -e "${RED}Failed to verify ${model}. Removing file.${NC}"
rm -f "${output_path}"
exit 1
fi
else
echo -e "${YELLOW}Failed to download from releases. Trying alternative sources...${NC}"
# Alternative: Download from Hugging Face (if available)
HF_URL="https://huggingface.co/ruvnet/scipix-models/resolve/main/${model}"
if download_file "${HF_URL}" "${output_path}"; then
echo -e "${GREEN}Downloaded ${model} from Hugging Face${NC}"
verify_checksum "${output_path}" "${CHECKSUMS[$model]}" || true
else
echo -e "${RED}Failed to download ${model} from all sources${NC}"
# Create a placeholder file with instructions
cat > "${output_path}.README" << EOF
Model: ${model}
This model file could not be downloaded automatically.
Please download it manually from one of these sources:
1. GitHub Releases: ${MODELS[$model]}
2. Hugging Face: https://huggingface.co/ruvnet/scipix-models
After downloading, place the file at:
${output_path}
Expected SHA256 checksum: ${CHECKSUMS[$model]}
EOF
echo -e "${YELLOW}Created instructions at ${output_path}.README${NC}"
fi
fi
done
# Create model configuration file
echo -e "${BLUE}Creating model configuration...${NC}"
cat > "${MODELS_DIR}/config.json" << EOF
{
"models": {
"encoder": {
"path": "scipix_encoder.onnx",
"type": "image_encoder",
"input_shape": [1, 3, 224, 224],
"output_dim": 768
},
"decoder": {
"path": "scipix_decoder.onnx",
"type": "sequence_decoder",
"vocab_size": 50000,
"max_length": 512
},
"tokenizer": {
"path": "scipix_tokenizer.onnx",
"type": "tokenizer",
"vocab_size": 50000
}
},
"version": "1.0.0",
"created_at": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
EOF
# Verify all models are present
echo ""
echo -e "${BLUE}Verifying model files...${NC}"
missing_models=0
for model in "${!MODELS[@]}"; do
if [ -f "${MODELS_DIR}/${model}" ]; then
size=$(du -h "${MODELS_DIR}/${model}" | cut -f1)
echo -e "${GREEN}${model} (${size})${NC}"
else
echo -e "${RED}${model} (missing)${NC}"
((missing_models++))
fi
done
echo ""
if [ ${missing_models} -eq 0 ]; then
echo -e "${GREEN}====================================${NC}"
echo -e "${GREEN}All models downloaded successfully!${NC}"
echo -e "${GREEN}====================================${NC}"
echo ""
echo -e "${BLUE}Models are located in: ${MODELS_DIR}/${NC}"
echo -e "${BLUE}Configuration file: ${MODELS_DIR}/config.json${NC}"
exit 0
else
echo -e "${YELLOW}====================================${NC}"
echo -e "${YELLOW}Warning: ${missing_models} model(s) missing${NC}"
echo -e "${YELLOW}====================================${NC}"
echo ""
echo -e "${YELLOW}Please check the .README files in ${MODELS_DIR}/ for manual download instructions.${NC}"
exit 1
fi

View File

@@ -0,0 +1,240 @@
#!/bin/bash
set -e
# ruvector-scipix Benchmark Suite Runner
# Comprehensive performance benchmarking with baseline tracking and regression detection
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
BENCHMARK_DIR="$PROJECT_DIR/target/criterion"
BASELINE="${BASELINE:-main}"
GENERATE_HTML="${GENERATE_HTML:-true}"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
echo -e "${BLUE}=====================================${NC}"
echo -e "${BLUE}ruvector-scipix Benchmark Suite${NC}"
echo -e "${BLUE}=====================================${NC}"
echo ""
# Check if running in project directory
if [ ! -f "$PROJECT_DIR/Cargo.toml" ]; then
echo -e "${RED}Error: Must run from scipix project directory${NC}"
exit 1
fi
# Function to run a single benchmark
run_benchmark() {
local bench_name=$1
local description=$2
echo -e "${GREEN}Running ${bench_name}...${NC}"
echo -e "${YELLOW}${description}${NC}"
cd "$PROJECT_DIR"
if [ "$BASELINE" != "" ]; then
cargo bench --bench "$bench_name" -- --save-baseline "$BASELINE"
else
cargo bench --bench "$bench_name"
fi
echo ""
}
# Function to compare with baseline
compare_baseline() {
local bench_name=$1
local baseline=$2
echo -e "${BLUE}Comparing ${bench_name} with baseline ${baseline}...${NC}"
cd "$PROJECT_DIR"
cargo bench --bench "$bench_name" -- --baseline "$baseline"
echo ""
}
# Function to check for regressions
check_regressions() {
echo -e "${BLUE}Checking for performance regressions...${NC}"
# Target metrics
echo -e "${YELLOW}Performance Targets:${NC}"
echo " - Single image OCR: <100ms P95"
echo " - Batch (16 images): <500ms"
echo " - Preprocessing: <20ms"
echo " - LaTeX generation: <5ms"
echo ""
# Parse criterion output for regressions
# In production, this would parse actual benchmark results
if [ -d "$BENCHMARK_DIR" ]; then
echo -e "${GREEN}Benchmark results saved to: ${BENCHMARK_DIR}${NC}"
fi
}
# Function to generate HTML reports
generate_reports() {
if [ "$GENERATE_HTML" = "true" ]; then
echo -e "${BLUE}Generating HTML reports...${NC}"
if [ -d "$BENCHMARK_DIR" ]; then
# Criterion automatically generates HTML reports
echo -e "${GREEN}HTML reports generated in ${BENCHMARK_DIR}${NC}"
echo -e "${YELLOW}Open ${BENCHMARK_DIR}/report/index.html in your browser${NC}"
fi
fi
}
# Parse command line arguments
MODE="${1:-all}"
COMPARE_BASELINE_NAME="${2:-}"
case "$MODE" in
"all")
echo -e "${YELLOW}Running all benchmarks...${NC}\n"
run_benchmark "ocr_latency" "OCR latency benchmarks (single, batch, cold vs warm)"
run_benchmark "preprocessing" "Image preprocessing benchmarks (transforms, pipeline)"
run_benchmark "latex_generation" "LaTeX generation benchmarks (AST, string building)"
run_benchmark "inference" "Model inference benchmarks (detection, recognition, math)"
run_benchmark "cache" "Cache benchmarks (embedding, similarity search)"
run_benchmark "api" "API benchmarks (parsing, serialization, middleware)"
run_benchmark "memory" "Memory benchmarks (peak usage, growth, fragmentation)"
check_regressions
generate_reports
;;
"latency")
run_benchmark "ocr_latency" "OCR latency benchmarks"
;;
"preprocessing")
run_benchmark "preprocessing" "Image preprocessing benchmarks"
;;
"latex")
run_benchmark "latex_generation" "LaTeX generation benchmarks"
;;
"inference")
run_benchmark "inference" "Model inference benchmarks"
;;
"cache")
run_benchmark "cache" "Cache benchmarks"
;;
"api")
run_benchmark "api" "API benchmarks"
;;
"memory")
run_benchmark "memory" "Memory benchmarks"
;;
"compare")
if [ -z "$COMPARE_BASELINE_NAME" ]; then
echo -e "${RED}Error: Baseline name required for comparison${NC}"
echo "Usage: $0 compare <baseline-name>"
exit 1
fi
echo -e "${YELLOW}Comparing all benchmarks with baseline: ${COMPARE_BASELINE_NAME}${NC}\n"
compare_baseline "ocr_latency" "$COMPARE_BASELINE_NAME"
compare_baseline "preprocessing" "$COMPARE_BASELINE_NAME"
compare_baseline "latex_generation" "$COMPARE_BASELINE_NAME"
compare_baseline "inference" "$COMPARE_BASELINE_NAME"
compare_baseline "cache" "$COMPARE_BASELINE_NAME"
compare_baseline "api" "$COMPARE_BASELINE_NAME"
compare_baseline "memory" "$COMPARE_BASELINE_NAME"
;;
"quick")
echo -e "${YELLOW}Running quick benchmark suite (reduced samples)...${NC}\n"
export CARGO_BENCH_OPTS="-- --quick"
run_benchmark "ocr_latency" "Quick OCR latency check"
run_benchmark "preprocessing" "Quick preprocessing check"
;;
"ci")
echo -e "${YELLOW}Running CI benchmark suite...${NC}\n"
# Run benchmarks with minimal samples for CI
export CARGO_BENCH_OPTS="-- --sample-size 10"
run_benchmark "ocr_latency" "CI OCR latency"
run_benchmark "preprocessing" "CI preprocessing"
run_benchmark "latex_generation" "CI LaTeX generation"
# Check for major regressions only
check_regressions
;;
"help"|"--help"|"-h")
echo "Usage: $0 [MODE] [OPTIONS]"
echo ""
echo "Modes:"
echo " all Run all benchmarks (default)"
echo " latency Run OCR latency benchmarks only"
echo " preprocessing Run preprocessing benchmarks only"
echo " latex Run LaTeX generation benchmarks only"
echo " inference Run model inference benchmarks only"
echo " cache Run cache benchmarks only"
echo " api Run API benchmarks only"
echo " memory Run memory benchmarks only"
echo " compare <name> Compare with saved baseline"
echo " quick Run quick benchmark suite"
echo " ci Run CI benchmark suite"
echo " help Show this help message"
echo ""
echo "Environment Variables:"
echo " BASELINE=<name> Save results as baseline (default: main)"
echo " GENERATE_HTML=<bool> Generate HTML reports (default: true)"
echo ""
echo "Examples:"
echo " $0 all # Run all benchmarks"
echo " $0 latency # Run latency benchmarks only"
echo " BASELINE=v1.0 $0 all # Save as v1.0 baseline"
echo " $0 compare v1.0 # Compare with v1.0 baseline"
echo " $0 quick # Quick benchmark suite"
;;
*)
echo -e "${RED}Error: Unknown mode '$MODE'${NC}"
echo "Use '$0 help' for usage information"
exit 1
;;
esac
echo ""
echo -e "${GREEN}=====================================${NC}"
echo -e "${GREEN}Benchmarks Complete!${NC}"
echo -e "${GREEN}=====================================${NC}"
# Print summary
if [ -d "$BENCHMARK_DIR" ]; then
echo ""
echo -e "${YELLOW}Results Summary:${NC}"
echo -e " Benchmark data: ${BENCHMARK_DIR}"
if [ "$GENERATE_HTML" = "true" ]; then
echo -e " HTML reports: ${BENCHMARK_DIR}/report/index.html"
fi
if [ "$BASELINE" != "" ]; then
echo -e " Saved baseline: ${BASELINE}"
fi
fi
echo ""

View File

@@ -0,0 +1,207 @@
#!/bin/bash
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
echo -e "${BLUE}Setting up RuVector Mathpix Development Environment${NC}"
echo ""
# Check if Rust is installed
if ! command -v rustc &> /dev/null; then
echo -e "${RED}Rust is not installed. Installing Rust...${NC}"
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source $HOME/.cargo/env
else
echo -e "${GREEN}Rust is already installed: $(rustc --version)${NC}"
fi
# Update Rust toolchain
echo -e "${BLUE}Updating Rust toolchain...${NC}"
rustup update stable
rustup default stable
# Install required components
echo -e "${BLUE}Installing Rust components...${NC}"
rustup component add rustfmt clippy
# Install development tools
echo -e "${BLUE}Installing development tools...${NC}"
# Code coverage
if ! command -v cargo-tarpaulin &> /dev/null; then
echo -e "${YELLOW}Installing cargo-tarpaulin...${NC}"
cargo install cargo-tarpaulin
else
echo -e "${GREEN}cargo-tarpaulin is already installed${NC}"
fi
# Security audit
if ! command -v cargo-audit &> /dev/null; then
echo -e "${YELLOW}Installing cargo-audit...${NC}"
cargo install cargo-audit
else
echo -e "${GREEN}cargo-audit is already installed${NC}"
fi
# Dependency checker
if ! command -v cargo-deny &> /dev/null; then
echo -e "${YELLOW}Installing cargo-deny...${NC}"
cargo install cargo-deny
else
echo -e "${GREEN}cargo-deny is already installed${NC}"
fi
# License checker
if ! command -v cargo-license &> /dev/null; then
echo -e "${YELLOW}Installing cargo-license...${NC}"
cargo install cargo-license
else
echo -e "${GREEN}cargo-license is already installed${NC}"
fi
# WASM tools
if ! command -v wasm-pack &> /dev/null; then
echo -e "${YELLOW}Installing wasm-pack...${NC}"
curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
else
echo -e "${GREEN}wasm-pack is already installed${NC}"
fi
# Benchmark comparison tool
if ! command -v critcmp &> /dev/null; then
echo -e "${YELLOW}Installing critcmp...${NC}"
cargo install critcmp
else
echo -e "${GREEN}critcmp is already installed${NC}"
fi
# Cargo watch for development
if ! command -v cargo-watch &> /dev/null; then
echo -e "${YELLOW}Installing cargo-watch...${NC}"
cargo install cargo-watch
else
echo -e "${GREEN}cargo-watch is already installed${NC}"
fi
# Flamegraph for profiling
if ! command -v cargo-flamegraph &> /dev/null; then
echo -e "${YELLOW}Installing cargo-flamegraph...${NC}"
cargo install flamegraph
else
echo -e "${GREEN}cargo-flamegraph is already installed${NC}"
fi
# Binary size analysis
if ! command -v cargo-bloat &> /dev/null; then
echo -e "${YELLOW}Installing cargo-bloat...${NC}"
cargo install cargo-bloat
else
echo -e "${GREEN}cargo-bloat is already installed${NC}"
fi
# Outdated dependency checker
if ! command -v cargo-outdated &> /dev/null; then
echo -e "${YELLOW}Installing cargo-outdated...${NC}"
cargo install cargo-outdated
else
echo -e "${GREEN}cargo-outdated is already installed${NC}"
fi
# Install WASM target
echo -e "${BLUE}Installing WASM target...${NC}"
rustup target add wasm32-unknown-unknown
# Install Node.js if not present (for WASM testing)
if ! command -v node &> /dev/null; then
echo -e "${YELLOW}Node.js not found. Please install Node.js for WASM testing.${NC}"
echo -e "${YELLOW}Visit: https://nodejs.org/${NC}"
else
echo -e "${GREEN}Node.js is installed: $(node --version)${NC}"
fi
# Create necessary directories
echo -e "${BLUE}Creating project directories...${NC}"
mkdir -p models
mkdir -p benchmarks/results
mkdir -p coverage
mkdir -p docs
mkdir -p .github/workflows
# Download test models
echo -e "${BLUE}Downloading test models...${NC}"
if [ -f "./scripts/download_models.sh" ]; then
chmod +x ./scripts/download_models.sh
./scripts/download_models.sh
else
echo -e "${YELLOW}Model download script not found. Skipping model download.${NC}"
fi
# Initialize git hooks (if in git repo)
if [ -d ".git" ]; then
echo -e "${BLUE}Setting up git hooks...${NC}"
# Pre-commit hook
cat > .git/hooks/pre-commit << 'EOF'
#!/bin/bash
echo "Running pre-commit checks..."
# Format check
cargo fmt --check
if [ $? -ne 0 ]; then
echo "Code formatting check failed. Run 'cargo fmt' to fix."
exit 1
fi
# Clippy
cargo clippy -- -D warnings
if [ $? -ne 0 ]; then
echo "Clippy check failed."
exit 1
fi
# Tests
cargo test
if [ $? -ne 0 ]; then
echo "Tests failed."
exit 1
fi
echo "Pre-commit checks passed!"
EOF
chmod +x .git/hooks/pre-commit
echo -e "${GREEN}Git hooks installed${NC}"
fi
# Build the project
echo -e "${BLUE}Building project...${NC}"
cargo build
# Run tests
echo -e "${BLUE}Running tests...${NC}"
cargo test
echo ""
echo -e "${GREEN}====================================${NC}"
echo -e "${GREEN}Development environment setup complete!${NC}"
echo -e "${GREEN}====================================${NC}"
echo ""
echo -e "${BLUE}Available commands:${NC}"
echo -e " ${GREEN}make help${NC} - Show all available make commands"
echo -e " ${GREEN}make build${NC} - Build the project"
echo -e " ${GREEN}make test${NC} - Run tests"
echo -e " ${GREEN}make bench${NC} - Run benchmarks"
echo -e " ${GREEN}make coverage${NC} - Generate coverage report"
echo -e " ${GREEN}make wasm${NC} - Build WASM package"
echo -e " ${GREEN}make watch${NC} - Watch for changes and rebuild"
echo ""
echo -e "${BLUE}Quick start:${NC}"
echo -e " 1. Run ${GREEN}make test${NC} to verify everything works"
echo -e " 2. Run ${GREEN}make bench${NC} to see baseline performance"
echo -e " 3. Run ${GREEN}make coverage${NC} to check test coverage"
echo ""
echo -e "${GREEN}Happy coding!${NC}"