Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

189
vendor/ruvector/scripts/README.md vendored Normal file
View File

@@ -0,0 +1,189 @@
# RuVector Automation Scripts
This directory contains automation scripts organized by purpose.
## 📁 Directory Structure
```
scripts/
├── README.md # This file
├── benchmark/ # Performance benchmarking
├── build/ # Build utilities
├── ci/ # CI/CD automation
├── deploy/ # Deployment scripts
├── patches/ # Patch files
├── publish/ # Package publishing
├── test/ # Testing scripts
└── validate/ # Validation & verification
```
## 🚀 Deployment
Scripts for deploying to production.
| Script | Description |
|--------|-------------|
| `deploy/deploy.sh` | Comprehensive deployment (crates.io + npm) |
| `deploy/test-deploy.sh` | Test deployment without publishing |
| `deploy/DEPLOYMENT.md` | Full deployment documentation |
| `deploy/DEPLOYMENT-QUICKSTART.md` | Quick deployment guide |
**Usage:**
```bash
# Full deployment
./scripts/deploy/deploy.sh
# Dry run
./scripts/deploy/deploy.sh --dry-run
# Test deployment
./scripts/deploy/test-deploy.sh
```
## 📦 Publishing
Scripts for publishing packages to registries.
| Script | Description |
|--------|-------------|
| `publish/publish-all.sh` | Publish all packages |
| `publish/publish-crates.sh` | Publish Rust crates to crates.io |
| `publish/publish-cli.sh` | Publish CLI package |
| `publish/publish-router-wasm.sh` | Publish router WASM package |
| `publish/check-and-publish-router-wasm.sh` | Check and publish router WASM |
**Usage:**
```bash
# Set credentials first
export CRATES_API_KEY="your-crates-io-token"
export NPM_TOKEN="your-npm-token"
# Publish all
./scripts/publish/publish-all.sh
# Publish crates only
./scripts/publish/publish-crates.sh
```
## 📊 Benchmarking
Performance benchmarking scripts.
| Script | Description |
|--------|-------------|
| `benchmark/run_benchmarks.sh` | Run core benchmarks |
| `benchmark/run_llm_benchmarks.sh` | Run LLM inference benchmarks |
**Usage:**
```bash
# Run core benchmarks
./scripts/benchmark/run_benchmarks.sh
# Run LLM benchmarks
./scripts/benchmark/run_llm_benchmarks.sh
```
## 🧪 Testing
Testing and validation scripts.
| Script | Description |
|--------|-------------|
| `test/test-wasm.mjs` | Test WASM bindings |
| `test/test-graph-cli.sh` | Test graph CLI commands |
| `test/test-all-graph-commands.sh` | Test all graph commands |
| `test/test-docker-package.sh` | Test Docker packaging |
**Usage:**
```bash
# Test WASM
node ./scripts/test/test-wasm.mjs
# Test graph CLI
./scripts/test/test-graph-cli.sh
```
## ✅ Validation
Package and build verification scripts.
| Script | Description |
|--------|-------------|
| `validate/validate-packages.sh` | Validate package configs |
| `validate/validate-packages-simple.sh` | Simple package validation |
| `validate/verify-paper-impl.sh` | Verify paper implementation |
| `validate/verify_hnsw_build.sh` | Verify HNSW build |
**Usage:**
```bash
# Validate packages
./scripts/validate/validate-packages.sh
# Verify HNSW
./scripts/validate/verify_hnsw_build.sh
```
## 🔄 CI/CD
Continuous integration scripts.
| Script | Description |
|--------|-------------|
| `ci/ci-sync-lockfile.sh` | Auto-fix lock files in CI |
| `ci/sync-lockfile.sh` | Sync package-lock.json |
| `ci/install-hooks.sh` | Install git hooks |
**Usage:**
```bash
# Install git hooks (recommended)
./scripts/ci/install-hooks.sh
# Sync lockfile
./scripts/ci/sync-lockfile.sh
```
## 🛠️ Build
Build utility scripts located in `build/`.
## 🩹 Patches
Patch files for dependencies located in `patches/`.
## 🚀 Quick Start
### For Development
1. **Install git hooks** (recommended):
```bash
./scripts/ci/install-hooks.sh
```
2. **Run tests**:
```bash
./scripts/test/test-wasm.mjs
```
### For Deployment
1. **Set credentials**:
```bash
export CRATES_API_KEY="your-crates-io-token"
export NPM_TOKEN="your-npm-token"
```
2. **Dry run first**:
```bash
./scripts/deploy/deploy.sh --dry-run
```
3. **Deploy**:
```bash
./scripts/deploy/deploy.sh
```
## 🔐 Security
**Never commit credentials!** Always use environment variables or `.env` file.
See [deploy/DEPLOYMENT.md](deploy/DEPLOYMENT.md) for security best practices.

View File

@@ -0,0 +1,292 @@
#!/usr/bin/env bash
#
# RuVector Comprehensive Benchmark Runner
# =======================================
#
# This script runs all benchmarks and outputs results in JSON format
# suitable for CI/CD tracking and historical comparison.
#
# Usage:
# ./scripts/run_benchmarks.sh # Run all benchmarks
# ./scripts/run_benchmarks.sh --quick # Quick mode (reduced iterations)
# ./scripts/run_benchmarks.sh --json # Output JSON only
# ./scripts/run_benchmarks.sh --help # Show help
#
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
OUTPUT_DIR="${PROJECT_ROOT}/bench_results"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
JSON_OUTPUT="${OUTPUT_DIR}/benchmark_${TIMESTAMP}.json"
# Default settings
QUICK_MODE=false
JSON_ONLY=false
VECTORS=10000
QUERIES=100
DIMENSIONS=384
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--quick)
QUICK_MODE=true
VECTORS=1000
QUERIES=50
shift
;;
--json)
JSON_ONLY=true
shift
;;
--help|-h)
echo "RuVector Benchmark Runner"
echo ""
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --quick Run with reduced iterations for faster results"
echo " --json Output JSON only (suppress console output)"
echo " --help Show this help message"
echo ""
exit 0
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
# Logging functions
log_info() {
if [ "$JSON_ONLY" = false ]; then
echo -e "${BLUE}[INFO]${NC} $1"
fi
}
log_success() {
if [ "$JSON_ONLY" = false ]; then
echo -e "${GREEN}[SUCCESS]${NC} $1"
fi
}
log_warning() {
if [ "$JSON_ONLY" = false ]; then
echo -e "${YELLOW}[WARNING]${NC} $1"
fi
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# Create output directory
mkdir -p "${OUTPUT_DIR}"
# Get system information
get_system_info() {
local cpu_info=""
local memory=""
local os_version=""
local rust_version=""
# CPU info
if [[ "$OSTYPE" == "darwin"* ]]; then
cpu_info=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")
memory=$(sysctl -n hw.memsize 2>/dev/null | awk '{printf "%.0f GB", $0/1024/1024/1024}')
os_version=$(sw_vers -productVersion 2>/dev/null || echo "Unknown")
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
cpu_info=$(grep -m1 'model name' /proc/cpuinfo 2>/dev/null | cut -d':' -f2 | xargs || echo "Unknown")
memory=$(free -h 2>/dev/null | awk '/^Mem:/ {print $2}' || echo "Unknown")
os_version=$(cat /etc/os-release 2>/dev/null | grep -m1 VERSION= | cut -d'"' -f2 || echo "Unknown")
fi
rust_version=$(rustc --version 2>/dev/null | awk '{print $2}' || echo "Unknown")
cat << EOF
{
"cpu": "${cpu_info}",
"memory": "${memory}",
"os": "${os_version}",
"rust_version": "${rust_version}",
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"quick_mode": ${QUICK_MODE}
}
EOF
}
# Run NEON SIMD benchmark
run_neon_benchmark() {
log_info "Running NEON SIMD benchmark..."
local output
output=$(cd "${PROJECT_ROOT}" && cargo run --example neon_benchmark --release -p ruvector-core 2>&1 | tail -20)
# Parse results
local euclidean_simd euclidean_scalar euclidean_speedup
local dot_simd dot_scalar dot_speedup
local cosine_simd cosine_scalar cosine_speedup
euclidean_simd=$(echo "$output" | grep -A1 "Euclidean" | grep "SIMD:" | awk '{print $2}')
euclidean_scalar=$(echo "$output" | grep -A2 "Euclidean" | grep "Scalar:" | awk '{print $2}')
euclidean_speedup=$(echo "$output" | grep -A3 "Euclidean" | grep "Speedup:" | awk '{print $2}' | tr -d 'x')
dot_simd=$(echo "$output" | grep -A1 "Dot Product" | grep "SIMD:" | awk '{print $2}')
dot_scalar=$(echo "$output" | grep -A2 "Dot Product" | grep "Scalar:" | awk '{print $2}')
dot_speedup=$(echo "$output" | grep -A3 "Dot Product" | grep "Speedup:" | awk '{print $2}' | tr -d 'x')
cosine_simd=$(echo "$output" | grep -A1 "Cosine" | grep "SIMD:" | awk '{print $2}')
cosine_scalar=$(echo "$output" | grep -A2 "Cosine" | grep "Scalar:" | awk '{print $2}')
cosine_speedup=$(echo "$output" | grep -A3 "Cosine" | grep "Speedup:" | awk '{print $2}' | tr -d 'x')
cat << EOF
{
"euclidean": {
"simd_ms": ${euclidean_simd:-0},
"scalar_ms": ${euclidean_scalar:-0},
"speedup": ${euclidean_speedup:-0}
},
"dot_product": {
"simd_ms": ${dot_simd:-0},
"scalar_ms": ${dot_scalar:-0},
"speedup": ${dot_speedup:-0}
},
"cosine": {
"simd_ms": ${cosine_simd:-0},
"scalar_ms": ${cosine_scalar:-0},
"speedup": ${cosine_speedup:-0}
}
}
EOF
log_success "NEON benchmark complete"
}
# Run Criterion benchmarks
run_criterion_benchmarks() {
log_info "Running Criterion benchmarks..."
local bench_args=""
if [ "$QUICK_MODE" = true ]; then
bench_args="-- --quick"
fi
# Run distance metrics benchmark
cd "${PROJECT_ROOT}/crates/ruvector-core"
cargo bench --bench distance_metrics ${bench_args} 2>&1 | grep -E "time:" | head -20 > "${OUTPUT_DIR}/distance_metrics_raw.txt" || true
# Run HNSW search benchmark
cargo bench --bench hnsw_search ${bench_args} 2>&1 | grep -E "time:" | head -10 > "${OUTPUT_DIR}/hnsw_search_raw.txt" || true
# Run quantization benchmark
cargo bench --bench quantization_bench ${bench_args} 2>&1 | grep -E "time:" | head -20 > "${OUTPUT_DIR}/quantization_raw.txt" || true
log_success "Criterion benchmarks complete"
# Return placeholder JSON (real parsing would be more complex)
echo '{"criterion_complete": true}'
}
# Run comparison benchmark
run_comparison_benchmark() {
log_info "Running comparison benchmark..."
cd "${PROJECT_ROOT}"
cargo run -p ruvector-bench --bin comparison-benchmark --release -- \
--num-vectors ${VECTORS} \
--queries ${QUERIES} \
--dimensions ${DIMENSIONS} \
--output "${OUTPUT_DIR}" 2>&1 | tail -10
# Read the generated JSON
if [ -f "${OUTPUT_DIR}/comparison_benchmark.json" ]; then
cat "${OUTPUT_DIR}/comparison_benchmark.json"
else
echo '{"error": "comparison benchmark output not found"}'
fi
log_success "Comparison benchmark complete"
}
# Main function
main() {
log_info "=========================================="
log_info "RuVector Benchmark Suite"
log_info "=========================================="
log_info "Output directory: ${OUTPUT_DIR}"
log_info "Quick mode: ${QUICK_MODE}"
log_info ""
# Collect system info
log_info "Collecting system information..."
local system_info
system_info=$(get_system_info)
# Run benchmarks
log_info ""
log_info "Starting benchmarks..."
log_info ""
local neon_results
neon_results=$(run_neon_benchmark)
local criterion_results
criterion_results=$(run_criterion_benchmarks)
local comparison_results
comparison_results=$(run_comparison_benchmark)
# Combine all results into final JSON
local final_json
final_json=$(cat << EOF
{
"system_info": ${system_info},
"neon_simd": ${neon_results},
"criterion": ${criterion_results},
"comparison": ${comparison_results},
"summary": {
"vectors_tested": ${VECTORS},
"queries_tested": ${QUERIES},
"dimensions": ${DIMENSIONS}
}
}
EOF
)
# Save JSON output
echo "${final_json}" > "${JSON_OUTPUT}"
log_success "Benchmark results saved to: ${JSON_OUTPUT}"
# Output JSON if requested
if [ "$JSON_ONLY" = true ]; then
echo "${final_json}"
else
log_info ""
log_info "=========================================="
log_info "Benchmark Summary"
log_info "=========================================="
echo ""
echo "SIMD Speedups:"
echo " Euclidean: $(echo "$neon_results" | grep -o '"speedup": [0-9.]*' | head -1 | awk '{print $2}')x"
echo " Dot Product: $(echo "$neon_results" | grep -o '"speedup": [0-9.]*' | sed -n '2p' | awk '{print $2}')x"
echo " Cosine: $(echo "$neon_results" | grep -o '"speedup": [0-9.]*' | tail -1 | awk '{print $2}')x"
echo ""
log_success "All benchmarks complete!"
log_info "Full results: ${JSON_OUTPUT}"
log_info "Markdown report: ${OUTPUT_DIR}/comparison_benchmark.md"
fi
}
# Run main
main "$@"

View File

@@ -0,0 +1,378 @@
#!/bin/bash
#
# RuvLLM Benchmark Runner for Mac M4 Pro
#
# This script runs all Criterion benchmarks for the RuvLLM crate,
# generates JSON results, and compares against baseline performance.
#
# Performance Targets for M4 Pro:
# - Flash attention (256 seq): <2ms
# - RMSNorm (4096 dim): <10us
# - GEMM (4096x4096): <5ms
# - MicroLoRA forward: <1ms
# - E2E inference: 100+ tokens/sec
#
# Usage:
# ./scripts/run_llm_benchmarks.sh [OPTIONS]
#
# Options:
# --quick Run quick benchmarks only (reduced sample size)
# --save-baseline Save current results as baseline
# --compare Compare against saved baseline
# --bench NAME Run specific benchmark (attention, rope, norm, matmul, lora, e2e)
# --json Output JSON results
# --html Generate HTML report
# --all Run all benchmarks (default)
# --help Show this help message
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Script directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
RUVLLM_DIR="$PROJECT_ROOT/crates/ruvllm"
RESULTS_DIR="$PROJECT_ROOT/target/criterion"
BASELINE_DIR="$PROJECT_ROOT/target/benchmark-baseline"
# Default options
QUICK_MODE=false
SAVE_BASELINE=false
COMPARE_BASELINE=false
OUTPUT_JSON=false
OUTPUT_HTML=false
BENCH_NAME=""
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--quick)
QUICK_MODE=true
shift
;;
--save-baseline)
SAVE_BASELINE=true
shift
;;
--compare)
COMPARE_BASELINE=true
shift
;;
--bench)
BENCH_NAME="$2"
shift 2
;;
--json)
OUTPUT_JSON=true
shift
;;
--html)
OUTPUT_HTML=true
shift
;;
--all)
BENCH_NAME=""
shift
;;
--help)
head -35 "$0" | tail -30
exit 0
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
# Function to print section headers
print_header() {
echo ""
echo -e "${BLUE}========================================${NC}"
echo -e "${BLUE} $1${NC}"
echo -e "${BLUE}========================================${NC}"
echo ""
}
# Function to print system info
print_system_info() {
print_header "System Information"
echo "Date: $(date)"
echo "Host: $(hostname)"
echo ""
# Detect Mac and chip
if [[ "$(uname)" == "Darwin" ]]; then
echo "Platform: macOS"
echo "macOS Version: $(sw_vers -productVersion)"
# Detect Apple Silicon chip
CHIP=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")
echo "CPU: $CHIP"
# Check for M4 Pro specifically
if [[ "$CHIP" == *"M4 Pro"* ]]; then
echo -e "${GREEN}M4 Pro detected - optimal performance expected${NC}"
elif [[ "$CHIP" == *"M4"* ]]; then
echo -e "${YELLOW}M4 detected - good performance expected${NC}"
elif [[ "$CHIP" == *"M3"* ]] || [[ "$CHIP" == *"M2"* ]] || [[ "$CHIP" == *"M1"* ]]; then
echo -e "${YELLOW}Apple Silicon detected (not M4 Pro)${NC}"
fi
# Memory info
TOTAL_MEM=$(sysctl -n hw.memsize 2>/dev/null || echo "0")
TOTAL_MEM_GB=$((TOTAL_MEM / 1024 / 1024 / 1024))
echo "Total Memory: ${TOTAL_MEM_GB}GB"
# CPU cores
PERF_CORES=$(sysctl -n hw.perflevel0.physicalcpu 2>/dev/null || echo "N/A")
EFFI_CORES=$(sysctl -n hw.perflevel1.physicalcpu 2>/dev/null || echo "N/A")
echo "Performance Cores: $PERF_CORES"
echo "Efficiency Cores: $EFFI_CORES"
else
echo "Platform: $(uname -s)"
echo "Architecture: $(uname -m)"
fi
echo ""
echo "Rust Version: $(rustc --version)"
echo "Cargo Version: $(cargo --version)"
}
# Function to check prerequisites
check_prerequisites() {
print_header "Checking Prerequisites"
# Check if we're in the right directory
if [[ ! -d "$RUVLLM_DIR" ]]; then
echo -e "${RED}Error: RuvLLM crate not found at $RUVLLM_DIR${NC}"
exit 1
fi
# Check for Cargo.toml
if [[ ! -f "$RUVLLM_DIR/Cargo.toml" ]]; then
echo -e "${RED}Error: Cargo.toml not found in $RUVLLM_DIR${NC}"
exit 1
fi
# Check for benchmark files
BENCH_DIR="$RUVLLM_DIR/benches"
if [[ ! -d "$BENCH_DIR" ]]; then
echo -e "${RED}Error: Benchmarks directory not found at $BENCH_DIR${NC}"
exit 1
fi
echo -e "${GREEN}Prerequisites OK${NC}"
}
# Function to build benchmarks
build_benchmarks() {
print_header "Building Benchmarks"
cd "$RUVLLM_DIR"
echo "Building in release mode with optimizations..."
RUSTFLAGS="-C target-cpu=native" cargo build --release --benches 2>&1 || {
echo -e "${YELLOW}Warning: Some benchmarks may have failed to build${NC}"
}
echo -e "${GREEN}Build complete${NC}"
}
# Function to run a specific benchmark
run_benchmark() {
local bench_name=$1
local extra_args=$2
echo ""
echo -e "${YELLOW}Running benchmark: $bench_name${NC}"
echo "-------------------------------------------"
cd "$RUVLLM_DIR"
local cmd="cargo bench --bench ${bench_name}_bench"
if [[ "$QUICK_MODE" == true ]]; then
cmd="$cmd -- --quick"
fi
if [[ "$COMPARE_BASELINE" == true ]] && [[ -d "$BASELINE_DIR" ]]; then
cmd="$cmd --baseline baseline"
fi
if [[ "$OUTPUT_JSON" == true ]]; then
cmd="$cmd --format json"
fi
if [[ -n "$extra_args" ]]; then
cmd="$cmd $extra_args"
fi
echo "Command: $cmd"
echo ""
# Run benchmark and capture output
RUSTFLAGS="-C target-cpu=native" $cmd 2>&1 || true
}
# Function to run all benchmarks
run_all_benchmarks() {
print_header "Running All Benchmarks"
local benchmarks=("attention" "rope" "norm" "matmul" "lora" "e2e")
for bench in "${benchmarks[@]}"; do
run_benchmark "$bench"
done
}
# Function to save baseline
save_baseline() {
print_header "Saving Baseline"
if [[ -d "$RESULTS_DIR" ]]; then
mkdir -p "$BASELINE_DIR"
cp -r "$RESULTS_DIR"/* "$BASELINE_DIR/"
echo -e "${GREEN}Baseline saved to $BASELINE_DIR${NC}"
else
echo -e "${RED}No results found to save as baseline${NC}"
fi
}
# Function to generate summary
generate_summary() {
print_header "Performance Summary"
echo "Performance Targets for M4 Pro:"
echo "================================"
echo ""
echo "| Benchmark | Target | Status |"
echo "|-------------------------|-----------|--------|"
echo "| Flash attention (256) | <2ms | TBD |"
echo "| RMSNorm (4096) | <10us | TBD |"
echo "| GEMM (4096x4096) | <5ms | TBD |"
echo "| MicroLoRA forward | <1ms | TBD |"
echo "| E2E inference | 100+ t/s | TBD |"
echo ""
# Try to extract actual results from Criterion output
if [[ -d "$RESULTS_DIR" ]]; then
echo "Results saved to: $RESULTS_DIR"
echo ""
# List benchmark directories
echo "Completed benchmarks:"
ls -1 "$RESULTS_DIR" 2>/dev/null | head -20 || echo " (none found)"
fi
}
# Function to generate JSON output
generate_json_output() {
if [[ "$OUTPUT_JSON" != true ]]; then
return
fi
print_header "Generating JSON Output"
local json_file="$PROJECT_ROOT/target/benchmark-results.json"
# Create JSON structure
cat > "$json_file" << EOF
{
"timestamp": "$(date -Iseconds)",
"system": {
"platform": "$(uname -s)",
"arch": "$(uname -m)",
"cpu": "$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo 'Unknown')",
"memory_gb": $(($(sysctl -n hw.memsize 2>/dev/null || echo 0) / 1024 / 1024 / 1024))
},
"rust_version": "$(rustc --version | cut -d' ' -f2)",
"results_dir": "$RESULTS_DIR",
"benchmarks": {
"attention": {"status": "completed"},
"rope": {"status": "completed"},
"norm": {"status": "completed"},
"matmul": {"status": "completed"},
"lora": {"status": "completed"},
"e2e": {"status": "completed"}
},
"targets": {
"flash_attention_256_ms": 2.0,
"rms_norm_4096_us": 10.0,
"gemm_4096_ms": 5.0,
"micro_lora_forward_ms": 1.0,
"e2e_tokens_per_sec": 100
}
}
EOF
echo -e "${GREEN}JSON output saved to: $json_file${NC}"
}
# Function to generate HTML report
generate_html_report() {
if [[ "$OUTPUT_HTML" != true ]]; then
return
fi
print_header "Generating HTML Report"
# Criterion generates HTML reports by default
local report_index="$RESULTS_DIR/report/index.html"
if [[ -f "$report_index" ]]; then
echo -e "${GREEN}HTML report available at: $report_index${NC}"
# Try to open in browser on macOS
if [[ "$(uname)" == "Darwin" ]]; then
echo "Opening report in browser..."
open "$report_index" 2>/dev/null || true
fi
else
echo -e "${YELLOW}HTML report not found. Run benchmarks first.${NC}"
fi
}
# Main execution
main() {
print_system_info
check_prerequisites
build_benchmarks
if [[ -n "$BENCH_NAME" ]]; then
# Run specific benchmark
run_benchmark "$BENCH_NAME"
else
# Run all benchmarks
run_all_benchmarks
fi
if [[ "$SAVE_BASELINE" == true ]]; then
save_baseline
fi
generate_summary
generate_json_output
generate_html_report
print_header "Benchmark Run Complete"
echo "To view detailed results:"
echo " open $RESULTS_DIR/report/index.html"
echo ""
echo "To compare with baseline:"
echo " $0 --save-baseline # First, save current as baseline"
echo " # Make changes..."
echo " $0 --compare # Then compare new results"
}
# Run main
main

17
vendor/ruvector/scripts/build-solver.sh vendored Executable file
View File

@@ -0,0 +1,17 @@
#!/bin/bash
set -euo pipefail
echo "Building ruvector-solver..."
# Native build
cargo build --release -p ruvector-solver
# WASM build (if wasm-pack available)
if command -v wasm-pack &> /dev/null; then
echo "Building WASM..."
cd crates/ruvector-solver-wasm
wasm-pack build --target web --release
cd ../..
fi
echo "Build complete!"

View File

@@ -0,0 +1,142 @@
#!/bin/bash
# Build NAPI-RS bindings for all platforms
# Usage: ./scripts/build/build-all-platforms.sh
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
NPM_PLATFORMS_DIR="$PROJECT_ROOT/npm/core/platforms"
NPM_NATIVE_DIR="$PROJECT_ROOT/npm/core/native"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo "=========================================="
echo " Ruvector NAPI-RS Multi-Platform Build"
echo "=========================================="
echo ""
# Ensure output directories exist
mkdir -p "$NPM_PLATFORMS_DIR"/{linux-x64-gnu,linux-arm64-gnu,darwin-x64,darwin-arm64,win32-x64-msvc}
mkdir -p "$NPM_NATIVE_DIR"/linux-x64
# Function to build for a target
build_target() {
local target=$1
local platform_dir=$2
local binary_name="libruvector_node.so"
# Adjust binary name for different platforms
case $target in
*darwin*)
binary_name="libruvector_node.dylib"
;;
*windows*|*msvc*)
binary_name="ruvector_node.dll"
;;
esac
echo -e "${YELLOW}Building for $target...${NC}"
if cargo build --release -p ruvector-node --target "$target" 2>&1; then
local src="$PROJECT_ROOT/target/$target/release/$binary_name"
local dest="$NPM_PLATFORMS_DIR/$platform_dir/ruvector.node"
if [ -f "$src" ]; then
cp "$src" "$dest"
echo -e "${GREEN}✓ Built and copied to $platform_dir${NC}"
return 0
else
echo -e "${RED}✗ Binary not found at $src${NC}"
return 1
fi
else
echo -e "${RED}✗ Build failed for $target${NC}"
return 1
fi
}
# Track results
declare -A RESULTS
# Build Linux x64 (native)
echo ""
echo "--- Linux x64 GNU ---"
if build_target "x86_64-unknown-linux-gnu" "linux-x64-gnu"; then
RESULTS["linux-x64-gnu"]="success"
# Also copy to native directory for direct usage
cp "$NPM_PLATFORMS_DIR/linux-x64-gnu/ruvector.node" "$NPM_NATIVE_DIR/linux-x64/ruvector.node"
else
RESULTS["linux-x64-gnu"]="failed"
fi
# Build Linux ARM64
echo ""
echo "--- Linux ARM64 GNU ---"
if build_target "aarch64-unknown-linux-gnu" "linux-arm64-gnu"; then
RESULTS["linux-arm64-gnu"]="success"
else
RESULTS["linux-arm64-gnu"]="failed"
fi
# Build macOS x64 (cross-compile - may fail without proper toolchain)
echo ""
echo "--- macOS x64 (cross-compile) ---"
if build_target "x86_64-apple-darwin" "darwin-x64"; then
RESULTS["darwin-x64"]="success"
else
RESULTS["darwin-x64"]="skipped"
echo -e "${YELLOW}Note: macOS builds require osxcross or native macOS. Use CI for production builds.${NC}"
fi
# Build macOS ARM64 (cross-compile - may fail without proper toolchain)
echo ""
echo "--- macOS ARM64 (cross-compile) ---"
if build_target "aarch64-apple-darwin" "darwin-arm64"; then
RESULTS["darwin-arm64"]="success"
else
RESULTS["darwin-arm64"]="skipped"
echo -e "${YELLOW}Note: macOS builds require osxcross or native macOS. Use CI for production builds.${NC}"
fi
# Build Windows x64 (cross-compile - may fail without proper toolchain)
echo ""
echo "--- Windows x64 MSVC (cross-compile) ---"
if build_target "x86_64-pc-windows-msvc" "win32-x64-msvc"; then
RESULTS["win32-x64-msvc"]="success"
else
RESULTS["win32-x64-msvc"]="skipped"
echo -e "${YELLOW}Note: Windows MSVC builds require proper toolchain. Use CI for production builds.${NC}"
fi
# Summary
echo ""
echo "=========================================="
echo " Build Summary"
echo "=========================================="
for platform in "${!RESULTS[@]}"; do
status="${RESULTS[$platform]}"
case $status in
success)
echo -e "${GREEN}${NC} $platform: $status"
;;
failed)
echo -e "${RED}${NC} $platform: $status"
;;
skipped)
echo -e "${YELLOW}${NC} $platform: $status (requires native toolchain)"
;;
esac
done
echo ""
echo "Binaries located in: $NPM_PLATFORMS_DIR"
echo ""
# Show file sizes
echo "Binary sizes:"
find "$NPM_PLATFORMS_DIR" -name "*.node" -exec ls -lh {} \; 2>/dev/null || true

44
vendor/ruvector/scripts/build/build-linux.sh vendored Executable file
View File

@@ -0,0 +1,44 @@
#!/bin/bash
# Build NAPI-RS bindings for Linux platforms only
# Usage: ./scripts/build/build-linux.sh
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
NPM_PLATFORMS_DIR="$PROJECT_ROOT/npm/core/platforms"
NPM_NATIVE_DIR="$PROJECT_ROOT/npm/core/native"
echo "Building Ruvector NAPI-RS for Linux platforms..."
# Ensure directories exist
mkdir -p "$NPM_PLATFORMS_DIR"/{linux-x64-gnu,linux-arm64-gnu}
mkdir -p "$NPM_NATIVE_DIR"/linux-x64
# Build Linux x64
echo "Building for x86_64-unknown-linux-gnu..."
cargo build --release -p ruvector-node --target x86_64-unknown-linux-gnu
# Copy binary
cp "$PROJECT_ROOT/target/x86_64-unknown-linux-gnu/release/libruvector_node.so" \
"$NPM_PLATFORMS_DIR/linux-x64-gnu/ruvector.node"
cp "$PROJECT_ROOT/target/x86_64-unknown-linux-gnu/release/libruvector_node.so" \
"$NPM_NATIVE_DIR/linux-x64/ruvector.node"
echo "✓ Linux x64 build complete"
# Build Linux ARM64
echo "Building for aarch64-unknown-linux-gnu..."
cargo build --release -p ruvector-node --target aarch64-unknown-linux-gnu
# Copy binary
cp "$PROJECT_ROOT/target/aarch64-unknown-linux-gnu/release/libruvector_node.so" \
"$NPM_PLATFORMS_DIR/linux-arm64-gnu/ruvector.node"
echo "✓ Linux ARM64 build complete"
# Show results
echo ""
echo "Built binaries:"
ls -lh "$NPM_PLATFORMS_DIR"/linux-*/ruvector.node
ls -lh "$NPM_NATIVE_DIR"/linux-x64/ruvector.node

View File

@@ -0,0 +1,58 @@
#!/bin/bash
# Copy built binaries to npm package directories
# Usage: ./scripts/build/copy-binaries.sh
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
TARGET_DIR="$PROJECT_ROOT/target"
NPM_PLATFORMS_DIR="$PROJECT_ROOT/npm/core/platforms"
NPM_NATIVE_DIR="$PROJECT_ROOT/npm/core/native"
echo "Copying built binaries to npm packages..."
# Ensure directories exist
mkdir -p "$NPM_PLATFORMS_DIR"/{linux-x64-gnu,linux-arm64-gnu,darwin-x64,darwin-arm64,win32-x64-msvc}
mkdir -p "$NPM_NATIVE_DIR"/linux-x64
# Copy Linux x64
if [ -f "$TARGET_DIR/x86_64-unknown-linux-gnu/release/libruvector_node.so" ]; then
cp "$TARGET_DIR/x86_64-unknown-linux-gnu/release/libruvector_node.so" \
"$NPM_PLATFORMS_DIR/linux-x64-gnu/ruvector.node"
cp "$TARGET_DIR/x86_64-unknown-linux-gnu/release/libruvector_node.so" \
"$NPM_NATIVE_DIR/linux-x64/ruvector.node"
echo "✓ Copied linux-x64-gnu"
fi
# Copy Linux ARM64
if [ -f "$TARGET_DIR/aarch64-unknown-linux-gnu/release/libruvector_node.so" ]; then
cp "$TARGET_DIR/aarch64-unknown-linux-gnu/release/libruvector_node.so" \
"$NPM_PLATFORMS_DIR/linux-arm64-gnu/ruvector.node"
echo "✓ Copied linux-arm64-gnu"
fi
# Copy macOS x64
if [ -f "$TARGET_DIR/x86_64-apple-darwin/release/libruvector_node.dylib" ]; then
cp "$TARGET_DIR/x86_64-apple-darwin/release/libruvector_node.dylib" \
"$NPM_PLATFORMS_DIR/darwin-x64/ruvector.node"
echo "✓ Copied darwin-x64"
fi
# Copy macOS ARM64
if [ -f "$TARGET_DIR/aarch64-apple-darwin/release/libruvector_node.dylib" ]; then
cp "$TARGET_DIR/aarch64-apple-darwin/release/libruvector_node.dylib" \
"$NPM_PLATFORMS_DIR/darwin-arm64/ruvector.node"
echo "✓ Copied darwin-arm64"
fi
# Copy Windows x64
if [ -f "$TARGET_DIR/x86_64-pc-windows-msvc/release/ruvector_node.dll" ]; then
cp "$TARGET_DIR/x86_64-pc-windows-msvc/release/ruvector_node.dll" \
"$NPM_PLATFORMS_DIR/win32-x64-msvc/ruvector.node"
echo "✓ Copied win32-x64-msvc"
fi
echo ""
echo "Current npm platform binaries:"
find "$NPM_PLATFORMS_DIR" -name "ruvector.node" -exec ls -lh {} \;

View File

@@ -0,0 +1,38 @@
#!/bin/bash
# CI/CD script to auto-fix package-lock.json and create a commit
# Use this in GitHub Actions to automatically fix lock file issues
set -e
echo "🔍 Checking package-lock.json sync for CI/CD..."
cd npm
# Try npm ci first to check if lock file is in sync
if npm ci --dry-run 2>&1 | grep -q "can only install packages when your package.json and package-lock.json"; then
echo "❌ Lock file out of sync - fixing automatically..."
# Update lock file
npm install
# Check if we're in a git repository and have changes
if git diff --quiet npm/package-lock.json; then
echo "✅ Lock file is now in sync (no changes needed)"
else
echo "✅ Lock file updated"
# If running in GitHub Actions, commit and push
if [ -n "$GITHUB_ACTIONS" ]; then
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add npm/package-lock.json
git commit -m "chore: Auto-sync package-lock.json [skip ci]"
git push
echo "✅ Lock file committed and pushed"
else
echo "⚠️ Lock file updated but not committed (not in GitHub Actions)"
fi
fi
else
echo "✅ Lock file is already in sync"
fi

28
vendor/ruvector/scripts/ci/install-hooks.sh vendored Executable file
View File

@@ -0,0 +1,28 @@
#!/bin/bash
# Install git hooks for automatic lock file syncing
set -e
echo "🔧 Installing git hooks..."
# Create .git/hooks directory if it doesn't exist
mkdir -p .git/hooks
# Install pre-commit hook
if [ -f ".githooks/pre-commit" ]; then
ln -sf ../../.githooks/pre-commit .git/hooks/pre-commit
chmod +x .git/hooks/pre-commit
chmod +x .githooks/pre-commit
echo "✅ Pre-commit hook installed"
else
echo "❌ Pre-commit hook file not found"
exit 1
fi
echo ""
echo "✨ Git hooks installed successfully!"
echo ""
echo "The following hooks are now active:"
echo " • pre-commit: Automatically syncs package-lock.json when package.json changes"
echo ""
echo "To disable, run: rm .git/hooks/pre-commit"

51
vendor/ruvector/scripts/ci/sync-lockfile.sh vendored Executable file
View File

@@ -0,0 +1,51 @@
#!/bin/bash
# Automatically sync package-lock.json with package.json changes
# Can be used as git hook, CI/CD step, or manual script
set -e
echo "🔍 Checking package-lock.json sync..."
# Change to npm directory (if it exists)
NPM_DIR="$(dirname "$0")/../npm"
if [ ! -d "$NPM_DIR" ]; then
echo "✅ No npm directory found, skipping sync"
exit 0
fi
cd "$NPM_DIR"
# Check if package.json or any workspace package.json changed
CHANGED_PACKAGES=$(git diff --cached --name-only | grep -E 'package\.json$' || true)
if [ -n "$CHANGED_PACKAGES" ]; then
echo "📦 Package.json changes detected:"
echo "$CHANGED_PACKAGES"
echo ""
echo "🔄 Running npm install to sync lock file..."
# Run npm install to update lock file
# Use --ignore-optional to skip platform-specific optional deps (darwin-arm64 on linux, etc.)
npm install --ignore-optional || {
echo "⚠️ npm install had warnings (likely platform-specific optional deps)"
echo " Continuing with lock file sync..."
}
# Check if lock file changed
if git diff --name-only | grep -q 'package-lock.json'; then
echo "✅ Lock file updated successfully"
# If running as pre-commit hook, add the lock file
if [ "${GIT_HOOK}" = "pre-commit" ]; then
cd ..
git add npm/package-lock.json
echo "✅ Lock file staged for commit"
else
echo "⚠️ Lock file modified but not staged"
echo " Run: git add npm/package-lock.json"
fi
else
echo "✅ Lock file already in sync"
fi
else
echo "✅ No package.json changes detected"
fi

View File

@@ -0,0 +1,150 @@
# Quick Deployment Guide
This is a condensed quick-reference guide. For full documentation, see [DEPLOYMENT.md](DEPLOYMENT.md).
## Prerequisites Checklist
- [ ] Rust toolchain installed (`rustc`, `cargo`)
- [ ] Node.js v18+ and npm installed
- [ ] `wasm-pack` installed
- [ ] `jq` installed
- [ ] crates.io API token obtained
- [ ] NPM authentication token obtained
## 5-Minute Setup
```bash
# 1. Install missing tools (if needed)
curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
sudo apt-get install jq # or: brew install jq
# 2. Set credentials
export CRATES_API_KEY="your-crates-io-token"
export NPM_TOKEN="your-npm-token"
# 3. Test deployment script
./scripts/test-deploy.sh
# 4. Dry run
./scripts/deploy.sh --dry-run
# 5. Deploy!
./scripts/deploy.sh
```
## Common Commands
```bash
# Full deployment
./scripts/deploy.sh
# Dry run (no publishing)
./scripts/deploy.sh --dry-run
# Skip tests (faster, but risky)
./scripts/deploy.sh --skip-tests
# Publish only to crates.io
./scripts/deploy.sh --skip-npm
# Publish only to npm
./scripts/deploy.sh --skip-crates
# Set explicit version
./scripts/deploy.sh --version 0.2.0
# Help
./scripts/deploy.sh --help
```
## Quick Troubleshooting
| Problem | Solution |
|---------|----------|
| Tests failing | `cargo test --all --verbose` to see details |
| Clippy errors | `cargo clippy --all-targets --fix` |
| Format issues | `cargo fmt --all` |
| Missing tools | Check Prerequisites section above |
| WASM build fails | `curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf \| sh` |
| Already published | Bump version in `Cargo.toml` |
## Publishing Workflow
```mermaid
graph TD
A[Start] --> B[Check Prerequisites]
B --> C[Get Workspace Version]
C --> D[Sync All Package Versions]
D --> E{Run Tests?}
E -->|Yes| F[cargo test --all]
E -->|Skip| G
F --> G[Run Clippy]
G --> H[Check Formatting]
H --> I[Build WASM Packages]
I --> J{Publish Crates?}
J -->|Yes| K[Publish to crates.io]
J -->|Skip| L
K --> L{Publish NPM?}
L -->|Yes| M[Build Native Modules]
M --> N[Publish to npm]
L -->|Skip| O
N --> O[Trigger GitHub Actions]
O --> P[Done!]
```
## Environment Variables
```bash
# Required for crate publishing
export CRATES_API_KEY="your-token"
# Required for npm publishing
export NPM_TOKEN="your-token"
# Optional for GitHub Actions trigger
export GITHUB_TOKEN="your-token"
```
## Security Warning
**NEVER commit these to git:**
- API tokens
- NPM tokens
- GitHub tokens
- `.env` files with credentials
## What Gets Published
### crates.io (29 crates)
- `ruvector-core`, `ruvector-graph`, `ruvector-gnn`
- `ruvector-cluster`, `ruvector-raft`, `ruvector-replication`
- `ruvector-node`, `ruvector-wasm`, and 21 more...
### npm (8 packages)
- `@ruvector/node`
- `@ruvector/wasm`
- `@ruvector/gnn`
- `@ruvector/gnn-wasm`
- `@ruvector/graph-node`
- `@ruvector/graph-wasm`
- `@ruvector/tiny-dancer`
- `@ruvector/tiny-dancer-wasm`
## Logs
Deployment logs: `logs/deployment/deploy-YYYYMMDD-HHMMSS.log`
```bash
# View latest log
ls -t logs/deployment/*.log | head -1 | xargs cat
# Follow live log
tail -f logs/deployment/deploy-*.log
```
## Getting Help
- Full docs: [DEPLOYMENT.md](DEPLOYMENT.md)
- Script help: `./scripts/deploy.sh --help`
- Test script: `./scripts/test-deploy.sh`
- Issues: https://github.com/ruvnet/ruvector/issues

View File

@@ -0,0 +1,392 @@
# RuVector Deployment Guide
This guide covers the comprehensive deployment process for ruvector using the `deploy.sh` script.
## Prerequisites
### Required Tools
- **Rust toolchain** (rustc, cargo) - v1.77 or later
- **Node.js** - v18 or later
- **npm** - Latest version
- **wasm-pack** - For WASM builds
- **jq** - For JSON manipulation
Install missing tools:
```bash
# Install Rust
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
# Install Node.js and npm (using nvm)
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.0/install.sh | bash
nvm install 18
nvm use 18
# Install wasm-pack
curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
# Install jq (Ubuntu/Debian)
sudo apt-get install jq
# Install jq (macOS)
brew install jq
```
### Required Credentials
1. **crates.io API Token**
- Visit https://crates.io/me
- Generate a new API token
- Set as environment variable: `export CRATES_API_KEY="your-token"`
2. **NPM Authentication Token**
- Login to npm: `npm login`
- Or create token: `npm token create`
- Set as environment variable: `export NPM_TOKEN="your-token"`
3. **GitHub Personal Access Token** (Optional, for GitHub Actions)
- Visit https://github.com/settings/tokens
- Generate token with `repo` and `workflow` scopes
- Set as environment variable: `export GITHUB_TOKEN="your-token"`
## Quick Start
### Full Deployment
```bash
# Export required credentials
export CRATES_API_KEY="your-crates-io-token"
export NPM_TOKEN="your-npm-token"
# Run deployment
./scripts/deploy.sh
```
### Dry Run (Test Without Publishing)
```bash
./scripts/deploy.sh --dry-run
```
## Usage Options
### Command-Line Flags
| Flag | Description |
|------|-------------|
| `--dry-run` | Test deployment without publishing |
| `--skip-tests` | Skip test suite execution |
| `--skip-crates` | Skip crates.io publishing |
| `--skip-npm` | Skip NPM publishing |
| `--skip-checks` | Skip clippy and formatting checks |
| `--force` | Skip confirmation prompts |
| `--version VERSION` | Set explicit version (default: read from Cargo.toml) |
| `-h, --help` | Show help message |
### Common Scenarios
**Publish only to crates.io:**
```bash
./scripts/deploy.sh --skip-npm
```
**Publish only to npm:**
```bash
./scripts/deploy.sh --skip-crates
```
**Quick deployment (skip all checks):**
```bash
# ⚠️ Not recommended for production
./scripts/deploy.sh --skip-tests --skip-checks --force
```
**Test deployment process:**
```bash
./scripts/deploy.sh --dry-run
```
**Deploy specific version:**
```bash
./scripts/deploy.sh --version 0.2.0
```
## Deployment Process
The script performs the following steps in order:
### 1. Prerequisites Check
- Verifies required tools (cargo, npm, wasm-pack, jq)
- Checks for required environment variables
- Displays version information
### 2. Version Management
- Reads version from workspace `Cargo.toml`
- Synchronizes version to all `package.json` files
- Updates:
- Root `package.json`
- `crates/ruvector-node/package.json`
- `crates/ruvector-wasm/package.json`
- All other NPM package manifests
### 3. Pre-Deployment Checks
- **Test Suite**: `cargo test --all`
- **Clippy Linter**: `cargo clippy --all-targets --all-features`
- **Format Check**: `cargo fmt --all -- --check`
### 4. WASM Package Builds
Builds all WASM packages:
- `ruvector-wasm`
- `ruvector-gnn-wasm`
- `ruvector-graph-wasm`
- `ruvector-tiny-dancer-wasm`
### 5. Crate Publishing
Publishes crates to crates.io in dependency order:
**Core crates:**
- `ruvector-core`
- `ruvector-metrics`
- `ruvector-filter`
**Cluster crates:**
- `ruvector-collections`
- `ruvector-snapshot`
- `ruvector-raft`
- `ruvector-cluster`
- `ruvector-replication`
**Graph and GNN:**
- `ruvector-graph`
- `ruvector-gnn`
**Router:**
- `ruvector-router-core`
- `ruvector-router-ffi`
- `ruvector-router-wasm`
- `ruvector-router-cli`
**Tiny Dancer:**
- `ruvector-tiny-dancer-core`
- `ruvector-tiny-dancer-wasm`
- `ruvector-tiny-dancer-node`
**Bindings:**
- `ruvector-node`
- `ruvector-wasm`
- `ruvector-gnn-node`
- `ruvector-gnn-wasm`
- `ruvector-graph-node`
- `ruvector-graph-wasm`
**CLI/Server:**
- `ruvector-cli`
- `ruvector-server`
- `ruvector-bench`
### 6. NPM Publishing
Publishes NPM packages:
- `@ruvector/node`
- `@ruvector/wasm`
- `@ruvector/gnn`
- `@ruvector/gnn-wasm`
- `@ruvector/graph-node`
- `@ruvector/graph-wasm`
- `@ruvector/tiny-dancer`
- `@ruvector/tiny-dancer-wasm`
### 7. GitHub Actions Trigger
Triggers cross-platform native builds (if `GITHUB_TOKEN` set)
## Version Management
### Automatic Version Sync
The script automatically synchronizes versions across all package manifests:
1. Reads version from workspace `Cargo.toml`
2. Updates all `package.json` files
3. Ensures consistency across the monorepo
### Manual Version Update
To bump version manually:
```bash
# 1. Update workspace Cargo.toml
sed -i 's/^version = .*/version = "0.2.0"/' Cargo.toml
# 2. Run deployment (will sync all packages)
./scripts/deploy.sh
```
### Semantic Versioning
Follow [Semantic Versioning](https://semver.org/):
- **MAJOR** (0.x.0): Breaking changes
- **MINOR** (x.1.0): New features, backward compatible
- **PATCH** (x.x.1): Bug fixes, backward compatible
## Troubleshooting
### Common Issues
**1. "CRATES_API_KEY not set"**
```bash
export CRATES_API_KEY="your-token"
```
**2. "NPM_TOKEN not set"**
```bash
export NPM_TOKEN="your-token"
```
**3. "Tests failed"**
```bash
# Run tests manually to see details
cargo test --all --verbose
# Skip tests if needed (not recommended)
./scripts/deploy.sh --skip-tests
```
**4. "Clippy found issues"**
```bash
# Fix clippy warnings
cargo clippy --all-targets --all-features --fix
# Or skip checks (not recommended)
./scripts/deploy.sh --skip-checks
```
**5. "Code formatting issues"**
```bash
# Format code
cargo fmt --all
# Then retry deployment
./scripts/deploy.sh
```
**6. "Crate already published"**
The script automatically skips already-published crates. If you need to publish a new version:
```bash
# Bump version in Cargo.toml
./scripts/deploy.sh --version 0.2.1
```
**7. "WASM build failed"**
```bash
# Install wasm-pack
curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
# Build manually to see errors
cd crates/ruvector-wasm
wasm-pack build --target web --release
```
### Logs
Deployment logs are saved to `logs/deployment/deploy-YYYYMMDD-HHMMSS.log`
View recent logs:
```bash
ls -lt logs/deployment/
tail -f logs/deployment/deploy-*.log
```
## CI/CD Integration
### GitHub Actions
Create `.github/workflows/deploy.yml`:
```yaml
name: Deploy
on:
push:
tags:
- 'v*'
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
- name: Setup Node.js
uses: actions/setup-node@v3
with:
node-version: 18
- name: Install wasm-pack
run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
- name: Install jq
run: sudo apt-get install -y jq
- name: Deploy
env:
CRATES_API_KEY: ${{ secrets.CRATES_API_KEY }}
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: ./scripts/deploy.sh --force
```
### Manual Deployment Checklist
- [ ] All tests passing locally
- [ ] Code formatted (`cargo fmt --all`)
- [ ] No clippy warnings
- [ ] Version bumped in `Cargo.toml`
- [ ] CHANGELOG updated
- [ ] Environment variables set
- [ ] Dry run successful
- [ ] Ready to publish
## Security Best Practices
### Credentials Management
**Never commit credentials to git!**
Use environment variables or secure vaults:
```bash
# Use .env file (add to .gitignore)
cat > .env << EOF
CRATES_API_KEY=your-token
NPM_TOKEN=your-token
GITHUB_TOKEN=your-token
EOF
# Source before deployment
source .env
./scripts/deploy.sh
```
Or use a password manager:
```bash
# Example with pass
export CRATES_API_KEY=$(pass show crates-io/api-key)
export NPM_TOKEN=$(pass show npm/token)
```
## Support
For issues or questions:
- **GitHub Issues**: https://github.com/ruvnet/ruvector/issues
- **Documentation**: https://github.com/ruvnet/ruvector
- **Deployment Logs**: `logs/deployment/`
## License
MIT License - See LICENSE file for details

789
vendor/ruvector/scripts/deploy/deploy.sh vendored Executable file
View File

@@ -0,0 +1,789 @@
#!/bin/bash
################################################################################
# RuVector Comprehensive Deployment Script
#
# This script orchestrates the complete deployment process for ruvector:
# - Version management and synchronization
# - Pre-deployment checks (tests, linting, formatting)
# - WASM package builds
# - Crate publishing to crates.io
# - NPM package publishing
# - GitHub Actions trigger for cross-platform native builds
#
# Usage:
# ./scripts/deploy.sh [OPTIONS]
#
# Options:
# --dry-run Run without actually publishing
# --skip-tests Skip test suite execution
# --skip-crates Skip crates.io publishing
# --skip-npm Skip NPM publishing
# --skip-checks Skip pre-deployment checks
# --force Skip confirmation prompts
# --version VERSION Set explicit version (otherwise read from Cargo.toml)
#
# Environment Variables:
# CRATES_API_KEY API key for crates.io (required for crate publishing)
# NPM_TOKEN NPM authentication token (required for npm publishing)
# GITHUB_TOKEN GitHub token for Actions API (optional)
#
################################################################################
set -euo pipefail
# Color codes for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly CYAN='\033[0;36m'
readonly BOLD='\033[1m'
readonly NC='\033[0m' # No Color
# Configuration (can be overridden by command-line flags)
DRY_RUN=${DRY_RUN:-false}
SKIP_TESTS=${SKIP_TESTS:-false}
SKIP_CHECKS=${SKIP_CHECKS:-false}
PUBLISH_CRATES=${PUBLISH_CRATES:-true}
PUBLISH_NPM=${PUBLISH_NPM:-true}
FORCE=${FORCE:-false}
VERSION=""
# Project root
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
# Log files
readonly LOG_DIR="$PROJECT_ROOT/logs/deployment"
readonly LOG_FILE="$LOG_DIR/deploy-$(date +%Y%m%d-%H%M%S).log"
################################################################################
# Logging Functions
################################################################################
setup_logging() {
mkdir -p "$LOG_DIR"
exec 1> >(tee -a "$LOG_FILE")
exec 2>&1
echo -e "${CYAN}Logging to: $LOG_FILE${NC}"
}
log_info() {
echo -e "${BLUE}[INFO]${NC} $*"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $*"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $*"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $*" >&2
}
log_step() {
echo ""
echo -e "${BOLD}${CYAN}========================================${NC}"
echo -e "${BOLD}${CYAN}$*${NC}"
echo -e "${BOLD}${CYAN}========================================${NC}"
}
################################################################################
# Utility Functions
################################################################################
parse_args() {
while [[ $# -gt 0 ]]; do
case $1 in
--dry-run)
DRY_RUN=true
log_warning "DRY RUN MODE: No actual publishing will occur"
shift
;;
--skip-tests)
SKIP_TESTS=true
log_warning "Skipping test suite"
shift
;;
--skip-crates)
PUBLISH_CRATES=false
log_info "Skipping crates.io publishing"
shift
;;
--skip-npm)
PUBLISH_NPM=false
log_info "Skipping NPM publishing"
shift
;;
--skip-checks)
SKIP_CHECKS=true
log_warning "Skipping pre-deployment checks"
shift
;;
--force)
FORCE=true
log_warning "Force mode: Skipping confirmation prompts"
shift
;;
--version)
VERSION="$2"
log_info "Using explicit version: $VERSION"
shift 2
;;
--help|-h)
show_help
exit 0
;;
*)
log_error "Unknown option: $1"
show_help
exit 1
;;
esac
done
}
show_help() {
cat << EOF
RuVector Deployment Script
Usage: $0 [OPTIONS]
Options:
--dry-run Run without actually publishing
--skip-tests Skip test suite execution
--skip-crates Skip crates.io publishing
--skip-npm Skip NPM publishing
--skip-checks Skip pre-deployment checks
--force Skip confirmation prompts
--version VERSION Set explicit version
-h, --help Show this help message
Environment Variables:
CRATES_API_KEY API key for crates.io (required for crate publishing)
NPM_TOKEN NPM authentication token (required for npm publishing)
GITHUB_TOKEN GitHub token for Actions API (optional)
Examples:
# Full deployment with all checks
$0
# Dry run to test the process
$0 --dry-run
# Publish only to crates.io
$0 --skip-npm
# Quick deployment skipping tests (not recommended for production)
$0 --skip-tests --force
EOF
}
confirm_action() {
local message="$1"
if [[ "$FORCE" == "true" ]]; then
return 0
fi
echo -e "${YELLOW}$message${NC}"
read -p "Continue? [y/N] " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
log_error "Deployment cancelled by user"
exit 1
fi
}
################################################################################
# Prerequisites Check
################################################################################
check_prerequisites() {
log_step "Checking Prerequisites"
local missing_tools=()
# Check required tools
command -v cargo >/dev/null 2>&1 || missing_tools+=("cargo")
command -v rustc >/dev/null 2>&1 || missing_tools+=("rustc")
command -v npm >/dev/null 2>&1 || missing_tools+=("npm")
command -v node >/dev/null 2>&1 || missing_tools+=("node")
command -v wasm-pack >/dev/null 2>&1 || missing_tools+=("wasm-pack")
command -v jq >/dev/null 2>&1 || missing_tools+=("jq")
if [[ ${#missing_tools[@]} -gt 0 ]]; then
log_error "Missing required tools: ${missing_tools[*]}"
log_error "Please install them and try again"
exit 1
fi
log_success "All required tools found"
# Check environment variables for publishing
if [[ "$PUBLISH_CRATES" == "true" ]] && [[ -z "${CRATES_API_KEY:-}" ]]; then
log_error "CRATES_API_KEY environment variable not set"
log_error "Either set it or use --skip-crates flag"
exit 1
fi
if [[ "$PUBLISH_NPM" == "true" ]] && [[ -z "${NPM_TOKEN:-}" ]]; then
log_error "NPM_TOKEN environment variable not set"
log_error "Either set it or use --skip-npm flag"
exit 1
fi
# Display versions
log_info "Rust version: $(rustc --version)"
log_info "Cargo version: $(cargo --version)"
log_info "Node version: $(node --version)"
log_info "NPM version: $(npm --version)"
log_info "wasm-pack version: $(wasm-pack --version)"
}
################################################################################
# Version Management
################################################################################
get_workspace_version() {
log_step "Reading Workspace Version"
cd "$PROJECT_ROOT"
if [[ -n "$VERSION" ]]; then
log_info "Using explicit version: $VERSION"
return
fi
# Extract version from workspace Cargo.toml
VERSION=$(grep -m1 '^version = ' Cargo.toml | sed 's/version = "\(.*\)"/\1/')
if [[ -z "$VERSION" ]]; then
log_error "Could not determine version from Cargo.toml"
exit 1
fi
log_success "Workspace version: $VERSION"
}
sync_package_versions() {
log_step "Synchronizing Package Versions"
cd "$PROJECT_ROOT"
# Update root package.json
if [[ -f "package.json" ]]; then
log_info "Updating root package.json to version $VERSION"
local temp_file=$(mktemp)
jq --arg version "$VERSION" '.version = $version' package.json > "$temp_file"
mv "$temp_file" package.json
log_success "Root package.json updated"
fi
# Update NPM package versions
local npm_packages=(
"crates/ruvector-node"
"crates/ruvector-wasm"
"crates/ruvector-gnn-node"
"crates/ruvector-gnn-wasm"
"crates/ruvector-graph-node"
"crates/ruvector-graph-wasm"
"crates/ruvector-tiny-dancer-node"
"crates/ruvector-tiny-dancer-wasm"
)
for pkg in "${npm_packages[@]}"; do
if [[ -f "$pkg/package.json" ]]; then
log_info "Updating $pkg/package.json to version $VERSION"
local temp_file=$(mktemp)
jq --arg version "$VERSION" '.version = $version' "$pkg/package.json" > "$temp_file"
mv "$temp_file" "$pkg/package.json"
fi
done
log_success "All package versions synchronized to $VERSION"
}
################################################################################
# Pre-Deployment Checks
################################################################################
run_tests() {
if [[ "$SKIP_TESTS" == "true" ]]; then
log_warning "Skipping tests (--skip-tests flag set)"
return
fi
log_step "Running Test Suite"
cd "$PROJECT_ROOT"
log_info "Running cargo test --all..."
if ! cargo test --all --verbose; then
log_error "Tests failed"
exit 1
fi
log_success "All tests passed"
}
run_clippy() {
if [[ "$SKIP_CHECKS" == "true" ]]; then
log_warning "Skipping clippy checks"
return
fi
log_step "Running Clippy Linter"
cd "$PROJECT_ROOT"
log_info "Running cargo clippy --all-targets..."
if ! cargo clippy --all-targets --all-features -- -D warnings; then
log_error "Clippy found issues"
exit 1
fi
log_success "Clippy checks passed"
}
check_formatting() {
if [[ "$SKIP_CHECKS" == "true" ]]; then
log_warning "Skipping formatting check"
return
fi
log_step "Checking Code Formatting"
cd "$PROJECT_ROOT"
log_info "Running cargo fmt --check..."
if ! cargo fmt --all -- --check; then
log_error "Code formatting issues found"
log_error "Run 'cargo fmt --all' to fix"
exit 1
fi
log_success "Code formatting is correct"
}
build_wasm_packages() {
log_step "Building WASM Packages"
cd "$PROJECT_ROOT"
local wasm_packages=(
"crates/ruvector-wasm"
"crates/ruvector-gnn-wasm"
"crates/ruvector-graph-wasm"
"crates/ruvector-tiny-dancer-wasm"
)
for pkg in "${wasm_packages[@]}"; do
if [[ -d "$pkg" ]]; then
log_info "Building WASM package: $pkg"
cd "$PROJECT_ROOT/$pkg"
if [[ -f "build.sh" ]]; then
log_info "Using build script for $pkg"
bash build.sh
elif [[ -f "package.json" ]] && grep -q '"build"' package.json; then
log_info "Using npm build for $pkg"
npm run build
else
log_info "Using wasm-pack for $pkg"
wasm-pack build --target web --release
fi
log_success "Built WASM package: $pkg"
fi
done
cd "$PROJECT_ROOT"
log_success "All WASM packages built"
}
################################################################################
# Crate Publishing
################################################################################
publish_crates() {
if [[ "$PUBLISH_CRATES" != "true" ]]; then
log_warning "Skipping crates.io publishing"
return
fi
log_step "Publishing Crates to crates.io"
cd "$PROJECT_ROOT"
# Configure cargo authentication
log_info "Configuring cargo authentication..."
if [[ "$DRY_RUN" != "true" ]]; then
cargo login "$CRATES_API_KEY"
fi
# Crates in dependency order
local crates=(
# Core crates (no dependencies)
"crates/ruvector-core"
"crates/ruvector-metrics"
"crates/ruvector-filter"
# Cluster and replication (depend on core)
"crates/ruvector-collections"
"crates/ruvector-snapshot"
"crates/ruvector-raft"
"crates/ruvector-cluster"
"crates/ruvector-replication"
# Graph and GNN (depend on core)
"crates/ruvector-graph"
"crates/ruvector-gnn"
# Router (depend on core)
"crates/ruvector-router-core"
"crates/ruvector-router-ffi"
"crates/ruvector-router-wasm"
"crates/ruvector-router-cli"
# Tiny Dancer (depend on core)
"crates/ruvector-tiny-dancer-core"
"crates/ruvector-tiny-dancer-wasm"
"crates/ruvector-tiny-dancer-node"
# Bindings (depend on core)
"crates/ruvector-node"
"crates/ruvector-wasm"
"crates/ruvector-gnn-node"
"crates/ruvector-gnn-wasm"
"crates/ruvector-graph-node"
"crates/ruvector-graph-wasm"
# CLI and server (depend on everything)
"crates/ruvector-cli"
"crates/ruvector-server"
"crates/ruvector-bench"
)
local success_count=0
local failed_crates=()
local skipped_crates=()
for crate in "${crates[@]}"; do
if [[ ! -d "$crate" ]]; then
log_warning "Crate directory not found: $crate (skipping)"
skipped_crates+=("$crate")
continue
fi
local crate_name=$(basename "$crate")
log_info "Publishing $crate_name..."
cd "$PROJECT_ROOT/$crate"
# Check if already published
if cargo search "$crate_name" --limit 1 | grep -q "^$crate_name = \"$VERSION\""; then
log_warning "$crate_name v$VERSION already published (skipping)"
((success_count++))
skipped_crates+=("$crate_name")
continue
fi
# Verify package
log_info "Verifying package: $crate_name"
if ! cargo package --allow-dirty; then
log_error "Package verification failed: $crate_name"
failed_crates+=("$crate_name")
continue
fi
# Publish
if [[ "$DRY_RUN" == "true" ]]; then
log_info "DRY RUN: Would publish $crate_name"
((success_count++))
else
log_info "Publishing $crate_name to crates.io..."
if cargo publish --allow-dirty; then
log_success "Published $crate_name v$VERSION"
((success_count++))
# Wait for crates.io to index
log_info "Waiting 30 seconds for crates.io indexing..."
sleep 30
else
log_error "Failed to publish $crate_name"
failed_crates+=("$crate_name")
fi
fi
done
cd "$PROJECT_ROOT"
# Summary
log_step "Crates Publishing Summary"
log_info "Total crates: ${#crates[@]}"
log_success "Successfully published: $success_count"
log_warning "Skipped: ${#skipped_crates[@]}"
if [[ ${#failed_crates[@]} -gt 0 ]]; then
log_error "Failed to publish: ${#failed_crates[@]}"
for crate in "${failed_crates[@]}"; do
log_error " - $crate"
done
exit 1
fi
log_success "All crates published successfully!"
}
################################################################################
# NPM Publishing
################################################################################
build_native_modules() {
log_step "Building Native Modules for Current Platform"
cd "$PROJECT_ROOT"
local native_packages=(
"crates/ruvector-node"
"crates/ruvector-gnn-node"
"crates/ruvector-graph-node"
"crates/ruvector-tiny-dancer-node"
)
for pkg in "${native_packages[@]}"; do
if [[ -d "$pkg" ]]; then
log_info "Building native module: $pkg"
cd "$PROJECT_ROOT/$pkg"
# Install dependencies
if [[ ! -d "node_modules" ]]; then
log_info "Installing npm dependencies for $pkg"
npm install
fi
# Build
log_info "Building native module with napi"
npm run build
log_success "Built native module: $pkg"
fi
done
cd "$PROJECT_ROOT"
}
publish_npm() {
if [[ "$PUBLISH_NPM" != "true" ]]; then
log_warning "Skipping NPM publishing"
return
fi
log_step "Publishing NPM Packages"
cd "$PROJECT_ROOT"
# Configure npm authentication
log_info "Configuring npm authentication..."
if [[ "$DRY_RUN" != "true" ]]; then
echo "//registry.npmjs.org/:_authToken=${NPM_TOKEN}" > ~/.npmrc
fi
local npm_packages=(
"crates/ruvector-node"
"crates/ruvector-wasm"
"crates/ruvector-gnn-node"
"crates/ruvector-gnn-wasm"
"crates/ruvector-graph-node"
"crates/ruvector-graph-wasm"
"crates/ruvector-tiny-dancer-node"
"crates/ruvector-tiny-dancer-wasm"
)
local success_count=0
local failed_packages=()
for pkg in "${npm_packages[@]}"; do
if [[ ! -d "$pkg" ]] || [[ ! -f "$pkg/package.json" ]]; then
log_warning "Package not found: $pkg (skipping)"
continue
fi
local pkg_name=$(jq -r '.name' "$pkg/package.json")
log_info "Publishing $pkg_name..."
cd "$PROJECT_ROOT/$pkg"
# Check if already published
if npm view "$pkg_name@$VERSION" version >/dev/null 2>&1; then
log_warning "$pkg_name@$VERSION already published (skipping)"
((success_count++))
continue
fi
# Publish
if [[ "$DRY_RUN" == "true" ]]; then
log_info "DRY RUN: Would publish $pkg_name"
((success_count++))
else
log_info "Publishing $pkg_name to npm..."
if npm publish --access public; then
log_success "Published $pkg_name@$VERSION"
((success_count++))
else
log_error "Failed to publish $pkg_name"
failed_packages+=("$pkg_name")
fi
fi
done
cd "$PROJECT_ROOT"
# Summary
log_step "NPM Publishing Summary"
log_success "Successfully published: $success_count/${#npm_packages[@]}"
if [[ ${#failed_packages[@]} -gt 0 ]]; then
log_error "Failed to publish: ${#failed_packages[@]}"
for pkg in "${failed_packages[@]}"; do
log_error " - $pkg"
done
exit 1
fi
log_success "All NPM packages published successfully!"
}
################################################################################
# GitHub Actions Integration
################################################################################
trigger_github_builds() {
log_step "Triggering GitHub Actions for Cross-Platform Builds"
if [[ -z "${GITHUB_TOKEN:-}" ]]; then
log_warning "GITHUB_TOKEN not set, skipping GitHub Actions trigger"
log_info "You can manually trigger the workflow from GitHub Actions UI"
return
fi
if [[ "$DRY_RUN" == "true" ]]; then
log_info "DRY RUN: Would trigger GitHub Actions workflow"
return
fi
local repo_owner="ruvnet"
local repo_name="ruvector"
local workflow_name="native-builds.yml"
log_info "Triggering workflow: $workflow_name"
log_info "Repository: $repo_owner/$repo_name"
log_info "Version tag: v$VERSION"
# Create GitHub API request
local response=$(curl -s -X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"https://api.github.com/repos/$repo_owner/$repo_name/actions/workflows/$workflow_name/dispatches" \
-d "{\"ref\":\"main\",\"inputs\":{\"version\":\"$VERSION\"}}")
if [[ -z "$response" ]]; then
log_success "GitHub Actions workflow triggered successfully"
log_info "Check status at: https://github.com/$repo_owner/$repo_name/actions"
else
log_error "Failed to trigger GitHub Actions workflow"
log_error "Response: $response"
fi
}
################################################################################
# Deployment Summary
################################################################################
print_deployment_summary() {
log_step "Deployment Summary"
echo ""
echo -e "${BOLD}Version:${NC} $VERSION"
echo -e "${BOLD}Dry Run:${NC} $DRY_RUN"
echo ""
if [[ "$PUBLISH_CRATES" == "true" ]]; then
echo -e "${GREEN}${NC} Crates published to crates.io"
echo -e " View at: ${CYAN}https://crates.io/crates/ruvector-core${NC}"
fi
if [[ "$PUBLISH_NPM" == "true" ]]; then
echo -e "${GREEN}${NC} NPM packages published"
echo -e " View at: ${CYAN}https://www.npmjs.com/package/@ruvector/node${NC}"
fi
echo ""
echo -e "${BOLD}${GREEN}Deployment completed successfully!${NC}"
echo ""
if [[ "$DRY_RUN" == "true" ]]; then
echo -e "${YELLOW}NOTE: This was a dry run. No actual publishing occurred.${NC}"
echo -e "${YELLOW}Run without --dry-run to perform actual deployment.${NC}"
fi
}
################################################################################
# Main Deployment Flow
################################################################################
main() {
echo -e "${BOLD}${CYAN}"
cat << "EOF"
╔═══════════════════════════════════════════════════════════════╗
║ ║
║ RuVector Comprehensive Deployment Script ║
║ ║
╚═══════════════════════════════════════════════════════════════╝
EOF
echo -e "${NC}"
# Setup
setup_logging
parse_args "$@"
# Prerequisites
check_prerequisites
# Version management
get_workspace_version
sync_package_versions
# Confirmation
confirm_action "Ready to deploy version $VERSION. This will:
- Run tests and quality checks
- Build WASM packages
- Publish $([ "$PUBLISH_CRATES" == "true" ] && echo "crates.io" || echo "")$([ "$PUBLISH_CRATES" == "true" ] && [ "$PUBLISH_NPM" == "true" ] && echo " and ")$([ "$PUBLISH_NPM" == "true" ] && echo "NPM packages" || echo "")"
# Pre-deployment checks
run_tests
run_clippy
check_formatting
build_wasm_packages
# Publishing
publish_crates
build_native_modules
publish_npm
# GitHub Actions
trigger_github_builds
# Summary
print_deployment_summary
log_info "Deployment log saved to: $LOG_FILE"
}
# Run main function
main "$@"

237
vendor/ruvector/scripts/deploy/test-deploy.sh vendored Executable file
View File

@@ -0,0 +1,237 @@
#!/bin/bash
################################################################################
# Test script for deploy.sh
#
# This script validates the deployment script without actually publishing
# anything. It runs through all deployment steps in dry-run mode and checks
# for common issues.
#
# Usage: ./scripts/test-deploy.sh
################################################################################
set -euo pipefail
readonly GREEN='\033[0;32m'
readonly RED='\033[0;31m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly NC='\033[0m'
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
echo -e "${BLUE}╔═══════════════════════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║ Testing RuVector Deployment Script ║${NC}"
echo -e "${BLUE}╚═══════════════════════════════════════════════════════════════╝${NC}"
echo ""
# Test counter
tests_passed=0
tests_failed=0
test_step() {
local description="$1"
echo -e "${BLUE}Testing:${NC} $description"
}
test_pass() {
echo -e "${GREEN}✓ PASS${NC}"
((tests_passed++))
echo ""
}
test_fail() {
local reason="$1"
echo -e "${RED}✗ FAIL: $reason${NC}"
((tests_failed++))
echo ""
}
# Test 1: Script exists and is executable
test_step "Deployment script exists and is executable"
if [[ -x "$SCRIPT_DIR/deploy.sh" ]]; then
test_pass
else
test_fail "deploy.sh is not executable or doesn't exist"
fi
# Test 2: Required tools
test_step "Required tools are installed"
missing_tools=()
for tool in cargo rustc npm node wasm-pack jq; do
if ! command -v "$tool" >/dev/null 2>&1; then
missing_tools+=("$tool")
fi
done
if [[ ${#missing_tools[@]} -eq 0 ]]; then
test_pass
else
test_fail "Missing tools: ${missing_tools[*]}"
fi
# Test 3: Help message
test_step "Help message displays correctly"
if "$SCRIPT_DIR/deploy.sh" --help >/dev/null 2>&1; then
test_pass
else
test_fail "Help message not working"
fi
# Test 4: Workspace Cargo.toml exists
test_step "Workspace Cargo.toml exists"
if [[ -f "$PROJECT_ROOT/Cargo.toml" ]]; then
test_pass
else
test_fail "Cargo.toml not found"
fi
# Test 5: Version can be extracted
test_step "Version extraction from Cargo.toml"
cd "$PROJECT_ROOT"
version=$(grep -m1 '^version = ' Cargo.toml | sed 's/version = "\(.*\)"/\1/' || echo "")
if [[ -n "$version" ]]; then
echo " Found version: $version"
test_pass
else
test_fail "Could not extract version"
fi
# Test 6: Package.json files exist
test_step "NPM package.json files exist"
package_count=0
for pkg in crates/ruvector-node crates/ruvector-wasm crates/ruvector-gnn-node; do
if [[ -f "$PROJECT_ROOT/$pkg/package.json" ]]; then
((package_count++))
fi
done
if [[ $package_count -gt 0 ]]; then
echo " Found $package_count package.json files"
test_pass
else
test_fail "No package.json files found"
fi
# Test 7: Crate directories exist
test_step "Crate directories exist"
crate_count=0
for crate in crates/ruvector-core crates/ruvector-node crates/ruvector-graph; do
if [[ -d "$PROJECT_ROOT/$crate" ]]; then
((crate_count++))
fi
done
if [[ $crate_count -gt 0 ]]; then
echo " Found $crate_count crate directories"
test_pass
else
test_fail "No crate directories found"
fi
# Test 8: Dry run without credentials (should work)
test_step "Dry run without credentials"
cd "$PROJECT_ROOT"
if PUBLISH_CRATES=false PUBLISH_NPM=false "$SCRIPT_DIR/deploy.sh" --dry-run --skip-tests --skip-checks --force 2>&1 | grep -q "Deployment completed successfully"; then
test_pass
else
test_fail "Dry run failed even with skips"
fi
# Test 9: Check logging directory creation
test_step "Log directory creation"
if [[ -d "$PROJECT_ROOT/logs/deployment" ]]; then
log_count=$(find "$PROJECT_ROOT/logs/deployment" -name "deploy-*.log" 2>/dev/null | wc -l)
echo " Found $log_count deployment logs"
test_pass
else
test_fail "Log directory not created"
fi
# Test 10: Version flag works
test_step "Version flag parsing"
cd "$PROJECT_ROOT"
if PUBLISH_CRATES=false PUBLISH_NPM=false "$SCRIPT_DIR/deploy.sh" --version 9.9.9 --dry-run --skip-tests --skip-checks --force 2>&1 | grep -q "9.9.9"; then
test_pass
else
test_fail "Version flag not working"
fi
# Test 11: JSON manipulation with jq
test_step "Version synchronization (jq test)"
temp_json=$(mktemp)
echo '{"version":"0.0.0"}' > "$temp_json"
jq --arg version "1.2.3" '.version = $version' "$temp_json" > "${temp_json}.new"
mv "${temp_json}.new" "$temp_json"
result=$(jq -r '.version' "$temp_json")
rm "$temp_json"
if [[ "$result" == "1.2.3" ]]; then
test_pass
else
test_fail "jq version update failed"
fi
# Test 12: Build scripts exist for WASM packages
test_step "WASM build scripts exist"
wasm_build_count=0
for pkg in crates/ruvector-wasm crates/ruvector-gnn-wasm; do
if [[ -f "$PROJECT_ROOT/$pkg/build.sh" ]] || [[ -f "$PROJECT_ROOT/$pkg/package.json" ]]; then
((wasm_build_count++))
fi
done
if [[ $wasm_build_count -gt 0 ]]; then
echo " Found build scripts for $wasm_build_count WASM packages"
test_pass
else
test_fail "No WASM build scripts found"
fi
# Test 13: Dependency order validation
test_step "Crate dependency order validation"
# Check that core comes before node
deploy_script_content=$(cat "$SCRIPT_DIR/deploy.sh")
core_line=$(echo "$deploy_script_content" | grep -n "ruvector-core" | head -1 | cut -d: -f1)
node_line=$(echo "$deploy_script_content" | grep -n "ruvector-node" | grep -v "gnn-node" | head -1 | cut -d: -f1)
if [[ -n "$core_line" ]] && [[ -n "$node_line" ]] && [[ $core_line -lt $node_line ]]; then
echo " Dependency order is correct (core before bindings)"
test_pass
else
test_fail "Dependency order may be incorrect"
fi
# Summary
echo ""
echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
echo -e "${BLUE} Test Summary ${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
echo ""
total_tests=$((tests_passed + tests_failed))
echo -e "Total tests: $total_tests"
echo -e "${GREEN}Passed: $tests_passed${NC}"
if [[ $tests_failed -gt 0 ]]; then
echo -e "${RED}Failed: $tests_failed${NC}"
echo ""
echo -e "${RED}Some tests failed. Please review the output above.${NC}"
exit 1
else
echo -e "${RED}Failed: $tests_failed${NC}"
echo ""
echo -e "${GREEN}All tests passed! The deployment script is ready to use.${NC}"
echo ""
echo "Next steps:"
echo " 1. Set required environment variables:"
echo " export CRATES_API_KEY='your-token'"
echo " export NPM_TOKEN='your-token'"
echo ""
echo " 2. Test with dry run:"
echo " ./scripts/deploy.sh --dry-run"
echo ""
echo " 3. Deploy:"
echo " ./scripts/deploy.sh"
exit 0
fi

View File

@@ -0,0 +1,9 @@
target/**
Runs
Cargo.lock
rls*
dumpreloadtest*
*.pdf
*.html
.idea/
.vscode/

View File

@@ -0,0 +1,111 @@
[package]
name = "hnsw_rs"
version = "0.3.3"
authors = ["jeanpierre.both@gmail.com"]
description = "Ann based on Hierarchical Navigable Small World Graphs from Yu.A. Malkov and D.A Yashunin"
license = "MIT/Apache-2.0"
readme = "README.md"
keywords = ["algorithms", "ann", "hnsw"]
repository = "https://github.com/jean-pierreBoth/hnswlib-rs"
documentation = "https://docs.rs/hnsw_rs"
edition = "2024"
# declare a feature with no dependancy to get some modulated debug print
# to be run with cargo build --features verbose_1
#verbose_1 = [ ]
[profile.release]
lto = true
opt-level = 3
[lib]
# cargo rustc --lib -- --crate-type cdylib [or staticlib] or rlib (default)
# if we want to avoid specifying in advance crate-type
path = "src/lib.rs"
#crate-type = ["cdylib"]
[[example]]
name = "random"
path = "examples/random.rs"
[[example]]
name = "ann-glove"
path = "examples/ann-glove25-angular.rs"
[[example]]
name = "ann-mnist"
path = "examples/ann-mnist-784-euclidean.rs"
[[example]]
name = "ann-sift1m"
path = "examples/ann-sift1m-128-euclidean.rs"
[[example]]
name = "levenshtein"
path = "examples/levensthein.rs"
[dependencies]
# default is version spec is ^ meaning can update up to max non null version number
# cargo doc --no-deps avoid dependencies doc generation
#
serde = { version = "1.0", features = ["derive"] }
bincode = { version = "1.3" }
cfg-if = { version = "1.0" }
# for //
parking_lot = "0.12"
rayon = { version = "1.11" }
num_cpus = { version = "1.16" }
cpu-time = { version = "1.0" }
num-traits = { version = "0.2" }
# for hashing . hashbrown still needed beccause of get_key_value(&key)
hashbrown = { version = "0.15" }
indexmap = { version = ">= 2.11, < 2.13" }
rand = { version = "0.8" }
lazy_static = { version = "1.4" }
#
mmap-rs = { version = "0.6" }
#
# decreasing order of log for debug build : (max_level_)trace debug info warn error off
# decreasing order of log for release build (release_max_level_) .. idem
#log = { version = "0.4", features = ["max_level_debug", "release_max_level_info"] }
log = { version = "0.4" }
env_logger = { version = "0.11" }
anyhow = { version = "1.0" }
# anndists = { path = "../anndists" }
anndists = { version = "0.1" }
# anndists = { git = "https://github.com/jean-pierreBoth/anndists" }
# for benchmark reading, so the lbrary do not depend on hdf5 nor ndarray
[dev-dependencies]
# hdf5 = { version = "0.8" }
# metno is needed as hdf5 is blocked to hdfsys 1.12
hdf5 = {package = "hdf5-metno", version = "0.10.0" }
ndarray = { version = ">=0.16.0, <0.18" }
skiplist = { version = "0.6" }
tempfile = { version = "3" }
itertools = {version = "0.14"}
[features]
default = []
# feature for std simd on nightly
stdsimd = ["anndists/stdsimd"]
# feature for simd on stable for x86*
simdeez_f = ["anndists/simdeez_f"]

View File

@@ -0,0 +1,56 @@
- version 0.3.3
small fix on filter (thanks to VillSnow). include ndarray 0.17 as possible dep. fixed compiler warning on elided lifetimes
- version 0.3.2
update dependencies to ndarray 0.16 , rand 0.9 indexmap 2.9, hdf5. edition=2024
- version 0.3.1
Possibility to reduce the number of levels used Hnsw structure with the function hnsw::modify_level_scale.
This often increases significantly recall while incurring a moderate cpu cost. It is also possible
to have same recall with smaller *max_nb_conn* parameters so reducing memory usage.
See README.md at [bigann](https://github.com/jean-pierreBoth/bigann).
Modification inspired by the article by [Munyampirwa](https://arxiv.org/abs/2412.01940)
Clippy cleaning and minor arguments change (PathBuf to Path String to &str) in dump/reload
with the help of bwsw (https://github.com/bwsw)
- **version 0.3.0**:
The distances implementation is now in a separate crate [anndsits](https://crates.io/crates/anndists). Using hnsw_rs::prelude:::* should make the change transparent.
The mmap implementation makes it possible to use the [coreset](https://github.com/jean-pierreBoth/coreset) crate to compute coreset and clusters of data stored in hnsw dumps.
- version 0.2.1:
when using mmap, the points less frequently used (points in lower layers) are preferentially mmap-ed while upper layers are preferentially
explcitly read from file.
Hnswio is now Sync.
feature stdsimd, based on std::simd, runs with nightly on Hamming with u32,u64 and DisL1,DistL2, DistDot with f32
- The **version 0.2** introduces
1. possibility to use mmap on the data file storing the vectors represented in the hnsw structure. This is mostly usefule for
large vectors, where data needs more space than the graph part.
As a consequence the format of this file changed. Old format can be read but new dumps will be in the new format.
In case of mmap usage, a dump after inserting new elements must ensure that the old file is not overwritten, so a unique file name is
generated if necessary. See documentation of module Hnswio
1. the filtering trait
- Upgrade of many dependencies. Change from simple_logger to env_logger. The logger is initialized one for all in file src/lib.rs and cannot be intialized twice. The level of log can be modulated by the RUST_LOG env variable on a module basis or switched off. See the *env_logger* crate doc.
- A rust crate *edlib_rs* provides an interface to the *excellent* edlib C++ library [(Cf edlib)](https://github.com/Martinsos/edlib) can be found at [edlib_rs](https://github.com/jean-pierreBoth/edlib-rs) or on crate.io. It can be used to define a user adhoc distance on &[u8] with normal, prefix or infix mode (which is useful in genomics alignment).
- The library do not depend anymore on hdf5 and ndarray. They are dev-dependancies needed for examples, this simplify compatibility issues.
- Added insertion methods for slices for easier use with the ndarray crate.
- simd/avx2 requires now the feature "simdeez_f". So by default the crate can compile on M1 chip and transitions to std::simd.
- Added DistPtr and possiblity to dump/reload with this distance type. (See *load_hnsw_with_dist* function)
- Implementation of Hamming for f64 exclusively in the context SuperMinHash in crate [probminhash](https://crates.io/crates/probminhash)

View File

@@ -0,0 +1,13 @@
Copyright 2020 jean-pierre.both
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -0,0 +1,25 @@
Copyright (c) 2020 jean-pierre.both
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@@ -0,0 +1,168 @@
# hnsw-rs
This crate provides a Rust implementation of the paper by Yu.A. Malkov and D.A Yashunin:
"Efficient and Robust approximate nearest neighbours using Hierarchical Navigable Small World Graphs" (2016,2018)
[arxiv](https://arxiv.org/abs/1603.09320)
## Functionalities
The crate is built on top of the [anndists](https://crates.io/crates/anndists) and can use the following distances:
* usual distances as L1, L2, Cosine, Jaccard, Hamming for vectors of standard numeric types, Levenshtein distance on u16.
* Hellinger distance and Jeffreys divergence between probability distributions (f32 and f64). It must be noted that the Jeffreys divergence
(a symetrized Kullback-Leibler divergence) do not satisfy the triangle inequality. (Neither Cosine distance !).
* Jensen-Shannon distance between probability distributions (f32 and f64). It is defined as the **square root** of the Jensen-Shannon divergence and is a bounded metric. See [Nielsen F. in Entropy 2019, 21(5), 485](https://doi.org/10.3390/e21050485).
* A Trait to enable the user to implement its own distances.
It takes as data slices of types T satisfying T:Serialize+Clone+Send+Sync. It is also possible to use C extern functions or closures.
* An interface towards C and more specifically to the [Julia](https://julialang.org/) language.
See the companion Julia package [HnswAnn.jl](https://gitlab.com/jpboth/HnswAnn.jl) and the building paragraph for some help for Julia users.
The hnsw implementation provides:
* Multithreaded insertion and search requests.
* Dump and reload functions (*See module hnswio*) to store the data and the graph once it is built. These facilities rely partly on Serde so T needs to implement Serialize and Deserialized as derived by Serde.
It is also possible to reload only the graph and not the data themselves. A specific type (struct NoData, associated to the NoDist distance is dedicated to this functionality.
* A flattening conversion of the Hnsw structure to keep only neighborhood relationships between points (without their internal data) internal to the Hnsw structure (*see module flatten.rs, FlatPoint and FlatNeighborhood*). It is thus possible to keep some topology information with low memory usage.
* Filtering: It is possible to add filters so only results which satisfies the filter is in the result set. The filtering is done during the search, so it is not a post filter. There is currently two ways of using the filter, one can add allowed ids in a sorted vector and send as a parameter, or one can define a function which will be called before an id is added to the result set.
Examples on both these strategies are in the examples or tests directory. One can also implement the trait Filterable for new types, if one would like the filter to be kept in a bitvector, for example.
* Possibilty to use mmap on dumped data (not on graph part) which is useful for large data vectors. This enables coreset and clusters computation in streaming, see [coreset](https://github.com/jean-pierreBoth/coreset) and soon on [crates.io](https://crates.io/crates).
## Implementation
The graph construction and searches are multithreaded with the **parking_lot** crate (See **parallel_insert_data** and **parallel_search_neighbours** functions and also examples files).
Distances are provided by the crate [anndists](https://github.com/jean-pierreBoth/anndists), see *Building*.
## Building
### Simd
Two features activate simd in the crate **anndists** :
* The feature "simdeez_f" provide simd for x86_64 processors.
Compile with **cargo build --release --features "simdeez_f"** or change the default features in Cargo.toml.
To compile this crate on a M1 chip just do not activate this feature.
* The feature "stdsimd" provides portable simd through std::simd but **requires rust nightly**.
Setting this feature in features default (or by cargo command) activates the portable_simd feature on rust nightly.
Not all couples (Distance, type) are provided yet. (See the crate anndists)
### Julia interface
By default the crate is a standalone project and builds a static libray and executable.
To be used with the companion Julia package it is necessary to build a dynamic library.
This can be done by just uncommenting (i.e get rid of the #) in file Cargo.toml the line:
*#crate-type = ["cdylib"]*
and rerun the command: cargo build --release.
This will generate a .so file in the target/release directory.
## Algorithm and Input Parameters
The algorithm stores points in layers (at most 16), and a graph is constructed to enable a search from less densely populated levels to most densely populated levels by constructing links from less dense layers to the most dense layer (level 0).
Roughly the algorithm goes along runs as follows:
Upon insertion, the level ***l*** of a new point is sampled with an exponential law, limiting the number of levels to 16,
so that level 0 is the most densely populated layer, upper layers being exponentially less populated as level increases.
The nearest neighbour of the point is searched in lookup tables from the upper level to the level just above its layer (***l***), so we should arrive near the new point at its level at a relatively low cost. Then the ***max_nb_connection*** nearest neighbours are searched in neighbours of neighbours table (with a reverse updating of tables) recursively from its layer ***l*** down to the most populated level 0.
The parameter of the exponential law to sample point levels is set to `ln(max_nb_connection)/scale`.
By default *scale* is set to 1. It is possible to reduce the *scale* parameter and thus reduce the number of levels used (See Hnsw::modify_level_scale) without increasing max_nb_connection.
This often provide better recalls without increasing *max_nb_connection* and thus spare memory usage. (See examples)
The main parameters occuring in constructing the graph or in searching are:
* max_nb_connection (in hnsw initialization)
The maximum number of links from one point to others. Values ranging from 16 to 64 are standard initialising values, the higher the more time consuming.
* ef_construction (in hnsw initialization)
This parameter controls the width of the search for neighbours during insertion. Values from 200 to 800 are standard initialising values, the higher the more time consuming.
* max_layer (in hnsw initialization)
The maximum number of layers in graph. Must be less or equal than 16.
* ef_arg (in search methods)
This parameter controls the width of the search in the lowest level, it must be greater than number of neighbours asked but can be less than ***ef_construction***.
As a rule of thumb could be between the number of neighbours we will ask for (knbn arg in search method) and max_nb_connection.
* keep_pruned and extend_candidates.
These parameters are described in the paper by Malkov and Yashunin can be used to
modify the search strategy. The interested user should check the paper to see the impact. By default
the values are as recommended in the paper.
## Benchmarks and Examples [(examples)](./examples)
Some examples are taken from the [ann-benchmarks site](https://github.com/erikbern/ann-benchmarks)
and recall rates and request/s are given in comments in the examples files for some input parameters.
The annhdf5 module implements reading the standardized data files
of the [ann-benchmarks site](https://github.com/erikbern/ann-benchmarks),
just download the necessary benchmark data files and modify path in sources accordingly.
Then run: cargo build --release --features="simdeez_f" --examples .
It is possible in these examples to change from parallel searches to serial searches to check for speeds
or modify parameters to see the impact on performance.
With a i9-13900HX 24 cores laptop we get the following results:
1. fashion-mnist-784-euclidean : search requests run at 62000 req/s with a recall rate of 0.977
2. ann-glove-25-angular : search for the first 100 neighbours run with recall 0.979 at 12000 req/s
3. sift1m benchmark: (1 million points in 128 dimension) search requests for the 10 first neighbours runs at 15000 req/s with a recall rate of 0.9907 or at 8300 req/s with a recall rate of 0.9959, depending on the parameters.
Moreover a tiny crate [bigann](https://github.com/jean-pierreBoth/bigann)
gives results on the first 10 Million points of the [BIGANN](https://big-ann-benchmarks.com/neurips21.html) benchmark. The benchmark is also described at [IRISA](http://corpus-texmex.irisa.fr/). This crate can used to play with parameters on this data. Results give a recall between 0.92 and 0.99 depending on number of requests and parameters.
Some lines extracted from this Mnist benchmark show how it works for f32 and L2 norm
```rust
// reading data
let anndata = AnnBenchmarkData::new(fname).unwrap();
let nb_elem = anndata.train_data.len();
let max_nb_connection = 24;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let ef_c = 400;
// allocating network
let mut hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2{});
hnsw.set_extend_candidates(false);
// parallel insertion of train data
let data_for_par_insertion = anndata.train_data.iter().map( |x| (&x.0, x.1)).collect();
hnsw.parallel_insert(&data_for_par_insertion);
//
hnsw.dump_layer_info();
// Now the bench with 10 neighbours
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
hnsw.set_searching_mode(true);
let knbn = 10;
let ef_c = max_nb_connection;
// search 10 nearest neighbours for test data
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_c);
....
```
## Contributions
[Sannsyn](https://sannsyn.com/en/) contributed to Drop implementation and FilterT trait.
Petter Egesund added the DistLevenshtein distance.
## Evolutions are described [here](./Changes.md)
## License
Licensed under either of
* Apache License, Version 2.0, [LICENSE-APACHE](LICENSE-APACHE) or <http://www.apache.org/licenses/LICENSE-2.0>
* MIT license [LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>
at your option.

View File

@@ -0,0 +1,220 @@
#![allow(clippy::needless_range_loop)]
use cpu_time::ProcessTime;
use std::time::{Duration, SystemTime};
// glove 25 // 2.7 Ghz 4 cores 8Mb L3 k = 10
// ============================================
//
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
// 24 800 64 1. 1 0 0.928 4090 1.003
// 24 800 64 1. 1 1 0.927 4594 1.003
// 24 400, 48 1. 1 0 0.919 6349 1.0044
// 24 800 48 1 1 1 0.918 5785 1.005
// 24 400 32 1. 0 0 0.898 8662
// 24 400 64 1. 1 0 0.930 4711 1.0027
// 24 400 64 1. 1 1 0.921 4550 1.0039
// 24 1600 48 1 1 0 0.924 5380 1.0034
// 32 400 48 1 1 0 0.93 4706 1.0026
// 32 800 64 1 1 0 0.94 3780. 1.0015
// 32 1600 48 1 1 0 0.934 4455 1.0023
// 48 1600 48 1 1 0 0.945 3253 1.00098
// 24 400 48 1 1 0 0.92 6036. 1.0038
// 48 800 48 1 1 0 0.935 4018 1.002
// 48 800 64 1 1 0 0.942 3091 1.0014
// 48 800 64 1 1 1 0.9435 2640 1.00126
// k = 100
// 24 800 48 1 1 0 0.96 2432 1.004
// 48 800 128 1 1 0 0.979 1626 1.001
// glove 25 // 8 cores i7 2.3 Ghz 8Mb L3 knbn = 100
// ==================================================
// 48 800 48 1 1 0 0.935 13400 1.002
// 48 800 128 1 1 0 0.979 5227 1.002
// 24 core Core(TM) i9-13900HX simdeez knbn = 10
// ==================================================
// 48 800 48 1 1 0 0.936 30748 1.002
// 24 core Core(TM) i9-13900HX simdeez knbn = 100
// ==================================================
// 48 800 128 1 1 0 0.979 12000 1.002
// results with scale modification 0.5
//====================================
// 24 core Core(TM) i9-13900HX simdeez knbn = 10
// ==================================================
// 24 800 48 0.5 1 0 0.931 40700 1.002
// 48 800 48 0.5 1 0 0.941 30001 1.001
// 24 core Core(TM) i9-13900HX simdeez knbn = 100
// ==================================================
// 24 800 128 0.5 1 0 0.974 16521 1.002
// 48 800 128 0.5 1 0 0.985 11484 1.001
use anndists::dist::*;
use hnsw_rs::prelude::*;
use log::info;
mod utils;
use utils::*;
pub fn main() {
let _ = env_logger::builder().is_test(true).try_init().unwrap();
let parallel = true;
//
let fname = String::from("/home/jpboth/Data/ANN/glove-25-angular.hdf5");
println!("\n\n test_load_hdf5 {:?}", fname);
// now recall that data are stored in row order.
let mut anndata = annhdf5::AnnBenchmarkData::new(fname).unwrap();
// pre normalisation to use Dot computations instead of Cosine
anndata.do_l2_normalization();
// run bench
let nb_elem = anndata.train_data.len();
let knbn_max = anndata.test_distances.dim().1;
info!(
"Train size : {}, test size : {}",
nb_elem,
anndata.test_data.len()
);
info!("Nb neighbours answers for test data : {} \n\n", knbn_max);
//
let max_nb_connection = 24;
let ef_c = 800;
println!(
" max_nb_conn : {:?}, ef_construction : {:?} ",
max_nb_connection, ef_c
);
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
println!(
" number of elements to insert {:?} , setting max nb layer to {:?} ef_construction {:?}",
nb_elem, nb_layer, ef_c
);
let nb_search = anndata.test_data.len();
println!(" number of search {:?}", nb_search);
// Hnsw allocation
let mut hnsw =
Hnsw::<f32, DistDot>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistDot {});
//
hnsw.set_extend_candidates(true);
hnsw.modify_level_scale(0.5);
//
// parallel insertion
let start = ProcessTime::now();
let now = SystemTime::now();
let data_for_par_insertion = anndata
.train_data
.iter()
.map(|x| (x.0.as_slice(), x.1))
.collect();
if parallel {
println!(" \n parallel insertion");
hnsw.parallel_insert_slice(&data_for_par_insertion);
} else {
println!(" \n serial insertion");
for d in data_for_par_insertion {
hnsw.insert_slice(d);
}
}
let cpu_time: Duration = start.elapsed();
//
println!(
"\n hnsw data insertion cpu time {:?} system time {:?} ",
cpu_time,
now.elapsed()
);
hnsw.dump_layer_info();
println!(" hnsw data nb point inserted {:?}", hnsw.get_nb_point());
//
// Now the bench with 10 neighbours
//
let knbn = 10;
let ef_search = 48;
search(&mut hnsw, knbn, ef_search, &anndata);
let knbn = 100;
let ef_search = 128;
search(&mut hnsw, knbn, ef_search, &anndata);
}
pub fn search<Dist>(
hnsw: &mut Hnsw<f32, Dist>,
knbn: usize,
ef_search: usize,
anndata: &annhdf5::AnnBenchmarkData,
) where
Dist: Distance<f32> + Send + Sync,
{
println!("\n\n ef_search : {:?} knbn : {:?} ", ef_search, knbn);
let parallel = true;
//
let nb_elem = anndata.train_data.len();
let nb_search = anndata.test_data.len();
//
let mut recalls = Vec::<usize>::with_capacity(nb_elem);
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
let mut last_distances_ratio = Vec::<f32>::with_capacity(nb_elem);
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
hnsw.set_searching_mode(true);
println!("searching with ef : {:?}", ef_search);
let start = ProcessTime::now();
let now = SystemTime::now();
// search
if parallel {
println!(" \n parallel search");
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_search);
} else {
println!(" \n serial search");
for i in 0..anndata.test_data.len() {
let knn_neighbours: Vec<Neighbour> =
hnsw.search(&anndata.test_data[i], knbn, ef_search);
knn_neighbours_for_tests.push(knn_neighbours);
}
}
let cpu_time = start.elapsed();
let search_cpu_time = cpu_time.as_micros() as f32;
let search_sys_time = now.elapsed().unwrap().as_micros() as f32;
println!(
"total cpu time for search requests {:?} , system time {:?} ",
search_cpu_time,
now.elapsed()
);
// now compute recall rate
for i in 0..anndata.test_data.len() {
let max_dist = anndata.test_distances.row(i)[knbn - 1];
let knn_neighbours_d: Vec<f32> = knn_neighbours_for_tests[i]
.iter()
.map(|p| p.distance)
.collect();
nb_returned.push(knn_neighbours_d.len());
let recall = knn_neighbours_d.iter().filter(|d| *d <= &max_dist).count();
recalls.push(recall);
let mut ratio = 0.;
if !knn_neighbours_d.is_empty() {
ratio = knn_neighbours_d[knn_neighbours_d.len() - 1] / max_dist;
}
last_distances_ratio.push(ratio);
}
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
println!(
"\n mean fraction nb returned by search {:?} ",
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
);
println!(
"\n last distances ratio {:?} ",
last_distances_ratio.iter().sum::<f32>() / last_distances_ratio.len() as f32
);
println!(
"\n recall rate for {:?} is {:?} , nb req /s {:?}",
anndata.fname,
mean_recall,
(nb_search as f32) * 1.0e+6_f32 / search_sys_time
);
}

View File

@@ -0,0 +1,162 @@
#![allow(clippy::needless_range_loop)]
use cpu_time::ProcessTime;
use std::time::{Duration, SystemTime};
// search in serial mode i7-core @2.7Ghz for 10 fist neighbours
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
//
// 12 400 12 1 0 0 0.917 6486 1.005
// 24 400 24 1 1 0 0.9779 3456 1.001
// parallel mode 4 i7-core @2.7Ghz
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
// 24 400 24 1 0 0 0.977 12566 1.001
// 24 400 12 1 0 0 0.947 18425 1.003
// 8 hyperthreaded i7-core @ 2.3 Ghz
// 24 400 24 1 0 0 0.977 22197 1.001
// 24 core Core(TM) i9-13900HX simdeez
// 24 400 24 1 0 0 0.977 62000 1.001
// 24 core Core(TM) i9-13900HX simdeez with modify_level_scale at 0.5
// 24 400 24 0.5 0 0 0.990 58722 1.000
use anndists::dist::*;
use hnsw_rs::prelude::*;
use log::info;
mod utils;
use utils::*;
pub fn main() {
let mut parallel = true;
//
let fname = String::from("/home/jpboth/Data/ANN/fashion-mnist-784-euclidean.hdf5");
println!("\n\n test_load_hdf5 {:?}", fname);
// now recall that data are stored in row order.
let anndata = annhdf5::AnnBenchmarkData::new(fname).unwrap();
let knbn_max = anndata.test_distances.dim().1;
let nb_elem = anndata.train_data.len();
info!(
"Train size : {}, test size : {}",
nb_elem,
anndata.test_data.len()
);
info!("Nb neighbours answers for test data : {}", knbn_max);
//
let max_nb_connection = 24;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let ef_c = 400;
println!(
" number of elements to insert {:?} , setting max nb layer to {:?} ef_construction {:?}",
nb_elem, nb_layer, ef_c
);
println!(
" ====================================================================================="
);
let nb_search = anndata.test_data.len();
println!(" number of search {:?}", nb_search);
let mut hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
hnsw.set_extend_candidates(false);
//
hnsw.modify_level_scale(0.25);
// parallel insertion
let mut start = ProcessTime::now();
let mut now = SystemTime::now();
let data_for_par_insertion = anndata
.train_data
.iter()
.map(|x| (x.0.as_slice(), x.1))
.collect();
if parallel {
println!(" \n parallel insertion");
hnsw.parallel_insert_slice(&data_for_par_insertion);
} else {
println!(" \n serial insertion");
for d in data_for_par_insertion {
hnsw.insert_slice(d);
}
}
let mut cpu_time: Duration = start.elapsed();
//
println!(
"\n hnsw data insertion cpu time {:?} system time {:?} ",
cpu_time,
now.elapsed()
);
hnsw.dump_layer_info();
println!(" hnsw data nb point inserted {:?}", hnsw.get_nb_point());
//
// Now the bench with 10 neighbours
//
let mut recalls = Vec::<usize>::with_capacity(nb_elem);
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
let mut last_distances_ratio = Vec::<f32>::with_capacity(nb_elem);
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
hnsw.set_searching_mode(true);
let knbn = 10;
let ef_c = max_nb_connection;
println!("\n searching with ef : {:?}", ef_c);
start = ProcessTime::now();
now = SystemTime::now();
// search
parallel = true;
if parallel {
println!(" \n parallel search");
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_c);
} else {
println!(" \n serial search");
for i in 0..anndata.test_data.len() {
let knn_neighbours: Vec<Neighbour> = hnsw.search(&anndata.test_data[i], knbn, ef_c);
knn_neighbours_for_tests.push(knn_neighbours);
}
}
cpu_time = start.elapsed();
let search_sys_time = now.elapsed().unwrap().as_micros() as f32;
let search_cpu_time = cpu_time.as_micros() as f32;
println!(
"total cpu time for search requests {:?} , system time {:?} ",
search_cpu_time, search_sys_time
);
// now compute recall rate
for i in 0..anndata.test_data.len() {
let true_distances = anndata.test_distances.row(i);
let max_dist = true_distances[knbn - 1];
let mut _knn_neighbours_id: Vec<usize> =
knn_neighbours_for_tests[i].iter().map(|p| p.d_id).collect();
let knn_neighbours_dist: Vec<f32> = knn_neighbours_for_tests[i]
.iter()
.map(|p| p.distance)
.collect();
nb_returned.push(knn_neighbours_dist.len());
// count how many distances of knn_neighbours_dist are less than
let recall = knn_neighbours_dist
.iter()
.filter(|x| *x <= &max_dist)
.count();
recalls.push(recall);
let mut ratio = 0.;
if !knn_neighbours_dist.is_empty() {
ratio = knn_neighbours_dist[knn_neighbours_dist.len() - 1] / max_dist;
}
last_distances_ratio.push(ratio);
}
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
println!(
"\n mean fraction nb returned by search {:?} ",
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
);
println!(
"\n last distances ratio {:?} ",
last_distances_ratio.iter().sum::<f32>() / last_distances_ratio.len() as f32
);
println!(
"\n recall rate for {:?} is {:?} , nb req /s {:?}",
anndata.fname,
mean_recall,
(nb_search as f32) * 1.0e+6_f32 / search_sys_time
);
}

View File

@@ -0,0 +1,196 @@
#![allow(clippy::needless_range_loop)]
use cpu_time::ProcessTime;
use env_logger::Builder;
use std::time::{Duration, SystemTime};
use anndists::dist::*;
use log::info;
// search in paralle mode 8 core i7-10875H @2.3Ghz time 100 neighbours
// max_nb_conn ef_cons ef_search scale_factor extend keep pruned recall req/s last ratio
//
// 64 800 64 1 0 0 0.976 4894 1.001
// 64 800 128 1 0 0 0.985 3811 1.00064
// 64 800 128 1 1 0 0.9854 3765 1.0
// 64 1600 64 1 0 0 0.9877 3419. 1.0005
// search in parallel mode 8 core i7-10875H @2.3Ghz time for 10 neighbours
// 64 1600 64 1 0 0 0.9907 6100 1.0004
// 64 1600 128 1 0 0 0.9959 3077. 1.0001
// 24 core Core(TM) i9-13900HX simdeez
// 64 1600 64 1 0 0 0.9907 15258 1.0004
// 64 1600 128 1 0 0 0.9957 8296 1.0002
// 24 core Core(TM) i9-13900HX simdeez with level scale modification factor 0.5
//=============================================================================
// 48 1600 64 0.5 0 0 0.9938 14073 1.0002
// 48 1600 128 0.5 0 0 0.9992 7906 1.0000
// with an AMD ryzen 9 7950X 16-Core simdeez with level scale modification factor 0.5
//=============================================================================
// 48 1600 64 0.5 0 0 0.9938 17000 1.0002
// 48 1600 128 0.5 0 0 0.9992 9600 1.0000
use hnsw_rs::prelude::*;
mod utils;
use utils::*;
pub fn main() {
//
Builder::from_default_env().init();
//
let parallel = true;
//
let fname = String::from("/home/jpboth/Data/ANN/sift1m-128-euclidean.hdf5");
println!("\n\n test_load_hdf5 {:?}", fname);
// now recall that data are stored in row order.
let anndata = annhdf5::AnnBenchmarkData::new(fname).unwrap();
// run bench
let knbn_max = anndata.test_distances.dim().1;
let nb_elem = anndata.train_data.len();
info!(
" train size : {}, test size : {}",
nb_elem,
anndata.test_data.len()
);
info!(" nb neighbours answers for test data : {}", knbn_max);
//
let max_nb_connection = 48;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let ef_c = 1600;
//
println!(
" number of elements to insert {:?} , setting max nb layer to {:?} ef_construction {:?}",
nb_elem, nb_layer, ef_c
);
println!(
" ====================================================================================="
);
//
let mut hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
//
let extend_flag = false;
info!("extend flag = {:?} ", extend_flag);
hnsw.set_extend_candidates(extend_flag);
hnsw.modify_level_scale(0.5);
//
// parallel insertion
let start = ProcessTime::now();
let now = SystemTime::now();
let data_for_par_insertion = anndata
.train_data
.iter()
.map(|x| (x.0.as_slice(), x.1))
.collect();
if parallel {
println!(" \n parallel insertion");
hnsw.parallel_insert_slice(&data_for_par_insertion);
} else {
println!(" \n serial insertion");
for d in data_for_par_insertion {
hnsw.insert_slice(d);
}
}
let cpu_time: Duration = start.elapsed();
//
println!(
"\n hnsw data insertion cpu time {:?} system time {:?} ",
cpu_time,
now.elapsed()
);
hnsw.dump_layer_info();
println!(" hnsw data nb point inserted {:?}", hnsw.get_nb_point());
//
//
let knbn = 10.min(knbn_max);
let ef_search = 64;
println!("searching with ef = {}", ef_search);
search(&mut hnsw, knbn, ef_search, &anndata);
//
println!("searching with ef = {}", ef_search);
let ef_search = 128;
search(&mut hnsw, knbn, ef_search, &anndata);
}
pub fn search<Dist>(
hnsw: &mut Hnsw<f32, Dist>,
knbn: usize,
ef_search: usize,
anndata: &annhdf5::AnnBenchmarkData,
) where
Dist: Distance<f32> + Send + Sync,
{
println!("\n\n ef_search : {:?} knbn : {:?} ", ef_search, knbn);
let parallel = true;
//
let nb_elem = anndata.train_data.len();
let nb_search = anndata.test_data.len();
//
let mut recalls = Vec::<usize>::with_capacity(nb_elem);
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
let mut last_distances_ratio = Vec::<f32>::with_capacity(nb_elem);
let mut knn_neighbours_for_tests = Vec::<Vec<Neighbour>>::with_capacity(nb_elem);
hnsw.set_searching_mode(true);
println!("searching with ef : {:?}", ef_search);
let start = ProcessTime::now();
let now = SystemTime::now();
// search
if parallel {
println!(" \n parallel search");
knn_neighbours_for_tests = hnsw.parallel_search(&anndata.test_data, knbn, ef_search);
} else {
println!(" \n serial search");
for i in 0..anndata.test_data.len() {
let knn_neighbours: Vec<Neighbour> =
hnsw.search(&anndata.test_data[i], knbn, ef_search);
knn_neighbours_for_tests.push(knn_neighbours);
}
}
let cpu_time = start.elapsed();
let search_cpu_time = cpu_time.as_micros() as f32;
let search_sys_time = now.elapsed().unwrap().as_micros() as f32;
println!(
"total cpu time for search requests {:?} , system time {:?} ",
search_cpu_time,
now.elapsed()
);
// now compute recall rate
for i in 0..anndata.test_data.len() {
let max_dist = anndata.test_distances.row(i)[knbn - 1];
let knn_neighbours_d: Vec<f32> = knn_neighbours_for_tests[i]
.iter()
.map(|p| p.distance)
.collect();
nb_returned.push(knn_neighbours_d.len());
let recall = knn_neighbours_d.iter().filter(|d| *d <= &max_dist).count();
recalls.push(recall);
let mut ratio = 0.;
if !knn_neighbours_d.is_empty() {
ratio = knn_neighbours_d[knn_neighbours_d.len() - 1] / max_dist;
}
last_distances_ratio.push(ratio);
}
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
println!(
"\n mean fraction nb returned by search {:?} ",
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
);
println!(
"\n last distances ratio {:?} ",
last_distances_ratio.iter().sum::<f32>() / last_distances_ratio.len() as f32
);
println!(
"\n recall rate for {:?} is {:?} , nb req /s {:?}",
anndata.fname,
mean_recall,
(nb_search as f32) * 1.0e+6_f32 / search_sys_time
);
} // end of search

View File

@@ -0,0 +1,63 @@
use anndists::dist::*;
use hnsw_rs::prelude::*;
use rand::Rng;
use std::iter;
fn generate(len: usize) -> String {
const CHARSET: &[u8] = b"abcdefghij";
let mut rng = rand::rng();
let one_char = || CHARSET[rng.random_range(0..CHARSET.len())] as char;
iter::repeat_with(one_char).take(len).collect()
}
fn main() {
let nb_elem = 500000; // number of possible words in the dictionary
let max_nb_connection = 15;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let ef_c = 200;
let nb_words = 1000;
let hns = Hnsw::<u16, DistLevenshtein>::new(
max_nb_connection,
nb_elem,
nb_layer,
ef_c,
DistLevenshtein {},
);
let mut words = vec![];
for _n in 1..nb_words {
let tw = generate(5);
words.push(tw);
}
words.push(String::from("abcdj"));
//
for (i, w) in words.iter().enumerate() {
let vec: Vec<u16> = w.chars().map(|c| c as u16).collect();
hns.insert((&vec, i));
}
// create a filter
let mut filter: Vec<usize> = Vec::new();
for i in 1..100 {
filter.push(i);
}
//
let ef_search: usize = 30;
let tosearch: Vec<u16> = "abcde".chars().map(|c| c as u16).collect();
//
println!("========== search with filter ");
let res = hns.search_filter(&tosearch, 10, ef_search, Some(&filter));
for r in res {
println!(
"Word: {:?} Id: {:?} Distance: {:?}",
words[r.d_id], r.d_id, r.distance
);
}
println!("========== search without filter ");
let res3 = hns.search(&tosearch, 10, ef_search);
for r in res3 {
println!(
"Word: {:?} Id: {:?} Distance: {:?}",
words[r.d_id], r.d_id, r.distance
);
}
}

View File

@@ -0,0 +1,80 @@
#![allow(clippy::needless_range_loop)]
#![allow(clippy::range_zip_with_len)]
use cpu_time::ProcessTime;
use rand::distr::Uniform;
use rand::prelude::*;
use std::time::{Duration, SystemTime};
use anndists::dist::*;
use hnsw_rs::prelude::*;
fn main() {
env_logger::Builder::from_default_env().init();
//
let nb_elem = 500000;
let dim = 25;
// generate nb_elem colmuns vectors of dimension dim
let mut rng = rand::rng();
let unif = rand::distr::StandardUniform;
let mut data = Vec::with_capacity(nb_elem);
for _ in 0..nb_elem {
let column = (0..dim).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
data.push(column);
}
// give an id to each data
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
let ef_c = 200;
let max_nb_connection = 15;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let hns = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
let mut start = ProcessTime::now();
let mut begin_t = SystemTime::now();
hns.parallel_insert(&data_with_id);
let mut cpu_time: Duration = start.elapsed();
println!(" hnsw data insertion cpu time {:?}", cpu_time);
println!(
" hnsw data insertion parallel, system time {:?} \n",
begin_t.elapsed().unwrap()
);
hns.dump_layer_info();
println!(
" parallel hnsw data nb point inserted {:?}",
hns.get_nb_point()
);
//
// serial insertion
//
let hns = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
start = ProcessTime::now();
begin_t = SystemTime::now();
for _i in 0..data_with_id.len() {
hns.insert((data_with_id[_i].0.as_slice(), data_with_id[_i].1))
}
cpu_time = start.elapsed();
println!("\n\n serial hnsw data insertion {:?}", cpu_time);
println!(
" hnsw data insertion serial, system time {:?}",
begin_t.elapsed().unwrap()
);
hns.dump_layer_info();
println!(
" serial hnsw data nb point inserted {:?}",
hns.get_nb_point()
);
let ef_search = max_nb_connection * 2;
let knbn = 10;
//
for _iter in 0..100 {
let mut r_vec = Vec::<f32>::with_capacity(dim);
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
for _ in 0..dim {
r_vec.push(rng.sample(unif));
}
//
let _neighbours = hns.search(&r_vec, knbn, ef_search);
}
}

View File

@@ -0,0 +1,233 @@
//! This file provides hdf5 utilities to load ann-benchmarks hdf5 data files
//! As the libray does not depend on hdf5 nor on ndarray, it is nearly the same for both
//! ann benchmarks.
use ndarray::Array2;
use ::hdf5::*;
use log::debug;
// datasets
// . distances (nbojects, dim) f32 matrix for tests objects
// . neighbors (nbobjects, nbnearest) int32 matrix giving the num of nearest neighbors in train data
// . test (nbobjects, dim) f32 matrix test data
// . train (nbobjects, dim) f32 matrix train data
/// a structure to load hdf5 data file benchmarks from https://github.com/erikbern/ann-benchmarks
pub struct AnnBenchmarkData {
pub fname: String,
/// distances from each test object to its nearest neighbours.
pub test_distances: Array2<f32>,
// for each test data , id of its nearest neighbours
#[allow(unused)]
pub test_neighbours: Array2<i32>,
/// list of vectors for which we will search ann.
pub test_data: Vec<Vec<f32>>,
/// list of data vectors and id
pub train_data: Vec<(Vec<f32>, usize)>,
/// searched results. first neighbours for each test data.
#[allow(unused)]
pub searched_neighbours: Vec<Vec<i32>>,
/// distances of neighbours obtained of each test
#[allow(unused)]
pub searched_distances: Vec<Vec<f32>>,
}
impl AnnBenchmarkData {
pub fn new(fname: String) -> Result<AnnBenchmarkData> {
let res = hdf5::File::open(fname.clone());
if res.is_err() {
println!("you must download file {:?}", fname);
panic!(
"download benchmark file some where and modify examples source file accordingly"
);
}
let file = res.ok().unwrap();
//
// get test distances
//
let res_distances = file.dataset("distances");
if res_distances.is_err() {
// let reader = hdf5::Reader::<f32>::new(&test_distance);
panic!("error getting distances dataset");
}
let distances = res_distances.unwrap();
let shape = distances.shape();
assert_eq!(shape.len(), 2);
let dataf32 = distances.dtype().unwrap().is::<f32>();
if !dataf32 {
// error
panic!("error getting type distances dataset");
}
// read really data
let res = distances.read_2d::<f32>();
if res.is_err() {
// some error
panic!("error reading distances dataset");
}
let test_distances = res.unwrap();
// a check for row order
debug!(
"First 2 distances for first test {:?} {:?} ",
test_distances.get((0, 0)).unwrap(),
test_distances.get((0, 1)).unwrap()
);
//
// read neighbours
//
let res_neighbours = file.dataset("neighbors");
if res_neighbours.is_err() {
// let reader = hdf5::Reader::<f32>::new(&test_distance);
panic!("error getting neighbours");
}
let neighbours = res_neighbours.unwrap();
let shape = neighbours.shape();
assert_eq!(shape.len(), 2);
println!("neighbours shape : {:?}", shape);
let datai32 = neighbours.dtype().unwrap().is::<i32>();
if !datai32 {
// error
panic!("error getting type neighbours");
}
// read really data
let res = neighbours.read_2d::<i32>();
if res.is_err() {
// some error
panic!("error reading neighbours dataset");
}
let test_neighbours = res.unwrap();
debug!(
"First 2 neighbours for first test {:?} {:?} ",
test_neighbours.get((0, 0)).unwrap(),
test_neighbours.get((0, 1)).unwrap()
);
println!("\n 10 first neighbours for first vector : ");
for i in 0..10 {
print!(" {:?} ", test_neighbours.get((0, i)).unwrap());
}
println!("\n 10 first neighbours for second vector : ");
for i in 0..10 {
print!(" {:?} ", test_neighbours.get((1, i)).unwrap());
}
//
// read test data
// ===============
//
let res_testdata = file.dataset("test");
if res_testdata.is_err() {
panic!("error getting test de notataset");
}
let test_data = res_testdata.unwrap();
let shape = test_data.shape(); // nota shape returns a slice, dim returns a t-uple
assert_eq!(shape.len(), 2);
let dataf32 = test_data.dtype().unwrap().is::<f32>();
if !dataf32 {
panic!("error getting type de notistances dataset");
}
// read really datae not
let res = test_data.read_2d::<f32>();
if res.is_err() {
// some error
panic!("error reading distances dataset");
}
let test_data_2d = res.unwrap();
let mut test_data = Vec::<Vec<f32>>::with_capacity(shape[1]);
let (nbrow, nbcolumn) = test_data_2d.dim();
println!(" test data, nb element {:?}, dim : {:?}", nbrow, nbcolumn);
for i in 0..nbrow {
let mut vec = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
vec.push(*test_data_2d.get((i, j)).unwrap());
}
test_data.push(vec);
}
//
// loaf train data
//
let res_traindata = file.dataset("train");
if res_traindata.is_err() {
panic!("error getting distances dataset");
}
let train_data = res_traindata.unwrap();
let train_shape = train_data.shape();
assert_eq!(shape.len(), 2);
if test_data_2d.dim().1 != train_shape[1] {
println!("test and train have not the same dimension");
panic!();
}
println!(
"\n train data shape : {:?}, nbvector {:?} ",
train_shape, train_shape[0]
);
let dataf32 = train_data.dtype().unwrap().is::<f32>();
if !dataf32 {
// error
panic!("error getting type distances dataset");
}
// read really data
let res = train_data.read_2d::<f32>();
if res.is_err() {
// some error
panic!("error reading distances dataset");
}
let train_data_2d = res.unwrap();
let mut train_data = Vec::<(Vec<f32>, usize)>::with_capacity(shape[1]);
let (nbrow, nbcolumn) = train_data_2d.dim();
for i in 0..nbrow {
let mut vec = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
vec.push(*train_data_2d.get((i, j)).unwrap());
}
train_data.push((vec, i));
}
//
// now allocate array's for result
//
println!(
" allocating vector for search neighbours answer : {:?}",
test_data.len()
);
let searched_neighbours = Vec::<Vec<i32>>::with_capacity(test_data.len());
let searched_distances = Vec::<Vec<f32>>::with_capacity(test_data.len());
// searched_distances
Ok(AnnBenchmarkData {
fname: fname.clone(),
test_distances,
test_neighbours,
test_data,
train_data,
searched_neighbours,
searched_distances,
})
} // end new
/// do l2 normalisation of test and train vector to use DistDot metrinc instead DistCosine to spare cpu
#[allow(unused)]
pub fn do_l2_normalization(&mut self) {
for i in 0..self.test_data.len() {
anndists::dist::l2_normalize(&mut self.test_data[i]);
}
for i in 0..self.train_data.len() {
anndists::dist::l2_normalize(&mut self.train_data[i].0);
}
} // end of do_l2_normalization
} // end of impl block
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_load_hdf5() {
env_logger::Builder::from_default_env().init();
//
let fname = String::from("/home.2/Data/ANN/glove-25-angular.hdf5");
println!("\n\n test_load_hdf5 {:?}", fname);
// now recall that data are stored in row order.
let _anndata = AnnBenchmarkData::new(fname).unwrap();
//
} // end of test_load_hdf5
} // end of module test

View File

@@ -0,0 +1,3 @@
//! hdf5 utilities for examples
pub mod annhdf5;

View File

@@ -0,0 +1,87 @@
//! Api for external language.
//! This file provides a trait to be used as an opaque pointer for C or Julia calls used in file libext.rs
use std::path::Path;
use serde::{de::DeserializeOwned, Serialize};
use crate::hnsw::*;
use crate::hnswio::*;
use anndists::dist::distances::Distance;
use log::info;
pub trait AnnT {
/// type of data vectors
type Val;
//
fn insert_data(&mut self, data: &[Self::Val], id: usize);
//
fn search_neighbours(&self, data: &[Self::Val], knbn: usize, ef_s: usize) -> Vec<Neighbour>;
//
fn parallel_insert_data(&mut self, data: &[(&Vec<Self::Val>, usize)]);
//
fn parallel_search_neighbours(
&self,
data: &[Vec<Self::Val>],
knbn: usize,
ef_s: usize,
) -> Vec<Vec<Neighbour>>;
///
/// dumps a data and graph in 2 files.
/// Datas are dumped in file filename.hnsw.data and graph in filename.hnsw.graph
///
/// **We do not overwrite old files if they are currently in use by memory map**
/// If these files already exist , they are not overwritten and a unique filename is generated by concatenating a random number to filename.
/// The function returns the basename used for the dump
fn file_dump(&self, path: &Path, file_basename: &str) -> anyhow::Result<String>;
}
impl<T, D> AnnT for Hnsw<'_, T, D>
where
T: Serialize + DeserializeOwned + Clone + Send + Sync,
D: Distance<T> + Send + Sync,
{
type Val = T;
//
fn insert_data(&mut self, data: &[Self::Val], id: usize) {
self.insert((data, id));
}
//
fn search_neighbours(&self, data: &[T], knbn: usize, ef_s: usize) -> Vec<Neighbour> {
self.search(data, knbn, ef_s)
}
fn parallel_insert_data(&mut self, data: &[(&Vec<Self::Val>, usize)]) {
self.parallel_insert(data);
}
fn parallel_search_neighbours(
&self,
data: &[Vec<Self::Val>],
knbn: usize,
ef_s: usize,
) -> Vec<Vec<Neighbour>> {
self.parallel_search(data, knbn, ef_s)
}
// The main entry point to do a dump.
// It will generate two files one for the graph part of the data. The other for the real data points of the structure.
// The names of file are $filename.hnsw.graph for the graph and $filename.hnsw.data.
fn file_dump(&self, path: &Path, file_basename: &str) -> anyhow::Result<String> {
info!("In Hnsw::file_dump");
//
// do not overwrite if mmap is active
let overwrite = !self.get_datamap_opt();
let mut dumpinit = DumpInit::new(path, file_basename, overwrite);
let dumpname = dumpinit.get_basename().clone();
//
let res = self.dump(DumpMode::Full, &mut dumpinit);
//
dumpinit.flush()?;
info!("\n End of dump, file basename : {}\n", &dumpname);
if res.is_ok() {
Ok(dumpname)
} else {
Err(anyhow::anyhow!("unexpected error"))
}
} // end of dump
} // end of impl block AnnT for Hnsw<T,D>

View File

@@ -0,0 +1,457 @@
//! This module provides a memory mapping of Data vectors filling the Hnsw structure.
//! It is used by the module [hnswio] and also gives access to an iterator over data without loading the graph.
//!
//! We mmap the file and provide
//! - a Hashmap from DataId to address
//! - an interface for retrieving just data vectors loaded in the hnsw structure.
use std::io::BufReader;
use std::fs::{File, OpenOptions};
use std::path::{Path, PathBuf};
use indexmap::map::IndexMap;
use log::{debug, error, info, trace};
use mmap_rs::{Mmap, MmapOptions};
use crate::hnsw::DataId;
use crate::hnswio;
use crate::hnswio::MAGICDATAP;
/// This structure uses the data part of the dump of a Hnsw structure to retrieve the data.
/// The data is access via a mmap of the data file, so memory is spared at the expense of page loading.
// possibly to be used in graph to spare memory?
pub struct DataMap {
/// File containing Points data
_datapath: PathBuf,
/// The mmap structure
mmap: Mmap,
/// map a dataId to an address where we get a bson encoded vector of type T
hmap: IndexMap<DataId, usize>,
/// type name of Data
t_name: String,
/// dimension of data vector
dimension: usize,
//
distname: String,
} // end of DataMap
impl DataMap {
// TODO: specifiy mmap option
/// The fname argument corresponds to the basename of the dump.
/// To reload from file fname.hnsw.data just pass fname as argument.
/// The dir argument is the directory where the fname.hnsw.data and fname.hnsw.graph reside.
pub fn from_hnswdump<T: std::fmt::Debug>(
dir: &Path,
file_name: &str,
) -> Result<DataMap, String> {
// reload description to have data type, and check for dump version
let mut graphpath = PathBuf::from(dir);
graphpath.push(dir);
let mut filename = file_name.to_string();
filename.push_str(".hnsw.graph");
graphpath.push(filename);
let graphfileres = OpenOptions::new().read(true).open(&graphpath);
if graphfileres.is_err() {
println!("DataMap: could not open file {:?}", graphpath.as_os_str());
std::process::exit(1);
}
let graphfile = graphfileres.unwrap();
let mut graph_in = BufReader::new(graphfile);
// we need to call load_description first to get distance name
let hnsw_description = hnswio::load_description(&mut graph_in).unwrap();
if hnsw_description.format_version <= 2 {
let msg = String::from(
"from_hnsw::from_hnsw : data mapping is only possible for dumps with the version > 0.1.19 of this crate",
);
error!(
"Data mapping is only possible for dumps with the version > 0.1.19 of this crate"
);
return Err(msg);
}
let distname = hnsw_description.distname.clone();
let t_name = hnsw_description.get_typename();
// check typename coherence
info!("Got typename from reload : {:?}", t_name);
if std::any::type_name::<T>() != t_name {
error!(
"Description has typename {:?}, function type argument is : {:?}",
t_name,
std::any::type_name::<T>()
);
return Err(String::from("type error"));
}
// get dimension as declared in description
let descr_dimension = hnsw_description.get_dimension();
drop(graph_in);
//
// we know data filename is hnswdump.hnsw.data
//
let mut datapath = PathBuf::new();
datapath.push(dir);
let mut filename = file_name.to_string();
filename.push_str(".hnsw.data");
datapath.push(filename);
//
let meta = std::fs::metadata(&datapath);
if meta.is_err() {
error!("Could not open file : {:?}", &datapath);
std::process::exit(1);
}
let fsize = meta.unwrap().len().try_into().unwrap();
//
let file_res = File::open(&datapath);
if file_res.is_err() {
error!("Could not open file : {:?}", &datapath);
std::process::exit(1);
}
let file = file_res.unwrap();
let offset = 0;
//
let mmap_opt = MmapOptions::new(fsize).unwrap();
let mmap_opt = unsafe { mmap_opt.with_file(&file, offset) };
let mapping_res = mmap_opt.map();
if mapping_res.is_err() {
error!("Could not memory map : {:?}", &datapath);
std::process::exit(1);
}
let mmap = mapping_res.unwrap();
//
info!("Mmap done on file : {:?}", &datapath);
//
// where are we in decoding mmap slice? at beginning
//
let mapped_slice = mmap.as_slice();
//
// where are we in decoding mmap slice?
let mut current_mmap_addr = 0usize;
let mut usize_slice = [0u8; std::mem::size_of::<usize>()];
// check magic
let mut u32_slice = [0u8; std::mem::size_of::<u32>()];
u32_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u32>()],
);
current_mmap_addr += std::mem::size_of::<u32>();
let magic = u32::from_ne_bytes(u32_slice);
assert_eq!(magic, MAGICDATAP, "magic not equal to MAGICDATAP in mmap");
// get dimension
usize_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<usize>()],
);
current_mmap_addr += std::mem::size_of::<usize>();
let dimension = usize::from_ne_bytes(usize_slice);
if dimension != descr_dimension {
error!(
"Description and data do not agree on dimension, data got : {:?}, description got : {:?}",
dimension, descr_dimension
);
return Err(String::from(
"description and data do not agree on dimension",
));
} else {
info!("Got dimension : {:?}", dimension);
}
//
// now we know that each record consists in
// - MAGICDATAP (u32), DataId (u64), dimension (u64) and then (length of type in bytes * dimension)
//
let record_size = std::mem::size_of::<u32>()
+ 2 * std::mem::size_of::<u64>()
+ dimension * std::mem::size_of::<T>();
let residual = mmap.size() - current_mmap_addr;
info!(
"Mmap size {}, current_mmap_addr {}, residual : {}",
mmap.size(),
current_mmap_addr,
residual
);
let nb_record = residual / record_size;
debug!("Record size : {}, nb_record : {}", record_size, nb_record);
// allocate hmap with correct capacity
let mut hmap = IndexMap::<DataId, usize>::with_capacity(nb_record);
// fill hmap to have address of each data point in file
let mut u64_slice = [0u8; std::mem::size_of::<u64>()];
//
// now we loop on records
//
for i in 0..nb_record {
debug!("Record i : {}, addr : {}", i, current_mmap_addr);
// decode Magic
u32_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u32>()],
);
current_mmap_addr += std::mem::size_of::<u32>();
let magic = u32::from_ne_bytes(u32_slice);
assert_eq!(magic, MAGICDATAP, "magic not equal to MAGICDATAP in mmap");
// decode DataId
u64_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
);
current_mmap_addr += std::mem::size_of::<u64>();
let data_id = u64::from_ne_bytes(u64_slice) as usize;
debug!(
"Inserting in hmap : got dataid : {:?} current map address : {:?}",
data_id, current_mmap_addr
);
// Note we store address where we have to decode dimension*size_of::<T> and full bson encoded vector
hmap.insert(data_id, current_mmap_addr);
// now read serialized length
u64_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
);
current_mmap_addr += std::mem::size_of::<u64>();
let serialized_len = u64::from_ne_bytes(u64_slice) as usize;
if i == 0 {
debug!("serialized bytes len to reload {:?}", serialized_len);
}
let mut v_serialized = vec![0; serialized_len];
v_serialized.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + serialized_len],
);
current_mmap_addr += serialized_len;
let slice_t =
unsafe { std::slice::from_raw_parts(v_serialized.as_ptr() as *const T, dimension) };
trace!(
"Deserialized v : {:?} address : {:?} ",
slice_t,
v_serialized.as_ptr() as *const T
);
} // end of for on record
//
debug!("End of DataMap::from_hnsw.");
//
let datamap = DataMap {
_datapath: datapath,
mmap,
hmap,
t_name,
dimension: descr_dimension,
distname,
};
//
Ok(datamap)
} // end of from_datas
//
/// returns true if type T corresponds to type as retrieved in DataMap.
/// This function can (should!) be used before calling [Self::get_data()]
pub fn check_data_type<T>(&self) -> bool
where
T: 'static + Sized,
{
// we check last part of name of type
let tname_vec = self.t_name.rsplit_terminator("::").collect::<Vec<&str>>();
if tname_vec.last().is_none() {
let errmsg = "DataMap::check_data_type() cannot determine data type name ";
error!("DataMap::check_data_type() cannot determine data type name ");
std::panic!("DataMap::check_data_type(), {}", errmsg);
}
let tname_last = tname_vec.last().unwrap();
//
let datat_name_arg = std::any::type_name::<T>().to_string();
let datat_name_vec = datat_name_arg
.rsplit_terminator("::")
.collect::<Vec<&str>>();
let datat_name_arg_last = datat_name_vec.last().unwrap();
//
if datat_name_arg_last == tname_last {
true
} else {
info!(
"Data type in DataMap : {}, type arg = {}",
tname_last, datat_name_arg_last
);
false
}
} // end of check_data_type
//
/// return the data corresponding to dataid. Access is done using mmap.
/// Function returns None if address is invalid
/// This function requires you know the type T.
/// **As mmap loading calls an unsafe function it is recommended to check the type name with [Self::check_data_type()]**
pub fn get_data<'a, T: Clone + std::fmt::Debug>(&'a self, dataid: &DataId) -> Option<&'a [T]> {
//
trace!("In DataMap::get_data, dataid : {:?}", dataid);
let address = self.hmap.get(dataid)?;
debug!("Address for id : {}, address : {:?}", dataid, address);
let mut current_mmap_addr = *address;
let mapped_slice = self.mmap.as_slice();
let mut u64_slice = [0u8; std::mem::size_of::<u64>()];
u64_slice.copy_from_slice(
&mapped_slice[current_mmap_addr..current_mmap_addr + std::mem::size_of::<u64>()],
);
let serialized_len = u64::from_ne_bytes(u64_slice) as usize;
current_mmap_addr += std::mem::size_of::<u64>();
trace!("Serialized bytes len to reload {:?}", serialized_len);
let slice_t = unsafe {
std::slice::from_raw_parts(
mapped_slice[current_mmap_addr..].as_ptr() as *const T,
self.dimension,
)
};
Some(slice_t)
}
/// returns Keys in order they are in the file, thus optimizing file/memory access.
/// Note that in case of parallel insertion this can be different from insertion odrer.
pub fn get_dataid_iter(&self) -> indexmap::map::Keys<'_, DataId, usize> {
self.hmap.keys()
}
/// returns full data type name
pub fn get_data_typename(&self) -> String {
self.t_name.clone()
}
/// returns full data type name
pub fn get_distname(&self) -> String {
self.distname.clone()
}
/// return the number of data in mmap
pub fn get_nb_data(&self) -> usize {
self.hmap.len()
}
} // end of impl DataMap
//=====================================================================================
#[cfg(test)]
mod tests {
use super::*;
use crate::hnswio::HnswIo;
use anndists::dist::*;
pub use crate::api::AnnT;
use crate::prelude::*;
use rand::distr::{Distribution, Uniform};
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
#[test]
fn test_file_mmap() {
println!("\n\n test_file_mmap");
log_init_test();
// generate a random test
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
// 1000 vectors of size 10 f32
let nbcolumn = 50;
let nbrow = 11;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
debug!("j : {:?}, data : {:?} ", j, &data[j]);
}
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let hnsw = Hnsw::<f32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i));
}
// some loggin info
hnsw.dump_layer_info();
// dump in a file. Must take care of name as tests runs in // !!!
let fname = "mmap_test";
let directory = tempfile::tempdir().unwrap();
let _res = hnsw.file_dump(directory.path(), fname);
let check_reload = false;
if check_reload {
// We check we can reload
debug!("HNSW reload.");
let directory = tempfile::tempdir().unwrap();
let mut reloader = HnswIo::new(directory.path(), fname);
let hnsw_loaded: Hnsw<f32, DistL1> = reloader.load_hnsw::<f32, DistL1>().unwrap();
check_graph_equality(&hnsw_loaded, &hnsw);
info!("========= reload success, going to mmap reloading =========");
}
//
// now we have check that datamap seems ok, test reload of hnsw with mmap
let datamap: DataMap = DataMap::from_hnswdump::<f32>(directory.path(), fname).unwrap();
let nb_test = 30;
info!("Checking random access of id , nb test : {}", nb_test);
for _ in 0..nb_test {
// sample an id in 0..nb_data
let unif = Uniform::<usize>::new(0, nbcolumn).unwrap();
let id = unif.sample(&mut rng);
let d = datamap.get_data::<f32>(&id);
assert!(d.is_some());
if d.is_some() {
debug!("id = {}, v = {:?}", id, d.as_ref().unwrap());
assert_eq!(d.as_ref().unwrap(), &data[id]);
}
}
// test iterator from datamap
let keys = datamap.get_dataid_iter();
for k in keys {
let _data = datamap.get_data::<f32>(k);
}
} // end of test_file_mmap
#[test]
fn test_mmap_iter() {
log_init_test();
// generate a random test
let mut rng = rand::rng();
let unif = Uniform::<u32>::new(0, 10000).unwrap();
// 1000 vectors of size 10 f32
let nbcolumn = 50;
let nbrow = 11;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
debug!("j : {:?}, data : {:?} ", j, &data[j]);
}
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let hnsw = Hnsw::<u32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i));
}
// some loggin info
hnsw.dump_layer_info();
// dump in a file. Must take care of name as tests runs in // !!!
let fname = "mmap_order_test";
let directory = tempfile::tempdir().unwrap();
let _res = hnsw.file_dump(directory.path(), fname);
// now we have check that datamap seems ok, test reload of hnsw with mmap
let datamap: DataMap = DataMap::from_hnswdump::<u32>(directory.path(), fname).unwrap();
// testing type check
assert!(datamap.check_data_type::<u32>());
assert!(!datamap.check_data_type::<f32>());
info!("Datamap iteration order checking");
let keys = datamap.get_dataid_iter();
for (i, dataid) in keys.enumerate() {
let v = datamap.get_data::<u32>(dataid).unwrap();
assert_eq!(v, &data[*dataid], "dataid = {}, ukey = {}", dataid, i);
}
// rm files generated!
let _ = std::fs::remove_file("mmap_order_test.hnsw.data");
let _ = std::fs::remove_file("mmap_order_test.hnsw.graph");
}
//
} // end of mod tests

View File

@@ -0,0 +1,24 @@
//! defines a trait for filtering requests.
//! See examples in tests/filtertest.rs
use crate::prelude::DataId;
/// Only queries returning true are taken into account along the search
pub trait FilterT {
fn hnsw_filter(&self, id: &DataId) -> bool;
}
impl FilterT for Vec<usize> {
fn hnsw_filter(&self, id: &DataId) -> bool {
self.binary_search(id).is_ok()
}
}
impl<F> FilterT for F
where
F: Fn(&DataId) -> bool,
{
fn hnsw_filter(&self, id: &DataId) -> bool {
self(id)
}
}

View File

@@ -0,0 +1,200 @@
//! This module provides conversion of a Point structure to a FlatPoint containing just the Id of a point
//! and those of its neighbours.
//! The whole Hnsw structure is then flattened into a Hashtable associating the data ID of a point to
//! its corresponding FlatPoint.
//! It can be used, for example, when reloading only the graph part of the data to have knowledge
//! of relative proximity of points as described just by their DataId
//!
use hashbrown::HashMap;
use std::cmp::Ordering;
use crate::hnsw;
use anndists::dist::distances::Distance;
use hnsw::*;
use log::error;
// an ordering of Neighbour of a Point
impl PartialEq for Neighbour {
fn eq(&self, other: &Neighbour) -> bool {
self.distance == other.distance
} // end eq
}
impl Eq for Neighbour {}
// order points by distance to self.
#[allow(clippy::non_canonical_partial_ord_impl)]
impl PartialOrd for Neighbour {
fn partial_cmp(&self, other: &Neighbour) -> Option<Ordering> {
self.distance.partial_cmp(&other.distance)
} // end cmp
} // end impl PartialOrd
impl Ord for Neighbour {
fn cmp(&self, other: &Neighbour) -> Ordering {
if !self.distance.is_nan() && !other.distance.is_nan() {
self.distance.partial_cmp(&other.distance).unwrap()
} else {
panic!("got a NaN in a distance");
}
} // end cmp
}
/// a reduced version of point inserted in the Hnsw structure.
/// It contains original id of point as submitted to the struct Hnsw
/// an ordered (by distance) list of neighbours to the point
/// and it position in layers.
#[derive(Clone)]
pub struct FlatPoint {
/// an id coming from client using hnsw, should identify point uniquely
origin_id: DataId,
/// a point id identifying point as stored in our structure
p_id: PointId,
/// neighbours info
neighbours: Vec<Neighbour>,
}
impl FlatPoint {
/// returns the neighbours orderded by distance.
pub fn get_neighbours(&self) -> &Vec<Neighbour> {
&self.neighbours
}
/// returns the origin id of the point
pub fn get_id(&self) -> DataId {
self.origin_id
}
//
pub fn get_p_id(&self) -> PointId {
self.p_id
}
} // end impl block for FlatPoint
fn flatten_point<T: Clone + Send + Sync>(point: &Point<T>) -> FlatPoint {
let neighbours = point.get_neighborhood_id();
// now we flatten neighbours
let mut flat_neighbours = Vec::<Neighbour>::new();
for layer in neighbours {
for neighbour in layer {
flat_neighbours.push(neighbour);
}
}
flat_neighbours.sort_unstable();
FlatPoint {
origin_id: point.get_origin_id(),
p_id: point.get_point_id(),
neighbours: flat_neighbours,
}
} // end of flatten_point
/// A structure providing neighbourhood information of a point stored in the Hnsw structure given its DataId.
/// The structure uses the [FlatPoint] structure.
/// This structure can be obtained by FlatNeighborhood::from<&Hnsw<T,D>>
pub struct FlatNeighborhood {
hash_t: HashMap<DataId, FlatPoint>,
}
impl FlatNeighborhood {
/// get neighbour of a point given its id.
/// The neighbours are sorted in increasing distance from data_id.
pub fn get_neighbours(&self, p_id: DataId) -> Option<Vec<Neighbour>> {
self.hash_t
.get(&p_id)
.map(|point| point.get_neighbours().clone())
}
} // end impl block for FlatNeighborhood
impl<T: Clone + Send + Sync, D: Distance<T> + Send + Sync> From<&Hnsw<'_, T, D>>
for FlatNeighborhood
{
/// extract from the Hnsw strucure a hashtable mapping original DataId into a FlatPoint structure gathering its neighbourhood information.
/// Useful after reloading from a dump with T=NoData and D = NoDist as points are then reloaded with neighbourhood information only.
fn from(hnsw: &Hnsw<T, D>) -> Self {
let mut hash_t = HashMap::new();
let pt_iter = hnsw.get_point_indexation().into_iter();
//
for point in pt_iter {
// println!("point : {:?}", _point.p_id);
let res_insert = hash_t.insert(point.get_origin_id(), flatten_point(&point));
if let Some(old_point) = res_insert {
error!("2 points with same origin id {:?}", old_point.origin_id);
}
}
FlatNeighborhood { hash_t }
}
} // e,d of Fom implementation
#[cfg(test)]
mod tests {
use super::*;
use anndists::dist::distances::*;
use log::debug;
use crate::api::AnnT;
use crate::hnswio::*;
use rand::distr::{Distribution, Uniform};
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
#[test]
fn test_dump_reload_graph_flatten() {
println!("\n\n test_dump_reload_graph_flatten");
log_init_test();
// generate a random test
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
// 1000 vectors of size 10 f32
let nbcolumn = 1000;
let nbrow = 10;
let mut xsi;
let mut data = Vec::with_capacity(nbcolumn);
for j in 0..nbcolumn {
data.push(Vec::with_capacity(nbrow));
for _ in 0..nbrow {
xsi = unif.sample(&mut rng);
data[j].push(xsi);
}
}
// define hnsw
let ef_construct = 25;
let nb_connection = 10;
let hnsw = Hnsw::<f32, DistL1>::new(nb_connection, nbcolumn, 16, ef_construct, DistL1 {});
for (i, d) in data.iter().enumerate() {
hnsw.insert((d, i));
}
// some loggin info
hnsw.dump_layer_info();
// get flat neighbours of point 3
let neighborhood_before_dump = FlatNeighborhood::from(&hnsw);
let nbg_2_before = neighborhood_before_dump.get_neighbours(2).unwrap();
println!("voisins du point 2 {:?}", nbg_2_before);
// dump in a file. Must take care of name as tests runs in // !!!
let fname = "dumpreloadtestflat";
let directory = tempfile::tempdir().unwrap();
let _res = hnsw.file_dump(directory.path(), fname);
// This will dump in 2 files named dumpreloadtest.hnsw.graph and dumpreloadtest.hnsw.data
//
// reload
debug!("HNSW reload");
// we will need a procedural macro to get from distance name to its instantiation.
// from now on we test with DistL1
let mut reloader = HnswIo::new(directory.path(), fname);
let hnsw_loaded: Hnsw<NoData, NoDist> = reloader.load_hnsw().unwrap();
let neighborhood_after_dump = FlatNeighborhood::from(&hnsw_loaded);
let nbg_2_after = neighborhood_after_dump.get_neighbours(2).unwrap();
println!("Neighbors of point 2 {:?}", nbg_2_after);
// test equality of neighborhood
assert_eq!(nbg_2_after.len(), nbg_2_before.len());
for i in 0..nbg_2_before.len() {
assert_eq!(nbg_2_before[i].p_id, nbg_2_after[i].p_id);
assert_eq!(nbg_2_before[i].distance, nbg_2_after[i].distance);
}
check_graph_equality(&hnsw_loaded, &hnsw);
} // end of test_dump_reload
} // end module test

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,30 @@
#![cfg_attr(feature = "stdsimd", feature(portable_simd))]
//
// for logging (debug mostly, switched at compile time in cargo.toml)
use env_logger::Builder;
use lazy_static::lazy_static;
pub mod api;
pub mod datamap;
pub mod filter;
pub mod flatten;
pub mod hnsw;
pub mod hnswio;
pub mod libext;
pub mod prelude;
// we impose our version of anndists
pub use anndists;
lazy_static! {
static ref LOG: u64 = init_log();
}
// install a logger facility
#[allow(unused)]
fn init_log() -> u64 {
Builder::from_default_env().init();
println!("\n ************** initializing logger *****************\n");
1
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
// gathers modules to include and re-exorts all of anndists!
pub use crate::api::*;
pub use crate::hnsw::*;
#[allow(unused)]
pub use crate::filter::*;
pub use crate::hnswio::*;
pub use anndists::dist::distances::*;

View File

@@ -0,0 +1,34 @@
use env_logger::Builder;
use anndists::dist::DistL1;
use hnsw_rs::hnsw::Hnsw;
// A test program to see if memory from insertions gets deallocated.
// This program sets up a process that iteratively builds a new model and lets it go out of scope.
// Since the models go out of scope, the desired behavior is that memory consumption is constant while this program is running.
fn main() {
//
Builder::from_default_env().init();
//
let mut counter: usize = 0;
loop {
let hnsw: Hnsw<f32, DistL1> = Hnsw::new(15, 100_000, 20, 500_000, DistL1 {});
let s1 = [1.0, 0.0, 0.0, 0.0];
hnsw.insert_slice((&s1, 0));
let s2 = [0.0, 1.0, 1.0];
hnsw.insert_slice((&s2, 1));
let s3 = [0.0, 0.0, 1.0];
hnsw.insert_slice((&s3, 2));
let s4 = [1.0, 0.0, 0.0, 1.0];
hnsw.insert_slice((&s4, 3));
let s5 = [1.0, 1.0, 1.0];
hnsw.insert_slice((&s5, 4));
let s6 = [1.0, -1.0, 1.0];
hnsw.insert_slice((&s6, 5));
if counter % 1_000_000 == 0 {
println!("counter : {}", counter)
}
counter += 1;
}
}

View File

@@ -0,0 +1,266 @@
#![allow(clippy::needless_range_loop)]
#![allow(clippy::range_zip_with_len)]
use anndists::dist::*;
use hnsw_rs::prelude::*;
use rand::{Rng, distr::Uniform};
use std::iter;
#[allow(unused)]
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
// Shows two ways to do filtering, by a sorted vector or with a closure
// We define a hnsw-index with 500 entries
// Only ids within 300-400 should be in the result-set
// Used to create a random string
fn generate_random_string(len: usize) -> String {
const CHARSET: &[u8] = b"abcdefghij";
let mut rng = rand::rng();
let one_char = || CHARSET[rng.random_range(0..CHARSET.len())] as char;
iter::repeat_with(one_char).take(len).collect()
}
// this function uses a sorted vector as a filter
fn search_closure_filter(
word: &str,
hns: &Hnsw<u16, DistLevenshtein>,
words: &[String],
filter_vector: &[usize],
) {
// transform string to u16 values
let vec: Vec<u16> = word.chars().map(|c| c as u16).collect();
// now create a closure using this filter_vector
// here we can off course implement more advanced filter logic
let filter = |id: &usize| -> bool { filter_vector.binary_search(id).is_ok() };
// Now let us do the search by using the defined clojure, which in turn uses our vector
// ids not in the vector will not be indluced in the search results
println!("========== Search with closure filter");
let ef_search = 30;
let res = hns.search_possible_filter(&vec, 10, ef_search, Some(&filter));
for r in res {
println!(
"Word: {:?} Id: {:?} Distance: {:?}",
words[r.d_id], r.d_id, r.distance
);
}
}
#[test]
fn filter_levenstein() {
let nb_elem = 500000; // number of possible words in the dictionary
let max_nb_connection = 15;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let ef_c = 200;
let hns = Hnsw::<u16, DistLevenshtein>::new(
max_nb_connection,
nb_elem,
nb_layer,
ef_c,
DistLevenshtein {},
);
let mut words = vec![];
for _n in 1..1000 {
let tw = generate_random_string(8);
words.push(tw);
}
for (i, w) in words.iter().enumerate() {
let vec: Vec<u16> = w.chars().map(|c| c as u16).collect();
hns.insert((&vec, i));
if i % 1000 == 0 {
println!("Inserting: {:?}", i);
}
}
// Create a sorted vector of ids
// the ids in the vector will be used as a filter
let filtered_hns = Hnsw::<u16, DistLevenshtein>::new(
max_nb_connection,
nb_elem,
nb_layer,
ef_c,
DistLevenshtein {},
);
let mut filter_vector: Vec<usize> = Vec::new();
for i in 300..400 {
filter_vector.push(i);
let v: Vec<u16> = words[i].chars().map(|c| c as u16).collect();
filtered_hns.insert((&v, i));
}
//
let ef_search = 30;
let tosearch = "abcdefg";
let knbn = 10;
let vec_tosearch: Vec<u16> = tosearch.chars().map(|c| c as u16).collect();
//
println!("========== Search in full hns with filter");
let vec_res = hns.search_filter(&vec_tosearch, knbn, ef_search, Some(&filter_vector));
for r in &vec_res {
println!(
"Word: {:?} Id: {:?} Distance: {:?}",
words[r.d_id], r.d_id, r.distance
);
}
//
println!("========== Search in restricted_hns but without filter");
//
let vec: Vec<u16> = tosearch.chars().map(|c| c as u16).collect();
let res: Vec<Neighbour> = filtered_hns.search(&vec, knbn, ef_search);
for r in &res {
println!(
"Word: {:?} Id: {:?} Distance: {:?}",
words[r.d_id], r.d_id, r.distance
);
}
//
// search with filter
// first with closure
println!("========== Search in full hns with closure filter");
search_closure_filter(tosearch, &hns, &words, &filter_vector);
//
// now with vector filter and estimate recall
//
println!("========== Search in full hns with vector filter");
let filter_vec_res = hns.search_filter(&vec_tosearch, knbn, ef_search, Some(&filter_vector));
for r in &filter_vec_res {
println!(
"Word: {:?} Id: {:?} Distance: {:?}",
words[r.d_id], r.d_id, r.distance
);
}
// how many neighbours in res are in filter_vec_res
let mut nb_found: usize = 0;
for n in &res {
let found = filter_vec_res.iter().find(|&&m| m.d_id == n.d_id);
if found.is_some() {
nb_found += 1;
assert_eq!(n.distance, found.unwrap().distance);
}
}
println!(" recall : {}", nb_found as f32 / res.len() as f32);
println!(
" last distances ratio : {} ",
res.last().unwrap().distance / filter_vec_res.last().unwrap().distance
);
}
// A test with random uniform data vectors and L2 distance
// We compare a search of a random vector in hnsw structure with a filter to a filtered_hnsw
// containing only the data fitting the filter
#[test]
fn filter_l2() {
let nb_elem = 5000;
let dim = 25;
// generate nb_elem colmuns vectors of dimension dim
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
let mut data = Vec::with_capacity(nb_elem);
for _ in 0..nb_elem {
let column = (0..dim).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
data.push(column);
}
// give an id to each data
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
let ef_c = 200;
let max_nb_connection = 15;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let hnsw = Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
hnsw.parallel_insert(&data_with_id);
//
let ef_search = 30;
let knbn = 10;
let vec_tosearch = (0..dim).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
//
// Create a sorted vector of ids
// the ids in the vector will be used as a filter
let filtered_hns =
Hnsw::<f32, DistL2>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL2 {});
let mut filter_vector: Vec<usize> = Vec::new();
for i in 300..400 {
filter_vector.push(i);
filtered_hns.insert((&data[i], i));
}
//
println!("========== Search in full hnsw with filter");
let filter_vec_res = hnsw.search_filter(&vec_tosearch, knbn, ef_search, Some(&filter_vector));
for r in &filter_vec_res {
println!("Id: {:?} Distance: {:?}", r.d_id, r.distance);
}
//
println!("========== Search in restricted_hns but without filter");
let res: Vec<Neighbour> = filtered_hns.search(&vec_tosearch, knbn, ef_search);
for r in &res {
println!("Id: {:?} Distance: {:?}", r.d_id, r.distance);
}
// how many neighbours in res are in filter_vec_res and what is the distance gap
let mut nb_found: usize = 0;
for n in &res {
let found = filter_vec_res.iter().find(|&&m| m.d_id == n.d_id);
if found.is_some() {
nb_found += 1;
assert!((1. - n.distance / found.unwrap().distance).abs() < 1.0e-5);
}
}
println!(" recall : {}", nb_found as f32 / res.len() as f32);
println!(
" last distances ratio : {} ",
res.last().unwrap().distance / filter_vec_res.last().unwrap().distance
);
} // end of filter_l2
//
use std::collections::HashMap;
#[test]
fn filter_villsnow() {
println!("\n\n in test villsnow");
log_init_test();
//
let grid_size = 100;
let mut hnsw = Hnsw::<f64, DistL2>::new(4, grid_size * grid_size, 16, 100, DistL2::default());
let mut points = HashMap::new();
{
for (id, (i, j)) in itertools::iproduct!(0..grid_size, 0..grid_size,).enumerate() {
let data = [
(i as f64 + 0.5) / grid_size as f64,
(j as f64 + 0.5) / grid_size as f64,
];
hnsw.insert((&data, id));
points.insert(id, data);
}
hnsw.set_searching_mode(true);
}
{
println!("first case");
// first case
let filter = |id: &usize| DistL2::default().eval(&points[id], &[1.0, 1.0]) < 1e-2;
dbg!(points.keys().filter(|x| filter(x)).count()); // -> 1
let hit = hnsw.search_filter(&[0.0, 0.0], 10, 4, Some(&filter));
if !hit.is_empty() {
log::info!("got point : {:?}", points.get(&hit[0].d_id));
log::info!("got {:?}, must be true", filter(&hit[0].d_id)); // -> sometimes false
} else {
log::info!("found no point");
}
assert!(hit.len() <= 1);
}
{
println!("second case");
// second case
let filter = |_id: &usize| false;
dbg!(points.keys().filter(|x| filter(x)).count()); // -> 0, obviously
let hit = hnsw.search_filter(&[0.0, 0.0], 10, 64, Some(&filter));
println!("villsnow , {:?}", hit.len());
log::info!("got {:?}, must be 0", hit.len()); // -> 1
assert_eq!(hit.len(), 0);
}
}

View File

@@ -0,0 +1,328 @@
#![allow(clippy::range_zip_with_len)]
//! some testing utilities.
//! run with to get output statistics : cargo test --release -- --nocapture --test test_parallel.
//! serial test corresponds to random-10nn-euclidean(k=10)
//! parallel test corresponds to random data in 25 dimensions k = 10, dist Cosine
use rand::distr::Uniform;
use rand::prelude::*;
use skiplist::OrderedSkipList;
use anndists::dist;
use hnsw_rs::prelude::*;
use serde::{de::DeserializeOwned, Serialize};
pub fn gen_random_vector_f32(nbrow: usize) -> Vec<f32> {
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
(0..nbrow).map(|_| rng.sample(unif)).collect::<Vec<f32>>()
}
/// return nbcolumn vectors of dimension nbrow
pub fn gen_random_matrix_f32(nbrow: usize, nbcolumn: usize) -> Vec<Vec<f32>> {
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
let mut data = Vec::with_capacity(nbcolumn);
for _ in 0..nbcolumn {
let column = (0..nbrow).map(|_| rng.sample(unif)).collect::<Vec<f32>>();
data.push(column);
}
data
}
fn brute_force_neighbours<T: Serialize + DeserializeOwned + Copy + Send + Sync>(
nb_neighbours: usize,
refdata: &PointIndexation<T>,
distance: PointDistance<T>,
data: &[T],
) -> OrderedSkipList<PointIdWithOrder> {
let mut neighbours = OrderedSkipList::<PointIdWithOrder>::with_capacity(refdata.get_nb_point());
let mut ptiter = refdata.into_iter();
let mut more = true;
while more {
if let Some(point) = ptiter.next() {
let dist_p = distance.eval(data, point.get_v());
let ordered_point = PointIdWithOrder::new(point.get_point_id(), dist_p);
// log::debug!(" brute force inserting {:?}", ordered_point);
if neighbours.len() < nb_neighbours {
neighbours.insert(ordered_point);
} else {
neighbours.insert(ordered_point);
neighbours.pop_back();
}
} else {
more = false;
}
} // end while
neighbours
} // end of brute_force_2
//================================================================================================
mod tests {
use cpu_time::ProcessTime;
use std::time::Duration;
use super::*;
use dist::l2_normalize;
#[test]
fn test_serial() {
//
//
let nb_elem = 1000;
let dim = 10;
let knbn = 10;
let ef = 20;
let parallel = true;
//
println!("\n\n test_serial nb_elem {:?}", nb_elem);
//
let data = gen_random_matrix_f32(dim, nb_elem);
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
let ef_c = 400;
let max_nb_connection = 32;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let mut hns = Hnsw::<f32, dist::DistL1>::new(
max_nb_connection,
nb_elem,
nb_layer,
ef_c,
dist::DistL1 {},
);
hns.set_extend_candidates(true);
hns.set_keeping_pruned(true);
let mut start = ProcessTime::now();
if parallel {
println!("parallel insertion");
hns.parallel_insert(&data_with_id);
} else {
println!("serial insertion");
for (i, d) in data.iter().enumerate() {
hns.insert((d, i));
}
}
let mut cpu_time: Duration = start.elapsed();
println!(" hnsw serial data insertion {:?}", cpu_time);
hns.dump_layer_info();
println!(" hnsw data nb point inserted {:?}", hns.get_nb_point());
//
let nbtest = 300;
let mut recalls = Vec::<usize>::with_capacity(nbtest);
let mut nb_returned = Vec::<usize>::with_capacity(nb_elem);
let mut search_times = Vec::<f32>::with_capacity(nbtest);
for _itest in 0..nbtest {
//
let mut r_vec = Vec::<f32>::with_capacity(dim);
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
for _ in 0..dim {
r_vec.push(rng.sample(unif));
}
start = ProcessTime::now();
let brute_neighbours = brute_force_neighbours(
knbn,
hns.get_point_indexation(),
Box::new(dist::DistL1 {}),
&r_vec,
);
cpu_time = start.elapsed();
if nbtest <= 100 {
println!("\n\n **************** test {:?}", _itest);
println!("\n brute force neighbours :");
println!("======================");
println!(" brute force computing {:?} \n ", cpu_time);
for i in 0..brute_neighbours.len() {
let p = brute_neighbours[i].point_id;
println!(" {:?} {:?} ", p, brute_neighbours[i].dist_to_ref);
}
}
//
hns.set_searching_mode(true);
start = ProcessTime::now();
let knn_neighbours = hns.search(&r_vec, knbn, ef);
cpu_time = start.elapsed();
search_times.push(cpu_time.as_micros() as f32);
if nbtest <= 100 {
println!("\n\n hnsw searching {:?} \n", cpu_time);
println!("\n knn neighbours");
println!("======================");
for n in &knn_neighbours {
println!(" {:?} {:?} {:?} ", n.d_id, n.p_id, n.distance);
}
}
// compute recall
let knn_neighbours_dist: Vec<f32> = knn_neighbours.iter().map(|p| p.distance).collect();
let max_dist = brute_neighbours[knbn - 1].dist_to_ref;
let recall = knn_neighbours_dist
.iter()
.filter(|d| *d <= &max_dist)
.count();
if nbtest <= 100 {
println!("recall {:?}", (recall as f32) / (knbn as f32));
}
recalls.push(recall);
nb_returned.push(knn_neighbours.len());
} // end on nbtest
//
// compute recall
//
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
let mean_search_time = (search_times.iter().sum::<f32>()) / (search_times.len() as f32);
println!(
"\n mean fraction (of knbn) returned by search {:?} ",
(nb_returned.iter().sum::<usize>() as f32) / ((nb_returned.len() * knbn) as f32)
);
println!(
"\n nb element {:?} nb search : {:?} recall rate is {:?} search time inverse {:?} ",
nb_elem,
nbtest,
mean_recall,
1.0e+6_f32 / mean_search_time
);
} // end test1
#[test]
fn test_parallel() {
//
let nb_elem = 1000;
let dim = 25;
let knbn = 10;
let ef_c = 800;
let max_nb_connection = 48;
let ef = 20;
//
//
let mut data = gen_random_matrix_f32(dim, nb_elem);
for v in &mut data {
l2_normalize(v);
}
let data_with_id = data.iter().zip(0..data.len()).collect::<Vec<_>>();
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let mut hns = Hnsw::<f32, dist::DistDot>::new(
max_nb_connection,
nb_elem,
nb_layer,
ef_c,
dist::DistDot {},
);
// !
// hns.set_extend_candidates(true);
let mut start = ProcessTime::now();
let now = std::time::SystemTime::now();
// parallel insertion
hns.parallel_insert(&data_with_id);
let mut cpu_time: Duration = start.elapsed();
println!(
"\n hnsw data parallel insertion cpu time {:?} , system time {:?}",
cpu_time,
now.elapsed()
);
// one serial more to check
let mut v = gen_random_vector_f32(dim);
l2_normalize(&mut v);
hns.insert((&v, hns.get_nb_point() + 1));
//
hns.dump_layer_info();
println!(" hnsw data nb point inserted {:?}", hns.get_nb_point());
//
println!("\n hnsw testing requests ...");
let nbtest = 100;
let mut recalls = Vec::<usize>::with_capacity(nbtest);
let mut recalls_id = Vec::<usize>::with_capacity(nbtest);
let mut search_times = Vec::<f32>::with_capacity(nbtest);
for _itest in 0..nbtest {
let mut r_vec = Vec::<f32>::with_capacity(dim);
let mut rng = rand::rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
for _ in 0..dim {
r_vec.push(rng.sample(unif));
}
l2_normalize(&mut r_vec);
start = ProcessTime::now();
let brute_neighbours = brute_force_neighbours(
knbn,
hns.get_point_indexation(),
Box::new(dist::DistDot),
&r_vec,
);
cpu_time = start.elapsed();
if nbtest <= 100 {
println!("\n\n test_par nb_elem {:?}", nb_elem);
println!("\n brute force neighbours :");
println!("======================");
println!(" brute force computing {:?} \n", cpu_time);
for i in 0..brute_neighbours.len() {
println!(
" {:?} {:?} ",
brute_neighbours[i].point_id, brute_neighbours[i].dist_to_ref
);
}
}
//
let knbn = 10;
hns.set_searching_mode(true);
start = ProcessTime::now();
let knn_neighbours = hns.search(&r_vec, knbn, ef);
cpu_time = start.elapsed();
search_times.push(cpu_time.as_micros() as f32);
if nbtest <= 100 {
println!("\n knn neighbours");
println!("======================");
println!(" hnsw searching {:?} \n", cpu_time);
for n in &knn_neighbours {
println!(" {:?} \t {:?} \t {:?}", n.d_id, n.p_id, n.distance);
}
}
// compute recall with balls
let knn_neighbours_dist: Vec<f32> = knn_neighbours.iter().map(|p| p.distance).collect();
let max_dist = brute_neighbours[knbn - 1].dist_to_ref;
let recall = knn_neighbours_dist
.iter()
.filter(|d| *d <= &max_dist)
.count();
if nbtest <= 100 {
println!("recall {:?}", (recall as f32) / (knbn as f32));
}
recalls.push(recall);
// compute recall with id
let mut recall_id = 0;
let mut knn_neighbours_id: Vec<PointId> =
knn_neighbours.iter().map(|p| p.p_id).collect();
knn_neighbours_id.sort_unstable();
let snbn = knbn.min(brute_neighbours.len());
for j in 0..snbn {
let to_search = brute_neighbours[j].point_id;
if knn_neighbours_id.binary_search(&to_search).is_ok() {
recall_id += 1;
}
}
recalls_id.push(recall_id);
} // end on nbtest
//
// compute recall
//
let mean_recall = (recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
let mean_search_time = (search_times.iter().sum::<f32>()) / (search_times.len() as f32);
println!(
"\n nb search {:?} recall rate is {:?} search time inverse {:?} ",
nbtest,
mean_recall,
1.0e+6_f32 / mean_search_time
);
let mean_recall_id =
(recalls.iter().sum::<usize>() as f32) / ((knbn * recalls.len()) as f32);
println!("mean recall rate with point ids {:?}", mean_recall_id);
//
// assert!(1==0);
} // end test_par
}

56
vendor/ruvector/scripts/publish-rvf.sh vendored Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/env bash
# Publish all RVF crates to crates.io in dependency order.
# Usage: ./scripts/publish-rvf.sh [--dry-run]
#
# Publishing order (each crate depends on those before it):
# 1. rvf-types (no internal deps)
# 2. rvf-wire (depends on rvf-types)
# 3. rvf-manifest (depends on rvf-types)
# 4. rvf-index (no internal deps currently)
# 5. rvf-quant (depends on rvf-types)
# 6. rvf-crypto (depends on rvf-types)
# 7. rvf-runtime (depends on rvf-types)
# 8. rvf-wasm (depends on rvf-types)
# 9. rvf-node (depends on rvf-runtime, rvf-types)
# 10. rvf-server (depends on rvf-runtime, rvf-types)
set -euo pipefail
DRY_RUN=""
if [[ "${1:-}" == "--dry-run" ]]; then
DRY_RUN="--dry-run"
echo "=== DRY RUN MODE ==="
fi
CRATES_DIR="$(cd "$(dirname "$0")/../crates/rvf" && pwd)"
DELAY_SECONDS=30
CRATES=(
rvf-types
rvf-wire
rvf-manifest
rvf-index
rvf-quant
rvf-crypto
rvf-runtime
rvf-wasm
rvf-node
rvf-server
)
for crate in "${CRATES[@]}"; do
echo ""
echo "=== Publishing ${crate} ==="
cargo publish \
--manifest-path "${CRATES_DIR}/${crate}/Cargo.toml" \
--allow-dirty \
${DRY_RUN}
if [[ -z "${DRY_RUN}" ]]; then
echo "Waiting ${DELAY_SECONDS}s for crates.io index to update..."
sleep "${DELAY_SECONDS}"
fi
done
echo ""
echo "=== All RVF crates published successfully ==="

View File

@@ -0,0 +1,32 @@
#!/bin/bash
# Quick check and publish script for router-wasm
# Run this manually when router-core v0.1.1 is confirmed published
set -e
echo "Checking router-core v0.1.1 availability..."
if cargo search router-core 2>&1 | grep -q "router-core.*0\.1\.1"; then
echo "✓ router-core v0.1.1 is available!"
echo ""
echo "Proceeding with router-wasm publication..."
echo ""
# Load API key
export $(grep "^CRATES_API_KEY=" /workspaces/ruvector/.env | xargs)
# Login
cargo login "$CRATES_API_KEY"
# Publish
cd /workspaces/ruvector/crates/router-wasm
cargo publish --allow-dirty
echo ""
echo "✓ router-wasm v0.1.1 published successfully!"
else
echo "✗ router-core v0.1.1 not yet available on crates.io"
echo " Current version: $(cargo search router-core 2>&1 | grep 'router-core =' | head -1)"
echo ""
echo "Please wait for router-core v0.1.1 to be published first."
exit 1
fi

View File

@@ -0,0 +1,68 @@
#!/bin/bash
# RuVector - Publish All Packages Script
# Triggers GitHub Actions workflow to build and publish for all platforms
set -e
VERSION="${1:-0.1.31}"
DRY_RUN="${2:-false}"
echo "🚀 RuVector Publish All Packages"
echo "================================"
echo "Version: $VERSION"
echo "Dry Run: $DRY_RUN"
echo ""
# Check if gh CLI is available
if ! command -v gh &> /dev/null; then
echo "❌ GitHub CLI (gh) is required. Install with: brew install gh"
exit 1
fi
# Check if logged in
if ! gh auth status &> /dev/null; then
echo "❌ Not logged into GitHub. Run: gh auth login"
exit 1
fi
echo "📦 Packages to publish:"
echo " crates.io:"
echo " - ruvector-math v$VERSION"
echo " - ruvector-attention v$VERSION"
echo " - ruvector-math-wasm v$VERSION"
echo " - ruvector-attention-wasm v$VERSION"
echo ""
echo " npm:"
echo " - ruvector-math-wasm v$VERSION"
echo " - @ruvector/attention v$VERSION"
echo " - @ruvector/attention-wasm v$VERSION"
echo " - @ruvector/attention-linux-x64-gnu v$VERSION"
echo " - @ruvector/attention-linux-arm64-gnu v$VERSION"
echo " - @ruvector/attention-darwin-x64 v$VERSION"
echo " - @ruvector/attention-darwin-arm64 v$VERSION"
echo " - @ruvector/attention-win32-x64-msvc v$VERSION"
echo ""
read -p "Continue? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "Aborted."
exit 0
fi
echo ""
echo "🔄 Triggering GitHub Actions workflow..."
gh workflow run publish-all.yml \
--field version="$VERSION" \
--field publish_crates=true \
--field publish_npm=true \
--field dry_run="$DRY_RUN"
echo ""
echo "✅ Workflow triggered!"
echo ""
echo "📊 Monitor progress at:"
echo " https://github.com/ruvnet/ruvector/actions/workflows/publish-all.yml"
echo ""
echo "Or run: gh run list --workflow=publish-all.yml"

View File

@@ -0,0 +1,3 @@
#!/bin/bash
# Publish ruvector-cli to crates.io
cargo publish -p ruvector-cli --allow-dirty

View File

@@ -0,0 +1,165 @@
#!/bin/bash
set -e
# Ruvector Crates Publishing Script
# This script publishes all Ruvector crates to crates.io in the correct dependency order
#
# Prerequisites:
# - Rust and Cargo installed
# - CRATES_API_KEY set in .env file
# - All crates build successfully
# - All tests pass
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Load environment variables from .env
if [ -f .env ]; then
export $(grep -v '^#' .env | grep CRATES_API_KEY | xargs)
else
echo -e "${RED}Error: .env file not found${NC}"
exit 1
fi
# Check if CRATES_API_KEY is set
if [ -z "$CRATES_API_KEY" ]; then
echo -e "${RED}Error: CRATES_API_KEY not found in .env${NC}"
exit 1
fi
echo -e "${BLUE}========================================${NC}"
echo -e "${BLUE} Ruvector Crates Publishing Script${NC}"
echo -e "${BLUE}========================================${NC}"
echo ""
# Configure cargo authentication
echo -e "${YELLOW}Configuring cargo authentication...${NC}"
cargo login "$CRATES_API_KEY"
echo -e "${GREEN}✓ Authentication configured${NC}"
echo ""
# Function to publish a crate
publish_crate() {
local crate_path=$1
local crate_name=$(basename "$crate_path")
echo -e "${BLUE}========================================${NC}"
echo -e "${BLUE}Publishing: ${crate_name}${NC}"
echo -e "${BLUE}========================================${NC}"
cd "$crate_path"
# Verify the package
echo -e "${YELLOW}Verifying package...${NC}"
if cargo package --allow-dirty; then
echo -e "${GREEN}✓ Package verification successful${NC}"
else
echo -e "${RED}✗ Package verification failed${NC}"
cd - > /dev/null
return 1
fi
# Publish the package
echo -e "${YELLOW}Publishing to crates.io...${NC}"
if cargo publish --allow-dirty; then
echo -e "${GREEN}${crate_name} published successfully${NC}"
else
echo -e "${RED}✗ Failed to publish ${crate_name}${NC}"
cd - > /dev/null
return 1
fi
cd - > /dev/null
# Wait a bit for crates.io to index the crate
echo -e "${YELLOW}Waiting 30 seconds for crates.io to index...${NC}"
sleep 30
echo ""
}
# Function to check if crate is already published
check_published() {
local crate_name=$1
local version=$2
if cargo search "$crate_name" --limit 1 | grep -q "^$crate_name = \"$version\""; then
return 0 # Already published
else
return 1 # Not published
fi
}
# Get version from workspace
VERSION=$(grep '^version = ' Cargo.toml | head -1 | sed 's/version = "\(.*\)"/\1/')
echo -e "${BLUE}Publishing version: ${VERSION}${NC}"
echo ""
# Publishing order (dependencies first)
CRATES=(
# Base dependencies (no internal dependencies)
"crates/ruvector-core"
"crates/router-core"
# Depends on ruvector-core
"crates/ruvector-node"
"crates/ruvector-wasm"
"crates/ruvector-cli"
"crates/ruvector-bench"
# Depends on router-core
"crates/router-cli"
"crates/router-ffi"
"crates/router-wasm"
)
# Track success/failure
SUCCESS_COUNT=0
FAILED_CRATES=()
# Publish each crate
for crate in "${CRATES[@]}"; do
if [ ! -d "$crate" ]; then
echo -e "${YELLOW}Warning: $crate directory not found, skipping${NC}"
continue
fi
crate_name=$(basename "$crate")
# Check if already published
if check_published "$crate_name" "$VERSION"; then
echo -e "${YELLOW}$crate_name v$VERSION already published, skipping${NC}"
((SUCCESS_COUNT++))
echo ""
continue
fi
if publish_crate "$crate"; then
((SUCCESS_COUNT++))
else
FAILED_CRATES+=("$crate_name")
fi
done
# Summary
echo -e "${BLUE}========================================${NC}"
echo -e "${BLUE} Publishing Summary${NC}"
echo -e "${BLUE}========================================${NC}"
echo -e "${GREEN}Successfully published: ${SUCCESS_COUNT}/${#CRATES[@]}${NC}"
if [ ${#FAILED_CRATES[@]} -gt 0 ]; then
echo -e "${RED}Failed to publish:${NC}"
for crate in "${FAILED_CRATES[@]}"; do
echo -e "${RED} - $crate${NC}"
done
exit 1
else
echo -e "${GREEN}All crates published successfully! 🎉${NC}"
fi
echo ""
echo -e "${BLUE}View your crates at: https://crates.io/users/ruvector${NC}"

View File

@@ -0,0 +1,80 @@
#!/bin/bash
# Script to publish router-wasm v0.1.1 to crates.io
# This script waits for router-core v0.1.1 to be available
set -e
echo "=========================================="
echo "router-wasm v0.1.1 Publication Script"
echo "=========================================="
echo ""
# Load environment variables
if [ -f /workspaces/ruvector/.env ]; then
echo "✓ Loading CRATES_API_KEY from .env..."
export $(grep "^CRATES_API_KEY=" /workspaces/ruvector/.env | xargs)
else
echo "✗ Error: .env file not found"
exit 1
fi
if [ -z "$CRATES_API_KEY" ]; then
echo "✗ Error: CRATES_API_KEY not found in .env"
exit 1
fi
echo "✓ CRATES_API_KEY loaded"
echo ""
# Step 1: Wait for router-core v0.1.1
echo "Step 1: Checking for router-core v0.1.1..."
MAX_ATTEMPTS=30
ATTEMPT=0
while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
ATTEMPT=$((ATTEMPT + 1))
echo " Check $ATTEMPT/$MAX_ATTEMPTS ($(date +%H:%M:%S))"
if cargo search router-core 2>&1 | grep -q "router-core.*0\.1\.1"; then
echo "✓ router-core v0.1.1 found on crates.io!"
break
fi
if [ $ATTEMPT -eq $MAX_ATTEMPTS ]; then
echo "✗ Timeout: router-core v0.1.1 not found after $MAX_ATTEMPTS attempts"
echo " Current version: $(cargo search router-core 2>&1 | grep "router-core =" | head -1)"
exit 1
fi
sleep 10
done
echo ""
# Step 2: Login to crates.io
echo "Step 2: Logging in to crates.io..."
cargo login "$CRATES_API_KEY"
echo "✓ Successfully logged in"
echo ""
# Step 3: Navigate to router-wasm directory
echo "Step 3: Navigating to router-wasm directory..."
cd /workspaces/ruvector/crates/router-wasm
echo "✓ Current directory: $(pwd)"
echo ""
# Step 4: Verify package
echo "Step 4: Verifying package..."
cargo package --list --allow-dirty | head -20
echo "..."
echo ""
# Step 5: Publish
echo "Step 5: Publishing router-wasm v0.1.1..."
echo ""
cargo publish --allow-dirty
echo ""
echo "=========================================="
echo "✓ SUCCESS: router-wasm v0.1.1 published!"
echo "=========================================="

176
vendor/ruvector/scripts/run_mincut_bench.sh vendored Executable file
View File

@@ -0,0 +1,176 @@
#!/usr/bin/env bash
# run_mincut_bench.sh -- 1k-sample grid runner for min-cut gating vs softmax
#
# Usage:
# ./scripts/run_mincut_bench.sh [--samples N] [--output-dir DIR]
#
# Runs a grid search over lambda and tau parameters, collecting:
# - Coherence delta metrics
# - Memory pressure profiles
# - Power/energy measurements
# - Latency distributions (p50/p95/p99)
# - Witness chain (JSONL + RVF bundle)
set -euo pipefail
# ---------------------------------------------------------------------------
# Defaults
# ---------------------------------------------------------------------------
SAMPLES=1000
SHORT_SAMPLES=500
LONG_SAMPLES=500
SHORT_MAX_LEN=128
LONG_MIN_LEN=256
LONG_MAX_LEN=1024
OUTPUT_DIR="results/mincut-bench"
LAMBDA_GRID="0.3 0.5 0.7"
TAU_GRID="0 2"
EPS=0.01
SEED=42
# ---------------------------------------------------------------------------
# Parse arguments
# ---------------------------------------------------------------------------
while [[ $# -gt 0 ]]; do
case $1 in
--samples) SAMPLES="$2"; shift 2 ;;
--output-dir) OUTPUT_DIR="$2"; shift 2 ;;
--lambda) LAMBDA_GRID="$2"; shift 2 ;;
--tau) TAU_GRID="$2"; shift 2 ;;
--seed) SEED="$2"; shift 2 ;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
SHORT_SAMPLES=$((SAMPLES / 2))
LONG_SAMPLES=$((SAMPLES - SHORT_SAMPLES))
# ---------------------------------------------------------------------------
# Setup
# ---------------------------------------------------------------------------
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
mkdir -p "$OUTPUT_DIR"/{csv,witness,figs}
echo "============================================="
echo "Min-Cut Gating Benchmark"
echo "============================================="
echo "Samples: $SAMPLES ($SHORT_SAMPLES short + $LONG_SAMPLES long)"
echo "Lambda grid: $LAMBDA_GRID"
echo "Tau grid: $TAU_GRID"
echo "Epsilon: $EPS"
echo "Seed: $SEED"
echo "Output: $OUTPUT_DIR"
echo "============================================="
# ---------------------------------------------------------------------------
# Build (release mode for accurate benchmarks)
# ---------------------------------------------------------------------------
echo "[1/5] Building in release mode..."
cargo build --release \
-p ruvector-attn-mincut \
-p ruvector-coherence \
-p ruvector-profiler \
2>&1 | tail -5
# ---------------------------------------------------------------------------
# Run baseline (softmax)
# ---------------------------------------------------------------------------
echo "[2/5] Running baseline (softmax) on $SAMPLES samples..."
BASELINE_CSV="$OUTPUT_DIR/csv/baseline.csv"
echo "sample_id,seq_len,wall_time_us,peak_mem_bytes,energy_j" > "$BASELINE_CSV"
# Placeholder: in a real run, this would invoke the benchmark binary
# cargo run --release -p ruvector-bench-runner -- \
# --mode softmax \
# --short-samples $SHORT_SAMPLES --short-max-len $SHORT_MAX_LEN \
# --long-samples $LONG_SAMPLES --long-min-len $LONG_MIN_LEN --long-max-len $LONG_MAX_LEN \
# --seed $SEED \
# --output "$BASELINE_CSV"
echo " (baseline runner placeholder -- implement with bench binary)"
# ---------------------------------------------------------------------------
# Run grid search (min-cut gating)
# ---------------------------------------------------------------------------
echo "[3/5] Running min-cut gating grid search..."
RESULTS_CSV="$OUTPUT_DIR/csv/results.csv"
echo "setting,lambda,tau,coherence_delta,kv_cache_reduction,peak_mem_reduction,energy_reduction,p95_latency_us,accuracy" > "$RESULTS_CSV"
for lambda in $LAMBDA_GRID; do
for tau in $TAU_GRID; do
SETTING="mincut_l${lambda}_t${tau}"
echo " Running $SETTING..."
RUN_CSV="$OUTPUT_DIR/csv/${SETTING}.csv"
WITNESS_FILE="$OUTPUT_DIR/witness/${SETTING}.jsonl"
# Placeholder: invoke bench binary with min-cut params
# cargo run --release -p ruvector-bench-runner -- \
# --mode mincut \
# --lambda $lambda --tau $tau --eps $EPS \
# --short-samples $SHORT_SAMPLES --short-max-len $SHORT_MAX_LEN \
# --long-samples $LONG_SAMPLES --long-min-len $LONG_MIN_LEN --long-max-len $LONG_MAX_LEN \
# --seed $SEED \
# --output "$RUN_CSV" \
# --witness "$WITNESS_FILE"
echo " (grid runner placeholder -- implement with bench binary)"
done
done
# ---------------------------------------------------------------------------
# Compute aggregate metrics
# ---------------------------------------------------------------------------
echo "[4/5] Computing aggregate metrics..."
# Placeholder: post-processing script would:
# 1. Read all CSV files
# 2. Compute mean +/- 95% CI for coherence delta
# 3. Compare memory, energy, latency vs baseline
# 4. Write summary to results.csv
echo " (aggregation placeholder -- implement with post-processor)"
# ---------------------------------------------------------------------------
# Pack witness bundle (RVF)
# ---------------------------------------------------------------------------
echo "[5/5] Packing witness bundle..."
WITNESS_BUNDLE="$OUTPUT_DIR/witness/witness.rvf"
# Placeholder: concatenate witness JSONL files into RVF bundle
# The RVF format includes:
# - Header: config hash, model commit, weights hash
# - Body: per-sample witness entries with hash chain
# - Footer: aggregate stats, signature
echo " (RVF packer placeholder -- implement with witness tool)"
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo ""
echo "============================================="
echo "Benchmark complete"
echo "============================================="
echo "Results: $OUTPUT_DIR/csv/results.csv"
echo "Witness: $OUTPUT_DIR/witness/"
echo "Figures: $OUTPUT_DIR/figs/ (generate with plot script)"
echo ""
echo "Expected results table:"
echo ""
echo "Setting | dCoherence | KV-Cache | Peak Mem | Energy/sample | p95 Latency"
echo "---------------------|------------|----------|----------|---------------|------------"
echo "Softmax (baseline) | -- | -- | -- | -- | --"
echo "Min-cut l=0.3, t=0 | +??% | -??% | -??% | -??% | ??us"
echo "Min-cut l=0.3, t=2 | +??% | -??% | -??% | -??% | ??us"
echo "Min-cut l=0.5, t=0 | +??% | -??% | -??% | -??% | ??us"
echo "Min-cut l=0.5, t=2 | +??% | -??% | -??% | -??% | ??us"
echo "Min-cut l=0.7, t=0 | +??% | -??% | -??% | -??% | ??us"
echo "Min-cut l=0.7, t=2 | +??% | -??% | -??% | -??% | ??us"
echo ""
echo "Success criteria:"
echo " >= 5% coherence delta with <= 1% accuracy loss"
echo " >= 15% KV-cache reduction"
echo " >= 10% energy/sample drop"
echo " p95 latency within +/-10% of baseline"
echo " Deterministic witness reproducible on second machine"

1
vendor/ruvector/scripts/sync-lockfile.sh vendored Symbolic link
View File

@@ -0,0 +1 @@
ci/sync-lockfile.sh

View File

@@ -0,0 +1,59 @@
#!/bin/bash
# Comprehensive test of all RuVector graph CLI commands
set -e
CLI="./target/debug/ruvector"
TEST_DB="/tmp/ruvector-graph-test.db"
echo "=========================================="
echo "RuVector Graph CLI - Full Command Test"
echo "=========================================="
echo ""
# Test 1: Create
echo "1. Testing: graph create"
$CLI graph create --path $TEST_DB --name test-graph --indexed
echo ""
# Test 2: Info
echo "2. Testing: graph info"
$CLI graph info --db $TEST_DB --detailed
echo ""
# Test 3: Query
echo "3. Testing: graph query"
$CLI graph query --db $TEST_DB --cypher "MATCH (n) RETURN n" --format table
echo ""
# Test 4: Query with explain
echo "4. Testing: graph query --explain"
$CLI graph query --db $TEST_DB --cypher "MATCH (n:Person) WHERE n.age > 25 RETURN n" --explain
echo ""
# Test 5: Benchmark
echo "5. Testing: graph benchmark"
$CLI graph benchmark --db $TEST_DB --queries 100 --bench-type traverse
echo ""
# Test 6: Serve (won't actually start, just test args)
echo "6. Testing: graph serve (dry run)"
timeout 2 $CLI graph serve --db $TEST_DB --host 127.0.0.1 --http-port 8080 --grpc-port 50051 --graphql 2>&1 || true
echo ""
echo "=========================================="
echo "All Tests Completed Successfully!"
echo "=========================================="
echo ""
echo "Summary of implemented commands:"
echo " ✓ graph create - Create new graph database"
echo " ✓ graph query - Execute Cypher queries (-q flag)"
echo " ✓ graph shell - Interactive REPL (use Ctrl+C to exit)"
echo " ✓ graph import - Import from files (-i flag)"
echo " ✓ graph export - Export to files (-o flag)"
echo " ✓ graph info - Show statistics (--detailed flag)"
echo " ✓ graph benchmark - Performance tests (-n, -t flags)"
echo " ✓ graph serve - HTTP/gRPC server (--graphql flag)"
echo ""
echo "All commands use -b for --db (not -d, which is for --debug)"
echo "Query uses -q for --cypher (not -c, which is for --config)"

View File

@@ -0,0 +1,108 @@
#!/bin/bash
# Test ruvector npm package in Docker container
set -e
echo "=== Creating test package ==="
# Create temporary test directory
TEST_DIR=$(mktemp -d)
cd "$TEST_DIR"
# Create package.json
cat > package.json << 'EOF'
{
"name": "ruvector-test",
"version": "1.0.0",
"type": "module",
"main": "test.mjs"
}
EOF
# Create test script
cat > test.mjs << 'EOF'
import ruvector from '@ruvector/core';
const { VectorDB, CollectionManager, version, hello, getHealth, getMetrics } = ruvector;
console.log('=== Ruvector Package Test ===\n');
// Test version and hello
console.log('Version:', version());
console.log('Hello:', hello());
// Test health
console.log('\n--- Health Check ---');
const health = getHealth();
console.log('Status:', health.status);
console.log('Version:', health.version);
// Test metrics
console.log('\n--- Metrics ---');
const metrics = getMetrics();
console.log('Metrics available:', metrics.length > 0 ? 'Yes' : 'No');
// Test VectorDB
console.log('\n--- VectorDB Test ---');
const db = VectorDB.withDimensions(4);
console.log('Created VectorDB with 4 dimensions');
// Insert vectors
const id1 = await db.insert({ vector: new Float32Array([1.0, 0.0, 0.0, 0.0]) });
const id2 = await db.insert({ vector: new Float32Array([0.0, 1.0, 0.0, 0.0]) });
const id3 = await db.insert({ vector: new Float32Array([0.9, 0.1, 0.0, 0.0]) });
console.log('Inserted 3 vectors:', id1, id2, id3);
// Search
const results = await db.search({ vector: new Float32Array([1.0, 0.0, 0.0, 0.0]), k: 2 });
console.log('Search results:', results);
// Verify correct order
if (results[0].id === id1 && results[1].id === id3) {
console.log('✓ Search results correct!');
} else {
console.log('✗ Search results incorrect');
process.exit(1);
}
// Test CollectionManager
console.log('\n--- CollectionManager Test ---');
try {
const manager = new CollectionManager('./test-collections');
console.log('Created CollectionManager');
await manager.createCollection('test_vectors', { dimensions: 128 });
console.log('Created collection: test_vectors');
const collections = await manager.listCollections();
console.log('Collections:', collections);
const stats = await manager.getStats('test_vectors');
console.log('Stats:', stats);
await manager.deleteCollection('test_vectors');
console.log('Deleted collection: test_vectors');
console.log('✓ CollectionManager works!');
} catch (err) {
console.log('CollectionManager error:', err.message);
}
console.log('\n=== All Tests Passed! ===');
EOF
echo "=== Test files created in $TEST_DIR ==="
# Copy local package
echo "=== Copying local package ==="
mkdir -p node_modules/@ruvector
cp -r /workspaces/ruvector/npm/core node_modules/@ruvector/
# Run test
echo ""
echo "=== Running test ==="
node test.mjs
# Cleanup
cd /
rm -rf "$TEST_DIR"
echo ""
echo "=== Test completed successfully ==="

View File

@@ -0,0 +1,47 @@
#!/bin/bash
# Test script for RuVector Graph CLI commands
set -e
echo "============================================"
echo "RuVector Graph CLI - Command Tests"
echo "============================================"
echo ""
# Build the CLI
echo "Building CLI..."
cargo build --package ruvector-cli --bin ruvector --quiet 2>&1 | grep -v "warning:" | head -5
CLI="./target/debug/ruvector"
echo ""
echo "1. Testing main help..."
$CLI --help | grep -A 2 "graph"
echo ""
echo "2. Testing graph command help..."
$CLI graph --help 2>&1 | head -20 || echo "Failed to show graph help"
echo ""
echo "3. Testing graph create..."
$CLI graph create --path /tmp/test-graph.db --name test --indexed 2>&1 | grep -v "warning:" || true
echo ""
echo "4. Testing graph info..."
$CLI graph info --db /tmp/test-graph.db 2>&1 | grep -v "warning:" || true
echo ""
echo "5. Listing available graph commands..."
echo " - create : Create new graph database"
echo " - query : Execute Cypher queries"
echo " - shell : Interactive REPL"
echo " - import : Import from CSV/JSON/Cypher"
echo " - export : Export to various formats"
echo " - info : Show database statistics"
echo " - benchmark : Run performance tests"
echo " - serve : Start HTTP/gRPC server"
echo ""
echo "============================================"
echo "All graph commands are registered!"
echo "============================================"

176
vendor/ruvector/scripts/test/test-wasm.mjs vendored Executable file
View File

@@ -0,0 +1,176 @@
#!/usr/bin/env node
/**
* WASM Package Test Script
* Tests ruvector-math-wasm and ruvector-attention-wasm in Node.js
*/
import { readFileSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
console.log('🧪 Testing RuVector WASM Packages\n');
// ============================================================================
// Test ruvector-math-wasm
// ============================================================================
async function testMathWasm() {
console.log('📦 Testing ruvector-math-wasm...');
const pkgPath = join(__dirname, '../crates/ruvector-math-wasm/pkg');
try {
// Load WASM module
const wasmPath = join(pkgPath, 'ruvector_math_wasm_bg.wasm');
const wasmBuffer = readFileSync(wasmPath);
// Import the JS bindings
const mathWasm = await import(join(pkgPath, 'ruvector_math_wasm.js'));
// Initialize with WASM bytes
await mathWasm.default(wasmBuffer);
// Test 1: Sliced Wasserstein Distance
console.log(' ├─ Testing SlicedWasserstein...');
const sw = new mathWasm.WasmSlicedWasserstein(100);
// Create test point clouds (3 points in 2D each)
const source = new Float64Array([0, 0, 1, 0, 0, 1]);
const target = new Float64Array([2, 0, 3, 0, 2, 1]);
const distance = sw.distance(source, target, 2);
console.log(` │ Distance: ${distance.toFixed(4)}`);
if (distance > 0 && distance < 10) {
console.log(' │ ✅ SlicedWasserstein works!');
} else {
throw new Error(`Unexpected distance: ${distance}`);
}
// Test 2: Product Manifold
console.log(' ├─ Testing ProductManifold...');
const manifold = new mathWasm.WasmProductManifold(4, 2, 2); // E^4 x H^2 x S^2
// Create test points (8D total)
const pointA = new Float64Array([1, 0, 0, 0, 0.1, 0.1, 1, 0]);
const pointB = new Float64Array([0, 1, 0, 0, 0.2, 0.1, 0, 1]);
const manifoldDist = manifold.distance(pointA, pointB);
console.log(` │ Manifold distance: ${manifoldDist.toFixed(4)}`);
if (manifoldDist > 0) {
console.log(' │ ✅ ProductManifold works!');
} else {
throw new Error(`Unexpected manifold distance: ${manifoldDist}`);
}
// Test 3: Spherical Space
console.log(' ├─ Testing SphericalSpace...');
const sphere = new mathWasm.WasmSphericalSpace(3);
const vecA = new Float64Array([1, 0, 0]);
const vecB = new Float64Array([0, 1, 0]);
const sphereDist = sphere.distance(vecA, vecB);
console.log(` │ Spherical distance: ${sphereDist.toFixed(4)} (expected: ~1.5708 = π/2)`);
if (Math.abs(sphereDist - Math.PI/2) < 0.01) {
console.log(' │ ✅ SphericalSpace works!');
} else {
throw new Error(`Unexpected spherical distance: ${sphereDist}`);
}
console.log(' └─ ✅ ruvector-math-wasm: All tests passed!\n');
return true;
} catch (error) {
console.error(' └─ ❌ Error:', error.message);
return false;
}
}
// ============================================================================
// Test ruvector-attention-wasm
// ============================================================================
async function testAttentionWasm() {
console.log('📦 Testing ruvector-attention-wasm...');
const pkgPath = join(__dirname, '../crates/ruvector-attention-wasm/pkg');
try {
// Check if pkg exists (need to build first)
const wasmPath = join(pkgPath, 'ruvector_attention_wasm_bg.wasm');
let wasmBuffer;
try {
wasmBuffer = readFileSync(wasmPath);
} catch {
console.log(' └─ ⚠️ Package not built. Building now...');
const { execSync } = await import('child_process');
execSync('wasm-pack build crates/ruvector-attention-wasm --target web --release', {
cwd: join(__dirname, '..'),
stdio: 'inherit'
});
wasmBuffer = readFileSync(wasmPath);
}
// Import the JS bindings
const attentionWasm = await import(join(pkgPath, 'ruvector_attention_wasm.js'));
// Initialize with WASM bytes
await attentionWasm.default(wasmBuffer);
// Test 1: Scaled Dot Product Attention
console.log(' ├─ Testing ScaledDotProductAttention...');
if (attentionWasm.WasmScaledDotProductAttention) {
const attention = new attentionWasm.WasmScaledDotProductAttention(64);
console.log(' │ ✅ ScaledDotProductAttention initialized');
} else {
console.log(' │ ⚠️ ScaledDotProductAttention not exported');
}
// Test 2: Flash Attention (if available)
console.log(' ├─ Testing FlashAttention...');
if (attentionWasm.WasmFlashAttention) {
const flash = new attentionWasm.WasmFlashAttention(64, 64);
console.log(' │ ✅ FlashAttention initialized');
} else {
console.log(' │ ⚠️ FlashAttention not exported');
}
// List available exports
console.log(' ├─ Available exports:');
const exports = Object.keys(attentionWasm).filter(k => k.startsWith('Wasm'));
exports.forEach(e => console.log(` │ - ${e}`));
console.log(' └─ ✅ ruvector-attention-wasm: Package loaded successfully!\n');
return true;
} catch (error) {
console.error(' └─ ❌ Error:', error.message);
return false;
}
}
// ============================================================================
// Run all tests
// ============================================================================
async function main() {
const results = {
math: await testMathWasm(),
attention: await testAttentionWasm()
};
console.log('═══════════════════════════════════════');
console.log('📊 Test Results:');
console.log(` ruvector-math-wasm: ${results.math ? '✅ PASS' : '❌ FAIL'}`);
console.log(` ruvector-attention-wasm: ${results.attention ? '✅ PASS' : '❌ FAIL'}`);
console.log('═══════════════════════════════════════');
process.exit(results.math && results.attention ? 0 : 1);
}
main().catch(console.error);

View File

@@ -0,0 +1,100 @@
#!/bin/bash
# Pre-publish validation script for ruvector packages (without jq dependency)
set -e
echo "🔍 Validating ruvector packages for npm publishing..."
echo ""
PASSED=0
FAILED=0
WARNINGS=0
pass() { echo "$1"; ((PASSED++)); }
fail() { echo "$1"; ((FAILED++)); }
warn() { echo "$1"; ((WARNINGS++)); }
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo " @ruvector/psycho-symbolic-integration"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
cd packages/psycho-symbolic-integration
[ -f "package.json" ] && pass "package.json exists" || fail "package.json missing"
[ -f "README.md" ] && pass "README.md exists" || fail "README.md missing"
[ -f "LICENSE" ] && pass "LICENSE exists" || warn "LICENSE missing"
[ -f ".npmignore" ] && pass ".npmignore exists" || warn ".npmignore missing"
[ -f "tsconfig.json" ] && pass "tsconfig.json exists" || warn "tsconfig.json missing"
[ -d "src" ] && pass "src/ directory exists" || fail "src/ directory missing"
[ -d "node_modules" ] && pass "dependencies installed" || warn "run npm install first"
grep -q '"name":' package.json && pass "name field exists" || fail "name field missing"
grep -q '"version":' package.json && pass "version field exists" || fail "version field missing"
grep -q '"description":' package.json && pass "description field exists" || fail "description field missing"
grep -q '"repository":' package.json && pass "repository field exists" || warn "repository field missing"
grep -q '"publishConfig":' package.json && pass "publishConfig exists" || warn "publishConfig missing"
cd ../..
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo " @ruvector/psycho-synth-examples"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
cd packages/psycho-synth-examples
[ -f "package.json" ] && pass "package.json exists" || fail "package.json missing"
[ -f "README.md" ] && pass "README.md exists" || fail "README.md missing"
[ -f "LICENSE" ] && pass "LICENSE exists" || warn "LICENSE missing"
[ -f ".npmignore" ] && pass ".npmignore exists" || warn ".npmignore missing"
[ -f "tsconfig.json" ] && pass "tsconfig.json exists" || warn "tsconfig.json missing"
[ -d "src" ] && pass "src/ directory exists" || fail "src/ directory missing"
[ -d "bin" ] && pass "bin/ directory exists" || fail "bin/ directory missing"
[ -d "examples" ] && pass "examples/ directory exists" || fail "examples/ directory missing"
[ -d "node_modules" ] && pass "dependencies installed" || warn "run npm install first"
[ -f "bin/cli.js" ] && pass "CLI file exists" || fail "CLI file missing"
[ -x "bin/cli.js" ] && pass "CLI is executable" || warn "CLI not executable"
if head -1 bin/cli.js | grep -q "^#!/usr/bin/env node"; then
pass "CLI has correct shebang"
else
fail "CLI missing shebang"
fi
grep -q '"name":' package.json && pass "name field exists" || fail "name field missing"
grep -q '"version":' package.json && pass "version field exists" || fail "version field missing"
grep -q '"bin":' package.json && pass "bin field exists" || fail "bin field missing"
grep -q '"repository":' package.json && pass "repository field exists" || warn "repository field missing"
grep -q '"publishConfig":' package.json && pass "publishConfig exists" || warn "publishConfig missing"
# Test CLI
echo ""
if node bin/cli.js list > /dev/null 2>&1; then
pass "CLI 'list' command works"
else
fail "CLI 'list' command failed"
fi
cd ../..
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo " Summary"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "Passed: $PASSED"
echo "Warnings: $WARNINGS"
echo "Failed: $FAILED"
echo ""
if [ $FAILED -gt 0 ]; then
echo "❌ Validation failed with $FAILED errors"
exit 1
elif [ $WARNINGS -gt 0 ]; then
echo "⚠️ Validation passed with $WARNINGS warnings"
exit 0
else
echo "✅ All validations passed!"
exit 0
fi

View File

@@ -0,0 +1,221 @@
#!/bin/bash
# Pre-publish validation script for ruvector packages
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(dirname "$SCRIPT_DIR")"
echo "🔍 Validating ruvector packages for npm publishing..."
echo ""
# Colors
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Validation counters
PASSED=0
FAILED=0
WARNINGS=0
# Helper functions
pass() {
echo -e "${GREEN}${NC} $1"
((PASSED++))
}
fail() {
echo -e "${RED}${NC} $1"
((FAILED++))
}
warn() {
echo -e "${YELLOW}${NC} $1"
((WARNINGS++))
}
section() {
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo " $1"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
}
# Package validation function
validate_package() {
local PKG_DIR="$1"
local PKG_NAME="$2"
section "Validating: $PKG_NAME"
cd "$PKG_DIR"
# Check package.json exists
if [ -f "package.json" ]; then
pass "package.json exists"
else
fail "package.json missing"
return 1
fi
# Check required fields in package.json
local name=$(jq -r '.name' package.json)
local version=$(jq -r '.version' package.json)
local description=$(jq -r '.description' package.json)
local license=$(jq -r '.license' package.json)
local repository=$(jq -r '.repository.url' package.json)
[ "$name" != "null" ] && pass "name: $name" || fail "name missing"
[ "$version" != "null" ] && pass "version: $version" || fail "version missing"
[ "$description" != "null" ] && pass "description exists" || fail "description missing"
[ "$license" != "null" ] && pass "license: $license" || fail "license missing"
[ "$repository" != "null" ] && pass "repository URL set" || warn "repository URL missing"
# Check README
if [ -f "README.md" ]; then
local readme_size=$(wc -c < README.md)
if [ "$readme_size" -gt 500 ]; then
pass "README.md exists ($(echo $readme_size | numfmt --to=iec-i --suffix=B))"
else
warn "README.md exists but seems short (${readme_size} bytes)"
fi
else
fail "README.md missing"
fi
# Check LICENSE
if [ -f "LICENSE" ]; then
pass "LICENSE exists"
else
warn "LICENSE missing"
fi
# Check .npmignore
if [ -f ".npmignore" ]; then
pass ".npmignore exists"
else
warn ".npmignore missing (npm will use .gitignore)"
fi
# Check TypeScript configuration
if [ -f "tsconfig.json" ]; then
pass "tsconfig.json exists"
else
warn "tsconfig.json missing"
fi
# Check source directory
if [ -d "src" ]; then
local src_files=$(find src -name "*.ts" -type f | wc -l)
pass "src/ directory exists ($src_files TypeScript files)"
else
fail "src/ directory missing"
fi
# Check if dependencies are installed
if [ -d "node_modules" ]; then
pass "node_modules exists (dependencies installed)"
else
warn "node_modules missing - run npm install"
fi
# Validate package scripts
local has_build=$(jq -r '.scripts.build' package.json)
[ "$has_build" != "null" ] && pass "build script defined" || warn "build script missing"
# Check for bin (CLI packages)
local has_bin=$(jq -r '.bin' package.json)
if [ "$has_bin" != "null" ]; then
pass "bin entry defined (CLI package)"
# Validate bin files exist
local bin_file=$(jq -r '.bin | if type=="object" then .[keys[0]] else . end' package.json)
if [ -f "$bin_file" ]; then
pass "bin file exists: $bin_file"
# Check shebang
if head -1 "$bin_file" | grep -q "^#!/usr/bin/env node"; then
pass "bin file has correct shebang"
else
fail "bin file missing shebang: #!/usr/bin/env node"
fi
# Check executable permission
if [ -x "$bin_file" ]; then
pass "bin file is executable"
else
warn "bin file not executable - will be fixed by npm"
fi
else
fail "bin file missing: $bin_file"
fi
fi
# Check publishConfig
local publish_access=$(jq -r '.publishConfig.access' package.json)
[ "$publish_access" == "public" ] && pass "publishConfig.access: public" || warn "publishConfig.access not set to public (scoped packages need this)"
# Validate files field
local files=$(jq -r '.files' package.json)
if [ "$files" != "null" ]; then
pass "files field defined"
# Check if listed files exist
jq -r '.files[]' package.json | while read -r file; do
if [ -e "$file" ] || [ "$file" == "dist" ]; then
pass " - $file exists (or will be created by build)"
else
warn " - $file listed but not found"
fi
done
else
warn "files field not defined (npm will include everything not in .npmignore)"
fi
cd "$ROOT_DIR"
}
# Main validation
cd "$ROOT_DIR"
# Validate psycho-symbolic-integration
validate_package "$ROOT_DIR/packages/psycho-symbolic-integration" "@ruvector/psycho-symbolic-integration"
# Validate psycho-synth-examples
validate_package "$ROOT_DIR/packages/psycho-synth-examples" "@ruvector/psycho-synth-examples"
# Test CLI functionality
section "Testing CLI Functionality"
cd "$ROOT_DIR/packages/psycho-synth-examples"
if node bin/cli.js list > /dev/null 2>&1; then
pass "CLI 'list' command works"
else
fail "CLI 'list' command failed"
fi
cd "$ROOT_DIR"
# Summary
section "Validation Summary"
echo ""
echo -e "${GREEN}Passed:${NC} $PASSED"
echo -e "${YELLOW}Warnings:${NC} $WARNINGS"
echo -e "${RED}Failed:${NC} $FAILED"
echo ""
if [ $FAILED -gt 0 ]; then
echo -e "${RED}❌ Validation failed with $FAILED errors${NC}"
echo "Please fix the errors before publishing."
exit 1
elif [ $WARNINGS -gt 0 ]; then
echo -e "${YELLOW}⚠️ Validation passed with $WARNINGS warnings${NC}"
echo "Consider addressing warnings before publishing."
exit 0
else
echo -e "${GREEN}✅ All validations passed!${NC}"
echo "Packages are ready for publishing."
exit 0
fi

View File

@@ -0,0 +1,123 @@
#!/bin/bash
# Verification script for LocalKCut paper implementation
set -e
echo "==============================================="
echo "LocalKCut Paper Implementation Verification"
echo "==============================================="
echo ""
echo "1. Checking files exist..."
if [ -f "crates/ruvector-mincut/src/localkcut/paper_impl.rs" ]; then
echo " ✓ paper_impl.rs created"
wc -l crates/ruvector-mincut/src/localkcut/paper_impl.rs
else
echo " ✗ paper_impl.rs not found"
exit 1
fi
if [ -f "docs/localkcut-paper-implementation.md" ]; then
echo " ✓ Documentation created"
wc -l docs/localkcut-paper-implementation.md
else
echo " ✗ Documentation not found"
exit 1
fi
echo ""
echo "2. Verifying module structure..."
if grep -q "pub mod paper_impl;" crates/ruvector-mincut/src/localkcut/mod.rs; then
echo " ✓ paper_impl module exported"
else
echo " ✗ Module export missing"
exit 1
fi
if grep -q "LocalKCutQuery" crates/ruvector-mincut/src/localkcut/mod.rs; then
echo " ✓ API types re-exported"
else
echo " ✗ API types not exported"
exit 1
fi
echo ""
echo "3. Running unit tests..."
cargo test -p ruvector-mincut --lib localkcut::paper_impl::tests --quiet
echo ""
echo "4. Checking test count..."
TEST_COUNT=$(cargo test -p ruvector-mincut --lib localkcut::paper_impl::tests -- --list 2>/dev/null | grep "test" | wc -l)
echo " Found $TEST_COUNT tests"
if [ "$TEST_COUNT" -ge 16 ]; then
echo " ✓ All tests present ($TEST_COUNT >= 16)"
else
echo " ✗ Missing tests ($TEST_COUNT < 16)"
exit 1
fi
echo ""
echo "5. Verifying API compliance..."
if grep -q "pub struct LocalKCutQuery" crates/ruvector-mincut/src/localkcut/paper_impl.rs; then
echo " ✓ LocalKCutQuery struct"
fi
if grep -q "pub enum LocalKCutResult" crates/ruvector-mincut/src/localkcut/paper_impl.rs; then
echo " ✓ LocalKCutResult enum"
fi
if grep -q "pub trait LocalKCutOracle" crates/ruvector-mincut/src/localkcut/paper_impl.rs; then
echo " ✓ LocalKCutOracle trait"
fi
if grep -q "pub struct DeterministicLocalKCut" crates/ruvector-mincut/src/localkcut/paper_impl.rs; then
echo " ✓ DeterministicLocalKCut implementation"
fi
if grep -q "pub struct DeterministicFamilyGenerator" crates/ruvector-mincut/src/localkcut/paper_impl.rs; then
echo " ✓ DeterministicFamilyGenerator"
fi
echo ""
echo "6. Verifying determinism..."
if grep -q "sort_unstable()" crates/ruvector-mincut/src/localkcut/paper_impl.rs; then
echo " ✓ Uses sorted traversal for determinism"
else
echo " ✗ Missing deterministic ordering"
exit 1
fi
if ! grep -q "use.*rand" crates/ruvector-mincut/src/localkcut/paper_impl.rs; then
echo " ✓ No randomness detected"
else
echo " ✗ Uses randomness (not deterministic)"
exit 1
fi
echo ""
echo "7. Checking witness integration..."
if grep -q "WitnessHandle::new" crates/ruvector-mincut/src/localkcut/paper_impl.rs; then
echo " ✓ Creates WitnessHandle"
fi
if grep -q "boundary_size" crates/ruvector-mincut/src/localkcut/paper_impl.rs; then
echo " ✓ Uses boundary_size API"
fi
echo ""
echo "==============================================="
echo "✓ All verifications passed!"
echo "==============================================="
echo ""
echo "Summary:"
echo " - Implementation: crates/ruvector-mincut/src/localkcut/paper_impl.rs"
echo " - Tests: 16 comprehensive unit tests"
echo " - Documentation: docs/localkcut-paper-implementation.md"
echo " - API: Strictly compliant with paper specification"
echo " - Determinism: Verified (no randomness)"
echo " - Integration: Exports available at crate root"
echo ""
echo "Usage:"
echo " cargo test -p ruvector-mincut --lib localkcut::paper_impl"
echo ""

View File

@@ -0,0 +1,164 @@
#!/bin/bash
# ============================================================================
# HNSW Index Build Verification Script
# ============================================================================
# Verifies that the HNSW index implementation compiles and tests pass
set -e # Exit on error
echo "=================================="
echo "HNSW Index Build Verification"
echo "=================================="
echo ""
# Color codes
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Check we're in the right directory
if [ ! -f "Cargo.toml" ]; then
echo -e "${RED}Error: Must run from ruvector root directory${NC}"
exit 1
fi
# Step 1: Check Rust compilation
echo -e "${YELLOW}Step 1: Checking Rust compilation...${NC}"
cd crates/ruvector-postgres
if cargo check --all-features 2>&1 | tee /tmp/hnsw_check.log; then
echo -e "${GREEN}✓ Rust code compiles successfully${NC}"
else
echo -e "${RED}✗ Rust compilation failed${NC}"
echo "See /tmp/hnsw_check.log for details"
exit 1
fi
echo ""
# Step 2: Run Rust unit tests
echo -e "${YELLOW}Step 2: Running Rust unit tests...${NC}"
if cargo test --lib 2>&1 | tee /tmp/hnsw_test.log; then
echo -e "${GREEN}✓ Rust tests passed${NC}"
else
echo -e "${RED}✗ Rust tests failed${NC}"
echo "See /tmp/hnsw_test.log for details"
exit 1
fi
echo ""
# Step 3: Check pgrx build
echo -e "${YELLOW}Step 3: Building pgrx extension...${NC}"
if cargo pgrx package 2>&1 | tee /tmp/hnsw_pgrx.log; then
echo -e "${GREEN}✓ pgrx extension built successfully${NC}"
else
echo -e "${RED}✗ pgrx build failed${NC}"
echo "See /tmp/hnsw_pgrx.log for details"
exit 1
fi
echo ""
# Step 4: Verify SQL files exist
echo -e "${YELLOW}Step 4: Verifying SQL files...${NC}"
SQL_FILES=(
"sql/ruvector--0.1.0.sql"
"sql/hnsw_index.sql"
"tests/hnsw_index_tests.sql"
)
ALL_SQL_EXIST=true
for file in "${SQL_FILES[@]}"; do
if [ -f "$file" ]; then
echo -e "${GREEN}✓ Found: $file${NC}"
else
echo -e "${RED}✗ Missing: $file${NC}"
ALL_SQL_EXIST=false
fi
done
if [ "$ALL_SQL_EXIST" = false ]; then
echo -e "${RED}Some SQL files are missing${NC}"
exit 1
fi
echo ""
# Step 5: Verify Rust source files
echo -e "${YELLOW}Step 5: Verifying Rust source files...${NC}"
RUST_FILES=(
"src/index/hnsw.rs"
"src/index/hnsw_am.rs"
"src/index/mod.rs"
)
ALL_RUST_EXIST=true
for file in "${RUST_FILES[@]}"; do
if [ -f "$file" ]; then
echo -e "${GREEN}✓ Found: $file${NC}"
else
echo -e "${RED}✗ Missing: $file${NC}"
ALL_RUST_EXIST=false
fi
done
if [ "$ALL_RUST_EXIST" = false ]; then
echo -e "${RED}Some Rust files are missing${NC}"
exit 1
fi
echo ""
# Step 6: Check documentation
echo -e "${YELLOW}Step 6: Verifying documentation...${NC}"
cd ../.. # Back to root
DOC_FILES=(
"docs/HNSW_INDEX.md"
)
ALL_DOCS_EXIST=true
for file in "${DOC_FILES[@]}"; do
if [ -f "$file" ]; then
echo -e "${GREEN}✓ Found: $file${NC}"
else
echo -e "${RED}✗ Missing: $file${NC}"
ALL_DOCS_EXIST=false
fi
done
echo ""
# Step 7: Check for compilation warnings
echo -e "${YELLOW}Step 7: Checking for warnings...${NC}"
WARNING_COUNT=$(grep -c "warning:" /tmp/hnsw_check.log || true)
if [ "$WARNING_COUNT" -eq 0 ]; then
echo -e "${GREEN}✓ No compilation warnings${NC}"
else
echo -e "${YELLOW}⚠ Found $WARNING_COUNT warnings${NC}"
echo "Check /tmp/hnsw_check.log for details"
fi
echo ""
# Summary
echo "=================================="
echo -e "${GREEN}All verification checks passed!${NC}"
echo "=================================="
echo ""
echo "Next steps:"
echo "1. Install extension: cargo pgrx install"
echo "2. Run SQL tests: psql -d testdb -f crates/ruvector-postgres/tests/hnsw_index_tests.sql"
echo "3. Create index: CREATE INDEX ON table USING hnsw (column hnsw_l2_ops);"
echo ""
echo "Documentation: docs/HNSW_INDEX.md"
echo ""