Files
wifi-densepose/crates/ruvector-bench/scripts/download_datasets.sh
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

103 lines
3.9 KiB
Bash
Executable File

#!/bin/bash
# Download ANN benchmark datasets (SIFT1M, GIST1M, Deep1M)
set -e
DATASETS_DIR="datasets"
mkdir -p "$DATASETS_DIR"
echo "╔════════════════════════════════════════╗"
echo "║ ANN Benchmark Dataset Downloader ║"
echo "╚════════════════════════════════════════╝"
echo ""
# Function to download and extract dataset
download_dataset() {
local name=$1
local url=$2
local file=$(basename "$url")
echo "Downloading $name..."
if [ -f "$DATASETS_DIR/$file" ]; then
echo " ✓ Already downloaded: $file"
else
wget -q --show-progress -O "$DATASETS_DIR/$file" "$url"
echo " ✓ Downloaded: $file"
fi
echo "Extracting $name..."
if [[ $file == *.tar.gz ]]; then
tar -xzf "$DATASETS_DIR/$file" -C "$DATASETS_DIR"
elif [[ $file == *.gz ]]; then
gunzip -k "$DATASETS_DIR/$file"
fi
echo " ✓ Extracted successfully"
echo ""
}
# SIFT1M Dataset (128D, 1M vectors)
# http://corpus-texmex.irisa.fr/
echo "1. SIFT1M Dataset (128 dimensions, 1M vectors)"
echo " Download from: http://corpus-texmex.irisa.fr/"
echo " Note: Direct download requires manual intervention due to terms of service"
echo " Please visit the website and download sift.tar.gz manually to datasets/"
echo ""
# GIST1M Dataset (960D, 1M vectors)
echo "2. GIST1M Dataset (960 dimensions, 1M vectors)"
echo " Download from: http://corpus-texmex.irisa.fr/"
echo " Note: Direct download requires manual intervention due to terms of service"
echo " Please visit the website and download gist.tar.gz manually to datasets/"
echo ""
# Deep1M Dataset (96D, 1M vectors)
echo "3. Deep1M Dataset (96 dimensions, 1M vectors)"
echo " Download from: http://sites.skoltech.ru/compvision/noimi/"
echo " Note: This dataset may require registration"
echo ""
# Alternative: Generate synthetic datasets
echo "═══════════════════════════════════════════════════════════════"
echo "ALTERNATIVE: Generate Synthetic Datasets"
echo "═══════════════════════════════════════════════════════════════"
echo ""
echo "If you prefer to use synthetic data for benchmarking, the"
echo "benchmark tools will automatically generate appropriate datasets."
echo ""
echo "To run with synthetic data:"
echo " cargo run --release --bin ann-benchmark -- --dataset synthetic"
echo ""
# Check for HDF5 support
echo "Checking dependencies..."
if command -v h5dump &> /dev/null; then
echo " ✓ HDF5 tools installed"
else
echo " ⚠ HDF5 tools not found. Install with:"
echo " Ubuntu/Debian: sudo apt-get install hdf5-tools"
echo " macOS: brew install hdf5"
echo " Note: HDF5 is optional for synthetic benchmarks"
fi
echo ""
echo "════════════════════════════════════════"
echo "Setup Instructions:"
echo "════════════════════════════════════════"
echo ""
echo "1. Manual Download (for real datasets):"
echo " - Visit http://corpus-texmex.irisa.fr/"
echo " - Download sift.tar.gz, gist.tar.gz"
echo " - Place in: $DATASETS_DIR/"
echo " - Extract: tar -xzf $DATASETS_DIR/sift.tar.gz -C $DATASETS_DIR/"
echo ""
echo "2. Synthetic Datasets (recommended for testing):"
echo " - No download required"
echo " - Generated automatically by benchmark tools"
echo " - Suitable for performance testing and profiling"
echo ""
echo "3. Run Benchmarks:"
echo " cd crates/ruvector-bench"
echo " cargo run --release --bin ann-benchmark"
echo ""
echo "✓ Setup guide complete!"