name: RuvLLM Benchmarks

on:
  pull_request:
    paths:
      - 'crates/ruvllm/**'
      - '.github/workflows/ruvllm-benchmarks.yml'
  push:
    branches:
      - main
      - develop
    paths:
      - 'crates/ruvllm/**'
  workflow_dispatch:
    inputs:
      run_ane_benchmarks:
        description: 'Run ANE benchmarks (macOS only)'
        required: false
        default: 'true'
        type: boolean
      run_full_suite:
        description: 'Run full benchmark suite (takes longer)'
        required: false
        default: 'false'
        type: boolean

env:
  CARGO_TERM_COLOR: always
  RUST_BACKTRACE: 1

permissions:
  contents: read
  pull-requests: write
  issues: write

jobs:
  # macOS ARM64 benchmarks (Apple Silicon with ANE)
  macos-arm64-benchmarks:
    name: macOS ARM64 Benchmarks (M-series)
    runs-on: macos-14  # M1/M2 runner
    timeout-minutes: 45

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install Rust toolchain
        uses: dtolnay/rust-toolchain@stable
        with:
          targets: aarch64-apple-darwin

      - name: Cache cargo registry
        uses: actions/cache@v4
        with:
          path: ~/.cargo/registry
          key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
          restore-keys: |
            ${{ runner.os }}-cargo-registry-

      - name: Cache cargo build
        uses: actions/cache@v4
        with:
          path: target
          key: ${{ runner.os }}-cargo-build-ruvllm-bench-${{ hashFiles('**/Cargo.lock') }}
          restore-keys: |
            ${{ runner.os }}-cargo-build-ruvllm-bench-
            ${{ runner.os }}-cargo-build-

      - name: Build ruvllm with ANE support
        run: |
          cargo build --release -p ruvllm --features "coreml,accelerate"

      - name: Run ANE vs NEON benchmarks
        if: github.event.inputs.run_ane_benchmarks != 'false'
        working-directory: crates/ruvllm
        run: |
          # Run the ANE comparison benchmarks
          cargo bench --features "coreml,accelerate" --bench ane_bench -- \
            --output-format bencher 2>&1 | tee ../../ane_bench_results.txt

      - name: Run crossover detection benchmark
        if: github.event.inputs.run_full_suite == 'true'
        working-directory: crates/ruvllm
        run: |
          cargo bench --features "coreml,accelerate" --bench ane_bench -- \
            crossover_detection --output-format bencher 2>&1 | tee -a ../../ane_bench_results.txt

      - name: Run hybrid pipeline benchmark
        if: github.event.inputs.run_full_suite == 'true'
        working-directory: crates/ruvllm
        run: |
          cargo bench --features "coreml,accelerate" --bench ane_bench -- \
            hybrid_pipeline --output-format bencher 2>&1 | tee -a ../../ane_bench_results.txt

      - name: Run matmul benchmarks
        working-directory: crates/ruvllm
        run: |
          cargo bench --features "coreml,accelerate" --bench matmul_bench -- \
            --output-format bencher 2>&1 | tee ../../matmul_bench_results.txt

      - name: Run attention benchmarks
        working-directory: crates/ruvllm
        run: |
          cargo bench --features "coreml,accelerate" --bench attention_bench -- \
            --output-format bencher 2>&1 | tee ../../attention_bench_results.txt

      - name: Generate benchmark summary
        run: |
          cat > benchmark_summary.md << 'EOF'
          # RuvLLM Benchmark Results (macOS ARM64 with ANE)

          ## System Information
          - Runner: macOS 14 (Apple Silicon M-series)
          - Features: coreml, accelerate

          ## ANE vs NEON Performance

          The ANE (Apple Neural Engine) benchmarks measure:
          - Matrix multiplication at various sizes
          - Activation functions (SiLU, GELU, Softmax)
          - Normalization (LayerNorm, RMSNorm)
          - Hybrid pipeline (ANE + GPU coordination)

          ### Expected Performance Characteristics (M4 Pro)

          | Matrix Size | ANE Advantage |
          |-------------|---------------|
          | < 512       | +30-50% faster |
          | 512-1024    | +10-30% faster |
          | 1024-1536   | ~Similar       |
          | 1536-2048   | GPU preferred  |
          | > 2048      | GPU wins 30-50%|

          ## Results

          ### ANE Benchmark Results
          ```
          EOF
          head -n 100 ane_bench_results.txt >> benchmark_summary.md
          cat >> benchmark_summary.md << 'EOF'
          ```

          ### Matrix Multiplication Results
          ```
          EOF
          head -n 50 matmul_bench_results.txt >> benchmark_summary.md
          cat >> benchmark_summary.md << 'EOF'
          ```

          ### Attention Results
          ```
          EOF
          head -n 50 attention_bench_results.txt >> benchmark_summary.md
          echo '```' >> benchmark_summary.md

      - name: Upload benchmark results
        uses: actions/upload-artifact@v4
        with:
          name: ruvllm-macos-arm64-benchmarks
          path: |
            ane_bench_results.txt
            matmul_bench_results.txt
            attention_bench_results.txt
            benchmark_summary.md
          retention-days: 30

      - name: Comment PR with results
        if: github.event_name == 'pull_request'
        continue-on-error: true
        uses: actions/github-script@v7
        with:
          script: |
            const fs = require('fs');
            const summary = fs.readFileSync('benchmark_summary.md', 'utf8');

            github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: summary
            });

  # Linux benchmarks (NEON only baseline)
  linux-benchmarks:
    name: Linux Benchmarks (NEON baseline)
    runs-on: ubuntu-latest
    timeout-minutes: 30

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install Rust toolchain
        uses: dtolnay/rust-toolchain@stable

      - name: Cache cargo
        uses: actions/cache@v4
        with:
          path: |
            ~/.cargo/registry
            target
          key: ${{ runner.os }}-cargo-ruvllm-bench-${{ hashFiles('**/Cargo.lock') }}

      - name: Run matmul benchmarks (NEON simulation)
        working-directory: crates/ruvllm
        run: |
          cargo bench --bench matmul_bench -- --output-format bencher 2>&1 | tee ../../linux_matmul_bench.txt

      - name: Run attention benchmarks
        working-directory: crates/ruvllm
        run: |
          cargo bench --bench attention_bench -- --output-format bencher 2>&1 | tee ../../linux_attention_bench.txt

      - name: Upload Linux benchmark results
        uses: actions/upload-artifact@v4
        with:
          name: ruvllm-linux-benchmarks
          path: |
            linux_matmul_bench.txt
            linux_attention_bench.txt
          retention-days: 30

  # Benchmark comparison job
  benchmark-comparison:
    name: Compare Benchmarks
    runs-on: ubuntu-latest
    needs: [macos-arm64-benchmarks, linux-benchmarks]
    if: github.event_name == 'pull_request'

    steps:
      - name: Download macOS results
        uses: actions/download-artifact@v4
        with:
          name: ruvllm-macos-arm64-benchmarks
          path: macos-results

      - name: Download Linux results
        uses: actions/download-artifact@v4
        with:
          name: ruvllm-linux-benchmarks
          path: linux-results

      - name: Generate comparison report
        run: |
          cat > comparison.md << 'EOF'
          # Cross-Platform Benchmark Comparison

          ## macOS ARM64 (Apple Silicon with ANE)

          ```
          EOF
          head -n 30 macos-results/ane_bench_results.txt >> comparison.md
          cat >> comparison.md << 'EOF'
          ```

          ## Linux x86_64 (Baseline)

          ```
          EOF
          head -n 30 linux-results/linux_matmul_bench.txt >> comparison.md
          echo '```' >> comparison.md

      - name: Upload comparison
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-comparison
          path: comparison.md
          retention-days: 30