wifi-densepose/vendor/ruvector/benches/attention_latency.rs

//! Attention Mechanism Latency Benchmarks
//!
//! Benchmark each attention mechanism at 100 tokens.
//! Target: <100 microseconds per mechanism.
//!
//! Run with: cargo bench --bench attention_latency

use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId, Throughput};

/// Generate random f32 vector for benchmarking
fn random_vector(dim: usize, seed: u64) -> Vec<f32> {
    (0..dim)
        .map(|i| {
            let x = ((seed.wrapping_mul(i as u64 + 1).wrapping_mul(0x5DEECE66D)) % 1000) as f32;
            (x / 500.0) - 1.0 // Range [-1, 1]
        })
        .collect()
}

/// Generate batch of random vectors
fn random_vectors(count: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
    (0..count)
        .map(|i| random_vector(dim, seed.wrapping_add(i as u64)))
        .collect()
}

fn bench_all_attention_mechanisms(c: &mut Criterion) {
    let mut group = c.benchmark_group("attention_mechanisms");

    // Test parameters
    let dim = 64;
    let num_heads = 8;
    let seq_len = 100; // Target: 100 tokens

    // Generate test data
    let queries = random_vectors(seq_len, dim, 42);
    let keys = random_vectors(seq_len, dim, 123);
    let values = random_vectors(seq_len, dim, 456);

    // Set throughput for tokens/second calculation
    group.throughput(Throughput::Elements(seq_len as u64));

    // ========================================================================
    // Multi-Head Attention Benchmark
    // ========================================================================

    group.bench_function("multi_head_attention", |b| {
        // TODO: When implemented:
        // let attention = MultiHeadAttention::new(dim, num_heads);

        b.iter(|| {
            // TODO: Replace with actual attention computation
            // attention.forward(&queries, &keys, &values)

            // Placeholder: simulate attention computation
            let mut output = vec![0.0f32; dim];
            for q in &queries {
                for (k, v) in keys.iter().zip(values.iter()) {
                    let score: f32 = q.iter().zip(k.iter()).map(|(a, b)| a * b).sum();
                    for (o, vi) in output.iter_mut().zip(v.iter()) {
                        *o += score * vi * 0.001;
                    }
                }
            }
            output
        });
    });

    // ========================================================================
    // Mamba SSM Benchmark
    // ========================================================================

    group.bench_function("mamba_ssm", |b| {
        // TODO: When implemented:
        // let mamba = MambaSSM::new(dim);

        b.iter(|| {
            // TODO: Replace with actual Mamba SSM computation
            // mamba.forward(&queries)

            // Placeholder: simulate O(n) selective scan
            let mut hidden = vec![0.0f32; dim];
            for input in &queries {
                for (h, x) in hidden.iter_mut().zip(input.iter()) {
                    *h = *h * 0.9 + *x * 0.1;
                }
            }
            hidden
        });
    });

    // ========================================================================
    // RWKV Attention Benchmark
    // ========================================================================

    group.bench_function("rwkv_attention", |b| {
        // TODO: When implemented:
        // let rwkv = RWKVAttention::new(dim);

        b.iter(|| {
            // TODO: Replace with actual RWKV computation
            // rwkv.forward(&queries)

            // Placeholder: simulate linear attention
            let mut state = vec![0.0f32; dim];
            for input in &queries {
                for (s, x) in state.iter_mut().zip(input.iter()) {
                    *s = *s * 0.95 + *x;
                }
            }
            state
        });
    });

    // ========================================================================
    // Flash Attention Approximation Benchmark
    // ========================================================================

    group.bench_function("flash_attention_approx", |b| {
        // TODO: When implemented:
        // let flash = FlashAttention::new(dim);

        b.iter(|| {
            // TODO: Replace with actual Flash Attention
            // flash.forward(&queries, &keys, &values)

            // Placeholder: simulate tiled computation
            let tile_size = 16;
            let mut output = vec![0.0f32; dim];
            for tile_start in (0..seq_len).step_by(tile_size) {
                let tile_end = (tile_start + tile_size).min(seq_len);
                for i in tile_start..tile_end {
                    for j in 0..dim {
                        output[j] += queries[i][j] * 0.01;
                    }
                }
            }
            output
        });
    });

    // ========================================================================
    // Hyperbolic Attention Benchmark
    // ========================================================================

    group.bench_function("hyperbolic_attention", |b| {
        // TODO: When implemented:
        // let hyp_attn = HyperbolicAttention::new(dim, -1.0);

        b.iter(|| {
            // TODO: Replace with actual hyperbolic attention
            // hyp_attn.forward(&queries[0], &keys, &values)

            // Placeholder: simulate Poincare operations
            let query = &queries[0];
            let mut output = vec![0.0f32; dim];
            for (k, v) in keys.iter().zip(values.iter()) {
                // Simplified Poincare distance
                let dist: f32 = query.iter().zip(k.iter())
                    .map(|(a, b)| (a - b).powi(2))
                    .sum::<f32>()
                    .sqrt();
                let weight = (-dist).exp();
                for (o, vi) in output.iter_mut().zip(v.iter()) {
                    *o += weight * vi;
                }
            }
            output
        });
    });

    group.finish();
}

fn bench_attention_scaling(c: &mut Criterion) {
    let mut group = c.benchmark_group("attention_scaling");

    let dim = 64;

    // Test different sequence lengths
    for seq_len in [32, 64, 128, 256, 512].iter() {
        let queries = random_vectors(*seq_len, dim, 42);
        let keys = random_vectors(*seq_len, dim, 123);
        let values = random_vectors(*seq_len, dim, 456);

        group.throughput(Throughput::Elements(*seq_len as u64));

        group.bench_with_input(
            BenchmarkId::new("multi_head", seq_len),
            &(&queries, &keys, &values),
            |b, (q, k, v)| {
                b.iter(|| {
                    // TODO: Replace with actual attention
                    let mut output = vec![0.0f32; dim];
                    for qi in q.iter() {
                        for (ki, vi) in k.iter().zip(v.iter()) {
                            let score: f32 = qi.iter().zip(ki.iter())
                                .map(|(a, b)| a * b).sum();
                            for (o, vij) in output.iter_mut().zip(vi.iter()) {
                                *o += score * vij * 0.001;
                            }
                        }
                    }
                    output
                });
            },
        );

        group.bench_with_input(
            BenchmarkId::new("mamba_ssm", seq_len),
            &(&queries,),
            |b, (input,)| {
                b.iter(|| {
                    // TODO: Replace with actual Mamba SSM
                    let mut hidden = vec![0.0f32; dim];
                    for inp in input.iter() {
                        for (h, x) in hidden.iter_mut().zip(inp.iter()) {
                            *h = *h * 0.9 + *x * 0.1;
                        }
                    }
                    hidden
                });
            },
        );
    }

    group.finish();
}

fn bench_attention_memory(c: &mut Criterion) {
    let mut group = c.benchmark_group("attention_memory");

    // Test memory-efficient vs standard attention
    let dim = 64;
    let seq_len = 256;

    let queries = random_vectors(seq_len, dim, 42);
    let keys = random_vectors(seq_len, dim, 123);
    let values = random_vectors(seq_len, dim, 456);

    group.bench_function("standard_attention", |b| {
        b.iter(|| {
            // Full attention matrix: O(n^2) memory
            let mut attn_matrix = vec![vec![0.0f32; seq_len]; seq_len];
            for i in 0..seq_len {
                for j in 0..seq_len {
                    attn_matrix[i][j] = queries[i].iter()
                        .zip(keys[j].iter())
                        .map(|(a, b)| a * b)
                        .sum();
                }
            }
            attn_matrix
        });
    });

    group.bench_function("memory_efficient_attention", |b| {
        b.iter(|| {
            // Compute attention row by row: O(n) memory
            let mut output = vec![vec![0.0f32; dim]; seq_len];
            for i in 0..seq_len {
                let mut scores = vec![0.0f32; seq_len];
                for j in 0..seq_len {
                    scores[j] = queries[i].iter()
                        .zip(keys[j].iter())
                        .map(|(a, b)| a * b)
                        .sum();
                }
                // Softmax
                let max = scores.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
                let exp_sum: f32 = scores.iter().map(|s| (s - max).exp()).sum();
                for (j, score) in scores.iter().enumerate() {
                    let weight = (score - max).exp() / exp_sum;
                    for (k, v) in output[i].iter_mut().zip(values[j].iter()) {
                        *k += weight * v;
                    }
                }
            }
            output
        });
    });

    group.finish();
}

criterion_group!(
    benches,
    bench_all_attention_mechanisms,
    bench_attention_scaling,
    bench_attention_memory
);

criterion_main!(benches);