Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions
--- a/crates/ruvector-attention/benches/attention_bench.rs
+++ b/crates/ruvector-attention/benches/attention_bench.rs
@@ -0,0 +1,329 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use ruvector_attention::{
+    attention::ScaledDotProductAttention,
+    graph::{
+        DualSpaceAttention, DualSpaceConfig, EdgeFeaturedAttention, EdgeFeaturedConfig, GraphRoPE,
+        RoPEConfig,
+    },
+    hyperbolic::{HyperbolicAttention, HyperbolicAttentionConfig},
+    moe::{MoEAttention, MoEConfig},
+    sparse::{FlashAttention, LinearAttention, LocalGlobalAttention},
+    training::{Adam, InfoNCELoss, Loss, Optimizer},
+    traits::Attention,
+};
+
+fn bench_scaled_dot_product(c: &mut Criterion) {
+    let mut group = c.benchmark_group("scaled_dot_product");
+
+    for dim in [64, 128, 256, 512] {
+        let attention = ScaledDotProductAttention::new(dim);
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, &dim| {
+            let query = vec![0.5; dim];
+            let keys: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+                .collect();
+            let values: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+                .collect();
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_flash_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("flash_attention");
+
+    for seq_len in [64, 256, 512, 1024] {
+        let dim = 256;
+        let attention = FlashAttention::new(dim, 64);
+
+        group.bench_with_input(
+            BenchmarkId::new("seq_len", seq_len),
+            &seq_len,
+            |b, &seq_len| {
+                let query = vec![0.5; dim];
+                let keys: Vec<Vec<f32>> = (0..seq_len)
+                    .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+                    .collect();
+                let values: Vec<Vec<f32>> = (0..seq_len)
+                    .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+                    .collect();
+                let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+                let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+                b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_linear_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("linear_attention");
+
+    for seq_len in [256, 512, 1024, 2048] {
+        let dim = 256;
+        let attention = LinearAttention::new(dim, 64);
+
+        group.bench_with_input(
+            BenchmarkId::new("seq_len", seq_len),
+            &seq_len,
+            |b, &seq_len| {
+                let query = vec![0.5; dim];
+                let keys: Vec<Vec<f32>> = (0..seq_len)
+                    .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+                    .collect();
+                let values: Vec<Vec<f32>> = (0..seq_len)
+                    .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+                    .collect();
+                let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+                let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+                b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_local_global_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("local_global_attention");
+
+    for window_size in [16, 32, 64, 128] {
+        let dim = 256;
+        let attention = LocalGlobalAttention::new(dim, window_size, 4);
+
+        group.bench_with_input(
+            BenchmarkId::new("window", window_size),
+            &window_size,
+            |b, _| {
+                let query = vec![0.5; dim];
+                let keys: Vec<Vec<f32>> = (0..512)
+                    .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+                    .collect();
+                let values: Vec<Vec<f32>> = (0..512)
+                    .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+                    .collect();
+                let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+                let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+                b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_moe_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("moe_attention");
+
+    for num_experts in [2, 4, 8] {
+        let config = MoEConfig::builder()
+            .dim(256)
+            .num_experts(num_experts)
+            .top_k(2)
+            .build();
+        let attention = MoEAttention::new(config);
+
+        group.bench_with_input(
+            BenchmarkId::new("experts", num_experts),
+            &num_experts,
+            |b, _| {
+                let query = vec![0.5; 256];
+                let keys: Vec<Vec<f32>> = (0..100)
+                    .map(|i| vec![(i as f32 * 0.01) % 1.0; 256])
+                    .collect();
+                let values: Vec<Vec<f32>> = (0..100)
+                    .map(|i| vec![(i as f32 * 0.02) % 1.0; 256])
+                    .collect();
+                let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+                let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+                b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_hyperbolic_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("hyperbolic_attention");
+
+    for dim in [64, 128, 256] {
+        let config = HyperbolicAttentionConfig {
+            dim,
+            curvature: -1.0,
+            ..Default::default()
+        };
+        let attention = HyperbolicAttention::new(config);
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, &dim| {
+            let query = vec![0.1; dim];
+            let keys: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.001) % 0.5; dim])
+                .collect();
+            let values: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.002) % 0.5; dim])
+                .collect();
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_edge_featured_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("edge_featured_attention");
+
+    for num_heads in [1, 2, 4, 8] {
+        let config = EdgeFeaturedConfig::builder()
+            .node_dim(256)
+            .edge_dim(32)
+            .num_heads(num_heads)
+            .build();
+        let attention = EdgeFeaturedAttention::new(config);
+
+        group.bench_with_input(BenchmarkId::new("heads", num_heads), &num_heads, |b, _| {
+            let query = vec![0.5; 256];
+            let keys: Vec<Vec<f32>> = (0..64)
+                .map(|i| vec![(i as f32 * 0.01) % 1.0; 256])
+                .collect();
+            let values: Vec<Vec<f32>> = (0..64)
+                .map(|i| vec![(i as f32 * 0.02) % 1.0; 256])
+                .collect();
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_graph_rope(c: &mut Criterion) {
+    let mut group = c.benchmark_group("graph_rope");
+
+    for dim in [64, 128, 256] {
+        let config = RoPEConfig::builder().dim(dim).max_position(1024).build();
+        let attention = GraphRoPE::new(config);
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, &dim| {
+            let query = vec![0.5; dim];
+            let keys: Vec<Vec<f32>> = (0..256)
+                .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+                .collect();
+            let values: Vec<Vec<f32>> = (0..256)
+                .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+                .collect();
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_dual_space_attention(c: &mut Criterion) {
+    let mut group = c.benchmark_group("dual_space_attention");
+
+    for dim in [64, 128, 256] {
+        let config = DualSpaceConfig::builder()
+            .dim(dim)
+            .euclidean_weight(0.5)
+            .hyperbolic_weight(0.5)
+            .build();
+        let attention = DualSpaceAttention::new(config);
+
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, &dim| {
+            let query = vec![0.1; dim];
+            let keys: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.001) % 0.3; dim])
+                .collect();
+            let values: Vec<Vec<f32>> = (0..100)
+                .map(|i| vec![(i as f32 * 0.002) % 0.3; dim])
+                .collect();
+            let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+            let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+            b.iter(|| black_box(attention.compute(&query, &keys_refs, &values_refs).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_infonce_loss(c: &mut Criterion) {
+    let mut group = c.benchmark_group("infonce_loss");
+
+    for num_negatives in [10, 50, 100, 200] {
+        let loss = InfoNCELoss::new(0.07);
+
+        group.bench_with_input(
+            BenchmarkId::new("negatives", num_negatives),
+            &num_negatives,
+            |b, &num_neg| {
+                let anchor = vec![0.5; 128];
+                let positive = vec![0.6; 128];
+                let negatives: Vec<Vec<f32>> = (0..num_neg)
+                    .map(|i| vec![(i as f32 * 0.01) % 1.0; 128])
+                    .collect();
+                let neg_refs: Vec<&[f32]> = negatives.iter().map(|n| n.as_slice()).collect();
+
+                b.iter(|| black_box(loss.compute(&anchor, &positive, &neg_refs)));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_adam_optimizer(c: &mut Criterion) {
+    let mut group = c.benchmark_group("adam_optimizer");
+
+    for dim in [128, 256, 512, 1024] {
+        group.bench_with_input(BenchmarkId::new("dim", dim), &dim, |b, &dim| {
+            let mut optimizer = Adam::new(dim, 0.001);
+            let mut params = vec![0.5; dim];
+            let gradients = vec![0.01; dim];
+
+            b.iter(|| {
+                optimizer.step(&mut params, &gradients);
+                black_box(&params)
+            });
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_scaled_dot_product,
+    bench_flash_attention,
+    bench_linear_attention,
+    bench_local_global_attention,
+    bench_moe_attention,
+    bench_hyperbolic_attention,
+    bench_edge_featured_attention,
+    bench_graph_rope,
+    bench_dual_space_attention,
+    bench_infonce_loss,
+    bench_adam_optimizer,
+);
+criterion_main!(benches);
--- a/crates/ruvector-attention/benches/attention_benchmarks.rs
+++ b/crates/ruvector-attention/benches/attention_benchmarks.rs
@@ -0,0 +1,303 @@
+//! Benchmarks for ruvector-attention
+//!
+//! Run with: cargo bench -p ruvector-attention
+
+use std::time::Instant;
+
+use ruvector_attention::{
+    attention::ScaledDotProductAttention,
+    graph::{
+        DualSpaceAttention, DualSpaceConfig, EdgeFeaturedAttention, EdgeFeaturedConfig, GraphRoPE,
+        RoPEConfig,
+    },
+    hyperbolic::{HyperbolicAttention, HyperbolicAttentionConfig},
+    moe::{MoEAttention, MoEConfig},
+    sparse::{FlashAttention, LinearAttention, LocalGlobalAttention},
+    training::{Adam, InfoNCELoss, Loss, Optimizer},
+    traits::Attention,
+};
+
+fn main() {
+    println!("=== ruvector-attention Benchmarks ===\n");
+
+    // Configuration
+    let dim = 256;
+    let seq_len = 512;
+    let iterations = 100;
+
+    // Generate test data
+    let query = vec![0.5f32; dim];
+    let keys: Vec<Vec<f32>> = (0..seq_len)
+        .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+        .collect();
+    let values: Vec<Vec<f32>> = (0..seq_len)
+        .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+        .collect();
+    let keys_refs: Vec<&[f32]> = keys.iter().map(|k| k.as_slice()).collect();
+    let values_refs: Vec<&[f32]> = values.iter().map(|v| v.as_slice()).collect();
+
+    println!("Configuration:");
+    println!("  Dimension: {}", dim);
+    println!("  Sequence Length: {}", seq_len);
+    println!("  Iterations: {}", iterations);
+    println!();
+
+    // 1. Scaled Dot-Product Attention
+    {
+        let attention = ScaledDotProductAttention::new(dim);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Scaled Dot-Product Attention:");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 2. Flash Attention
+    {
+        let attention = FlashAttention::new(dim, 64);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Flash Attention (block_size=64):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 3. Linear Attention
+    {
+        let attention = LinearAttention::new(dim, 64);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Linear Attention (num_features=64):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 4. Local-Global Attention
+    {
+        let attention = LocalGlobalAttention::new(dim, 32, 4);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Local-Global Attention (window=32, global=4):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 5. MoE Attention
+    {
+        let config = MoEConfig::builder()
+            .dim(dim)
+            .num_experts(4)
+            .top_k(2)
+            .build();
+        let attention = MoEAttention::new(config);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("MoE Attention (4 experts, top-2):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 6. Hyperbolic Attention
+    {
+        let config = HyperbolicAttentionConfig {
+            dim,
+            curvature: -1.0,
+            ..Default::default()
+        };
+        let attention = HyperbolicAttention::new(config);
+        // Use smaller values for Poincaré ball
+        let hyp_query = vec![0.1f32; dim];
+        let hyp_keys: Vec<Vec<f32>> = (0..seq_len)
+            .map(|i| vec![(i as f32 * 0.001) % 0.5; dim])
+            .collect();
+        let hyp_values: Vec<Vec<f32>> = (0..seq_len)
+            .map(|i| vec![(i as f32 * 0.002) % 0.5; dim])
+            .collect();
+        let hyp_keys_refs: Vec<&[f32]> = hyp_keys.iter().map(|k| k.as_slice()).collect();
+        let hyp_values_refs: Vec<&[f32]> = hyp_values.iter().map(|v| v.as_slice()).collect();
+
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention
+                .compute(&hyp_query, &hyp_keys_refs, &hyp_values_refs)
+                .unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Hyperbolic Attention (curvature=1.0):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 7. Edge-Featured Graph Attention
+    {
+        let config = EdgeFeaturedConfig::builder()
+            .node_dim(dim)
+            .edge_dim(32)
+            .num_heads(4)
+            .build();
+        let attention = EdgeFeaturedAttention::new(config);
+
+        let graph_keys: Vec<Vec<f32>> = (0..64)
+            .map(|i| vec![(i as f32 * 0.01) % 1.0; dim])
+            .collect();
+        let graph_values: Vec<Vec<f32>> = (0..64)
+            .map(|i| vec![(i as f32 * 0.02) % 1.0; dim])
+            .collect();
+        let graph_keys_refs: Vec<&[f32]> = graph_keys.iter().map(|k| k.as_slice()).collect();
+        let graph_values_refs: Vec<&[f32]> = graph_values.iter().map(|v| v.as_slice()).collect();
+
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention
+                .compute(&query, &graph_keys_refs, &graph_values_refs)
+                .unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Edge-Featured Graph Attention (4 heads):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 8. Graph RoPE
+    {
+        let config = RoPEConfig::builder().dim(dim).max_position(1024).build();
+        let attention = GraphRoPE::new(config);
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention.compute(&query, &keys_refs, &values_refs).unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Graph RoPE Attention:");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 9. Dual-Space Attention
+    {
+        let config = DualSpaceConfig::builder()
+            .dim(dim)
+            .euclidean_weight(0.5)
+            .hyperbolic_weight(0.5)
+            .build();
+        let attention = DualSpaceAttention::new(config);
+
+        // Use smaller values for hyperbolic component
+        let dual_query = vec![0.1f32; dim];
+        let dual_keys: Vec<Vec<f32>> = (0..seq_len)
+            .map(|i| vec![(i as f32 * 0.001) % 0.3; dim])
+            .collect();
+        let dual_values: Vec<Vec<f32>> = (0..seq_len)
+            .map(|i| vec![(i as f32 * 0.002) % 0.3; dim])
+            .collect();
+        let dual_keys_refs: Vec<&[f32]> = dual_keys.iter().map(|k| k.as_slice()).collect();
+        let dual_values_refs: Vec<&[f32]> = dual_values.iter().map(|v| v.as_slice()).collect();
+
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = attention
+                .compute(&dual_query, &dual_keys_refs, &dual_values_refs)
+                .unwrap();
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("Dual-Space Attention (Euclidean + Hyperbolic):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 10. Training: InfoNCE Loss
+    {
+        let loss = InfoNCELoss::new(0.07);
+        let anchor = vec![0.5f32; 128];
+        let positive = vec![0.6f32; 128];
+        let negatives: Vec<Vec<f32>> = (0..50)
+            .map(|i| vec![(i as f32 * 0.01) % 1.0; 128])
+            .collect();
+        let neg_refs: Vec<&[f32]> = negatives.iter().map(|n| n.as_slice()).collect();
+
+        let start = Instant::now();
+        for _ in 0..iterations {
+            let _ = loss.compute(&anchor, &positive, &neg_refs);
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / iterations as f64;
+        println!("InfoNCE Loss (50 negatives):");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    // 11. Training: Adam Optimizer
+    {
+        let mut optimizer = Adam::new(dim, 0.001);
+        let mut params = vec![0.5f32; dim];
+        let gradients = vec![0.01f32; dim];
+
+        let start = Instant::now();
+        for _ in 0..iterations * 10 {
+            optimizer.step(&mut params, &gradients);
+        }
+        let elapsed = start.elapsed();
+        let avg_us = elapsed.as_micros() as f64 / (iterations * 10) as f64;
+        println!("Adam Optimizer Step:");
+        println!("  Total: {:?}", elapsed);
+        println!("  Per iteration: {:.2} µs", avg_us);
+        println!("  Throughput: {:.0} ops/sec", 1_000_000.0 / avg_us);
+        println!();
+    }
+
+    println!("=== Benchmark Complete ===");
+
+    // Summary
+    println!("\n=== Summary ===");
+    println!("All attention mechanisms functional and benchmarked.");
+    println!("Module coverage:");
+    println!("  - Core: ScaledDotProductAttention, MultiHeadAttention");
+    println!("  - Sparse: FlashAttention, LinearAttention, LocalGlobalAttention");
+    println!("  - MoE: MoEAttention with learned routing");
+    println!("  - Graph: EdgeFeaturedAttention, GraphRoPE, DualSpaceAttention");
+    println!("  - Hyperbolic: HyperbolicAttention, MixedCurvatureAttention");
+    println!("  - Training: InfoNCE, ContrastiveLoss, Adam/AdamW/SGD, Curriculum");
+}