//! Benchmark suite for j-Tree + BMSSP optimizations
//!
//! Measures before/after performance for each optimization:
//! - DSpar: 5.9x target
//! - Cache: 10x target
//! - SIMD: 2-4x target
//! - Pool: 50-75% memory reduction
//! - Parallel: Near-linear scaling
//! - WASM Batch: 10x FFI reduction
//!
//! Target: Combined 10x speedup

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use ruvector_mincut::graph::DynamicGraph;
use ruvector_mincut::optimization::{
    BatchConfig, BenchmarkSuite, CacheConfig, DegreePresparse, DistanceArray, LevelData, LevelPool,
    LevelUpdateResult, ParallelConfig, ParallelLevelUpdater, PathDistanceCache, PoolConfig,
    PresparseConfig, SimdDistanceOps, WasmBatchOps,
};
use std::collections::HashSet;

/// Create test graph with specified size
fn create_test_graph(vertices: usize, edges: usize) -> DynamicGraph {
    let graph = DynamicGraph::new();

    for i in 0..vertices {
        graph.add_vertex(i as u64);
    }

    let mut edge_count = 0;
    for i in 0..vertices {
        for j in (i + 1)..vertices {
            if edge_count >= edges {
                break;
            }
            let _ = graph.insert_edge(i as u64, j as u64, 1.0);
            edge_count += 1;
        }
        if edge_count >= edges {
            break;
        }
    }

    graph
}

/// Benchmark DSpar sparsification
fn bench_dspar(c: &mut Criterion) {
    let mut group = c.benchmark_group("DSpar");

    for size in [100, 1000, 5000].iter() {
        let graph = create_test_graph(*size, size * 5);

        group.bench_with_input(BenchmarkId::new("baseline", size), size, |b, _| {
            b.iter(|| {
                let edges: Vec<_> = graph.edges().collect();
                black_box(edges.len())
            })
        });

        let mut dspar = DegreePresparse::with_config(PresparseConfig {
            target_sparsity: 0.1,
            ..Default::default()
        });

        group.bench_with_input(BenchmarkId::new("optimized", size), size, |b, _| {
            b.iter(|| {
                let result = dspar.presparse(&graph);
                black_box(result.edges.len())
            })
        });
    }

    group.finish();
}

/// Benchmark path distance cache
fn bench_cache(c: &mut Criterion) {
    let mut group = c.benchmark_group("PathCache");

    for size in [100, 1000, 5000].iter() {
        group.bench_with_input(
            BenchmarkId::new("baseline_no_cache", size),
            size,
            |b, &size| {
                b.iter(|| {
                    let mut total = 0.0;
                    for i in 0..size {
                        total += (i as f64 * 1.414).sqrt();
                    }
                    black_box(total)
                })
            },
        );

        let cache = PathDistanceCache::with_config(CacheConfig {
            max_entries: *size,
            ..Default::default()
        });

        // Pre-populate cache
        for i in 0..*size {
            cache.insert(i as u64, (i + 1) as u64, (i as f64).sqrt());
        }

        group.bench_with_input(
            BenchmarkId::new("optimized_with_cache", size),
            size,
            |b, &size| {
                b.iter(|| {
                    let mut total = 0.0;
                    for i in 0..size {
                        if let Some(d) = cache.get(i as u64, (i + 1) as u64) {
                            total += d;
                        }
                    }
                    black_box(total)
                })
            },
        );
    }

    group.finish();
}

/// Benchmark SIMD distance operations
fn bench_simd(c: &mut Criterion) {
    let mut group = c.benchmark_group("SIMD");

    for size in [100, 1000, 10000].iter() {
        let mut arr = DistanceArray::new(*size);
        for i in 0..*size {
            arr.set(i as u64, (i as f64) * 0.5 + 1.0);
        }
        arr.set((*size / 2) as u64, 0.1);

        group.bench_with_input(BenchmarkId::new("find_min_naive", size), &arr, |b, arr| {
            b.iter(|| {
                let data = arr.as_slice();
                let mut min_val = f64::INFINITY;
                let mut min_idx = 0;
                for (i, &d) in data.iter().enumerate() {
                    if d < min_val {
                        min_val = d;
                        min_idx = i;
                    }
                }
                black_box((min_val, min_idx))
            })
        });

        group.bench_with_input(BenchmarkId::new("find_min_simd", size), &arr, |b, arr| {
            b.iter(|| black_box(SimdDistanceOps::find_min(arr)))
        });

        let neighbors: Vec<_> = (0..(size / 10).min(100))
            .map(|i| ((i * 10) as u64, 1.0))
            .collect();

        group.bench_with_input(
            BenchmarkId::new("relax_batch_naive", size),
            size,
            |b, &size| {
                let mut arr = DistanceArray::new(size);
                b.iter(|| {
                    let data = arr.as_mut_slice();
                    for &(idx, weight) in &neighbors {
                        let idx = idx as usize;
                        if idx < data.len() {
                            let new_dist = 0.0 + weight;
                            if new_dist < data[idx] {
                                data[idx] = new_dist;
                            }
                        }
                    }
                    black_box(())
                })
            },
        );

        group.bench_with_input(
            BenchmarkId::new("relax_batch_simd", size),
            size,
            |b, &size| {
                let mut arr = DistanceArray::new(size);
                b.iter(|| black_box(SimdDistanceOps::relax_batch(&mut arr, 0.0, &neighbors)))
            },
        );
    }

    group.finish();
}

/// Benchmark pool allocation
fn bench_pool(c: &mut Criterion) {
    let mut group = c.benchmark_group("Pool");

    for size in [100, 1000].iter() {
        group.bench_with_input(
            BenchmarkId::new("baseline_alloc_dealloc", size),
            size,
            |b, &size| {
                b.iter(|| {
                    let mut levels = Vec::new();
                    for i in 0..10 {
                        levels.push(LevelData::new(i, size));
                    }
                    black_box(levels.len())
                })
            },
        );

        let pool = LevelPool::with_config(PoolConfig {
            max_materialized_levels: 5,
            lazy_dealloc: true,
            ..Default::default()
        });

        group.bench_with_input(
            BenchmarkId::new("optimized_pool", size),
            size,
            |b, &size| {
                b.iter(|| {
                    for i in 0..10 {
                        let level = pool.allocate_level(i, size);
                        pool.materialize(i, level);
                    }
                    black_box(pool.stats().materialized_levels)
                })
            },
        );
    }

    group.finish();
}

/// Benchmark parallel processing
fn bench_parallel(c: &mut Criterion) {
    let mut group = c.benchmark_group("Parallel");

    let levels: Vec<usize> = (0..100).collect();

    for work_size in [10, 100, 1000].iter() {
        group.bench_with_input(
            BenchmarkId::new("sequential", work_size),
            work_size,
            |b, &work_size| {
                b.iter(|| {
                    let _results: Vec<_> = levels
                        .iter()
                        .map(|&level| {
                            let mut sum = 0.0;
                            for i in 0..work_size {
                                sum += (i as f64).sqrt();
                            }
                            LevelUpdateResult {
                                level,
                                cut_value: sum,
                                partition: HashSet::new(),
                                time_us: 0,
                            }
                        })
                        .collect();
                    black_box(())
                })
            },
        );

        let updater = ParallelLevelUpdater::with_config(ParallelConfig {
            min_parallel_size: 10,
            ..Default::default()
        });

        group.bench_with_input(
            BenchmarkId::new("parallel_rayon", work_size),
            work_size,
            |b, &work_size| {
                b.iter(|| {
                    let _results = updater.process_parallel(&levels, |level| {
                        let mut sum = 0.0;
                        for i in 0..work_size {
                            sum += (i as f64).sqrt();
                        }
                        LevelUpdateResult {
                            level,
                            cut_value: sum,
                            partition: HashSet::new(),
                            time_us: 0,
                        }
                    });
                    black_box(())
                })
            },
        );
    }

    group.finish();
}

/// Benchmark WASM batch operations
fn bench_wasm_batch(c: &mut Criterion) {
    let mut group = c.benchmark_group("WASM_Batch");

    for size in [100, 1000, 5000].iter() {
        let edges: Vec<_> = (0..*size)
            .map(|i| (i as u64, (i + 1) as u64, 1.0))
            .collect();

        group.bench_with_input(
            BenchmarkId::new("individual_ops", size),
            &edges,
            |b, edges| {
                b.iter(|| {
                    for edge in edges {
                        black_box(edge);
                    }
                })
            },
        );

        let mut batch = WasmBatchOps::with_config(BatchConfig {
            max_batch_size: 1024,
            ..Default::default()
        });

        group.bench_with_input(BenchmarkId::new("batched_ops", size), &edges, |b, edges| {
            b.iter(|| {
                batch.queue_insert_edges(edges.clone());
                let results = batch.execute_batch();
                black_box(results.len())
            })
        });
    }

    group.finish();
}

/// Run complete benchmark suite
fn bench_complete_suite(c: &mut Criterion) {
    let mut group = c.benchmark_group("Complete_Suite");

    group.bench_function("full_optimization_suite", |b| {
        b.iter(|| {
            let mut suite = BenchmarkSuite::new()
                .with_sizes(vec![100])
                .with_iterations(1);

            let results = suite.run_all();
            let combined = suite.combined_speedup();
            black_box((results.len(), combined))
        })
    });

    group.finish();
}

criterion_group!(
    benches,
    bench_dspar,
    bench_cache,
    bench_simd,
    bench_pool,
    bench_parallel,
    bench_wasm_batch,
    bench_complete_suite,
);

criterion_main!(benches);