//! Comprehensive benchmarks for cognitum-gate-kernel
//!
//! Target latencies:
//! - Single edge insert: < 100ns
//! - Batch 1000 edges: < 100us
//! - Single tick: < 500us
//! - Tick under 10K edges: < 5ms
//! - TileReport serialization: < 1us
//! - E-value update: < 50ns
//! - Mixture e-value (SIMD): < 500ns for 16 hypotheses

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};

use cognitum_gate_kernel::{
    delta::{Delta, Observation},
    evidence::{
        f32_to_log_e, EvidenceAccumulator, HypothesisState, LogEValue, LOG_LR_CONNECTIVITY_POS,
    },
    report::TileReport,
    shard::{CompactGraph, MAX_SHARD_VERTICES},
    TileState, MAX_DELTA_BUFFER,
};

// ============================================================================
// Edge Operations Benchmarks
// ============================================================================

/// Benchmark single edge insertion
fn bench_edge_insert(c: &mut Criterion) {
    let mut group = c.benchmark_group("edge_operations");
    group.throughput(Throughput::Elements(1));

    // Benchmark on empty graph
    group.bench_function("insert_single_empty", |b| {
        b.iter_batched(
            CompactGraph::new,
            |mut graph| {
                black_box(graph.add_edge(0, 1, 100));
                graph
            },
            criterion::BatchSize::SmallInput,
        )
    });

    // Benchmark on partially filled graph
    group.bench_function("insert_single_partial", |b| {
        b.iter_batched(
            || {
                let mut graph = CompactGraph::new();
                for i in 0..100u16 {
                    graph.add_edge(i, i + 1, 100);
                }
                graph
            },
            |mut graph| {
                black_box(graph.add_edge(200, 201, 100));
                graph
            },
            criterion::BatchSize::SmallInput,
        )
    });

    // Benchmark edge removal
    group.bench_function("remove_single", |b| {
        b.iter_batched(
            || {
                let mut graph = CompactGraph::new();
                graph.add_edge(0, 1, 100);
                graph.add_edge(1, 2, 100);
                graph.add_edge(2, 3, 100);
                graph
            },
            |mut graph| {
                black_box(graph.remove_edge(1, 2));
                graph
            },
            criterion::BatchSize::SmallInput,
        )
    });

    // Benchmark edge lookup
    group.bench_function("find_edge", |b| {
        let mut graph = CompactGraph::new();
        for i in 0..200u16 {
            graph.add_edge(i, i + 1, 100);
        }
        b.iter(|| black_box(graph.find_edge(100, 101)))
    });

    // Benchmark weight update
    group.bench_function("update_weight", |b| {
        let mut graph = CompactGraph::new();
        for i in 0..100u16 {
            graph.add_edge(i, i + 1, 100);
        }
        b.iter(|| {
            black_box(graph.update_weight(50, 51, 200));
        })
    });

    group.finish();
}

/// Benchmark batch edge insertion (1000 edges)
fn bench_edge_batch(c: &mut Criterion) {
    let mut group = c.benchmark_group("edge_batch");

    for batch_size in [100, 500, 1000] {
        group.throughput(Throughput::Elements(batch_size as u64));

        group.bench_with_input(
            BenchmarkId::new("insert_batch", batch_size),
            &batch_size,
            |b, &size| {
                b.iter_batched(
                    CompactGraph::new,
                    |mut graph| {
                        for i in 0..size as u16 {
                            // Use modular arithmetic to create varied edges within bounds
                            let src = i % 200;
                            let dst = (i % 200) + 1;
                            graph.add_edge(src, dst, 100);
                        }
                        black_box(graph)
                    },
                    criterion::BatchSize::SmallInput,
                )
            },
        );
    }

    // Benchmark batch with recompute_components
    group.bench_function("batch_1000_with_components", |b| {
        b.iter_batched(
            CompactGraph::new,
            |mut graph| {
                for i in 0..500u16 {
                    let src = i % 200;
                    let dst = (i % 200) + 1;
                    graph.add_edge(src, dst, 100);
                }
                graph.recompute_components();
                black_box(graph)
            },
            criterion::BatchSize::SmallInput,
        )
    });

    group.finish();
}

// ============================================================================
// Tick Cycle Benchmarks
// ============================================================================

/// Benchmark single tick cycle
fn bench_tick(c: &mut Criterion) {
    let mut group = c.benchmark_group("tick_cycle");
    group.throughput(Throughput::Elements(1));

    // Empty tick (no deltas)
    group.bench_function("tick_empty", |b| {
        let mut tile = TileState::new(0);
        b.iter(|| black_box(tile.tick(black_box(1))))
    });

    // Tick with small graph
    group.bench_function("tick_small_graph", |b| {
        let mut tile = TileState::new(0);
        // Add some edges
        for i in 0..10u16 {
            tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
        }
        tile.tick(0); // Initial tick to process deltas

        b.iter(|| black_box(tile.tick(black_box(1))))
    });

    // Tick with pending deltas
    group.bench_function("tick_with_deltas", |b| {
        b.iter_batched(
            || {
                let mut tile = TileState::new(0);
                for i in 0..10u16 {
                    tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
                }
                tile
            },
            |mut tile| black_box(tile.tick(1)),
            criterion::BatchSize::SmallInput,
        )
    });

    // Tick with observations
    group.bench_function("tick_with_observations", |b| {
        b.iter_batched(
            || {
                let mut tile = TileState::new(0);
                tile.evidence.add_connectivity_hypothesis(5);
                for _ in 0..5 {
                    let obs = Observation::connectivity(5, true);
                    tile.ingest_delta(&Delta::observation(obs));
                }
                tile
            },
            |mut tile| black_box(tile.tick(1)),
            criterion::BatchSize::SmallInput,
        )
    });

    group.finish();
}

/// Benchmark tick under heavy load (10K edges simulated via max graph)
fn bench_tick_under_load(c: &mut Criterion) {
    let mut group = c.benchmark_group("tick_under_load");
    group.sample_size(50); // Reduce sample size for expensive benchmarks

    // Create a densely connected graph (approaching limits)
    for edge_count in [500, 800, 1000] {
        group.throughput(Throughput::Elements(edge_count as u64));

        group.bench_with_input(
            BenchmarkId::new("edges", edge_count),
            &edge_count,
            |b, &count| {
                b.iter_batched(
                    || {
                        let mut tile = TileState::new(0);
                        // Create a connected graph
                        for i in 0..count.min(1000) as u16 {
                            let src = i % 250;
                            let dst = (i + 1) % 250;
                            if src != dst {
                                tile.ingest_delta(&Delta::edge_add(src, dst, 100));
                            }
                        }
                        tile.tick(0); // Process initial deltas

                        // Add some pending work
                        tile.ingest_delta(&Delta::edge_add(0, 100, 150));
                        tile.ingest_delta(&Delta::observation(Observation::connectivity(0, true)));
                        tile
                    },
                    |mut tile| black_box(tile.tick(1)),
                    criterion::BatchSize::SmallInput,
                )
            },
        );
    }

    // Benchmark connected components recomputation at scale
    group.bench_function("recompute_components_800", |b| {
        b.iter_batched(
            || {
                let mut graph = CompactGraph::new();
                // Create 4 disconnected clusters of 50 nodes each
                for cluster in 0..4u16 {
                    let base = cluster * 60;
                    for i in 0..50u16 {
                        graph.add_edge(base + i, base + (i + 1) % 50, 100);
                    }
                }
                graph
            },
            |mut graph| {
                black_box(graph.recompute_components());
                graph
            },
            criterion::BatchSize::SmallInput,
        )
    });

    group.finish();
}

// ============================================================================
// Report Serialization Benchmarks
// ============================================================================

/// Benchmark TileReport serialization
fn bench_report_serialize(c: &mut Criterion) {
    let mut group = c.benchmark_group("report_serialization");
    group.throughput(Throughput::Elements(1));

    // Create a populated tile report
    let create_report = || {
        let mut tile = TileState::new(42);
        for i in 0..20u16 {
            tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
        }
        tile.tick(1)
    };

    let report = create_report();

    // Raw memory copy (baseline)
    group.bench_function("raw_copy_64_bytes", |b| {
        let report = create_report();
        b.iter(|| {
            let mut buffer = [0u8; 64];
            unsafe {
                let src = &report as *const TileReport as *const u8;
                core::ptr::copy_nonoverlapping(src, buffer.as_mut_ptr(), 64);
            }
            black_box(buffer)
        })
    });

    // Report creation from scratch
    group.bench_function("create_new", |b| {
        b.iter(|| black_box(TileReport::new(black_box(42))))
    });

    // Report field access patterns
    group.bench_function("access_witness", |b| {
        b.iter(|| black_box(report.get_witness()))
    });

    group.bench_function("access_connected", |b| {
        b.iter(|| black_box(report.is_connected()))
    });

    group.bench_function("e_value_approx", |b| {
        b.iter(|| black_box(report.e_value_approx()))
    });

    group.finish();
}

// ============================================================================
// E-Value Computation Benchmarks
// ============================================================================

/// Benchmark e-value accumulator update
fn bench_evalue_update(c: &mut Criterion) {
    let mut group = c.benchmark_group("evalue_update");
    group.throughput(Throughput::Elements(1));

    // Single hypothesis update
    group.bench_function("hypothesis_update_f32", |b| {
        let mut hyp = HypothesisState::new(0, HypothesisState::TYPE_CONNECTIVITY);
        b.iter(|| black_box(hyp.update(black_box(1.5))))
    });

    // Update with pre-computed log LR (faster path)
    group.bench_function("hypothesis_update_log_lr", |b| {
        let mut hyp = HypothesisState::new(0, HypothesisState::TYPE_CONNECTIVITY);
        b.iter(|| black_box(hyp.update_with_log_lr(black_box(LOG_LR_CONNECTIVITY_POS))))
    });

    // f32 to log conversion
    group.bench_function("f32_to_log_e", |b| {
        b.iter(|| black_box(f32_to_log_e(black_box(1.5))))
    });

    // f32 to log with common value (fast path)
    group.bench_function("f32_to_log_e_fast_path", |b| {
        b.iter(|| black_box(f32_to_log_e(black_box(2.0))))
    });

    // Full accumulator observation processing
    group.bench_function("accumulator_process_obs", |b| {
        let mut acc = EvidenceAccumulator::new();
        acc.add_connectivity_hypothesis(5);
        let obs = Observation::connectivity(5, true);

        b.iter(|| {
            acc.process_observation(black_box(obs), black_box(1));
        })
    });

    // Multiple hypotheses
    for hyp_count in [1, 4, 8, 16] {
        group.bench_with_input(
            BenchmarkId::new("process_obs_hypotheses", hyp_count),
            &hyp_count,
            |b, &count| {
                let mut acc = EvidenceAccumulator::new();
                for v in 0..count as u16 {
                    acc.add_connectivity_hypothesis(v);
                }
                let obs = Observation::connectivity(0, true);

                b.iter(|| {
                    acc.process_observation(black_box(obs), black_box(1));
                })
            },
        );
    }

    group.finish();
}

/// Benchmark mixture e-value computation (potential SIMD opportunity)
fn bench_mixture_evalue(c: &mut Criterion) {
    let mut group = c.benchmark_group("mixture_evalue");

    // Simulated mixture: aggregate multiple log e-values
    // This is where SIMD can provide significant speedup

    // Scalar baseline
    group.bench_function("aggregate_16_scalar", |b| {
        let log_e_values: [LogEValue; 16] = [
            65536, 38550, -65536, 65536, 38550, 65536, 38550, -32768, 65536, 65536, 38550, -65536,
            65536, 38550, 65536, 38550,
        ];

        b.iter(|| {
            let sum: LogEValue = log_e_values.iter().copied().sum();
            black_box(sum)
        })
    });

    // Parallel lanes pattern (SIMD-friendly)
    group.bench_function("aggregate_16_parallel_lanes", |b| {
        let log_e_values: [LogEValue; 16] = [
            65536, 38550, -65536, 65536, 38550, 65536, 38550, -32768, 65536, 65536, 38550, -65536,
            65536, 38550, 65536, 38550,
        ];

        b.iter(|| {
            // Process in 4 lanes (potential SIMD with 128-bit registers)
            let mut lanes = [0i32; 4];
            for (i, &val) in log_e_values.iter().enumerate() {
                lanes[i % 4] = lanes[i % 4].saturating_add(val);
            }
            let sum = lanes.iter().sum::<i32>();
            black_box(sum)
        })
    });

    // Chunked processing (auto-vectorization friendly)
    group.bench_function("aggregate_16_chunked", |b| {
        let log_e_values: [LogEValue; 16] = [
            65536, 38550, -65536, 65536, 38550, 65536, 38550, -32768, 65536, 65536, 38550, -65536,
            65536, 38550, 65536, 38550,
        ];

        b.iter(|| {
            let mut total = 0i32;
            for chunk in log_e_values.chunks(4) {
                let chunk_sum: i32 = chunk.iter().copied().sum();
                total = total.saturating_add(chunk_sum);
            }
            black_box(total)
        })
    });

    // Scale to 255 tiles (realistic workload)
    group.bench_function("aggregate_255_tiles", |b| {
        let log_e_values: Vec<LogEValue> = (0..255)
            .map(|i| (i as i32 % 3 - 1) * 65536) // Varying positive/negative evidence
            .collect();

        b.iter(|| {
            let sum: i64 = log_e_values.iter().map(|&v| v as i64).sum();
            black_box(sum)
        })
    });

    // Mixture with product (exp-log pattern)
    group.bench_function("mixture_product_16", |b| {
        let log_e_values: [LogEValue; 16] = [
            65536, 38550, -65536, 65536, 38550, 65536, 38550, -32768, 65536, 65536, 38550, -65536,
            65536, 38550, 65536, 38550,
        ];

        b.iter(|| {
            // For product, sum the logs, then exp
            let log_sum: i64 = log_e_values.iter().map(|&v| v as i64).sum();
            // Approximate exp2 for final result
            let approx_result = (log_sum as f64) / 65536.0;
            black_box(approx_result)
        })
    });

    group.finish();
}

// ============================================================================
// Additional Performance Benchmarks
// ============================================================================

/// Benchmark delta ingestion
fn bench_delta_ingestion(c: &mut Criterion) {
    let mut group = c.benchmark_group("delta_ingestion");
    group.throughput(Throughput::Elements(1));

    group.bench_function("ingest_single", |b| {
        let mut tile = TileState::new(0);
        let delta = Delta::edge_add(0, 1, 100);

        b.iter(|| {
            tile.reset();
            black_box(tile.ingest_delta(&delta))
        })
    });

    // Fill buffer benchmark
    group.bench_function("fill_buffer_64", |b| {
        b.iter_batched(
            || TileState::new(0),
            |mut tile| {
                for i in 0..MAX_DELTA_BUFFER as u16 {
                    tile.ingest_delta(&Delta::edge_add(i, i + 1, 100));
                }
                black_box(tile)
            },
            criterion::BatchSize::SmallInput,
        )
    });

    group.finish();
}

/// Benchmark neighbor iteration
fn bench_neighbor_iteration(c: &mut Criterion) {
    let mut group = c.benchmark_group("neighbor_iteration");

    // Create a graph with varying degree vertices
    let mut graph = CompactGraph::new();
    // Create a hub vertex with many neighbors
    for i in 1..25u16 {
        graph.add_edge(0, i, 100);
    }
    // Create a chain
    for i in 30..50u16 {
        graph.add_edge(i, i + 1, 100);
    }

    group.bench_function("neighbors_hub_24", |b| {
        b.iter(|| {
            let neighbors = graph.neighbors(0);
            black_box(neighbors.len())
        })
    });

    group.bench_function("neighbors_chain_2", |b| {
        b.iter(|| {
            let neighbors = graph.neighbors(35);
            black_box(neighbors.len())
        })
    });

    group.bench_function("iterate_all_neighbors", |b| {
        b.iter(|| {
            let mut total = 0usize;
            for v in 0..50u16 {
                total += graph.neighbors(v).len();
            }
            black_box(total)
        })
    });

    group.finish();
}

// ============================================================================
// Memory and Cache Benchmarks
// ============================================================================

/// Benchmark memory access patterns
fn bench_memory_patterns(c: &mut Criterion) {
    let mut group = c.benchmark_group("memory_patterns");

    // Sequential vertex access
    group.bench_function("sequential_vertex_scan", |b| {
        let mut graph = CompactGraph::new();
        for i in 0..200u16 {
            graph.add_edge(i, i + 1, 100);
        }

        b.iter(|| {
            let mut active = 0u16;
            for i in 0..256u16 {
                if graph.vertices[i as usize].is_active() {
                    active += 1;
                }
            }
            black_box(active)
        })
    });

    // Random access pattern
    group.bench_function("random_vertex_access", |b| {
        let mut graph = CompactGraph::new();
        for i in 0..200u16 {
            graph.add_edge(i, i + 1, 100);
        }

        // Pseudo-random access pattern
        let indices: Vec<u16> = (0..100).map(|i| (i * 37) % 256).collect();

        b.iter(|| {
            let mut sum = 0u8;
            for &i in &indices {
                sum = sum.wrapping_add(graph.vertices[i as usize].degree);
            }
            black_box(sum)
        })
    });

    // Edge array scan
    group.bench_function("edge_array_scan", |b| {
        let mut graph = CompactGraph::new();
        for i in 0..500u16 {
            let src = i % 200;
            let dst = (i % 200) + 1;
            if src != dst {
                graph.add_edge(src, dst, 100);
            }
        }

        b.iter(|| {
            let mut active = 0u16;
            for edge in &graph.edges {
                if edge.is_active() {
                    active += 1;
                }
            }
            black_box(active)
        })
    });

    group.finish();
}

// ============================================================================
// Criterion Groups
// ============================================================================

criterion_group!(edge_benches, bench_edge_insert, bench_edge_batch,);

criterion_group!(tick_benches, bench_tick, bench_tick_under_load,);

criterion_group!(evidence_benches, bench_evalue_update, bench_mixture_evalue,);

criterion_group!(
    misc_benches,
    bench_report_serialize,
    bench_delta_ingestion,
    bench_neighbor_iteration,
    bench_memory_patterns,
);

criterion_main!(edge_benches, tick_benches, evidence_benches, misc_benches);