312 lines
8.5 KiB
Rust
312 lines
8.5 KiB
Rust
//! Demonstration of performance optimizations in ruvector-scipix
|
|
//!
|
|
//! This example shows how to use various optimization features:
|
|
//! - SIMD operations for image processing
|
|
//! - Parallel batch processing
|
|
//! - Memory pooling
|
|
//! - Model quantization
|
|
//! - Dynamic batching
|
|
|
|
use ruvector_scipix::optimize::*;
|
|
use std::sync::Arc;
|
|
use std::time::Instant;
|
|
|
|
fn main() {
|
|
println!("=== Ruvector-Scipix Optimization Demo ===\n");
|
|
|
|
// 1. Feature Detection
|
|
demo_feature_detection();
|
|
|
|
// 2. SIMD Operations
|
|
demo_simd_operations();
|
|
|
|
// 3. Parallel Processing
|
|
demo_parallel_processing();
|
|
|
|
// 4. Memory Optimizations
|
|
demo_memory_optimizations();
|
|
|
|
// 5. Model Quantization
|
|
demo_quantization();
|
|
|
|
println!("\n=== Demo Complete ===");
|
|
}
|
|
|
|
fn demo_feature_detection() {
|
|
println!("1. CPU Feature Detection");
|
|
println!("------------------------");
|
|
|
|
let features = detect_features();
|
|
println!("AVX2 Support: {}", if features.avx2 { "✓" } else { "✗" });
|
|
println!(
|
|
"AVX-512 Support: {}",
|
|
if features.avx512f { "✓" } else { "✗" }
|
|
);
|
|
println!("NEON Support: {}", if features.neon { "✓" } else { "✗" });
|
|
println!(
|
|
"SSE4.2 Support: {}",
|
|
if features.sse4_2 { "✓" } else { "✗" }
|
|
);
|
|
|
|
let opt_level = get_opt_level();
|
|
println!("Optimization Level: {:?}", opt_level);
|
|
println!();
|
|
}
|
|
|
|
fn demo_simd_operations() {
|
|
println!("2. SIMD Operations");
|
|
println!("------------------");
|
|
|
|
// Create test image (512x512 RGBA)
|
|
let size = 512;
|
|
let rgba: Vec<u8> = (0..size * size * 4).map(|i| (i % 256) as u8).collect();
|
|
let mut gray = vec![0u8; size * size];
|
|
|
|
// Benchmark grayscale conversion
|
|
let iterations = 100;
|
|
|
|
let start = Instant::now();
|
|
for _ in 0..iterations {
|
|
simd::simd_grayscale(&rgba, &mut gray);
|
|
}
|
|
let simd_time = start.elapsed();
|
|
|
|
println!("Grayscale conversion ({} iterations):", iterations);
|
|
println!(
|
|
" SIMD: {:?} ({:.2} MP/s)",
|
|
simd_time,
|
|
(iterations as f64 * size as f64 * size as f64 / 1_000_000.0) / simd_time.as_secs_f64()
|
|
);
|
|
|
|
// Benchmark threshold
|
|
let mut binary = vec![0u8; size * size];
|
|
|
|
let start = Instant::now();
|
|
for _ in 0..iterations {
|
|
simd::simd_threshold(&gray, 128, &mut binary);
|
|
}
|
|
let threshold_time = start.elapsed();
|
|
|
|
println!("Threshold operation ({} iterations):", iterations);
|
|
println!(
|
|
" SIMD: {:?} ({:.2} MP/s)",
|
|
threshold_time,
|
|
(iterations as f64 * size as f64 * size as f64 / 1_000_000.0)
|
|
/ threshold_time.as_secs_f64()
|
|
);
|
|
|
|
// Benchmark normalization
|
|
let mut data: Vec<f32> = (0..8192).map(|i| i as f32).collect();
|
|
|
|
let start = Instant::now();
|
|
for _ in 0..iterations {
|
|
simd::simd_normalize(&mut data);
|
|
}
|
|
let normalize_time = start.elapsed();
|
|
|
|
println!("Normalization ({} iterations):", iterations);
|
|
println!(" SIMD: {:?}", normalize_time);
|
|
println!();
|
|
}
|
|
|
|
fn demo_parallel_processing() {
|
|
println!("3. Parallel Processing");
|
|
println!("----------------------");
|
|
|
|
let data: Vec<i32> = (0..10000).collect();
|
|
|
|
// Sequential processing
|
|
let start = Instant::now();
|
|
let _seq_result: Vec<i32> = data.iter().map(|&x| expensive_computation(x)).collect();
|
|
let seq_time = start.elapsed();
|
|
|
|
// Parallel processing
|
|
let start = Instant::now();
|
|
let _par_result =
|
|
parallel::parallel_map_chunked(data.clone(), 100, |x| expensive_computation(x));
|
|
let par_time = start.elapsed();
|
|
|
|
println!("Processing 10,000 items:");
|
|
println!(" Sequential: {:?}", seq_time);
|
|
println!(" Parallel: {:?}", par_time);
|
|
println!(
|
|
" Speedup: {:.2}x",
|
|
seq_time.as_secs_f64() / par_time.as_secs_f64()
|
|
);
|
|
|
|
let threads = parallel::optimal_thread_count();
|
|
println!(" Using {} threads", threads);
|
|
println!();
|
|
}
|
|
|
|
fn expensive_computation(x: i32) -> i32 {
|
|
// Simulate some work
|
|
(0..100).fold(x, |acc, i| acc.wrapping_add(i))
|
|
}
|
|
|
|
fn demo_memory_optimizations() {
|
|
println!("4. Memory Optimizations");
|
|
println!("-----------------------");
|
|
|
|
let pools = memory::GlobalPools::get();
|
|
|
|
// Benchmark buffer pool vs direct allocation
|
|
let iterations = 10000;
|
|
|
|
// Pooled allocation
|
|
let start = Instant::now();
|
|
for _ in 0..iterations {
|
|
let mut buf = pools.acquire_small();
|
|
buf.extend_from_slice(&[0u8; 512]);
|
|
}
|
|
let pooled_time = start.elapsed();
|
|
|
|
// Direct allocation
|
|
let start = Instant::now();
|
|
for _ in 0..iterations {
|
|
let mut buf = Vec::with_capacity(1024);
|
|
buf.extend_from_slice(&[0u8; 512]);
|
|
}
|
|
let direct_time = start.elapsed();
|
|
|
|
println!("Buffer allocation ({} iterations):", iterations);
|
|
println!(" Pooled: {:?}", pooled_time);
|
|
println!(" Direct: {:?}", direct_time);
|
|
println!(
|
|
" Speedup: {:.2}x",
|
|
direct_time.as_secs_f64() / pooled_time.as_secs_f64()
|
|
);
|
|
|
|
// Arena allocation
|
|
let mut arena = memory::Arena::with_capacity(1024 * 1024);
|
|
|
|
let start = Instant::now();
|
|
for _ in 0..iterations {
|
|
arena.reset();
|
|
for _ in 0..10 {
|
|
let _slice = arena.alloc(1024, 8);
|
|
}
|
|
}
|
|
let arena_time = start.elapsed();
|
|
|
|
println!(
|
|
"\nArena allocation ({} iterations, 10 allocs each):",
|
|
iterations
|
|
);
|
|
println!(" Time: {:?}", arena_time);
|
|
println!();
|
|
}
|
|
|
|
fn demo_quantization() {
|
|
println!("5. Model Quantization");
|
|
println!("---------------------");
|
|
|
|
// Create model weights
|
|
let size = 100_000;
|
|
let weights: Vec<f32> = (0..size)
|
|
.map(|i| ((i as f32 / size as f32) * 2.0 - 1.0))
|
|
.collect();
|
|
|
|
println!(
|
|
"Original model: {} weights ({:.2} MB)",
|
|
weights.len(),
|
|
(weights.len() * std::mem::size_of::<f32>()) as f64 / 1_048_576.0
|
|
);
|
|
|
|
// Quantize
|
|
let start = Instant::now();
|
|
let (quantized, params) = quantize::quantize_weights(&weights);
|
|
let quant_time = start.elapsed();
|
|
|
|
println!(
|
|
"Quantized: {} weights ({:.2} MB)",
|
|
quantized.len(),
|
|
(quantized.len() * std::mem::size_of::<i8>()) as f64 / 1_048_576.0
|
|
);
|
|
println!(
|
|
"Compression: {:.2}x",
|
|
(weights.len() * std::mem::size_of::<f32>()) as f64
|
|
/ (quantized.len() * std::mem::size_of::<i8>()) as f64
|
|
);
|
|
println!("Quantization time: {:?}", quant_time);
|
|
|
|
// Check quality
|
|
let error = quantize::quantization_error(&weights, &quantized, params);
|
|
let snr = quantize::sqnr(&weights, &quantized, params);
|
|
|
|
println!("Quality metrics:");
|
|
println!(" MSE: {:.6}", error);
|
|
println!(" SQNR: {:.2} dB", snr);
|
|
|
|
// Benchmark dequantization
|
|
let iterations = 100;
|
|
let start = Instant::now();
|
|
for _ in 0..iterations {
|
|
let _restored = quantize::dequantize(&quantized, params);
|
|
}
|
|
let dequant_time = start.elapsed();
|
|
|
|
println!(
|
|
"Dequantization ({} iterations): {:?}",
|
|
iterations, dequant_time
|
|
);
|
|
|
|
// Per-channel quantization
|
|
let weights_2d: Vec<f32> = (0..10_000).map(|i| i as f32).collect();
|
|
let shape = vec![100, 100]; // 100 channels, 100 values each
|
|
|
|
let start = Instant::now();
|
|
let per_channel = quantize::PerChannelQuant::from_f32(&weights_2d, shape);
|
|
let per_channel_time = start.elapsed();
|
|
|
|
println!("\nPer-channel quantization:");
|
|
println!(" Channels: {}", per_channel.params.len());
|
|
println!(" Time: {:?}", per_channel_time);
|
|
println!();
|
|
}
|
|
|
|
// Async batching demo (would need tokio runtime)
|
|
#[allow(dead_code)]
|
|
async fn demo_batching() {
|
|
println!("6. Dynamic Batching");
|
|
println!("-------------------");
|
|
|
|
use batch::{BatchConfig, DynamicBatcher};
|
|
|
|
let config = BatchConfig {
|
|
max_batch_size: 32,
|
|
max_wait_ms: 50,
|
|
max_queue_size: 1000,
|
|
preferred_batch_size: 16,
|
|
};
|
|
|
|
let batcher = Arc::new(DynamicBatcher::new(config, |items: Vec<i32>| {
|
|
// Simulate batch processing
|
|
items.into_iter().map(|x| Ok(x * 2)).collect()
|
|
}));
|
|
|
|
// Start processing loop
|
|
let batcher_clone = batcher.clone();
|
|
tokio::spawn(async move {
|
|
batcher_clone.run().await;
|
|
});
|
|
|
|
// Add items
|
|
let mut handles = vec![];
|
|
for i in 0..100 {
|
|
let batcher = batcher.clone();
|
|
handles.push(tokio::spawn(async move { batcher.add(i).await }));
|
|
}
|
|
|
|
// Wait for results
|
|
for handle in handles {
|
|
let _ = handle.await;
|
|
}
|
|
|
|
let stats = batcher.stats().await;
|
|
println!("Queue size: {}", stats.queue_size);
|
|
println!("Max wait: {:?}", stats.max_wait_time);
|
|
|
|
batcher.shutdown().await;
|
|
}
|