Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,396 @@
//! Dynamic batching for throughput optimization
//!
//! Provides intelligent batching to maximize GPU/CPU utilization while
//! maintaining acceptable latency.
use std::collections::VecDeque;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::{oneshot, Mutex};
use tokio::time::sleep;
/// Item in the batching queue
pub struct BatchItem<T, R> {
pub data: T,
pub response: oneshot::Sender<BatchResult<R>>,
pub enqueued_at: Instant,
}
/// Result of batch processing
pub type BatchResult<T> = std::result::Result<T, BatchError>;
/// Batch processing errors
#[derive(Debug, Clone)]
pub enum BatchError {
Timeout,
ProcessingFailed(String),
QueueFull,
}
impl std::fmt::Display for BatchError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
BatchError::Timeout => write!(f, "Batch processing timeout"),
BatchError::ProcessingFailed(msg) => write!(f, "Processing failed: {}", msg),
BatchError::QueueFull => write!(f, "Queue is full"),
}
}
}
impl std::error::Error for BatchError {}
/// Dynamic batcher configuration
#[derive(Debug, Clone)]
pub struct BatchConfig {
/// Maximum items in a batch
pub max_batch_size: usize,
/// Maximum time to wait before processing partial batch
pub max_wait_ms: u64,
/// Maximum queue size
pub max_queue_size: usize,
/// Minimum batch size to prefer
pub preferred_batch_size: usize,
}
impl Default for BatchConfig {
fn default() -> Self {
Self {
max_batch_size: 32,
max_wait_ms: 50,
max_queue_size: 1000,
preferred_batch_size: 16,
}
}
}
/// Dynamic batcher for throughput optimization
pub struct DynamicBatcher<T, R> {
config: BatchConfig,
queue: Arc<Mutex<VecDeque<BatchItem<T, R>>>>,
processor: Arc<dyn Fn(Vec<T>) -> Vec<std::result::Result<R, String>> + Send + Sync>,
shutdown: Arc<Mutex<bool>>,
}
impl<T, R> DynamicBatcher<T, R>
where
T: Send + 'static,
R: Send + 'static,
{
/// Create new dynamic batcher
pub fn new<F>(config: BatchConfig, processor: F) -> Self
where
F: Fn(Vec<T>) -> Vec<std::result::Result<R, String>> + Send + Sync + 'static,
{
Self {
config,
queue: Arc::new(Mutex::new(VecDeque::new())),
processor: Arc::new(processor),
shutdown: Arc::new(Mutex::new(false)),
}
}
/// Add item to batch queue
pub async fn add(&self, item: T) -> BatchResult<R> {
let (tx, rx) = oneshot::channel();
let batch_item = BatchItem {
data: item,
response: tx,
enqueued_at: Instant::now(),
};
{
let mut queue = self.queue.lock().await;
if queue.len() >= self.config.max_queue_size {
return Err(BatchError::QueueFull);
}
queue.push_back(batch_item);
}
// Wait for response
rx.await.map_err(|_| BatchError::Timeout)?
}
/// Start batch processing loop
pub async fn run(&self) {
let mut last_process = Instant::now();
loop {
// Check if shutdown requested
{
let shutdown = self.shutdown.lock().await;
if *shutdown {
break;
}
}
let should_process = {
let queue = self.queue.lock().await;
queue.len() >= self.config.max_batch_size
|| (queue.len() >= self.config.preferred_batch_size
&& last_process.elapsed().as_millis() >= self.config.max_wait_ms as u128)
|| (queue.len() > 0
&& last_process.elapsed().as_millis() >= self.config.max_wait_ms as u128)
};
if should_process {
self.process_batch().await;
last_process = Instant::now();
} else {
// Sleep briefly to avoid busy waiting
sleep(Duration::from_millis(1)).await;
}
}
// Process remaining items before shutdown
self.process_batch().await;
}
/// Process current batch
async fn process_batch(&self) {
let items = {
let mut queue = self.queue.lock().await;
let batch_size = self.config.max_batch_size.min(queue.len());
if batch_size == 0 {
return;
}
queue.drain(..batch_size).collect::<Vec<_>>()
};
if items.is_empty() {
return;
}
// Extract data and response channels
let (data, responses): (Vec<_>, Vec<_>) = items
.into_iter()
.map(|item| (item.data, item.response))
.unzip();
// Process batch
let results = (self.processor)(data);
// Send responses
for (response_tx, result) in responses.into_iter().zip(results.into_iter()) {
let batch_result = result.map_err(|e| BatchError::ProcessingFailed(e));
let _ = response_tx.send(batch_result);
}
}
/// Gracefully shutdown the batcher
pub async fn shutdown(&self) {
let mut shutdown = self.shutdown.lock().await;
*shutdown = true;
}
/// Get current queue size
pub async fn queue_size(&self) -> usize {
self.queue.lock().await.len()
}
/// Get current queue statistics
pub async fn stats(&self) -> BatchStats {
let queue = self.queue.lock().await;
let queue_size = queue.len();
let max_wait = queue
.front()
.map(|item| item.enqueued_at.elapsed())
.unwrap_or(Duration::from_secs(0));
BatchStats {
queue_size,
max_wait_time: max_wait,
}
}
}
/// Batch statistics
#[derive(Debug, Clone)]
pub struct BatchStats {
pub queue_size: usize,
pub max_wait_time: Duration,
}
/// Adaptive batcher that adjusts batch size based on latency
pub struct AdaptiveBatcher<T, R> {
inner: DynamicBatcher<T, R>,
config: Arc<Mutex<BatchConfig>>,
latency_history: Arc<Mutex<VecDeque<Duration>>>,
target_latency: Duration,
}
impl<T, R> AdaptiveBatcher<T, R>
where
T: Send + 'static,
R: Send + 'static,
{
/// Create adaptive batcher with target latency
pub fn new<F>(initial_config: BatchConfig, target_latency: Duration, processor: F) -> Self
where
F: Fn(Vec<T>) -> Vec<Result<R, String>> + Send + Sync + 'static,
{
let config = Arc::new(Mutex::new(initial_config.clone()));
let inner = DynamicBatcher::new(initial_config, processor);
Self {
inner,
config,
latency_history: Arc::new(Mutex::new(VecDeque::with_capacity(100))),
target_latency,
}
}
/// Add item and adapt batch size
pub async fn add(&self, item: T) -> Result<R, BatchError> {
let start = Instant::now();
let result = self.inner.add(item).await;
let latency = start.elapsed();
// Record latency
{
let mut history = self.latency_history.lock().await;
history.push_back(latency);
if history.len() > 100 {
history.pop_front();
}
}
// Adapt batch size every 10 requests
{
let history = self.latency_history.lock().await;
if history.len() % 10 == 0 && history.len() >= 10 {
let avg_latency: Duration = history.iter().sum::<Duration>() / history.len() as u32;
let mut config = self.config.lock().await;
if avg_latency > self.target_latency {
// Reduce batch size to lower latency
config.max_batch_size = (config.max_batch_size * 9 / 10).max(1);
} else if avg_latency < self.target_latency / 2 {
// Increase batch size for better throughput
config.max_batch_size = (config.max_batch_size * 11 / 10).min(128);
}
}
}
result
}
/// Run the batcher
pub async fn run(&self) {
self.inner.run().await;
}
/// Get current configuration
pub async fn current_config(&self) -> BatchConfig {
self.config.lock().await.clone()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_dynamic_batcher() {
let config = BatchConfig {
max_batch_size: 4,
max_wait_ms: 100,
max_queue_size: 100,
preferred_batch_size: 2,
};
let batcher = Arc::new(DynamicBatcher::new(config, |items: Vec<i32>| {
items.into_iter().map(|x| Ok(x * 2)).collect()
}));
// Start processing loop
let batcher_clone = batcher.clone();
tokio::spawn(async move {
batcher_clone.run().await;
});
// Add items
let mut handles = vec![];
for i in 0..8 {
let batcher = batcher.clone();
handles.push(tokio::spawn(async move { batcher.add(i).await }));
}
// Wait for results
for (i, handle) in handles.into_iter().enumerate() {
let result = handle.await.unwrap().unwrap();
assert_eq!(result, (i as i32) * 2);
}
batcher.shutdown().await;
}
#[tokio::test]
async fn test_batch_stats() {
let config = BatchConfig::default();
let batcher = DynamicBatcher::new(config, |items: Vec<i32>| {
items.into_iter().map(|x| Ok(x)).collect()
});
// Queue some items without processing
let _ = batcher.add(1);
let _ = batcher.add(2);
let _ = batcher.add(3);
let stats = batcher.stats().await;
assert_eq!(stats.queue_size, 3);
}
#[tokio::test]
async fn test_queue_full() {
let config = BatchConfig {
max_queue_size: 2,
..Default::default()
};
let batcher = DynamicBatcher::new(config, |items: Vec<i32>| {
std::thread::sleep(Duration::from_secs(1)); // Slow processing
items.into_iter().map(|x| Ok(x)).collect()
});
// Fill queue
let _ = batcher.add(1);
let _ = batcher.add(2);
// This should fail - queue is full
let result = batcher.add(3).await;
assert!(matches!(result, Err(BatchError::QueueFull)));
}
#[tokio::test]
async fn test_adaptive_batcher() {
let config = BatchConfig {
max_batch_size: 8,
max_wait_ms: 50,
max_queue_size: 100,
preferred_batch_size: 4,
};
let batcher = Arc::new(AdaptiveBatcher::new(
config,
Duration::from_millis(100),
|items: Vec<i32>| items.into_iter().map(|x| Ok(x * 2)).collect(),
));
let batcher_clone = batcher.clone();
tokio::spawn(async move {
batcher_clone.run().await;
});
// Process some requests
for i in 0..20 {
let result = batcher.add(i).await.unwrap();
assert_eq!(result, i * 2);
}
// Configuration should have adapted
let final_config = batcher.current_config().await;
assert!(final_config.max_batch_size > 0);
}
}

View File

@@ -0,0 +1,409 @@
//! Memory optimization utilities
//!
//! Provides object pooling, memory-mapped file loading, and zero-copy operations.
use memmap2::{Mmap, MmapOptions};
use std::collections::VecDeque;
use std::fs::File;
use std::path::Path;
use std::sync::{Arc, Mutex};
use super::memory_opt_enabled;
use crate::error::{Result, ScipixError};
/// Object pool for reusable buffers
pub struct BufferPool<T> {
pool: Arc<Mutex<VecDeque<T>>>,
factory: Arc<dyn Fn() -> T + Send + Sync>,
#[allow(dead_code)]
max_size: usize,
}
impl<T: Send + 'static> BufferPool<T> {
/// Create a new buffer pool
pub fn new<F>(factory: F, initial_size: usize, max_size: usize) -> Self
where
F: Fn() -> T + Send + Sync + 'static,
{
let factory = Arc::new(factory);
let pool = Arc::new(Mutex::new(VecDeque::with_capacity(max_size)));
// Pre-allocate initial buffers
if memory_opt_enabled() {
let mut pool_lock = pool.lock().unwrap();
for _ in 0..initial_size {
pool_lock.push_back(factory());
}
}
Self {
pool,
factory,
max_size,
}
}
/// Acquire a buffer from the pool
pub fn acquire(&self) -> PooledBuffer<T> {
let buffer = if memory_opt_enabled() {
self.pool
.lock()
.unwrap()
.pop_front()
.unwrap_or_else(|| (self.factory)())
} else {
(self.factory)()
};
PooledBuffer {
buffer: Some(buffer),
pool: self.pool.clone(),
}
}
/// Get current pool size
pub fn size(&self) -> usize {
self.pool.lock().unwrap().len()
}
/// Clear the pool
pub fn clear(&self) {
self.pool.lock().unwrap().clear();
}
}
/// RAII guard for pooled buffers
pub struct PooledBuffer<T> {
buffer: Option<T>,
pool: Arc<Mutex<VecDeque<T>>>,
}
impl<T> PooledBuffer<T> {
/// Get mutable reference to buffer
pub fn get_mut(&mut self) -> &mut T {
self.buffer.as_mut().unwrap()
}
/// Get immutable reference to buffer
pub fn get(&self) -> &T {
self.buffer.as_ref().unwrap()
}
}
impl<T> Drop for PooledBuffer<T> {
fn drop(&mut self) {
if memory_opt_enabled() {
if let Some(buffer) = self.buffer.take() {
let mut pool = self.pool.lock().unwrap();
pool.push_back(buffer);
}
}
}
}
impl<T> std::ops::Deref for PooledBuffer<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
self.buffer.as_ref().unwrap()
}
}
impl<T> std::ops::DerefMut for PooledBuffer<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.buffer.as_mut().unwrap()
}
}
/// Memory-mapped model file
pub struct MmapModel {
_mmap: Mmap,
data: *const u8,
len: usize,
}
unsafe impl Send for MmapModel {}
unsafe impl Sync for MmapModel {}
impl MmapModel {
/// Load model from file using memory mapping
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let file = File::open(path.as_ref()).map_err(|e| ScipixError::Io(e))?;
let mmap = unsafe {
MmapOptions::new()
.map(&file)
.map_err(|e| ScipixError::Io(e))?
};
let data = mmap.as_ptr();
let len = mmap.len();
Ok(Self {
_mmap: mmap,
data,
len,
})
}
/// Get slice of model data
pub fn as_slice(&self) -> &[u8] {
unsafe { std::slice::from_raw_parts(self.data, self.len) }
}
/// Get size of mapped region
pub fn len(&self) -> usize {
self.len
}
/// Check if empty
pub fn is_empty(&self) -> bool {
self.len == 0
}
}
/// Zero-copy image view
pub struct ImageView<'a> {
data: &'a [u8],
width: u32,
height: u32,
channels: u8,
}
impl<'a> ImageView<'a> {
/// Create new image view from raw data
pub fn new(data: &'a [u8], width: u32, height: u32, channels: u8) -> Result<Self> {
let expected_len = (width * height * channels as u32) as usize;
if data.len() != expected_len {
return Err(ScipixError::InvalidInput(format!(
"Invalid data length: expected {}, got {}",
expected_len,
data.len()
)));
}
Ok(Self {
data,
width,
height,
channels,
})
}
/// Get pixel at (x, y)
pub fn pixel(&self, x: u32, y: u32) -> &[u8] {
let offset = ((y * self.width + x) * self.channels as u32) as usize;
&self.data[offset..offset + self.channels as usize]
}
/// Get raw data slice
pub fn data(&self) -> &[u8] {
self.data
}
/// Get dimensions
pub fn dimensions(&self) -> (u32, u32) {
(self.width, self.height)
}
/// Get number of channels
pub fn channels(&self) -> u8 {
self.channels
}
/// Create subview (region of interest)
pub fn subview(&self, x: u32, y: u32, width: u32, height: u32) -> Result<Self> {
if x + width > self.width || y + height > self.height {
return Err(ScipixError::InvalidInput(
"Subview out of bounds".to_string(),
));
}
// For simplicity, this creates a copy. True zero-copy would need stride support
let mut subview_data = Vec::new();
for row in y..y + height {
let start = ((row * self.width + x) * self.channels as u32) as usize;
let end = start + (width * self.channels as u32) as usize;
subview_data.extend_from_slice(&self.data[start..end]);
}
// This temporarily leaks memory - in production, use arena allocator
let leaked = Box::leak(subview_data.into_boxed_slice());
Ok(Self {
data: leaked,
width,
height,
channels: self.channels,
})
}
}
/// Arena allocator for temporary allocations
pub struct Arena {
buffer: Vec<u8>,
offset: usize,
}
impl Arena {
/// Create new arena with capacity
pub fn with_capacity(capacity: usize) -> Self {
Self {
buffer: Vec::with_capacity(capacity),
offset: 0,
}
}
/// Allocate aligned memory
pub fn alloc(&mut self, size: usize, align: usize) -> &mut [u8] {
// Align offset
let padding = (align - (self.offset % align)) % align;
self.offset += padding;
let start = self.offset;
let end = start + size;
if end > self.buffer.capacity() {
// Grow buffer
self.buffer.reserve(end - self.buffer.len());
}
unsafe {
self.buffer.set_len(end);
}
self.offset = end;
&mut self.buffer[start..end]
}
/// Reset arena (keeps capacity)
pub fn reset(&mut self) {
self.offset = 0;
self.buffer.clear();
}
/// Get current usage
pub fn usage(&self) -> usize {
self.offset
}
/// Get capacity
pub fn capacity(&self) -> usize {
self.buffer.capacity()
}
}
/// Global buffer pools for common sizes
pub struct GlobalPools {
small: BufferPool<Vec<u8>>, // 1KB buffers
medium: BufferPool<Vec<u8>>, // 64KB buffers
large: BufferPool<Vec<u8>>, // 1MB buffers
}
impl GlobalPools {
fn new() -> Self {
Self {
small: BufferPool::new(|| Vec::with_capacity(1024), 10, 100),
medium: BufferPool::new(|| Vec::with_capacity(64 * 1024), 5, 50),
large: BufferPool::new(|| Vec::with_capacity(1024 * 1024), 2, 20),
}
}
/// Get the global pools instance
pub fn get() -> &'static Self {
static POOLS: std::sync::OnceLock<GlobalPools> = std::sync::OnceLock::new();
POOLS.get_or_init(GlobalPools::new)
}
/// Acquire small buffer (1KB)
pub fn acquire_small(&self) -> PooledBuffer<Vec<u8>> {
self.small.acquire()
}
/// Acquire medium buffer (64KB)
pub fn acquire_medium(&self) -> PooledBuffer<Vec<u8>> {
self.medium.acquire()
}
/// Acquire large buffer (1MB)
pub fn acquire_large(&self) -> PooledBuffer<Vec<u8>> {
self.large.acquire()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_buffer_pool() {
let pool = BufferPool::new(|| Vec::with_capacity(1024), 2, 10);
assert_eq!(pool.size(), 2);
let mut buf1 = pool.acquire();
assert_eq!(buf1.capacity(), 1024);
buf1.extend_from_slice(b"test");
drop(buf1);
assert_eq!(pool.size(), 3); // Returned to pool
}
#[test]
fn test_mmap_model() {
let mut temp = NamedTempFile::new().unwrap();
temp.write_all(b"test model data").unwrap();
temp.flush().unwrap();
let mmap = MmapModel::from_file(temp.path()).unwrap();
assert_eq!(mmap.as_slice(), b"test model data");
assert_eq!(mmap.len(), 15);
}
#[test]
fn test_image_view() {
let data = vec![
255, 0, 0, 255, // Red pixel
0, 255, 0, 255, // Green pixel
0, 0, 255, 255, // Blue pixel
255, 255, 255, 255, // White pixel
];
let view = ImageView::new(&data, 2, 2, 4).unwrap();
assert_eq!(view.dimensions(), (2, 2));
assert_eq!(view.pixel(0, 0), &[255, 0, 0, 255]);
assert_eq!(view.pixel(1, 1), &[255, 255, 255, 255]);
}
#[test]
fn test_arena() {
let mut arena = Arena::with_capacity(1024);
let slice1 = arena.alloc(100, 8);
assert_eq!(slice1.len(), 100);
let slice2 = arena.alloc(200, 8);
assert_eq!(slice2.len(), 200);
assert!(arena.usage() >= 300);
arena.reset();
assert_eq!(arena.usage(), 0);
}
#[test]
fn test_global_pools() {
let pools = GlobalPools::get();
let small = pools.acquire_small();
assert!(small.capacity() >= 1024);
let medium = pools.acquire_medium();
assert!(medium.capacity() >= 64 * 1024);
let large = pools.acquire_large();
assert!(large.capacity() >= 1024 * 1024);
}
}

View File

@@ -0,0 +1,169 @@
//! Performance optimization utilities for scipix OCR
//!
//! This module provides runtime feature detection and optimized code paths
//! for different CPU architectures and capabilities.
pub mod batch;
pub mod memory;
pub mod parallel;
pub mod quantize;
pub mod simd;
use std::sync::OnceLock;
/// CPU features detected at runtime
#[derive(Debug, Clone, Copy)]
pub struct CpuFeatures {
pub avx2: bool,
pub avx512f: bool,
pub neon: bool,
pub sse4_2: bool,
}
static CPU_FEATURES: OnceLock<CpuFeatures> = OnceLock::new();
/// Detect CPU features at runtime
pub fn detect_features() -> CpuFeatures {
*CPU_FEATURES.get_or_init(|| {
#[cfg(target_arch = "x86_64")]
{
CpuFeatures {
avx2: is_x86_feature_detected!("avx2"),
avx512f: is_x86_feature_detected!("avx512f"),
neon: false,
sse4_2: is_x86_feature_detected!("sse4.2"),
}
}
#[cfg(target_arch = "aarch64")]
{
CpuFeatures {
avx2: false,
avx512f: false,
neon: std::arch::is_aarch64_feature_detected!("neon"),
sse4_2: false,
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
CpuFeatures {
avx2: false,
avx512f: false,
neon: false,
sse4_2: false,
}
}
})
}
/// Get the detected CPU features
pub fn get_features() -> CpuFeatures {
detect_features()
}
/// Runtime dispatch to optimized implementation
pub trait OptimizedOp<T> {
/// Execute the operation with the best available implementation
fn execute(&self, input: T) -> T;
/// Execute with SIMD if available, fallback to scalar
fn execute_auto(&self, input: T) -> T {
let features = get_features();
if features.avx2 || features.avx512f || features.neon {
self.execute_simd(input)
} else {
self.execute_scalar(input)
}
}
/// SIMD implementation
fn execute_simd(&self, input: T) -> T;
/// Scalar fallback implementation
fn execute_scalar(&self, input: T) -> T;
}
/// Optimization level configuration
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OptLevel {
/// No optimizations, scalar code only
None,
/// Use SIMD when available
Simd,
/// Use SIMD + parallel processing
Parallel,
/// All optimizations including memory optimizations
Full,
}
impl Default for OptLevel {
fn default() -> Self {
OptLevel::Full
}
}
/// Global optimization configuration
static OPT_LEVEL: OnceLock<OptLevel> = OnceLock::new();
/// Set the optimization level
pub fn set_opt_level(level: OptLevel) {
OPT_LEVEL.set(level).ok();
}
/// Get the current optimization level
pub fn get_opt_level() -> OptLevel {
*OPT_LEVEL.get_or_init(OptLevel::default)
}
/// Check if SIMD optimizations are enabled
pub fn simd_enabled() -> bool {
matches!(
get_opt_level(),
OptLevel::Simd | OptLevel::Parallel | OptLevel::Full
)
}
/// Check if parallel optimizations are enabled
pub fn parallel_enabled() -> bool {
matches!(get_opt_level(), OptLevel::Parallel | OptLevel::Full)
}
/// Check if memory optimizations are enabled
pub fn memory_opt_enabled() -> bool {
matches!(get_opt_level(), OptLevel::Full)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_feature_detection() {
let features = detect_features();
println!("Detected features: {:?}", features);
// Should always succeed on any platform
assert!(
features.avx2
|| features.avx512f
|| features.neon
|| features.sse4_2
|| (!features.avx2 && !features.avx512f && !features.neon && !features.sse4_2)
);
}
#[test]
fn test_opt_level() {
assert_eq!(get_opt_level(), OptLevel::Full);
set_opt_level(OptLevel::Simd);
// Can't change after first init, should still be Full
assert_eq!(get_opt_level(), OptLevel::Full);
}
#[test]
fn test_optimization_checks() {
assert!(simd_enabled());
assert!(parallel_enabled());
assert!(memory_opt_enabled());
}
}

View File

@@ -0,0 +1,335 @@
//! Parallel processing utilities for OCR pipeline
//!
//! Provides parallel image preprocessing, batch OCR, and pipelined execution.
use image::DynamicImage;
use rayon::prelude::*;
use std::sync::Arc;
use tokio::sync::Semaphore;
use super::parallel_enabled;
/// Parallel preprocessing of multiple images
pub fn parallel_preprocess<F>(images: Vec<DynamicImage>, preprocess_fn: F) -> Vec<DynamicImage>
where
F: Fn(DynamicImage) -> DynamicImage + Sync + Send,
{
if !parallel_enabled() {
return images.into_iter().map(preprocess_fn).collect();
}
images.into_par_iter().map(preprocess_fn).collect()
}
/// Parallel processing with error handling
pub fn parallel_preprocess_result<F, E>(
images: Vec<DynamicImage>,
preprocess_fn: F,
) -> Vec<std::result::Result<DynamicImage, E>>
where
F: Fn(DynamicImage) -> std::result::Result<DynamicImage, E> + Sync + Send,
E: Send,
{
if !parallel_enabled() {
return images.into_iter().map(preprocess_fn).collect();
}
images.into_par_iter().map(preprocess_fn).collect()
}
/// Pipeline parallel execution for OCR workflow
///
/// Executes stages in a pipeline: preprocess | detect | recognize
/// Each stage can start processing the next item while previous stages
/// continue with subsequent items.
pub struct PipelineExecutor<T, U, V> {
stage1: Arc<dyn Fn(T) -> U + Send + Sync>,
stage2: Arc<dyn Fn(U) -> V + Send + Sync>,
}
impl<T, U, V> PipelineExecutor<T, U, V>
where
T: Send,
U: Send,
V: Send,
{
pub fn new<F1, F2>(stage1: F1, stage2: F2) -> Self
where
F1: Fn(T) -> U + Send + Sync + 'static,
F2: Fn(U) -> V + Send + Sync + 'static,
{
Self {
stage1: Arc::new(stage1),
stage2: Arc::new(stage2),
}
}
/// Execute pipeline on multiple inputs
pub fn execute_batch(&self, inputs: Vec<T>) -> Vec<V> {
if !parallel_enabled() {
return inputs
.into_iter()
.map(|input| {
let stage1_out = (self.stage1)(input);
(self.stage2)(stage1_out)
})
.collect();
}
inputs
.into_par_iter()
.map(|input| {
let stage1_out = (self.stage1)(input);
(self.stage2)(stage1_out)
})
.collect()
}
}
/// Three-stage pipeline executor
pub struct Pipeline3<T, U, V, W> {
stage1: Arc<dyn Fn(T) -> U + Send + Sync>,
stage2: Arc<dyn Fn(U) -> V + Send + Sync>,
stage3: Arc<dyn Fn(V) -> W + Send + Sync>,
}
impl<T, U, V, W> Pipeline3<T, U, V, W>
where
T: Send,
U: Send,
V: Send,
W: Send,
{
pub fn new<F1, F2, F3>(stage1: F1, stage2: F2, stage3: F3) -> Self
where
F1: Fn(T) -> U + Send + Sync + 'static,
F2: Fn(U) -> V + Send + Sync + 'static,
F3: Fn(V) -> W + Send + Sync + 'static,
{
Self {
stage1: Arc::new(stage1),
stage2: Arc::new(stage2),
stage3: Arc::new(stage3),
}
}
pub fn execute_batch(&self, inputs: Vec<T>) -> Vec<W> {
if !parallel_enabled() {
return inputs
.into_iter()
.map(|input| {
let out1 = (self.stage1)(input);
let out2 = (self.stage2)(out1);
(self.stage3)(out2)
})
.collect();
}
inputs
.into_par_iter()
.map(|input| {
let out1 = (self.stage1)(input);
let out2 = (self.stage2)(out1);
(self.stage3)(out2)
})
.collect()
}
}
/// Parallel map with configurable chunk size
pub fn parallel_map_chunked<T, U, F>(items: Vec<T>, chunk_size: usize, map_fn: F) -> Vec<U>
where
T: Send,
U: Send,
F: Fn(T) -> U + Sync + Send,
{
if !parallel_enabled() {
return items.into_iter().map(map_fn).collect();
}
items
.into_par_iter()
.with_min_len(chunk_size)
.map(map_fn)
.collect()
}
/// Async parallel executor with concurrency limit
pub struct AsyncParallelExecutor {
semaphore: Arc<Semaphore>,
}
impl AsyncParallelExecutor {
/// Create executor with maximum concurrency limit
pub fn new(max_concurrent: usize) -> Self {
Self {
semaphore: Arc::new(Semaphore::new(max_concurrent)),
}
}
/// Execute async tasks with concurrency limit
pub async fn execute<T, F, Fut>(&self, tasks: Vec<T>, executor: F) -> Vec<Fut::Output>
where
T: Send + 'static,
F: Fn(T) -> Fut + Send + Sync + Clone + 'static,
Fut: std::future::Future + Send + 'static,
Fut::Output: Send + 'static,
{
let mut handles = Vec::new();
for task in tasks {
let permit = self.semaphore.clone().acquire_owned().await.unwrap();
let executor = executor.clone();
let handle = tokio::spawn(async move {
let result = executor(task).await;
drop(permit); // Release semaphore
result
});
handles.push(handle);
}
// Wait for all tasks to complete
let mut results = Vec::new();
for handle in handles {
if let Ok(result) = handle.await {
results.push(result);
}
}
results
}
/// Execute with error handling
pub async fn execute_result<T, F, Fut, R, E>(
&self,
tasks: Vec<T>,
executor: F,
) -> Vec<std::result::Result<R, E>>
where
T: Send + 'static,
F: Fn(T) -> Fut + Send + Sync + Clone + 'static,
Fut: std::future::Future<Output = std::result::Result<R, E>> + Send + 'static,
R: Send + 'static,
E: Send + 'static,
{
let mut handles = Vec::new();
for task in tasks {
let permit = self.semaphore.clone().acquire_owned().await.unwrap();
let executor = executor.clone();
let handle = tokio::spawn(async move {
let result = executor(task).await;
drop(permit);
result
});
handles.push(handle);
}
let mut results = Vec::new();
for handle in handles {
match handle.await {
Ok(result) => results.push(result),
Err(_) => continue, // Task panicked
}
}
results
}
}
/// Work-stealing parallel iterator for unbalanced workloads
pub fn parallel_unbalanced<T, U, F>(items: Vec<T>, map_fn: F) -> Vec<U>
where
T: Send,
U: Send,
F: Fn(T) -> U + Sync + Send,
{
if !parallel_enabled() {
return items.into_iter().map(map_fn).collect();
}
// Use adaptive strategy for unbalanced work
items
.into_par_iter()
.with_min_len(1) // Allow fine-grained work stealing
.map(map_fn)
.collect()
}
/// Get optimal thread count for current system
pub fn optimal_thread_count() -> usize {
rayon::current_num_threads()
}
/// Set global thread pool size
pub fn set_thread_count(threads: usize) {
rayon::ThreadPoolBuilder::new()
.num_threads(threads)
.build_global()
.ok();
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parallel_map() {
let data: Vec<i32> = (0..100).collect();
let result = parallel_map_chunked(data, 10, |x| x * 2);
assert_eq!(result.len(), 100);
assert_eq!(result[0], 0);
assert_eq!(result[50], 100);
assert_eq!(result[99], 198);
}
#[test]
fn test_pipeline_executor() {
let pipeline = PipelineExecutor::new(|x: i32| x + 1, |x: i32| x * 2);
let inputs = vec![1, 2, 3, 4, 5];
let results = pipeline.execute_batch(inputs);
assert_eq!(results, vec![4, 6, 8, 10, 12]);
}
#[test]
fn test_pipeline3() {
let pipeline = Pipeline3::new(|x: i32| x + 1, |x: i32| x * 2, |x: i32| x - 1);
let inputs = vec![1, 2, 3];
let results = pipeline.execute_batch(inputs);
// (1+1)*2-1 = 3, (2+1)*2-1 = 5, (3+1)*2-1 = 7
assert_eq!(results, vec![3, 5, 7]);
}
#[tokio::test]
async fn test_async_executor() {
let executor = AsyncParallelExecutor::new(2);
let tasks = vec![1, 2, 3, 4, 5];
let results = executor
.execute(tasks, |x| async move {
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
x * 2
})
.await;
assert_eq!(results.len(), 5);
assert!(results.contains(&2));
assert!(results.contains(&10));
}
#[test]
fn test_optimal_threads() {
let threads = optimal_thread_count();
assert!(threads > 0);
assert!(threads <= num_cpus::get());
}
}

View File

@@ -0,0 +1,339 @@
//! Model quantization utilities
//!
//! Provides INT8 quantization for model weights and activations to reduce
//! memory usage and improve inference speed.
use std::f32;
/// Quantization parameters
#[derive(Debug, Clone, Copy)]
pub struct QuantParams {
pub scale: f32,
pub zero_point: i8,
}
impl QuantParams {
/// Calculate quantization parameters from min/max values
pub fn from_range(min: f32, max: f32) -> Self {
let qmin = i8::MIN as f32;
let qmax = i8::MAX as f32;
let scale = (max - min) / (qmax - qmin);
let zero_point = (qmin - min / scale).round() as i8;
Self { scale, zero_point }
}
/// Calculate from data statistics
pub fn from_data(data: &[f32]) -> Self {
let min = data.iter().copied().fold(f32::INFINITY, f32::min);
let max = data.iter().copied().fold(f32::NEG_INFINITY, f32::max);
Self::from_range(min, max)
}
/// Symmetric quantization (zero_point = 0)
pub fn symmetric(abs_max: f32) -> Self {
let scale = abs_max / 127.0;
Self {
scale,
zero_point: 0,
}
}
}
/// Quantize f32 weights to i8
pub fn quantize_weights(weights: &[f32]) -> (Vec<i8>, QuantParams) {
let params = QuantParams::from_data(weights);
let quantized = quantize_with_params(weights, params);
(quantized, params)
}
/// Quantize with given parameters
pub fn quantize_with_params(weights: &[f32], params: QuantParams) -> Vec<i8> {
weights.iter().map(|&w| quantize_value(w, params)).collect()
}
/// Quantize single value
#[inline]
pub fn quantize_value(value: f32, params: QuantParams) -> i8 {
let scaled = value / params.scale + params.zero_point as f32;
scaled.round().clamp(i8::MIN as f32, i8::MAX as f32) as i8
}
/// Dequantize i8 to f32
pub fn dequantize(quantized: &[i8], params: QuantParams) -> Vec<f32> {
quantized
.iter()
.map(|&q| dequantize_value(q, params))
.collect()
}
/// Dequantize single value
#[inline]
pub fn dequantize_value(quantized: i8, params: QuantParams) -> f32 {
(quantized as f32 - params.zero_point as f32) * params.scale
}
/// Quantized tensor representation
pub struct QuantizedTensor {
pub data: Vec<i8>,
pub params: QuantParams,
pub shape: Vec<usize>,
}
impl QuantizedTensor {
/// Create from f32 tensor
pub fn from_f32(data: &[f32], shape: Vec<usize>) -> Self {
let (quantized, params) = quantize_weights(data);
Self {
data: quantized,
params,
shape,
}
}
/// Create with symmetric quantization
pub fn from_f32_symmetric(data: &[f32], shape: Vec<usize>) -> Self {
let abs_max = data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let params = QuantParams::symmetric(abs_max);
let quantized = quantize_with_params(data, params);
Self {
data: quantized,
params,
shape,
}
}
/// Dequantize to f32
pub fn to_f32(&self) -> Vec<f32> {
dequantize(&self.data, self.params)
}
/// Get size in bytes
pub fn size_bytes(&self) -> usize {
self.data.len()
+ std::mem::size_of::<QuantParams>()
+ self.shape.len() * std::mem::size_of::<usize>()
}
/// Calculate memory savings vs f32
pub fn compression_ratio(&self) -> f32 {
let f32_size = self.data.len() * std::mem::size_of::<f32>();
let quantized_size = self.size_bytes();
f32_size as f32 / quantized_size as f32
}
}
/// Per-channel quantization for conv/linear layers
pub struct PerChannelQuant {
pub data: Vec<i8>,
pub params: Vec<QuantParams>,
pub shape: Vec<usize>,
}
impl PerChannelQuant {
/// Quantize with per-channel parameters
/// For a weight tensor of shape [out_channels, in_channels, ...],
/// use separate params for each output channel
pub fn from_f32(data: &[f32], shape: Vec<usize>) -> Self {
if shape.is_empty() {
panic!("Shape cannot be empty");
}
let out_channels = shape[0];
let channel_size = data.len() / out_channels;
let mut all_quantized = Vec::with_capacity(data.len());
let mut params = Vec::with_capacity(out_channels);
for ch in 0..out_channels {
let start = ch * channel_size;
let end = start + channel_size;
let channel_data = &data[start..end];
let ch_params = QuantParams::from_data(channel_data);
let ch_quantized = quantize_with_params(channel_data, ch_params);
all_quantized.extend(ch_quantized);
params.push(ch_params);
}
Self {
data: all_quantized,
params,
shape,
}
}
/// Dequantize to f32
pub fn to_f32(&self) -> Vec<f32> {
let out_channels = self.shape[0];
let channel_size = self.data.len() / out_channels;
let mut result = Vec::with_capacity(self.data.len());
for ch in 0..out_channels {
let start = ch * channel_size;
let end = start + channel_size;
let channel_data = &self.data[start..end];
let ch_params = self.params[ch];
result.extend(dequantize(channel_data, ch_params));
}
result
}
}
/// Dynamic quantization - quantize at runtime
pub struct DynamicQuantizer {
percentile: f32,
}
impl DynamicQuantizer {
/// Create quantizer with calibration percentile
/// percentile: clip values beyond this percentile (e.g., 99.9)
pub fn new(percentile: f32) -> Self {
Self { percentile }
}
/// Quantize with calibration
pub fn quantize(&self, data: &[f32]) -> (Vec<i8>, QuantParams) {
let mut sorted: Vec<f32> = data.iter().copied().collect();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
let idx = ((sorted.len() as f32 * self.percentile / 100.0) as usize).min(sorted.len() - 1);
let min = -sorted[sorted.len() - idx];
let max = sorted[idx];
let params = QuantParams::from_range(min, max);
let quantized = quantize_with_params(data, params);
(quantized, params)
}
}
/// Calculate quantization error (MSE)
pub fn quantization_error(original: &[f32], quantized: &[i8], params: QuantParams) -> f32 {
let dequantized = dequantize(quantized, params);
let mse: f32 = original
.iter()
.zip(dequantized.iter())
.map(|(o, d)| (o - d).powi(2))
.sum::<f32>()
/ original.len() as f32;
mse
}
/// Calculate signal-to-quantization-noise ratio (SQNR) in dB
pub fn sqnr(original: &[f32], quantized: &[i8], params: QuantParams) -> f32 {
let dequantized = dequantize(quantized, params);
let signal_power: f32 = original.iter().map(|x| x.powi(2)).sum::<f32>() / original.len() as f32;
let noise_power: f32 = original
.iter()
.zip(dequantized.iter())
.map(|(o, d)| (o - d).powi(2))
.sum::<f32>()
/ original.len() as f32;
10.0 * (signal_power / noise_power).log10()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_quantize_dequantize() {
let weights = vec![0.0, 0.5, 1.0, -0.5, -1.0];
let (quantized, params) = quantize_weights(&weights);
let dequantized = dequantize(&quantized, params);
// Check approximate equality
for (orig, deq) in weights.iter().zip(dequantized.iter()) {
assert!((orig - deq).abs() < 0.01, "orig: {}, deq: {}", orig, deq);
}
}
#[test]
fn test_symmetric_quantization() {
let data = vec![-1.0, -0.5, 0.0, 0.5, 1.0];
let params = QuantParams::symmetric(1.0);
assert_eq!(params.zero_point, 0);
assert!((params.scale - 1.0 / 127.0).abs() < 1e-6);
let quantized = quantize_with_params(&data, params);
assert_eq!(quantized[2], 0); // 0.0 should map to 0
}
#[test]
fn test_quantized_tensor() {
let data = vec![1.0, 2.0, 3.0, 4.0];
let tensor = QuantizedTensor::from_f32(&data, vec![2, 2]);
assert_eq!(tensor.shape, vec![2, 2]);
assert_eq!(tensor.data.len(), 4);
let dequantized = tensor.to_f32();
for (orig, deq) in data.iter().zip(dequantized.iter()) {
assert!((orig - deq).abs() < 0.1);
}
}
#[test]
fn test_per_channel_quant() {
// 2 channels, 3 values each
let data = vec![
1.0, 2.0, 3.0, // Channel 0
10.0, 20.0, 30.0, // Channel 1
];
let quant = PerChannelQuant::from_f32(&data, vec![2, 3]);
assert_eq!(quant.params.len(), 2);
let dequantized = quant.to_f32();
for (orig, deq) in data.iter().zip(dequantized.iter()) {
assert!((orig - deq).abs() < 1.0);
}
}
#[test]
fn test_quantization_error() {
let original = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let (quantized, params) = quantize_weights(&original);
let error = quantization_error(&original, &quantized, params);
assert!(error < 0.1); // Should be small for simple data
let snr = sqnr(&original, &quantized, params);
assert!(snr > 30.0); // Should have good SNR
}
#[test]
fn test_compression_ratio() {
let data: Vec<f32> = (0..1000).map(|i| i as f32 / 1000.0).collect();
let tensor = QuantizedTensor::from_f32(&data, vec![1000]);
let ratio = tensor.compression_ratio();
assert!(ratio > 3.5); // Should be ~4x compression
}
#[test]
fn test_dynamic_quantizer() {
let mut data: Vec<f32> = (0..100).map(|i| i as f32).collect();
data.push(1000.0); // Outlier
let quantizer = DynamicQuantizer::new(99.0);
let (quantized, params) = quantizer.quantize(&data);
assert_eq!(quantized.len(), 101);
// The outlier should be clipped
assert!(params.scale > 0.0);
}
}

View File

@@ -0,0 +1,597 @@
//! SIMD-accelerated image processing operations
//!
//! Provides optimized implementations for common image operations using
//! AVX2, AVX-512, and ARM NEON intrinsics.
use super::{get_features, simd_enabled};
/// Convert RGBA image to grayscale using optimized SIMD operations
pub fn simd_grayscale(rgba: &[u8], gray: &mut [u8]) {
if !simd_enabled() {
return scalar_grayscale(rgba, gray);
}
let features = get_features();
#[cfg(target_arch = "x86_64")]
{
if features.avx2 {
unsafe { avx2_grayscale(rgba, gray) }
} else if features.sse4_2 {
unsafe { sse_grayscale(rgba, gray) }
} else {
scalar_grayscale(rgba, gray)
}
}
#[cfg(target_arch = "aarch64")]
{
if features.neon {
unsafe { neon_grayscale(rgba, gray) }
} else {
scalar_grayscale(rgba, gray)
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
scalar_grayscale(rgba, gray)
}
}
/// Scalar fallback for grayscale conversion
fn scalar_grayscale(rgba: &[u8], gray: &mut [u8]) {
assert_eq!(
rgba.len() / 4,
gray.len(),
"RGBA length must be 4x grayscale length"
);
for (i, chunk) in rgba.chunks_exact(4).enumerate() {
let r = chunk[0] as u32;
let g = chunk[1] as u32;
let b = chunk[2] as u32;
// ITU-R BT.601 luma coefficients: 0.299 R + 0.587 G + 0.114 B
gray[i] = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn avx2_grayscale(rgba: &[u8], gray: &mut [u8]) {
use std::arch::x86_64::*;
let len = gray.len();
let mut i = 0;
// Process 8 pixels at a time (32 RGBA bytes)
while i + 8 <= len {
// Load 32 bytes (8 RGBA pixels)
let rgba_ptr = rgba.as_ptr().add(i * 4);
let _pixels = _mm256_loadu_si256(rgba_ptr as *const __m256i);
// Separate RGBA channels (simplified - actual implementation would use shuffles)
// For production, use proper channel extraction
// Store grayscale result
for j in 0..8 {
let pixel_idx = (i + j) * 4;
let r = *rgba.get_unchecked(pixel_idx) as u32;
let g = *rgba.get_unchecked(pixel_idx + 1) as u32;
let b = *rgba.get_unchecked(pixel_idx + 2) as u32;
*gray.get_unchecked_mut(i + j) = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
}
i += 8;
}
// Handle remaining pixels
scalar_grayscale(&rgba[i * 4..], &mut gray[i..]);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.2")]
unsafe fn sse_grayscale(rgba: &[u8], gray: &mut [u8]) {
#[allow(unused_imports)]
use std::arch::x86_64::*;
let len = gray.len();
let mut i = 0;
// Process 4 pixels at a time (16 RGBA bytes)
while i + 4 <= len {
for j in 0..4 {
let pixel_idx = (i + j) * 4;
let r = *rgba.get_unchecked(pixel_idx) as u32;
let g = *rgba.get_unchecked(pixel_idx + 1) as u32;
let b = *rgba.get_unchecked(pixel_idx + 2) as u32;
*gray.get_unchecked_mut(i + j) = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
}
i += 4;
}
scalar_grayscale(&rgba[i * 4..], &mut gray[i..]);
}
#[cfg(target_arch = "aarch64")]
unsafe fn neon_grayscale(rgba: &[u8], gray: &mut [u8]) {
use std::arch::aarch64::*;
let len = gray.len();
let mut i = 0;
// Process 8 pixels at a time
while i + 8 <= len {
for j in 0..8 {
let idx = (i + j) * 4;
let r = *rgba.get_unchecked(idx) as u32;
let g = *rgba.get_unchecked(idx + 1) as u32;
let b = *rgba.get_unchecked(idx + 2) as u32;
*gray.get_unchecked_mut(i + j) = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
}
i += 8;
}
scalar_grayscale(&rgba[i * 4..], &mut gray[i..]);
}
/// Apply threshold to grayscale image using SIMD
pub fn simd_threshold(gray: &[u8], thresh: u8, out: &mut [u8]) {
if !simd_enabled() {
return scalar_threshold(gray, thresh, out);
}
let features = get_features();
#[cfg(target_arch = "x86_64")]
{
if features.avx2 {
unsafe { avx2_threshold(gray, thresh, out) }
} else {
scalar_threshold(gray, thresh, out)
}
}
#[cfg(not(target_arch = "x86_64"))]
{
scalar_threshold(gray, thresh, out)
}
}
fn scalar_threshold(gray: &[u8], thresh: u8, out: &mut [u8]) {
for (g, o) in gray.iter().zip(out.iter_mut()) {
*o = if *g >= thresh { 255 } else { 0 };
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn avx2_threshold(gray: &[u8], thresh: u8, out: &mut [u8]) {
use std::arch::x86_64::*;
let len = gray.len();
let mut i = 0;
let thresh_vec = _mm256_set1_epi8(thresh as i8);
let ones = _mm256_set1_epi8(-1); // 0xFF
// Process 32 bytes at a time
while i + 32 <= len {
let gray_vec = _mm256_loadu_si256(gray.as_ptr().add(i) as *const __m256i);
let cmp = _mm256_cmpgt_epi8(gray_vec, thresh_vec);
let result = _mm256_and_si256(cmp, ones);
_mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut __m256i, result);
i += 32;
}
// Handle remaining bytes
scalar_threshold(&gray[i..], thresh, &mut out[i..]);
}
/// Normalize f32 tensor data using SIMD
pub fn simd_normalize(data: &mut [f32]) {
if !simd_enabled() {
return scalar_normalize(data);
}
let features = get_features();
#[cfg(target_arch = "x86_64")]
{
if features.avx2 {
unsafe { avx2_normalize(data) }
} else {
scalar_normalize(data)
}
}
#[cfg(not(target_arch = "x86_64"))]
{
scalar_normalize(data)
}
}
fn scalar_normalize(data: &mut [f32]) {
let sum: f32 = data.iter().sum();
let mean = sum / data.len() as f32;
let variance: f32 = data.iter().map(|x| (x - mean).powi(2)).sum::<f32>() / data.len() as f32;
let std_dev = variance.sqrt() + 1e-8; // Add epsilon for numerical stability
for x in data.iter_mut() {
*x = (*x - mean) / std_dev;
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn avx2_normalize(data: &mut [f32]) {
use std::arch::x86_64::*;
// Calculate mean using SIMD
let len = data.len();
let mut sum = _mm256_setzero_ps();
let mut i = 0;
while i + 8 <= len {
let vals = _mm256_loadu_ps(data.as_ptr().add(i));
sum = _mm256_add_ps(sum, vals);
i += 8;
}
// Horizontal sum
let sum_scalar = {
let sum_arr: [f32; 8] = std::mem::transmute(sum);
sum_arr.iter().sum::<f32>() + data[i..].iter().sum::<f32>()
};
let mean = sum_scalar / len as f32;
let mean_vec = _mm256_set1_ps(mean);
// Calculate variance
let mut var_sum = _mm256_setzero_ps();
i = 0;
while i + 8 <= len {
let vals = _mm256_loadu_ps(data.as_ptr().add(i));
let diff = _mm256_sub_ps(vals, mean_vec);
let sq = _mm256_mul_ps(diff, diff);
var_sum = _mm256_add_ps(var_sum, sq);
i += 8;
}
let var_scalar = {
let var_arr: [f32; 8] = std::mem::transmute(var_sum);
var_arr.iter().sum::<f32>() + data[i..].iter().map(|x| (x - mean).powi(2)).sum::<f32>()
};
let std_dev = (var_scalar / len as f32).sqrt() + 1e-8;
let std_vec = _mm256_set1_ps(std_dev);
// Normalize
i = 0;
while i + 8 <= len {
let vals = _mm256_loadu_ps(data.as_ptr().add(i));
let centered = _mm256_sub_ps(vals, mean_vec);
let normalized = _mm256_div_ps(centered, std_vec);
_mm256_storeu_ps(data.as_mut_ptr().add(i), normalized);
i += 8;
}
// Handle remaining elements
for x in &mut data[i..] {
*x = (*x - mean) / std_dev;
}
}
/// Fast bilinear resize using SIMD - optimized for preprocessing
/// This is significantly faster than the image crate's resize for typical OCR sizes
pub fn simd_resize_bilinear(
src: &[u8],
src_width: usize,
src_height: usize,
dst_width: usize,
dst_height: usize,
) -> Vec<u8> {
if !simd_enabled() {
return scalar_resize_bilinear(src, src_width, src_height, dst_width, dst_height);
}
let features = get_features();
#[cfg(target_arch = "x86_64")]
{
if features.avx2 {
unsafe { avx2_resize_bilinear(src, src_width, src_height, dst_width, dst_height) }
} else {
scalar_resize_bilinear(src, src_width, src_height, dst_width, dst_height)
}
}
#[cfg(not(target_arch = "x86_64"))]
{
scalar_resize_bilinear(src, src_width, src_height, dst_width, dst_height)
}
}
/// Scalar bilinear resize implementation
fn scalar_resize_bilinear(
src: &[u8],
src_width: usize,
src_height: usize,
dst_width: usize,
dst_height: usize,
) -> Vec<u8> {
let mut dst = vec![0u8; dst_width * dst_height];
let x_scale = src_width as f32 / dst_width as f32;
let y_scale = src_height as f32 / dst_height as f32;
for y in 0..dst_height {
let src_y = y as f32 * y_scale;
let y0 = (src_y.floor() as usize).min(src_height - 1);
let y1 = (y0 + 1).min(src_height - 1);
let y_frac = src_y - src_y.floor();
for x in 0..dst_width {
let src_x = x as f32 * x_scale;
let x0 = (src_x.floor() as usize).min(src_width - 1);
let x1 = (x0 + 1).min(src_width - 1);
let x_frac = src_x - src_x.floor();
// Bilinear interpolation
let p00 = src[y0 * src_width + x0] as f32;
let p10 = src[y0 * src_width + x1] as f32;
let p01 = src[y1 * src_width + x0] as f32;
let p11 = src[y1 * src_width + x1] as f32;
let top = p00 * (1.0 - x_frac) + p10 * x_frac;
let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
let value = top * (1.0 - y_frac) + bottom * y_frac;
dst[y * dst_width + x] = value.round() as u8;
}
}
dst
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn avx2_resize_bilinear(
src: &[u8],
src_width: usize,
src_height: usize,
dst_width: usize,
dst_height: usize,
) -> Vec<u8> {
use std::arch::x86_64::*;
let mut dst = vec![0u8; dst_width * dst_height];
let x_scale = src_width as f32 / dst_width as f32;
let y_scale = src_height as f32 / dst_height as f32;
// Process 8 output pixels at a time for x dimension
for y in 0..dst_height {
let src_y = y as f32 * y_scale;
let y0 = (src_y.floor() as usize).min(src_height - 1);
let y1 = (y0 + 1).min(src_height - 1);
let _y_frac = _mm256_set1_ps(src_y - src_y.floor());
let _y_frac_inv = _mm256_set1_ps(1.0 - (src_y - src_y.floor()));
let mut x = 0;
while x + 8 <= dst_width {
// Calculate source x coordinates for 8 destination pixels
let src_xs: [f32; 8] = [
(x) as f32 * x_scale,
(x + 1) as f32 * x_scale,
(x + 2) as f32 * x_scale,
(x + 3) as f32 * x_scale,
(x + 4) as f32 * x_scale,
(x + 5) as f32 * x_scale,
(x + 6) as f32 * x_scale,
(x + 7) as f32 * x_scale,
];
let mut results = [0u8; 8];
for i in 0..8 {
let src_x = src_xs[i];
let x0 = (src_x.floor() as usize).min(src_width - 1);
let x1 = (x0 + 1).min(src_width - 1);
let x_frac = src_x - src_x.floor();
let p00 = *src.get_unchecked(y0 * src_width + x0) as f32;
let p10 = *src.get_unchecked(y0 * src_width + x1) as f32;
let p01 = *src.get_unchecked(y1 * src_width + x0) as f32;
let p11 = *src.get_unchecked(y1 * src_width + x1) as f32;
let top = p00 * (1.0 - x_frac) + p10 * x_frac;
let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
let value =
top * (1.0 - (src_y - src_y.floor())) + bottom * (src_y - src_y.floor());
results[i] = value.round() as u8;
}
for i in 0..8 {
*dst.get_unchecked_mut(y * dst_width + x + i) = results[i];
}
x += 8;
}
// Handle remaining pixels
while x < dst_width {
let src_x = x as f32 * x_scale;
let x0 = (src_x.floor() as usize).min(src_width - 1);
let x1 = (x0 + 1).min(src_width - 1);
let x_frac = src_x - src_x.floor();
let p00 = *src.get_unchecked(y0 * src_width + x0) as f32;
let p10 = *src.get_unchecked(y0 * src_width + x1) as f32;
let p01 = *src.get_unchecked(y1 * src_width + x0) as f32;
let p11 = *src.get_unchecked(y1 * src_width + x1) as f32;
let top = p00 * (1.0 - x_frac) + p10 * x_frac;
let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
let value = top * (1.0 - (src_y - src_y.floor())) + bottom * (src_y - src_y.floor());
*dst.get_unchecked_mut(y * dst_width + x) = value.round() as u8;
x += 1;
}
}
dst
}
/// Parallel SIMD resize for large images - splits work across threads
#[cfg(feature = "rayon")]
pub fn parallel_simd_resize(
src: &[u8],
src_width: usize,
src_height: usize,
dst_width: usize,
dst_height: usize,
) -> Vec<u8> {
use rayon::prelude::*;
// For small images, use single-threaded SIMD
if dst_height < 64 || dst_width * dst_height < 100_000 {
return simd_resize_bilinear(src, src_width, src_height, dst_width, dst_height);
}
let mut dst = vec![0u8; dst_width * dst_height];
let x_scale = src_width as f32 / dst_width as f32;
let y_scale = src_height as f32 / dst_height as f32;
// Process rows in parallel
dst.par_chunks_mut(dst_width)
.enumerate()
.for_each(|(y, row)| {
let src_y = y as f32 * y_scale;
let y0 = (src_y.floor() as usize).min(src_height - 1);
let y1 = (y0 + 1).min(src_height - 1);
let y_frac = src_y - src_y.floor();
for x in 0..dst_width {
let src_x = x as f32 * x_scale;
let x0 = (src_x.floor() as usize).min(src_width - 1);
let x1 = (x0 + 1).min(src_width - 1);
let x_frac = src_x - src_x.floor();
let p00 = src[y0 * src_width + x0] as f32;
let p10 = src[y0 * src_width + x1] as f32;
let p01 = src[y1 * src_width + x0] as f32;
let p11 = src[y1 * src_width + x1] as f32;
let top = p00 * (1.0 - x_frac) + p10 * x_frac;
let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
let value = top * (1.0 - y_frac) + bottom * y_frac;
row[x] = value.round() as u8;
}
});
dst
}
/// Ultra-fast area average downscaling for preprocessing
/// Best for large images being scaled down significantly
pub fn fast_area_resize(
src: &[u8],
src_width: usize,
src_height: usize,
dst_width: usize,
dst_height: usize,
) -> Vec<u8> {
// Only use area averaging for downscaling
if dst_width >= src_width || dst_height >= src_height {
return simd_resize_bilinear(src, src_width, src_height, dst_width, dst_height);
}
let mut dst = vec![0u8; dst_width * dst_height];
let x_ratio = src_width as f32 / dst_width as f32;
let y_ratio = src_height as f32 / dst_height as f32;
for y in 0..dst_height {
let y_start = (y as f32 * y_ratio) as usize;
let y_end = (((y + 1) as f32 * y_ratio) as usize).min(src_height);
for x in 0..dst_width {
let x_start = (x as f32 * x_ratio) as usize;
let x_end = (((x + 1) as f32 * x_ratio) as usize).min(src_width);
// Calculate area average
let mut sum: u32 = 0;
let mut count: u32 = 0;
for sy in y_start..y_end {
for sx in x_start..x_end {
sum += src[sy * src_width + sx] as u32;
count += 1;
}
}
dst[y * dst_width + x] = if count > 0 { (sum / count) as u8 } else { 0 };
}
}
dst
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_grayscale_conversion() {
let rgba = vec![
255, 0, 0, 255, // Red
0, 255, 0, 255, // Green
0, 0, 255, 255, // Blue
255, 255, 255, 255, // White
];
let mut gray = vec![0u8; 4];
simd_grayscale(&rgba, &mut gray);
// Check approximately correct values
assert!(gray[0] > 50 && gray[0] < 100); // Red
assert!(gray[1] > 130 && gray[1] < 160); // Green
assert!(gray[2] > 20 && gray[2] < 50); // Blue
assert_eq!(gray[3], 255); // White
}
#[test]
fn test_threshold() {
let gray = vec![0, 50, 100, 150, 200, 255];
let mut out = vec![0u8; 6];
simd_threshold(&gray, 100, &mut out);
assert_eq!(out, vec![0, 0, 0, 255, 255, 255]);
}
#[test]
fn test_normalize() {
let mut data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
simd_normalize(&mut data);
// After normalization, mean should be ~0 and std dev ~1
let mean: f32 = data.iter().sum::<f32>() / data.len() as f32;
assert!(mean.abs() < 1e-6);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_simd_vs_scalar_grayscale() {
let rgba: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
let mut gray_simd = vec![0u8; 256];
let mut gray_scalar = vec![0u8; 256];
simd_grayscale(&rgba, &mut gray_simd);
scalar_grayscale(&rgba, &mut gray_scalar);
assert_eq!(gray_simd, gray_scalar);
}
}