Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
396
vendor/ruvector/examples/scipix/src/optimize/batch.rs
vendored
Normal file
396
vendor/ruvector/examples/scipix/src/optimize/batch.rs
vendored
Normal file
@@ -0,0 +1,396 @@
|
||||
//! Dynamic batching for throughput optimization
|
||||
//!
|
||||
//! Provides intelligent batching to maximize GPU/CPU utilization while
|
||||
//! maintaining acceptable latency.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::sync::{oneshot, Mutex};
|
||||
use tokio::time::sleep;
|
||||
|
||||
/// Item in the batching queue
|
||||
pub struct BatchItem<T, R> {
|
||||
pub data: T,
|
||||
pub response: oneshot::Sender<BatchResult<R>>,
|
||||
pub enqueued_at: Instant,
|
||||
}
|
||||
|
||||
/// Result of batch processing
|
||||
pub type BatchResult<T> = std::result::Result<T, BatchError>;
|
||||
|
||||
/// Batch processing errors
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum BatchError {
|
||||
Timeout,
|
||||
ProcessingFailed(String),
|
||||
QueueFull,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BatchError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
BatchError::Timeout => write!(f, "Batch processing timeout"),
|
||||
BatchError::ProcessingFailed(msg) => write!(f, "Processing failed: {}", msg),
|
||||
BatchError::QueueFull => write!(f, "Queue is full"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for BatchError {}
|
||||
|
||||
/// Dynamic batcher configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BatchConfig {
|
||||
/// Maximum items in a batch
|
||||
pub max_batch_size: usize,
|
||||
/// Maximum time to wait before processing partial batch
|
||||
pub max_wait_ms: u64,
|
||||
/// Maximum queue size
|
||||
pub max_queue_size: usize,
|
||||
/// Minimum batch size to prefer
|
||||
pub preferred_batch_size: usize,
|
||||
}
|
||||
|
||||
impl Default for BatchConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_batch_size: 32,
|
||||
max_wait_ms: 50,
|
||||
max_queue_size: 1000,
|
||||
preferred_batch_size: 16,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Dynamic batcher for throughput optimization
|
||||
pub struct DynamicBatcher<T, R> {
|
||||
config: BatchConfig,
|
||||
queue: Arc<Mutex<VecDeque<BatchItem<T, R>>>>,
|
||||
processor: Arc<dyn Fn(Vec<T>) -> Vec<std::result::Result<R, String>> + Send + Sync>,
|
||||
shutdown: Arc<Mutex<bool>>,
|
||||
}
|
||||
|
||||
impl<T, R> DynamicBatcher<T, R>
|
||||
where
|
||||
T: Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
/// Create new dynamic batcher
|
||||
pub fn new<F>(config: BatchConfig, processor: F) -> Self
|
||||
where
|
||||
F: Fn(Vec<T>) -> Vec<std::result::Result<R, String>> + Send + Sync + 'static,
|
||||
{
|
||||
Self {
|
||||
config,
|
||||
queue: Arc::new(Mutex::new(VecDeque::new())),
|
||||
processor: Arc::new(processor),
|
||||
shutdown: Arc::new(Mutex::new(false)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add item to batch queue
|
||||
pub async fn add(&self, item: T) -> BatchResult<R> {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
|
||||
let batch_item = BatchItem {
|
||||
data: item,
|
||||
response: tx,
|
||||
enqueued_at: Instant::now(),
|
||||
};
|
||||
|
||||
{
|
||||
let mut queue = self.queue.lock().await;
|
||||
if queue.len() >= self.config.max_queue_size {
|
||||
return Err(BatchError::QueueFull);
|
||||
}
|
||||
queue.push_back(batch_item);
|
||||
}
|
||||
|
||||
// Wait for response
|
||||
rx.await.map_err(|_| BatchError::Timeout)?
|
||||
}
|
||||
|
||||
/// Start batch processing loop
|
||||
pub async fn run(&self) {
|
||||
let mut last_process = Instant::now();
|
||||
|
||||
loop {
|
||||
// Check if shutdown requested
|
||||
{
|
||||
let shutdown = self.shutdown.lock().await;
|
||||
if *shutdown {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let should_process = {
|
||||
let queue = self.queue.lock().await;
|
||||
queue.len() >= self.config.max_batch_size
|
||||
|| (queue.len() >= self.config.preferred_batch_size
|
||||
&& last_process.elapsed().as_millis() >= self.config.max_wait_ms as u128)
|
||||
|| (queue.len() > 0
|
||||
&& last_process.elapsed().as_millis() >= self.config.max_wait_ms as u128)
|
||||
};
|
||||
|
||||
if should_process {
|
||||
self.process_batch().await;
|
||||
last_process = Instant::now();
|
||||
} else {
|
||||
// Sleep briefly to avoid busy waiting
|
||||
sleep(Duration::from_millis(1)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// Process remaining items before shutdown
|
||||
self.process_batch().await;
|
||||
}
|
||||
|
||||
/// Process current batch
|
||||
async fn process_batch(&self) {
|
||||
let items = {
|
||||
let mut queue = self.queue.lock().await;
|
||||
let batch_size = self.config.max_batch_size.min(queue.len());
|
||||
if batch_size == 0 {
|
||||
return;
|
||||
}
|
||||
queue.drain(..batch_size).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
if items.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Extract data and response channels
|
||||
let (data, responses): (Vec<_>, Vec<_>) = items
|
||||
.into_iter()
|
||||
.map(|item| (item.data, item.response))
|
||||
.unzip();
|
||||
|
||||
// Process batch
|
||||
let results = (self.processor)(data);
|
||||
|
||||
// Send responses
|
||||
for (response_tx, result) in responses.into_iter().zip(results.into_iter()) {
|
||||
let batch_result = result.map_err(|e| BatchError::ProcessingFailed(e));
|
||||
let _ = response_tx.send(batch_result);
|
||||
}
|
||||
}
|
||||
|
||||
/// Gracefully shutdown the batcher
|
||||
pub async fn shutdown(&self) {
|
||||
let mut shutdown = self.shutdown.lock().await;
|
||||
*shutdown = true;
|
||||
}
|
||||
|
||||
/// Get current queue size
|
||||
pub async fn queue_size(&self) -> usize {
|
||||
self.queue.lock().await.len()
|
||||
}
|
||||
|
||||
/// Get current queue statistics
|
||||
pub async fn stats(&self) -> BatchStats {
|
||||
let queue = self.queue.lock().await;
|
||||
let queue_size = queue.len();
|
||||
|
||||
let max_wait = queue
|
||||
.front()
|
||||
.map(|item| item.enqueued_at.elapsed())
|
||||
.unwrap_or(Duration::from_secs(0));
|
||||
|
||||
BatchStats {
|
||||
queue_size,
|
||||
max_wait_time: max_wait,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Batch statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BatchStats {
|
||||
pub queue_size: usize,
|
||||
pub max_wait_time: Duration,
|
||||
}
|
||||
|
||||
/// Adaptive batcher that adjusts batch size based on latency
|
||||
pub struct AdaptiveBatcher<T, R> {
|
||||
inner: DynamicBatcher<T, R>,
|
||||
config: Arc<Mutex<BatchConfig>>,
|
||||
latency_history: Arc<Mutex<VecDeque<Duration>>>,
|
||||
target_latency: Duration,
|
||||
}
|
||||
|
||||
impl<T, R> AdaptiveBatcher<T, R>
|
||||
where
|
||||
T: Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
/// Create adaptive batcher with target latency
|
||||
pub fn new<F>(initial_config: BatchConfig, target_latency: Duration, processor: F) -> Self
|
||||
where
|
||||
F: Fn(Vec<T>) -> Vec<Result<R, String>> + Send + Sync + 'static,
|
||||
{
|
||||
let config = Arc::new(Mutex::new(initial_config.clone()));
|
||||
let inner = DynamicBatcher::new(initial_config, processor);
|
||||
|
||||
Self {
|
||||
inner,
|
||||
config,
|
||||
latency_history: Arc::new(Mutex::new(VecDeque::with_capacity(100))),
|
||||
target_latency,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add item and adapt batch size
|
||||
pub async fn add(&self, item: T) -> Result<R, BatchError> {
|
||||
let start = Instant::now();
|
||||
let result = self.inner.add(item).await;
|
||||
let latency = start.elapsed();
|
||||
|
||||
// Record latency
|
||||
{
|
||||
let mut history = self.latency_history.lock().await;
|
||||
history.push_back(latency);
|
||||
if history.len() > 100 {
|
||||
history.pop_front();
|
||||
}
|
||||
}
|
||||
|
||||
// Adapt batch size every 10 requests
|
||||
{
|
||||
let history = self.latency_history.lock().await;
|
||||
if history.len() % 10 == 0 && history.len() >= 10 {
|
||||
let avg_latency: Duration = history.iter().sum::<Duration>() / history.len() as u32;
|
||||
|
||||
let mut config = self.config.lock().await;
|
||||
if avg_latency > self.target_latency {
|
||||
// Reduce batch size to lower latency
|
||||
config.max_batch_size = (config.max_batch_size * 9 / 10).max(1);
|
||||
} else if avg_latency < self.target_latency / 2 {
|
||||
// Increase batch size for better throughput
|
||||
config.max_batch_size = (config.max_batch_size * 11 / 10).min(128);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Run the batcher
|
||||
pub async fn run(&self) {
|
||||
self.inner.run().await;
|
||||
}
|
||||
|
||||
/// Get current configuration
|
||||
pub async fn current_config(&self) -> BatchConfig {
|
||||
self.config.lock().await.clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_dynamic_batcher() {
|
||||
let config = BatchConfig {
|
||||
max_batch_size: 4,
|
||||
max_wait_ms: 100,
|
||||
max_queue_size: 100,
|
||||
preferred_batch_size: 2,
|
||||
};
|
||||
|
||||
let batcher = Arc::new(DynamicBatcher::new(config, |items: Vec<i32>| {
|
||||
items.into_iter().map(|x| Ok(x * 2)).collect()
|
||||
}));
|
||||
|
||||
// Start processing loop
|
||||
let batcher_clone = batcher.clone();
|
||||
tokio::spawn(async move {
|
||||
batcher_clone.run().await;
|
||||
});
|
||||
|
||||
// Add items
|
||||
let mut handles = vec![];
|
||||
for i in 0..8 {
|
||||
let batcher = batcher.clone();
|
||||
handles.push(tokio::spawn(async move { batcher.add(i).await }));
|
||||
}
|
||||
|
||||
// Wait for results
|
||||
for (i, handle) in handles.into_iter().enumerate() {
|
||||
let result = handle.await.unwrap().unwrap();
|
||||
assert_eq!(result, (i as i32) * 2);
|
||||
}
|
||||
|
||||
batcher.shutdown().await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_batch_stats() {
|
||||
let config = BatchConfig::default();
|
||||
let batcher = DynamicBatcher::new(config, |items: Vec<i32>| {
|
||||
items.into_iter().map(|x| Ok(x)).collect()
|
||||
});
|
||||
|
||||
// Queue some items without processing
|
||||
let _ = batcher.add(1);
|
||||
let _ = batcher.add(2);
|
||||
let _ = batcher.add(3);
|
||||
|
||||
let stats = batcher.stats().await;
|
||||
assert_eq!(stats.queue_size, 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_queue_full() {
|
||||
let config = BatchConfig {
|
||||
max_queue_size: 2,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let batcher = DynamicBatcher::new(config, |items: Vec<i32>| {
|
||||
std::thread::sleep(Duration::from_secs(1)); // Slow processing
|
||||
items.into_iter().map(|x| Ok(x)).collect()
|
||||
});
|
||||
|
||||
// Fill queue
|
||||
let _ = batcher.add(1);
|
||||
let _ = batcher.add(2);
|
||||
|
||||
// This should fail - queue is full
|
||||
let result = batcher.add(3).await;
|
||||
assert!(matches!(result, Err(BatchError::QueueFull)));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_adaptive_batcher() {
|
||||
let config = BatchConfig {
|
||||
max_batch_size: 8,
|
||||
max_wait_ms: 50,
|
||||
max_queue_size: 100,
|
||||
preferred_batch_size: 4,
|
||||
};
|
||||
|
||||
let batcher = Arc::new(AdaptiveBatcher::new(
|
||||
config,
|
||||
Duration::from_millis(100),
|
||||
|items: Vec<i32>| items.into_iter().map(|x| Ok(x * 2)).collect(),
|
||||
));
|
||||
|
||||
let batcher_clone = batcher.clone();
|
||||
tokio::spawn(async move {
|
||||
batcher_clone.run().await;
|
||||
});
|
||||
|
||||
// Process some requests
|
||||
for i in 0..20 {
|
||||
let result = batcher.add(i).await.unwrap();
|
||||
assert_eq!(result, i * 2);
|
||||
}
|
||||
|
||||
// Configuration should have adapted
|
||||
let final_config = batcher.current_config().await;
|
||||
assert!(final_config.max_batch_size > 0);
|
||||
}
|
||||
}
|
||||
409
vendor/ruvector/examples/scipix/src/optimize/memory.rs
vendored
Normal file
409
vendor/ruvector/examples/scipix/src/optimize/memory.rs
vendored
Normal file
@@ -0,0 +1,409 @@
|
||||
//! Memory optimization utilities
|
||||
//!
|
||||
//! Provides object pooling, memory-mapped file loading, and zero-copy operations.
|
||||
|
||||
use memmap2::{Mmap, MmapOptions};
|
||||
use std::collections::VecDeque;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use super::memory_opt_enabled;
|
||||
use crate::error::{Result, ScipixError};
|
||||
|
||||
/// Object pool for reusable buffers
|
||||
pub struct BufferPool<T> {
|
||||
pool: Arc<Mutex<VecDeque<T>>>,
|
||||
factory: Arc<dyn Fn() -> T + Send + Sync>,
|
||||
#[allow(dead_code)]
|
||||
max_size: usize,
|
||||
}
|
||||
|
||||
impl<T: Send + 'static> BufferPool<T> {
|
||||
/// Create a new buffer pool
|
||||
pub fn new<F>(factory: F, initial_size: usize, max_size: usize) -> Self
|
||||
where
|
||||
F: Fn() -> T + Send + Sync + 'static,
|
||||
{
|
||||
let factory = Arc::new(factory);
|
||||
let pool = Arc::new(Mutex::new(VecDeque::with_capacity(max_size)));
|
||||
|
||||
// Pre-allocate initial buffers
|
||||
if memory_opt_enabled() {
|
||||
let mut pool_lock = pool.lock().unwrap();
|
||||
for _ in 0..initial_size {
|
||||
pool_lock.push_back(factory());
|
||||
}
|
||||
}
|
||||
|
||||
Self {
|
||||
pool,
|
||||
factory,
|
||||
max_size,
|
||||
}
|
||||
}
|
||||
|
||||
/// Acquire a buffer from the pool
|
||||
pub fn acquire(&self) -> PooledBuffer<T> {
|
||||
let buffer = if memory_opt_enabled() {
|
||||
self.pool
|
||||
.lock()
|
||||
.unwrap()
|
||||
.pop_front()
|
||||
.unwrap_or_else(|| (self.factory)())
|
||||
} else {
|
||||
(self.factory)()
|
||||
};
|
||||
|
||||
PooledBuffer {
|
||||
buffer: Some(buffer),
|
||||
pool: self.pool.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get current pool size
|
||||
pub fn size(&self) -> usize {
|
||||
self.pool.lock().unwrap().len()
|
||||
}
|
||||
|
||||
/// Clear the pool
|
||||
pub fn clear(&self) {
|
||||
self.pool.lock().unwrap().clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII guard for pooled buffers
|
||||
pub struct PooledBuffer<T> {
|
||||
buffer: Option<T>,
|
||||
pool: Arc<Mutex<VecDeque<T>>>,
|
||||
}
|
||||
|
||||
impl<T> PooledBuffer<T> {
|
||||
/// Get mutable reference to buffer
|
||||
pub fn get_mut(&mut self) -> &mut T {
|
||||
self.buffer.as_mut().unwrap()
|
||||
}
|
||||
|
||||
/// Get immutable reference to buffer
|
||||
pub fn get(&self) -> &T {
|
||||
self.buffer.as_ref().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for PooledBuffer<T> {
|
||||
fn drop(&mut self) {
|
||||
if memory_opt_enabled() {
|
||||
if let Some(buffer) = self.buffer.take() {
|
||||
let mut pool = self.pool.lock().unwrap();
|
||||
pool.push_back(buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> std::ops::Deref for PooledBuffer<T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.buffer.as_ref().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> std::ops::DerefMut for PooledBuffer<T> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.buffer.as_mut().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
/// Memory-mapped model file
|
||||
pub struct MmapModel {
|
||||
_mmap: Mmap,
|
||||
data: *const u8,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
unsafe impl Send for MmapModel {}
|
||||
unsafe impl Sync for MmapModel {}
|
||||
|
||||
impl MmapModel {
|
||||
/// Load model from file using memory mapping
|
||||
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
let file = File::open(path.as_ref()).map_err(|e| ScipixError::Io(e))?;
|
||||
|
||||
let mmap = unsafe {
|
||||
MmapOptions::new()
|
||||
.map(&file)
|
||||
.map_err(|e| ScipixError::Io(e))?
|
||||
};
|
||||
|
||||
let data = mmap.as_ptr();
|
||||
let len = mmap.len();
|
||||
|
||||
Ok(Self {
|
||||
_mmap: mmap,
|
||||
data,
|
||||
len,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get slice of model data
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
unsafe { std::slice::from_raw_parts(self.data, self.len) }
|
||||
}
|
||||
|
||||
/// Get size of mapped region
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Check if empty
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len == 0
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero-copy image view
|
||||
pub struct ImageView<'a> {
|
||||
data: &'a [u8],
|
||||
width: u32,
|
||||
height: u32,
|
||||
channels: u8,
|
||||
}
|
||||
|
||||
impl<'a> ImageView<'a> {
|
||||
/// Create new image view from raw data
|
||||
pub fn new(data: &'a [u8], width: u32, height: u32, channels: u8) -> Result<Self> {
|
||||
let expected_len = (width * height * channels as u32) as usize;
|
||||
if data.len() != expected_len {
|
||||
return Err(ScipixError::InvalidInput(format!(
|
||||
"Invalid data length: expected {}, got {}",
|
||||
expected_len,
|
||||
data.len()
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
data,
|
||||
width,
|
||||
height,
|
||||
channels,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get pixel at (x, y)
|
||||
pub fn pixel(&self, x: u32, y: u32) -> &[u8] {
|
||||
let offset = ((y * self.width + x) * self.channels as u32) as usize;
|
||||
&self.data[offset..offset + self.channels as usize]
|
||||
}
|
||||
|
||||
/// Get raw data slice
|
||||
pub fn data(&self) -> &[u8] {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Get dimensions
|
||||
pub fn dimensions(&self) -> (u32, u32) {
|
||||
(self.width, self.height)
|
||||
}
|
||||
|
||||
/// Get number of channels
|
||||
pub fn channels(&self) -> u8 {
|
||||
self.channels
|
||||
}
|
||||
|
||||
/// Create subview (region of interest)
|
||||
pub fn subview(&self, x: u32, y: u32, width: u32, height: u32) -> Result<Self> {
|
||||
if x + width > self.width || y + height > self.height {
|
||||
return Err(ScipixError::InvalidInput(
|
||||
"Subview out of bounds".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
// For simplicity, this creates a copy. True zero-copy would need stride support
|
||||
let mut subview_data = Vec::new();
|
||||
for row in y..y + height {
|
||||
let start = ((row * self.width + x) * self.channels as u32) as usize;
|
||||
let end = start + (width * self.channels as u32) as usize;
|
||||
subview_data.extend_from_slice(&self.data[start..end]);
|
||||
}
|
||||
|
||||
// This temporarily leaks memory - in production, use arena allocator
|
||||
let leaked = Box::leak(subview_data.into_boxed_slice());
|
||||
|
||||
Ok(Self {
|
||||
data: leaked,
|
||||
width,
|
||||
height,
|
||||
channels: self.channels,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Arena allocator for temporary allocations
|
||||
pub struct Arena {
|
||||
buffer: Vec<u8>,
|
||||
offset: usize,
|
||||
}
|
||||
|
||||
impl Arena {
|
||||
/// Create new arena with capacity
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
Self {
|
||||
buffer: Vec::with_capacity(capacity),
|
||||
offset: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Allocate aligned memory
|
||||
pub fn alloc(&mut self, size: usize, align: usize) -> &mut [u8] {
|
||||
// Align offset
|
||||
let padding = (align - (self.offset % align)) % align;
|
||||
self.offset += padding;
|
||||
|
||||
let start = self.offset;
|
||||
let end = start + size;
|
||||
|
||||
if end > self.buffer.capacity() {
|
||||
// Grow buffer
|
||||
self.buffer.reserve(end - self.buffer.len());
|
||||
}
|
||||
|
||||
unsafe {
|
||||
self.buffer.set_len(end);
|
||||
}
|
||||
|
||||
self.offset = end;
|
||||
&mut self.buffer[start..end]
|
||||
}
|
||||
|
||||
/// Reset arena (keeps capacity)
|
||||
pub fn reset(&mut self) {
|
||||
self.offset = 0;
|
||||
self.buffer.clear();
|
||||
}
|
||||
|
||||
/// Get current usage
|
||||
pub fn usage(&self) -> usize {
|
||||
self.offset
|
||||
}
|
||||
|
||||
/// Get capacity
|
||||
pub fn capacity(&self) -> usize {
|
||||
self.buffer.capacity()
|
||||
}
|
||||
}
|
||||
|
||||
/// Global buffer pools for common sizes
|
||||
pub struct GlobalPools {
|
||||
small: BufferPool<Vec<u8>>, // 1KB buffers
|
||||
medium: BufferPool<Vec<u8>>, // 64KB buffers
|
||||
large: BufferPool<Vec<u8>>, // 1MB buffers
|
||||
}
|
||||
|
||||
impl GlobalPools {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
small: BufferPool::new(|| Vec::with_capacity(1024), 10, 100),
|
||||
medium: BufferPool::new(|| Vec::with_capacity(64 * 1024), 5, 50),
|
||||
large: BufferPool::new(|| Vec::with_capacity(1024 * 1024), 2, 20),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the global pools instance
|
||||
pub fn get() -> &'static Self {
|
||||
static POOLS: std::sync::OnceLock<GlobalPools> = std::sync::OnceLock::new();
|
||||
POOLS.get_or_init(GlobalPools::new)
|
||||
}
|
||||
|
||||
/// Acquire small buffer (1KB)
|
||||
pub fn acquire_small(&self) -> PooledBuffer<Vec<u8>> {
|
||||
self.small.acquire()
|
||||
}
|
||||
|
||||
/// Acquire medium buffer (64KB)
|
||||
pub fn acquire_medium(&self) -> PooledBuffer<Vec<u8>> {
|
||||
self.medium.acquire()
|
||||
}
|
||||
|
||||
/// Acquire large buffer (1MB)
|
||||
pub fn acquire_large(&self) -> PooledBuffer<Vec<u8>> {
|
||||
self.large.acquire()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::io::Write;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
#[test]
|
||||
fn test_buffer_pool() {
|
||||
let pool = BufferPool::new(|| Vec::with_capacity(1024), 2, 10);
|
||||
|
||||
assert_eq!(pool.size(), 2);
|
||||
|
||||
let mut buf1 = pool.acquire();
|
||||
assert_eq!(buf1.capacity(), 1024);
|
||||
buf1.extend_from_slice(b"test");
|
||||
|
||||
drop(buf1);
|
||||
assert_eq!(pool.size(), 3); // Returned to pool
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mmap_model() {
|
||||
let mut temp = NamedTempFile::new().unwrap();
|
||||
temp.write_all(b"test model data").unwrap();
|
||||
temp.flush().unwrap();
|
||||
|
||||
let mmap = MmapModel::from_file(temp.path()).unwrap();
|
||||
assert_eq!(mmap.as_slice(), b"test model data");
|
||||
assert_eq!(mmap.len(), 15);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_image_view() {
|
||||
let data = vec![
|
||||
255, 0, 0, 255, // Red pixel
|
||||
0, 255, 0, 255, // Green pixel
|
||||
0, 0, 255, 255, // Blue pixel
|
||||
255, 255, 255, 255, // White pixel
|
||||
];
|
||||
|
||||
let view = ImageView::new(&data, 2, 2, 4).unwrap();
|
||||
assert_eq!(view.dimensions(), (2, 2));
|
||||
assert_eq!(view.pixel(0, 0), &[255, 0, 0, 255]);
|
||||
assert_eq!(view.pixel(1, 1), &[255, 255, 255, 255]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_arena() {
|
||||
let mut arena = Arena::with_capacity(1024);
|
||||
|
||||
let slice1 = arena.alloc(100, 8);
|
||||
assert_eq!(slice1.len(), 100);
|
||||
|
||||
let slice2 = arena.alloc(200, 8);
|
||||
assert_eq!(slice2.len(), 200);
|
||||
|
||||
assert!(arena.usage() >= 300);
|
||||
|
||||
arena.reset();
|
||||
assert_eq!(arena.usage(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_global_pools() {
|
||||
let pools = GlobalPools::get();
|
||||
|
||||
let small = pools.acquire_small();
|
||||
assert!(small.capacity() >= 1024);
|
||||
|
||||
let medium = pools.acquire_medium();
|
||||
assert!(medium.capacity() >= 64 * 1024);
|
||||
|
||||
let large = pools.acquire_large();
|
||||
assert!(large.capacity() >= 1024 * 1024);
|
||||
}
|
||||
}
|
||||
169
vendor/ruvector/examples/scipix/src/optimize/mod.rs
vendored
Normal file
169
vendor/ruvector/examples/scipix/src/optimize/mod.rs
vendored
Normal file
@@ -0,0 +1,169 @@
|
||||
//! Performance optimization utilities for scipix OCR
|
||||
//!
|
||||
//! This module provides runtime feature detection and optimized code paths
|
||||
//! for different CPU architectures and capabilities.
|
||||
|
||||
pub mod batch;
|
||||
pub mod memory;
|
||||
pub mod parallel;
|
||||
pub mod quantize;
|
||||
pub mod simd;
|
||||
|
||||
use std::sync::OnceLock;
|
||||
|
||||
/// CPU features detected at runtime
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct CpuFeatures {
|
||||
pub avx2: bool,
|
||||
pub avx512f: bool,
|
||||
pub neon: bool,
|
||||
pub sse4_2: bool,
|
||||
}
|
||||
|
||||
static CPU_FEATURES: OnceLock<CpuFeatures> = OnceLock::new();
|
||||
|
||||
/// Detect CPU features at runtime
|
||||
pub fn detect_features() -> CpuFeatures {
|
||||
*CPU_FEATURES.get_or_init(|| {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
CpuFeatures {
|
||||
avx2: is_x86_feature_detected!("avx2"),
|
||||
avx512f: is_x86_feature_detected!("avx512f"),
|
||||
neon: false,
|
||||
sse4_2: is_x86_feature_detected!("sse4.2"),
|
||||
}
|
||||
}
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
{
|
||||
CpuFeatures {
|
||||
avx2: false,
|
||||
avx512f: false,
|
||||
neon: std::arch::is_aarch64_feature_detected!("neon"),
|
||||
sse4_2: false,
|
||||
}
|
||||
}
|
||||
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
|
||||
{
|
||||
CpuFeatures {
|
||||
avx2: false,
|
||||
avx512f: false,
|
||||
neon: false,
|
||||
sse4_2: false,
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the detected CPU features
|
||||
pub fn get_features() -> CpuFeatures {
|
||||
detect_features()
|
||||
}
|
||||
|
||||
/// Runtime dispatch to optimized implementation
|
||||
pub trait OptimizedOp<T> {
|
||||
/// Execute the operation with the best available implementation
|
||||
fn execute(&self, input: T) -> T;
|
||||
|
||||
/// Execute with SIMD if available, fallback to scalar
|
||||
fn execute_auto(&self, input: T) -> T {
|
||||
let features = get_features();
|
||||
if features.avx2 || features.avx512f || features.neon {
|
||||
self.execute_simd(input)
|
||||
} else {
|
||||
self.execute_scalar(input)
|
||||
}
|
||||
}
|
||||
|
||||
/// SIMD implementation
|
||||
fn execute_simd(&self, input: T) -> T;
|
||||
|
||||
/// Scalar fallback implementation
|
||||
fn execute_scalar(&self, input: T) -> T;
|
||||
}
|
||||
|
||||
/// Optimization level configuration
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum OptLevel {
|
||||
/// No optimizations, scalar code only
|
||||
None,
|
||||
/// Use SIMD when available
|
||||
Simd,
|
||||
/// Use SIMD + parallel processing
|
||||
Parallel,
|
||||
/// All optimizations including memory optimizations
|
||||
Full,
|
||||
}
|
||||
|
||||
impl Default for OptLevel {
|
||||
fn default() -> Self {
|
||||
OptLevel::Full
|
||||
}
|
||||
}
|
||||
|
||||
/// Global optimization configuration
|
||||
static OPT_LEVEL: OnceLock<OptLevel> = OnceLock::new();
|
||||
|
||||
/// Set the optimization level
|
||||
pub fn set_opt_level(level: OptLevel) {
|
||||
OPT_LEVEL.set(level).ok();
|
||||
}
|
||||
|
||||
/// Get the current optimization level
|
||||
pub fn get_opt_level() -> OptLevel {
|
||||
*OPT_LEVEL.get_or_init(OptLevel::default)
|
||||
}
|
||||
|
||||
/// Check if SIMD optimizations are enabled
|
||||
pub fn simd_enabled() -> bool {
|
||||
matches!(
|
||||
get_opt_level(),
|
||||
OptLevel::Simd | OptLevel::Parallel | OptLevel::Full
|
||||
)
|
||||
}
|
||||
|
||||
/// Check if parallel optimizations are enabled
|
||||
pub fn parallel_enabled() -> bool {
|
||||
matches!(get_opt_level(), OptLevel::Parallel | OptLevel::Full)
|
||||
}
|
||||
|
||||
/// Check if memory optimizations are enabled
|
||||
pub fn memory_opt_enabled() -> bool {
|
||||
matches!(get_opt_level(), OptLevel::Full)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_feature_detection() {
|
||||
let features = detect_features();
|
||||
println!("Detected features: {:?}", features);
|
||||
|
||||
// Should always succeed on any platform
|
||||
assert!(
|
||||
features.avx2
|
||||
|| features.avx512f
|
||||
|| features.neon
|
||||
|| features.sse4_2
|
||||
|| (!features.avx2 && !features.avx512f && !features.neon && !features.sse4_2)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_opt_level() {
|
||||
assert_eq!(get_opt_level(), OptLevel::Full);
|
||||
|
||||
set_opt_level(OptLevel::Simd);
|
||||
// Can't change after first init, should still be Full
|
||||
assert_eq!(get_opt_level(), OptLevel::Full);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_optimization_checks() {
|
||||
assert!(simd_enabled());
|
||||
assert!(parallel_enabled());
|
||||
assert!(memory_opt_enabled());
|
||||
}
|
||||
}
|
||||
335
vendor/ruvector/examples/scipix/src/optimize/parallel.rs
vendored
Normal file
335
vendor/ruvector/examples/scipix/src/optimize/parallel.rs
vendored
Normal file
@@ -0,0 +1,335 @@
|
||||
//! Parallel processing utilities for OCR pipeline
|
||||
//!
|
||||
//! Provides parallel image preprocessing, batch OCR, and pipelined execution.
|
||||
|
||||
use image::DynamicImage;
|
||||
use rayon::prelude::*;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use super::parallel_enabled;
|
||||
|
||||
/// Parallel preprocessing of multiple images
|
||||
pub fn parallel_preprocess<F>(images: Vec<DynamicImage>, preprocess_fn: F) -> Vec<DynamicImage>
|
||||
where
|
||||
F: Fn(DynamicImage) -> DynamicImage + Sync + Send,
|
||||
{
|
||||
if !parallel_enabled() {
|
||||
return images.into_iter().map(preprocess_fn).collect();
|
||||
}
|
||||
|
||||
images.into_par_iter().map(preprocess_fn).collect()
|
||||
}
|
||||
|
||||
/// Parallel processing with error handling
|
||||
pub fn parallel_preprocess_result<F, E>(
|
||||
images: Vec<DynamicImage>,
|
||||
preprocess_fn: F,
|
||||
) -> Vec<std::result::Result<DynamicImage, E>>
|
||||
where
|
||||
F: Fn(DynamicImage) -> std::result::Result<DynamicImage, E> + Sync + Send,
|
||||
E: Send,
|
||||
{
|
||||
if !parallel_enabled() {
|
||||
return images.into_iter().map(preprocess_fn).collect();
|
||||
}
|
||||
|
||||
images.into_par_iter().map(preprocess_fn).collect()
|
||||
}
|
||||
|
||||
/// Pipeline parallel execution for OCR workflow
|
||||
///
|
||||
/// Executes stages in a pipeline: preprocess | detect | recognize
|
||||
/// Each stage can start processing the next item while previous stages
|
||||
/// continue with subsequent items.
|
||||
pub struct PipelineExecutor<T, U, V> {
|
||||
stage1: Arc<dyn Fn(T) -> U + Send + Sync>,
|
||||
stage2: Arc<dyn Fn(U) -> V + Send + Sync>,
|
||||
}
|
||||
|
||||
impl<T, U, V> PipelineExecutor<T, U, V>
|
||||
where
|
||||
T: Send,
|
||||
U: Send,
|
||||
V: Send,
|
||||
{
|
||||
pub fn new<F1, F2>(stage1: F1, stage2: F2) -> Self
|
||||
where
|
||||
F1: Fn(T) -> U + Send + Sync + 'static,
|
||||
F2: Fn(U) -> V + Send + Sync + 'static,
|
||||
{
|
||||
Self {
|
||||
stage1: Arc::new(stage1),
|
||||
stage2: Arc::new(stage2),
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute pipeline on multiple inputs
|
||||
pub fn execute_batch(&self, inputs: Vec<T>) -> Vec<V> {
|
||||
if !parallel_enabled() {
|
||||
return inputs
|
||||
.into_iter()
|
||||
.map(|input| {
|
||||
let stage1_out = (self.stage1)(input);
|
||||
(self.stage2)(stage1_out)
|
||||
})
|
||||
.collect();
|
||||
}
|
||||
|
||||
inputs
|
||||
.into_par_iter()
|
||||
.map(|input| {
|
||||
let stage1_out = (self.stage1)(input);
|
||||
(self.stage2)(stage1_out)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
/// Three-stage pipeline executor
|
||||
pub struct Pipeline3<T, U, V, W> {
|
||||
stage1: Arc<dyn Fn(T) -> U + Send + Sync>,
|
||||
stage2: Arc<dyn Fn(U) -> V + Send + Sync>,
|
||||
stage3: Arc<dyn Fn(V) -> W + Send + Sync>,
|
||||
}
|
||||
|
||||
impl<T, U, V, W> Pipeline3<T, U, V, W>
|
||||
where
|
||||
T: Send,
|
||||
U: Send,
|
||||
V: Send,
|
||||
W: Send,
|
||||
{
|
||||
pub fn new<F1, F2, F3>(stage1: F1, stage2: F2, stage3: F3) -> Self
|
||||
where
|
||||
F1: Fn(T) -> U + Send + Sync + 'static,
|
||||
F2: Fn(U) -> V + Send + Sync + 'static,
|
||||
F3: Fn(V) -> W + Send + Sync + 'static,
|
||||
{
|
||||
Self {
|
||||
stage1: Arc::new(stage1),
|
||||
stage2: Arc::new(stage2),
|
||||
stage3: Arc::new(stage3),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute_batch(&self, inputs: Vec<T>) -> Vec<W> {
|
||||
if !parallel_enabled() {
|
||||
return inputs
|
||||
.into_iter()
|
||||
.map(|input| {
|
||||
let out1 = (self.stage1)(input);
|
||||
let out2 = (self.stage2)(out1);
|
||||
(self.stage3)(out2)
|
||||
})
|
||||
.collect();
|
||||
}
|
||||
|
||||
inputs
|
||||
.into_par_iter()
|
||||
.map(|input| {
|
||||
let out1 = (self.stage1)(input);
|
||||
let out2 = (self.stage2)(out1);
|
||||
(self.stage3)(out2)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
/// Parallel map with configurable chunk size
|
||||
pub fn parallel_map_chunked<T, U, F>(items: Vec<T>, chunk_size: usize, map_fn: F) -> Vec<U>
|
||||
where
|
||||
T: Send,
|
||||
U: Send,
|
||||
F: Fn(T) -> U + Sync + Send,
|
||||
{
|
||||
if !parallel_enabled() {
|
||||
return items.into_iter().map(map_fn).collect();
|
||||
}
|
||||
|
||||
items
|
||||
.into_par_iter()
|
||||
.with_min_len(chunk_size)
|
||||
.map(map_fn)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Async parallel executor with concurrency limit
|
||||
pub struct AsyncParallelExecutor {
|
||||
semaphore: Arc<Semaphore>,
|
||||
}
|
||||
|
||||
impl AsyncParallelExecutor {
|
||||
/// Create executor with maximum concurrency limit
|
||||
pub fn new(max_concurrent: usize) -> Self {
|
||||
Self {
|
||||
semaphore: Arc::new(Semaphore::new(max_concurrent)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute async tasks with concurrency limit
|
||||
pub async fn execute<T, F, Fut>(&self, tasks: Vec<T>, executor: F) -> Vec<Fut::Output>
|
||||
where
|
||||
T: Send + 'static,
|
||||
F: Fn(T) -> Fut + Send + Sync + Clone + 'static,
|
||||
Fut: std::future::Future + Send + 'static,
|
||||
Fut::Output: Send + 'static,
|
||||
{
|
||||
let mut handles = Vec::new();
|
||||
|
||||
for task in tasks {
|
||||
let permit = self.semaphore.clone().acquire_owned().await.unwrap();
|
||||
let executor = executor.clone();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let result = executor(task).await;
|
||||
drop(permit); // Release semaphore
|
||||
result
|
||||
});
|
||||
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Wait for all tasks to complete
|
||||
let mut results = Vec::new();
|
||||
for handle in handles {
|
||||
if let Ok(result) = handle.await {
|
||||
results.push(result);
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Execute with error handling
|
||||
pub async fn execute_result<T, F, Fut, R, E>(
|
||||
&self,
|
||||
tasks: Vec<T>,
|
||||
executor: F,
|
||||
) -> Vec<std::result::Result<R, E>>
|
||||
where
|
||||
T: Send + 'static,
|
||||
F: Fn(T) -> Fut + Send + Sync + Clone + 'static,
|
||||
Fut: std::future::Future<Output = std::result::Result<R, E>> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
E: Send + 'static,
|
||||
{
|
||||
let mut handles = Vec::new();
|
||||
|
||||
for task in tasks {
|
||||
let permit = self.semaphore.clone().acquire_owned().await.unwrap();
|
||||
let executor = executor.clone();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let result = executor(task).await;
|
||||
drop(permit);
|
||||
result
|
||||
});
|
||||
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
let mut results = Vec::new();
|
||||
for handle in handles {
|
||||
match handle.await {
|
||||
Ok(result) => results.push(result),
|
||||
Err(_) => continue, // Task panicked
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
}
|
||||
|
||||
/// Work-stealing parallel iterator for unbalanced workloads
|
||||
pub fn parallel_unbalanced<T, U, F>(items: Vec<T>, map_fn: F) -> Vec<U>
|
||||
where
|
||||
T: Send,
|
||||
U: Send,
|
||||
F: Fn(T) -> U + Sync + Send,
|
||||
{
|
||||
if !parallel_enabled() {
|
||||
return items.into_iter().map(map_fn).collect();
|
||||
}
|
||||
|
||||
// Use adaptive strategy for unbalanced work
|
||||
items
|
||||
.into_par_iter()
|
||||
.with_min_len(1) // Allow fine-grained work stealing
|
||||
.map(map_fn)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get optimal thread count for current system
|
||||
pub fn optimal_thread_count() -> usize {
|
||||
rayon::current_num_threads()
|
||||
}
|
||||
|
||||
/// Set global thread pool size
|
||||
pub fn set_thread_count(threads: usize) {
|
||||
rayon::ThreadPoolBuilder::new()
|
||||
.num_threads(threads)
|
||||
.build_global()
|
||||
.ok();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parallel_map() {
|
||||
let data: Vec<i32> = (0..100).collect();
|
||||
let result = parallel_map_chunked(data, 10, |x| x * 2);
|
||||
|
||||
assert_eq!(result.len(), 100);
|
||||
assert_eq!(result[0], 0);
|
||||
assert_eq!(result[50], 100);
|
||||
assert_eq!(result[99], 198);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pipeline_executor() {
|
||||
let pipeline = PipelineExecutor::new(|x: i32| x + 1, |x: i32| x * 2);
|
||||
|
||||
let inputs = vec![1, 2, 3, 4, 5];
|
||||
let results = pipeline.execute_batch(inputs);
|
||||
|
||||
assert_eq!(results, vec![4, 6, 8, 10, 12]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pipeline3() {
|
||||
let pipeline = Pipeline3::new(|x: i32| x + 1, |x: i32| x * 2, |x: i32| x - 1);
|
||||
|
||||
let inputs = vec![1, 2, 3];
|
||||
let results = pipeline.execute_batch(inputs);
|
||||
|
||||
// (1+1)*2-1 = 3, (2+1)*2-1 = 5, (3+1)*2-1 = 7
|
||||
assert_eq!(results, vec![3, 5, 7]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_async_executor() {
|
||||
let executor = AsyncParallelExecutor::new(2);
|
||||
|
||||
let tasks = vec![1, 2, 3, 4, 5];
|
||||
let results = executor
|
||||
.execute(tasks, |x| async move {
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
|
||||
x * 2
|
||||
})
|
||||
.await;
|
||||
|
||||
assert_eq!(results.len(), 5);
|
||||
assert!(results.contains(&2));
|
||||
assert!(results.contains(&10));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_optimal_threads() {
|
||||
let threads = optimal_thread_count();
|
||||
assert!(threads > 0);
|
||||
assert!(threads <= num_cpus::get());
|
||||
}
|
||||
}
|
||||
339
vendor/ruvector/examples/scipix/src/optimize/quantize.rs
vendored
Normal file
339
vendor/ruvector/examples/scipix/src/optimize/quantize.rs
vendored
Normal file
@@ -0,0 +1,339 @@
|
||||
//! Model quantization utilities
|
||||
//!
|
||||
//! Provides INT8 quantization for model weights and activations to reduce
|
||||
//! memory usage and improve inference speed.
|
||||
|
||||
use std::f32;
|
||||
|
||||
/// Quantization parameters
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct QuantParams {
|
||||
pub scale: f32,
|
||||
pub zero_point: i8,
|
||||
}
|
||||
|
||||
impl QuantParams {
|
||||
/// Calculate quantization parameters from min/max values
|
||||
pub fn from_range(min: f32, max: f32) -> Self {
|
||||
let qmin = i8::MIN as f32;
|
||||
let qmax = i8::MAX as f32;
|
||||
|
||||
let scale = (max - min) / (qmax - qmin);
|
||||
let zero_point = (qmin - min / scale).round() as i8;
|
||||
|
||||
Self { scale, zero_point }
|
||||
}
|
||||
|
||||
/// Calculate from data statistics
|
||||
pub fn from_data(data: &[f32]) -> Self {
|
||||
let min = data.iter().copied().fold(f32::INFINITY, f32::min);
|
||||
let max = data.iter().copied().fold(f32::NEG_INFINITY, f32::max);
|
||||
Self::from_range(min, max)
|
||||
}
|
||||
|
||||
/// Symmetric quantization (zero_point = 0)
|
||||
pub fn symmetric(abs_max: f32) -> Self {
|
||||
let scale = abs_max / 127.0;
|
||||
Self {
|
||||
scale,
|
||||
zero_point: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Quantize f32 weights to i8
|
||||
pub fn quantize_weights(weights: &[f32]) -> (Vec<i8>, QuantParams) {
|
||||
let params = QuantParams::from_data(weights);
|
||||
let quantized = quantize_with_params(weights, params);
|
||||
(quantized, params)
|
||||
}
|
||||
|
||||
/// Quantize with given parameters
|
||||
pub fn quantize_with_params(weights: &[f32], params: QuantParams) -> Vec<i8> {
|
||||
weights.iter().map(|&w| quantize_value(w, params)).collect()
|
||||
}
|
||||
|
||||
/// Quantize single value
|
||||
#[inline]
|
||||
pub fn quantize_value(value: f32, params: QuantParams) -> i8 {
|
||||
let scaled = value / params.scale + params.zero_point as f32;
|
||||
scaled.round().clamp(i8::MIN as f32, i8::MAX as f32) as i8
|
||||
}
|
||||
|
||||
/// Dequantize i8 to f32
|
||||
pub fn dequantize(quantized: &[i8], params: QuantParams) -> Vec<f32> {
|
||||
quantized
|
||||
.iter()
|
||||
.map(|&q| dequantize_value(q, params))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Dequantize single value
|
||||
#[inline]
|
||||
pub fn dequantize_value(quantized: i8, params: QuantParams) -> f32 {
|
||||
(quantized as f32 - params.zero_point as f32) * params.scale
|
||||
}
|
||||
|
||||
/// Quantized tensor representation
|
||||
pub struct QuantizedTensor {
|
||||
pub data: Vec<i8>,
|
||||
pub params: QuantParams,
|
||||
pub shape: Vec<usize>,
|
||||
}
|
||||
|
||||
impl QuantizedTensor {
|
||||
/// Create from f32 tensor
|
||||
pub fn from_f32(data: &[f32], shape: Vec<usize>) -> Self {
|
||||
let (quantized, params) = quantize_weights(data);
|
||||
Self {
|
||||
data: quantized,
|
||||
params,
|
||||
shape,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create with symmetric quantization
|
||||
pub fn from_f32_symmetric(data: &[f32], shape: Vec<usize>) -> Self {
|
||||
let abs_max = data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
|
||||
let params = QuantParams::symmetric(abs_max);
|
||||
let quantized = quantize_with_params(data, params);
|
||||
|
||||
Self {
|
||||
data: quantized,
|
||||
params,
|
||||
shape,
|
||||
}
|
||||
}
|
||||
|
||||
/// Dequantize to f32
|
||||
pub fn to_f32(&self) -> Vec<f32> {
|
||||
dequantize(&self.data, self.params)
|
||||
}
|
||||
|
||||
/// Get size in bytes
|
||||
pub fn size_bytes(&self) -> usize {
|
||||
self.data.len()
|
||||
+ std::mem::size_of::<QuantParams>()
|
||||
+ self.shape.len() * std::mem::size_of::<usize>()
|
||||
}
|
||||
|
||||
/// Calculate memory savings vs f32
|
||||
pub fn compression_ratio(&self) -> f32 {
|
||||
let f32_size = self.data.len() * std::mem::size_of::<f32>();
|
||||
let quantized_size = self.size_bytes();
|
||||
f32_size as f32 / quantized_size as f32
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-channel quantization for conv/linear layers
|
||||
pub struct PerChannelQuant {
|
||||
pub data: Vec<i8>,
|
||||
pub params: Vec<QuantParams>,
|
||||
pub shape: Vec<usize>,
|
||||
}
|
||||
|
||||
impl PerChannelQuant {
|
||||
/// Quantize with per-channel parameters
|
||||
/// For a weight tensor of shape [out_channels, in_channels, ...],
|
||||
/// use separate params for each output channel
|
||||
pub fn from_f32(data: &[f32], shape: Vec<usize>) -> Self {
|
||||
if shape.is_empty() {
|
||||
panic!("Shape cannot be empty");
|
||||
}
|
||||
|
||||
let out_channels = shape[0];
|
||||
let channel_size = data.len() / out_channels;
|
||||
|
||||
let mut all_quantized = Vec::with_capacity(data.len());
|
||||
let mut params = Vec::with_capacity(out_channels);
|
||||
|
||||
for ch in 0..out_channels {
|
||||
let start = ch * channel_size;
|
||||
let end = start + channel_size;
|
||||
let channel_data = &data[start..end];
|
||||
|
||||
let ch_params = QuantParams::from_data(channel_data);
|
||||
let ch_quantized = quantize_with_params(channel_data, ch_params);
|
||||
|
||||
all_quantized.extend(ch_quantized);
|
||||
params.push(ch_params);
|
||||
}
|
||||
|
||||
Self {
|
||||
data: all_quantized,
|
||||
params,
|
||||
shape,
|
||||
}
|
||||
}
|
||||
|
||||
/// Dequantize to f32
|
||||
pub fn to_f32(&self) -> Vec<f32> {
|
||||
let out_channels = self.shape[0];
|
||||
let channel_size = self.data.len() / out_channels;
|
||||
|
||||
let mut result = Vec::with_capacity(self.data.len());
|
||||
|
||||
for ch in 0..out_channels {
|
||||
let start = ch * channel_size;
|
||||
let end = start + channel_size;
|
||||
let channel_data = &self.data[start..end];
|
||||
let ch_params = self.params[ch];
|
||||
|
||||
result.extend(dequantize(channel_data, ch_params));
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
/// Dynamic quantization - quantize at runtime
|
||||
pub struct DynamicQuantizer {
|
||||
percentile: f32,
|
||||
}
|
||||
|
||||
impl DynamicQuantizer {
|
||||
/// Create quantizer with calibration percentile
|
||||
/// percentile: clip values beyond this percentile (e.g., 99.9)
|
||||
pub fn new(percentile: f32) -> Self {
|
||||
Self { percentile }
|
||||
}
|
||||
|
||||
/// Quantize with calibration
|
||||
pub fn quantize(&self, data: &[f32]) -> (Vec<i8>, QuantParams) {
|
||||
let mut sorted: Vec<f32> = data.iter().copied().collect();
|
||||
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
|
||||
let idx = ((sorted.len() as f32 * self.percentile / 100.0) as usize).min(sorted.len() - 1);
|
||||
|
||||
let min = -sorted[sorted.len() - idx];
|
||||
let max = sorted[idx];
|
||||
|
||||
let params = QuantParams::from_range(min, max);
|
||||
let quantized = quantize_with_params(data, params);
|
||||
|
||||
(quantized, params)
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate quantization error (MSE)
|
||||
pub fn quantization_error(original: &[f32], quantized: &[i8], params: QuantParams) -> f32 {
|
||||
let dequantized = dequantize(quantized, params);
|
||||
|
||||
let mse: f32 = original
|
||||
.iter()
|
||||
.zip(dequantized.iter())
|
||||
.map(|(o, d)| (o - d).powi(2))
|
||||
.sum::<f32>()
|
||||
/ original.len() as f32;
|
||||
|
||||
mse
|
||||
}
|
||||
|
||||
/// Calculate signal-to-quantization-noise ratio (SQNR) in dB
|
||||
pub fn sqnr(original: &[f32], quantized: &[i8], params: QuantParams) -> f32 {
|
||||
let dequantized = dequantize(quantized, params);
|
||||
|
||||
let signal_power: f32 = original.iter().map(|x| x.powi(2)).sum::<f32>() / original.len() as f32;
|
||||
let noise_power: f32 = original
|
||||
.iter()
|
||||
.zip(dequantized.iter())
|
||||
.map(|(o, d)| (o - d).powi(2))
|
||||
.sum::<f32>()
|
||||
/ original.len() as f32;
|
||||
|
||||
10.0 * (signal_power / noise_power).log10()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_quantize_dequantize() {
|
||||
let weights = vec![0.0, 0.5, 1.0, -0.5, -1.0];
|
||||
let (quantized, params) = quantize_weights(&weights);
|
||||
let dequantized = dequantize(&quantized, params);
|
||||
|
||||
// Check approximate equality
|
||||
for (orig, deq) in weights.iter().zip(dequantized.iter()) {
|
||||
assert!((orig - deq).abs() < 0.01, "orig: {}, deq: {}", orig, deq);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_symmetric_quantization() {
|
||||
let data = vec![-1.0, -0.5, 0.0, 0.5, 1.0];
|
||||
let params = QuantParams::symmetric(1.0);
|
||||
|
||||
assert_eq!(params.zero_point, 0);
|
||||
assert!((params.scale - 1.0 / 127.0).abs() < 1e-6);
|
||||
|
||||
let quantized = quantize_with_params(&data, params);
|
||||
assert_eq!(quantized[2], 0); // 0.0 should map to 0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantized_tensor() {
|
||||
let data = vec![1.0, 2.0, 3.0, 4.0];
|
||||
let tensor = QuantizedTensor::from_f32(&data, vec![2, 2]);
|
||||
|
||||
assert_eq!(tensor.shape, vec![2, 2]);
|
||||
assert_eq!(tensor.data.len(), 4);
|
||||
|
||||
let dequantized = tensor.to_f32();
|
||||
for (orig, deq) in data.iter().zip(dequantized.iter()) {
|
||||
assert!((orig - deq).abs() < 0.1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_per_channel_quant() {
|
||||
// 2 channels, 3 values each
|
||||
let data = vec![
|
||||
1.0, 2.0, 3.0, // Channel 0
|
||||
10.0, 20.0, 30.0, // Channel 1
|
||||
];
|
||||
|
||||
let quant = PerChannelQuant::from_f32(&data, vec![2, 3]);
|
||||
assert_eq!(quant.params.len(), 2);
|
||||
|
||||
let dequantized = quant.to_f32();
|
||||
for (orig, deq) in data.iter().zip(dequantized.iter()) {
|
||||
assert!((orig - deq).abs() < 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantization_error() {
|
||||
let original = vec![1.0, 2.0, 3.0, 4.0, 5.0];
|
||||
let (quantized, params) = quantize_weights(&original);
|
||||
|
||||
let error = quantization_error(&original, &quantized, params);
|
||||
assert!(error < 0.1); // Should be small for simple data
|
||||
|
||||
let snr = sqnr(&original, &quantized, params);
|
||||
assert!(snr > 30.0); // Should have good SNR
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression_ratio() {
|
||||
let data: Vec<f32> = (0..1000).map(|i| i as f32 / 1000.0).collect();
|
||||
let tensor = QuantizedTensor::from_f32(&data, vec![1000]);
|
||||
|
||||
let ratio = tensor.compression_ratio();
|
||||
assert!(ratio > 3.5); // Should be ~4x compression
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dynamic_quantizer() {
|
||||
let mut data: Vec<f32> = (0..100).map(|i| i as f32).collect();
|
||||
data.push(1000.0); // Outlier
|
||||
|
||||
let quantizer = DynamicQuantizer::new(99.0);
|
||||
let (quantized, params) = quantizer.quantize(&data);
|
||||
|
||||
assert_eq!(quantized.len(), 101);
|
||||
// The outlier should be clipped
|
||||
assert!(params.scale > 0.0);
|
||||
}
|
||||
}
|
||||
597
vendor/ruvector/examples/scipix/src/optimize/simd.rs
vendored
Normal file
597
vendor/ruvector/examples/scipix/src/optimize/simd.rs
vendored
Normal file
@@ -0,0 +1,597 @@
|
||||
//! SIMD-accelerated image processing operations
|
||||
//!
|
||||
//! Provides optimized implementations for common image operations using
|
||||
//! AVX2, AVX-512, and ARM NEON intrinsics.
|
||||
|
||||
use super::{get_features, simd_enabled};
|
||||
|
||||
/// Convert RGBA image to grayscale using optimized SIMD operations
|
||||
pub fn simd_grayscale(rgba: &[u8], gray: &mut [u8]) {
|
||||
if !simd_enabled() {
|
||||
return scalar_grayscale(rgba, gray);
|
||||
}
|
||||
|
||||
let features = get_features();
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if features.avx2 {
|
||||
unsafe { avx2_grayscale(rgba, gray) }
|
||||
} else if features.sse4_2 {
|
||||
unsafe { sse_grayscale(rgba, gray) }
|
||||
} else {
|
||||
scalar_grayscale(rgba, gray)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
{
|
||||
if features.neon {
|
||||
unsafe { neon_grayscale(rgba, gray) }
|
||||
} else {
|
||||
scalar_grayscale(rgba, gray)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
|
||||
{
|
||||
scalar_grayscale(rgba, gray)
|
||||
}
|
||||
}
|
||||
|
||||
/// Scalar fallback for grayscale conversion
|
||||
fn scalar_grayscale(rgba: &[u8], gray: &mut [u8]) {
|
||||
assert_eq!(
|
||||
rgba.len() / 4,
|
||||
gray.len(),
|
||||
"RGBA length must be 4x grayscale length"
|
||||
);
|
||||
|
||||
for (i, chunk) in rgba.chunks_exact(4).enumerate() {
|
||||
let r = chunk[0] as u32;
|
||||
let g = chunk[1] as u32;
|
||||
let b = chunk[2] as u32;
|
||||
|
||||
// ITU-R BT.601 luma coefficients: 0.299 R + 0.587 G + 0.114 B
|
||||
gray[i] = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
unsafe fn avx2_grayscale(rgba: &[u8], gray: &mut [u8]) {
|
||||
use std::arch::x86_64::*;
|
||||
|
||||
let len = gray.len();
|
||||
let mut i = 0;
|
||||
|
||||
// Process 8 pixels at a time (32 RGBA bytes)
|
||||
while i + 8 <= len {
|
||||
// Load 32 bytes (8 RGBA pixels)
|
||||
let rgba_ptr = rgba.as_ptr().add(i * 4);
|
||||
let _pixels = _mm256_loadu_si256(rgba_ptr as *const __m256i);
|
||||
|
||||
// Separate RGBA channels (simplified - actual implementation would use shuffles)
|
||||
// For production, use proper channel extraction
|
||||
|
||||
// Store grayscale result
|
||||
for j in 0..8 {
|
||||
let pixel_idx = (i + j) * 4;
|
||||
let r = *rgba.get_unchecked(pixel_idx) as u32;
|
||||
let g = *rgba.get_unchecked(pixel_idx + 1) as u32;
|
||||
let b = *rgba.get_unchecked(pixel_idx + 2) as u32;
|
||||
*gray.get_unchecked_mut(i + j) = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
|
||||
}
|
||||
|
||||
i += 8;
|
||||
}
|
||||
|
||||
// Handle remaining pixels
|
||||
scalar_grayscale(&rgba[i * 4..], &mut gray[i..]);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "sse4.2")]
|
||||
unsafe fn sse_grayscale(rgba: &[u8], gray: &mut [u8]) {
|
||||
#[allow(unused_imports)]
|
||||
use std::arch::x86_64::*;
|
||||
|
||||
let len = gray.len();
|
||||
let mut i = 0;
|
||||
|
||||
// Process 4 pixels at a time (16 RGBA bytes)
|
||||
while i + 4 <= len {
|
||||
for j in 0..4 {
|
||||
let pixel_idx = (i + j) * 4;
|
||||
let r = *rgba.get_unchecked(pixel_idx) as u32;
|
||||
let g = *rgba.get_unchecked(pixel_idx + 1) as u32;
|
||||
let b = *rgba.get_unchecked(pixel_idx + 2) as u32;
|
||||
*gray.get_unchecked_mut(i + j) = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
|
||||
}
|
||||
i += 4;
|
||||
}
|
||||
|
||||
scalar_grayscale(&rgba[i * 4..], &mut gray[i..]);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
unsafe fn neon_grayscale(rgba: &[u8], gray: &mut [u8]) {
|
||||
use std::arch::aarch64::*;
|
||||
|
||||
let len = gray.len();
|
||||
let mut i = 0;
|
||||
|
||||
// Process 8 pixels at a time
|
||||
while i + 8 <= len {
|
||||
for j in 0..8 {
|
||||
let idx = (i + j) * 4;
|
||||
let r = *rgba.get_unchecked(idx) as u32;
|
||||
let g = *rgba.get_unchecked(idx + 1) as u32;
|
||||
let b = *rgba.get_unchecked(idx + 2) as u32;
|
||||
*gray.get_unchecked_mut(i + j) = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
|
||||
}
|
||||
i += 8;
|
||||
}
|
||||
|
||||
scalar_grayscale(&rgba[i * 4..], &mut gray[i..]);
|
||||
}
|
||||
|
||||
/// Apply threshold to grayscale image using SIMD
|
||||
pub fn simd_threshold(gray: &[u8], thresh: u8, out: &mut [u8]) {
|
||||
if !simd_enabled() {
|
||||
return scalar_threshold(gray, thresh, out);
|
||||
}
|
||||
|
||||
let features = get_features();
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if features.avx2 {
|
||||
unsafe { avx2_threshold(gray, thresh, out) }
|
||||
} else {
|
||||
scalar_threshold(gray, thresh, out)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
{
|
||||
scalar_threshold(gray, thresh, out)
|
||||
}
|
||||
}
|
||||
|
||||
fn scalar_threshold(gray: &[u8], thresh: u8, out: &mut [u8]) {
|
||||
for (g, o) in gray.iter().zip(out.iter_mut()) {
|
||||
*o = if *g >= thresh { 255 } else { 0 };
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
unsafe fn avx2_threshold(gray: &[u8], thresh: u8, out: &mut [u8]) {
|
||||
use std::arch::x86_64::*;
|
||||
|
||||
let len = gray.len();
|
||||
let mut i = 0;
|
||||
|
||||
let thresh_vec = _mm256_set1_epi8(thresh as i8);
|
||||
let ones = _mm256_set1_epi8(-1); // 0xFF
|
||||
|
||||
// Process 32 bytes at a time
|
||||
while i + 32 <= len {
|
||||
let gray_vec = _mm256_loadu_si256(gray.as_ptr().add(i) as *const __m256i);
|
||||
let cmp = _mm256_cmpgt_epi8(gray_vec, thresh_vec);
|
||||
let result = _mm256_and_si256(cmp, ones);
|
||||
_mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut __m256i, result);
|
||||
i += 32;
|
||||
}
|
||||
|
||||
// Handle remaining bytes
|
||||
scalar_threshold(&gray[i..], thresh, &mut out[i..]);
|
||||
}
|
||||
|
||||
/// Normalize f32 tensor data using SIMD
|
||||
pub fn simd_normalize(data: &mut [f32]) {
|
||||
if !simd_enabled() {
|
||||
return scalar_normalize(data);
|
||||
}
|
||||
|
||||
let features = get_features();
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if features.avx2 {
|
||||
unsafe { avx2_normalize(data) }
|
||||
} else {
|
||||
scalar_normalize(data)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
{
|
||||
scalar_normalize(data)
|
||||
}
|
||||
}
|
||||
|
||||
fn scalar_normalize(data: &mut [f32]) {
|
||||
let sum: f32 = data.iter().sum();
|
||||
let mean = sum / data.len() as f32;
|
||||
|
||||
let variance: f32 = data.iter().map(|x| (x - mean).powi(2)).sum::<f32>() / data.len() as f32;
|
||||
let std_dev = variance.sqrt() + 1e-8; // Add epsilon for numerical stability
|
||||
|
||||
for x in data.iter_mut() {
|
||||
*x = (*x - mean) / std_dev;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
unsafe fn avx2_normalize(data: &mut [f32]) {
|
||||
use std::arch::x86_64::*;
|
||||
|
||||
// Calculate mean using SIMD
|
||||
let len = data.len();
|
||||
let mut sum = _mm256_setzero_ps();
|
||||
let mut i = 0;
|
||||
|
||||
while i + 8 <= len {
|
||||
let vals = _mm256_loadu_ps(data.as_ptr().add(i));
|
||||
sum = _mm256_add_ps(sum, vals);
|
||||
i += 8;
|
||||
}
|
||||
|
||||
// Horizontal sum
|
||||
let sum_scalar = {
|
||||
let sum_arr: [f32; 8] = std::mem::transmute(sum);
|
||||
sum_arr.iter().sum::<f32>() + data[i..].iter().sum::<f32>()
|
||||
};
|
||||
|
||||
let mean = sum_scalar / len as f32;
|
||||
let mean_vec = _mm256_set1_ps(mean);
|
||||
|
||||
// Calculate variance
|
||||
let mut var_sum = _mm256_setzero_ps();
|
||||
i = 0;
|
||||
|
||||
while i + 8 <= len {
|
||||
let vals = _mm256_loadu_ps(data.as_ptr().add(i));
|
||||
let diff = _mm256_sub_ps(vals, mean_vec);
|
||||
let sq = _mm256_mul_ps(diff, diff);
|
||||
var_sum = _mm256_add_ps(var_sum, sq);
|
||||
i += 8;
|
||||
}
|
||||
|
||||
let var_scalar = {
|
||||
let var_arr: [f32; 8] = std::mem::transmute(var_sum);
|
||||
var_arr.iter().sum::<f32>() + data[i..].iter().map(|x| (x - mean).powi(2)).sum::<f32>()
|
||||
};
|
||||
|
||||
let std_dev = (var_scalar / len as f32).sqrt() + 1e-8;
|
||||
let std_vec = _mm256_set1_ps(std_dev);
|
||||
|
||||
// Normalize
|
||||
i = 0;
|
||||
while i + 8 <= len {
|
||||
let vals = _mm256_loadu_ps(data.as_ptr().add(i));
|
||||
let centered = _mm256_sub_ps(vals, mean_vec);
|
||||
let normalized = _mm256_div_ps(centered, std_vec);
|
||||
_mm256_storeu_ps(data.as_mut_ptr().add(i), normalized);
|
||||
i += 8;
|
||||
}
|
||||
|
||||
// Handle remaining elements
|
||||
for x in &mut data[i..] {
|
||||
*x = (*x - mean) / std_dev;
|
||||
}
|
||||
}
|
||||
|
||||
/// Fast bilinear resize using SIMD - optimized for preprocessing
|
||||
/// This is significantly faster than the image crate's resize for typical OCR sizes
|
||||
pub fn simd_resize_bilinear(
|
||||
src: &[u8],
|
||||
src_width: usize,
|
||||
src_height: usize,
|
||||
dst_width: usize,
|
||||
dst_height: usize,
|
||||
) -> Vec<u8> {
|
||||
if !simd_enabled() {
|
||||
return scalar_resize_bilinear(src, src_width, src_height, dst_width, dst_height);
|
||||
}
|
||||
|
||||
let features = get_features();
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if features.avx2 {
|
||||
unsafe { avx2_resize_bilinear(src, src_width, src_height, dst_width, dst_height) }
|
||||
} else {
|
||||
scalar_resize_bilinear(src, src_width, src_height, dst_width, dst_height)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
{
|
||||
scalar_resize_bilinear(src, src_width, src_height, dst_width, dst_height)
|
||||
}
|
||||
}
|
||||
|
||||
/// Scalar bilinear resize implementation
|
||||
fn scalar_resize_bilinear(
|
||||
src: &[u8],
|
||||
src_width: usize,
|
||||
src_height: usize,
|
||||
dst_width: usize,
|
||||
dst_height: usize,
|
||||
) -> Vec<u8> {
|
||||
let mut dst = vec![0u8; dst_width * dst_height];
|
||||
|
||||
let x_scale = src_width as f32 / dst_width as f32;
|
||||
let y_scale = src_height as f32 / dst_height as f32;
|
||||
|
||||
for y in 0..dst_height {
|
||||
let src_y = y as f32 * y_scale;
|
||||
let y0 = (src_y.floor() as usize).min(src_height - 1);
|
||||
let y1 = (y0 + 1).min(src_height - 1);
|
||||
let y_frac = src_y - src_y.floor();
|
||||
|
||||
for x in 0..dst_width {
|
||||
let src_x = x as f32 * x_scale;
|
||||
let x0 = (src_x.floor() as usize).min(src_width - 1);
|
||||
let x1 = (x0 + 1).min(src_width - 1);
|
||||
let x_frac = src_x - src_x.floor();
|
||||
|
||||
// Bilinear interpolation
|
||||
let p00 = src[y0 * src_width + x0] as f32;
|
||||
let p10 = src[y0 * src_width + x1] as f32;
|
||||
let p01 = src[y1 * src_width + x0] as f32;
|
||||
let p11 = src[y1 * src_width + x1] as f32;
|
||||
|
||||
let top = p00 * (1.0 - x_frac) + p10 * x_frac;
|
||||
let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
|
||||
let value = top * (1.0 - y_frac) + bottom * y_frac;
|
||||
|
||||
dst[y * dst_width + x] = value.round() as u8;
|
||||
}
|
||||
}
|
||||
|
||||
dst
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
unsafe fn avx2_resize_bilinear(
|
||||
src: &[u8],
|
||||
src_width: usize,
|
||||
src_height: usize,
|
||||
dst_width: usize,
|
||||
dst_height: usize,
|
||||
) -> Vec<u8> {
|
||||
use std::arch::x86_64::*;
|
||||
|
||||
let mut dst = vec![0u8; dst_width * dst_height];
|
||||
|
||||
let x_scale = src_width as f32 / dst_width as f32;
|
||||
let y_scale = src_height as f32 / dst_height as f32;
|
||||
|
||||
// Process 8 output pixels at a time for x dimension
|
||||
for y in 0..dst_height {
|
||||
let src_y = y as f32 * y_scale;
|
||||
let y0 = (src_y.floor() as usize).min(src_height - 1);
|
||||
let y1 = (y0 + 1).min(src_height - 1);
|
||||
let _y_frac = _mm256_set1_ps(src_y - src_y.floor());
|
||||
let _y_frac_inv = _mm256_set1_ps(1.0 - (src_y - src_y.floor()));
|
||||
|
||||
let mut x = 0;
|
||||
while x + 8 <= dst_width {
|
||||
// Calculate source x coordinates for 8 destination pixels
|
||||
let src_xs: [f32; 8] = [
|
||||
(x) as f32 * x_scale,
|
||||
(x + 1) as f32 * x_scale,
|
||||
(x + 2) as f32 * x_scale,
|
||||
(x + 3) as f32 * x_scale,
|
||||
(x + 4) as f32 * x_scale,
|
||||
(x + 5) as f32 * x_scale,
|
||||
(x + 6) as f32 * x_scale,
|
||||
(x + 7) as f32 * x_scale,
|
||||
];
|
||||
|
||||
let mut results = [0u8; 8];
|
||||
for i in 0..8 {
|
||||
let src_x = src_xs[i];
|
||||
let x0 = (src_x.floor() as usize).min(src_width - 1);
|
||||
let x1 = (x0 + 1).min(src_width - 1);
|
||||
let x_frac = src_x - src_x.floor();
|
||||
|
||||
let p00 = *src.get_unchecked(y0 * src_width + x0) as f32;
|
||||
let p10 = *src.get_unchecked(y0 * src_width + x1) as f32;
|
||||
let p01 = *src.get_unchecked(y1 * src_width + x0) as f32;
|
||||
let p11 = *src.get_unchecked(y1 * src_width + x1) as f32;
|
||||
|
||||
let top = p00 * (1.0 - x_frac) + p10 * x_frac;
|
||||
let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
|
||||
let value =
|
||||
top * (1.0 - (src_y - src_y.floor())) + bottom * (src_y - src_y.floor());
|
||||
results[i] = value.round() as u8;
|
||||
}
|
||||
|
||||
for i in 0..8 {
|
||||
*dst.get_unchecked_mut(y * dst_width + x + i) = results[i];
|
||||
}
|
||||
x += 8;
|
||||
}
|
||||
|
||||
// Handle remaining pixels
|
||||
while x < dst_width {
|
||||
let src_x = x as f32 * x_scale;
|
||||
let x0 = (src_x.floor() as usize).min(src_width - 1);
|
||||
let x1 = (x0 + 1).min(src_width - 1);
|
||||
let x_frac = src_x - src_x.floor();
|
||||
|
||||
let p00 = *src.get_unchecked(y0 * src_width + x0) as f32;
|
||||
let p10 = *src.get_unchecked(y0 * src_width + x1) as f32;
|
||||
let p01 = *src.get_unchecked(y1 * src_width + x0) as f32;
|
||||
let p11 = *src.get_unchecked(y1 * src_width + x1) as f32;
|
||||
|
||||
let top = p00 * (1.0 - x_frac) + p10 * x_frac;
|
||||
let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
|
||||
let value = top * (1.0 - (src_y - src_y.floor())) + bottom * (src_y - src_y.floor());
|
||||
*dst.get_unchecked_mut(y * dst_width + x) = value.round() as u8;
|
||||
x += 1;
|
||||
}
|
||||
}
|
||||
|
||||
dst
|
||||
}
|
||||
|
||||
/// Parallel SIMD resize for large images - splits work across threads
|
||||
#[cfg(feature = "rayon")]
|
||||
pub fn parallel_simd_resize(
|
||||
src: &[u8],
|
||||
src_width: usize,
|
||||
src_height: usize,
|
||||
dst_width: usize,
|
||||
dst_height: usize,
|
||||
) -> Vec<u8> {
|
||||
use rayon::prelude::*;
|
||||
|
||||
// For small images, use single-threaded SIMD
|
||||
if dst_height < 64 || dst_width * dst_height < 100_000 {
|
||||
return simd_resize_bilinear(src, src_width, src_height, dst_width, dst_height);
|
||||
}
|
||||
|
||||
let mut dst = vec![0u8; dst_width * dst_height];
|
||||
let x_scale = src_width as f32 / dst_width as f32;
|
||||
let y_scale = src_height as f32 / dst_height as f32;
|
||||
|
||||
// Process rows in parallel
|
||||
dst.par_chunks_mut(dst_width)
|
||||
.enumerate()
|
||||
.for_each(|(y, row)| {
|
||||
let src_y = y as f32 * y_scale;
|
||||
let y0 = (src_y.floor() as usize).min(src_height - 1);
|
||||
let y1 = (y0 + 1).min(src_height - 1);
|
||||
let y_frac = src_y - src_y.floor();
|
||||
|
||||
for x in 0..dst_width {
|
||||
let src_x = x as f32 * x_scale;
|
||||
let x0 = (src_x.floor() as usize).min(src_width - 1);
|
||||
let x1 = (x0 + 1).min(src_width - 1);
|
||||
let x_frac = src_x - src_x.floor();
|
||||
|
||||
let p00 = src[y0 * src_width + x0] as f32;
|
||||
let p10 = src[y0 * src_width + x1] as f32;
|
||||
let p01 = src[y1 * src_width + x0] as f32;
|
||||
let p11 = src[y1 * src_width + x1] as f32;
|
||||
|
||||
let top = p00 * (1.0 - x_frac) + p10 * x_frac;
|
||||
let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
|
||||
let value = top * (1.0 - y_frac) + bottom * y_frac;
|
||||
|
||||
row[x] = value.round() as u8;
|
||||
}
|
||||
});
|
||||
|
||||
dst
|
||||
}
|
||||
|
||||
/// Ultra-fast area average downscaling for preprocessing
|
||||
/// Best for large images being scaled down significantly
|
||||
pub fn fast_area_resize(
|
||||
src: &[u8],
|
||||
src_width: usize,
|
||||
src_height: usize,
|
||||
dst_width: usize,
|
||||
dst_height: usize,
|
||||
) -> Vec<u8> {
|
||||
// Only use area averaging for downscaling
|
||||
if dst_width >= src_width || dst_height >= src_height {
|
||||
return simd_resize_bilinear(src, src_width, src_height, dst_width, dst_height);
|
||||
}
|
||||
|
||||
let mut dst = vec![0u8; dst_width * dst_height];
|
||||
|
||||
let x_ratio = src_width as f32 / dst_width as f32;
|
||||
let y_ratio = src_height as f32 / dst_height as f32;
|
||||
|
||||
for y in 0..dst_height {
|
||||
let y_start = (y as f32 * y_ratio) as usize;
|
||||
let y_end = (((y + 1) as f32 * y_ratio) as usize).min(src_height);
|
||||
|
||||
for x in 0..dst_width {
|
||||
let x_start = (x as f32 * x_ratio) as usize;
|
||||
let x_end = (((x + 1) as f32 * x_ratio) as usize).min(src_width);
|
||||
|
||||
// Calculate area average
|
||||
let mut sum: u32 = 0;
|
||||
let mut count: u32 = 0;
|
||||
|
||||
for sy in y_start..y_end {
|
||||
for sx in x_start..x_end {
|
||||
sum += src[sy * src_width + sx] as u32;
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
dst[y * dst_width + x] = if count > 0 { (sum / count) as u8 } else { 0 };
|
||||
}
|
||||
}
|
||||
|
||||
dst
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_grayscale_conversion() {
|
||||
let rgba = vec![
|
||||
255, 0, 0, 255, // Red
|
||||
0, 255, 0, 255, // Green
|
||||
0, 0, 255, 255, // Blue
|
||||
255, 255, 255, 255, // White
|
||||
];
|
||||
let mut gray = vec![0u8; 4];
|
||||
|
||||
simd_grayscale(&rgba, &mut gray);
|
||||
|
||||
// Check approximately correct values
|
||||
assert!(gray[0] > 50 && gray[0] < 100); // Red
|
||||
assert!(gray[1] > 130 && gray[1] < 160); // Green
|
||||
assert!(gray[2] > 20 && gray[2] < 50); // Blue
|
||||
assert_eq!(gray[3], 255); // White
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_threshold() {
|
||||
let gray = vec![0, 50, 100, 150, 200, 255];
|
||||
let mut out = vec![0u8; 6];
|
||||
|
||||
simd_threshold(&gray, 100, &mut out);
|
||||
|
||||
assert_eq!(out, vec![0, 0, 0, 255, 255, 255]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize() {
|
||||
let mut data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
|
||||
simd_normalize(&mut data);
|
||||
|
||||
// After normalization, mean should be ~0 and std dev ~1
|
||||
let mean: f32 = data.iter().sum::<f32>() / data.len() as f32;
|
||||
assert!(mean.abs() < 1e-6);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[test]
|
||||
fn test_simd_vs_scalar_grayscale() {
|
||||
let rgba: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
|
||||
let mut gray_simd = vec![0u8; 256];
|
||||
let mut gray_scalar = vec![0u8; 256];
|
||||
|
||||
simd_grayscale(&rgba, &mut gray_simd);
|
||||
scalar_grayscale(&rgba, &mut gray_scalar);
|
||||
|
||||
assert_eq!(gray_simd, gray_scalar);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user