Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'

2026-02-28 14:39:40 -05:00
parent 7885bf6278 d803bfe2b1
commit cd5943df23
7854 changed files with 3522914 additions and 0 deletions
--- a/vendor/ruvector/examples/scipix/src/optimize/batch.rs
+++ b/vendor/ruvector/examples/scipix/src/optimize/batch.rs
@@ -0,0 +1,396 @@
+//! Dynamic batching for throughput optimization
+//!
+//! Provides intelligent batching to maximize GPU/CPU utilization while
+//! maintaining acceptable latency.
+
+use std::collections::VecDeque;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+use tokio::sync::{oneshot, Mutex};
+use tokio::time::sleep;
+
+/// Item in the batching queue
+pub struct BatchItem<T, R> {
+    pub data: T,
+    pub response: oneshot::Sender<BatchResult<R>>,
+    pub enqueued_at: Instant,
+}
+
+/// Result of batch processing
+pub type BatchResult<T> = std::result::Result<T, BatchError>;
+
+/// Batch processing errors
+#[derive(Debug, Clone)]
+pub enum BatchError {
+    Timeout,
+    ProcessingFailed(String),
+    QueueFull,
+}
+
+impl std::fmt::Display for BatchError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            BatchError::Timeout => write!(f, "Batch processing timeout"),
+            BatchError::ProcessingFailed(msg) => write!(f, "Processing failed: {}", msg),
+            BatchError::QueueFull => write!(f, "Queue is full"),
+        }
+    }
+}
+
+impl std::error::Error for BatchError {}
+
+/// Dynamic batcher configuration
+#[derive(Debug, Clone)]
+pub struct BatchConfig {
+    /// Maximum items in a batch
+    pub max_batch_size: usize,
+    /// Maximum time to wait before processing partial batch
+    pub max_wait_ms: u64,
+    /// Maximum queue size
+    pub max_queue_size: usize,
+    /// Minimum batch size to prefer
+    pub preferred_batch_size: usize,
+}
+
+impl Default for BatchConfig {
+    fn default() -> Self {
+        Self {
+            max_batch_size: 32,
+            max_wait_ms: 50,
+            max_queue_size: 1000,
+            preferred_batch_size: 16,
+        }
+    }
+}
+
+/// Dynamic batcher for throughput optimization
+pub struct DynamicBatcher<T, R> {
+    config: BatchConfig,
+    queue: Arc<Mutex<VecDeque<BatchItem<T, R>>>>,
+    processor: Arc<dyn Fn(Vec<T>) -> Vec<std::result::Result<R, String>> + Send + Sync>,
+    shutdown: Arc<Mutex<bool>>,
+}
+
+impl<T, R> DynamicBatcher<T, R>
+where
+    T: Send + 'static,
+    R: Send + 'static,
+{
+    /// Create new dynamic batcher
+    pub fn new<F>(config: BatchConfig, processor: F) -> Self
+    where
+        F: Fn(Vec<T>) -> Vec<std::result::Result<R, String>> + Send + Sync + 'static,
+    {
+        Self {
+            config,
+            queue: Arc::new(Mutex::new(VecDeque::new())),
+            processor: Arc::new(processor),
+            shutdown: Arc::new(Mutex::new(false)),
+        }
+    }
+
+    /// Add item to batch queue
+    pub async fn add(&self, item: T) -> BatchResult<R> {
+        let (tx, rx) = oneshot::channel();
+
+        let batch_item = BatchItem {
+            data: item,
+            response: tx,
+            enqueued_at: Instant::now(),
+        };
+
+        {
+            let mut queue = self.queue.lock().await;
+            if queue.len() >= self.config.max_queue_size {
+                return Err(BatchError::QueueFull);
+            }
+            queue.push_back(batch_item);
+        }
+
+        // Wait for response
+        rx.await.map_err(|_| BatchError::Timeout)?
+    }
+
+    /// Start batch processing loop
+    pub async fn run(&self) {
+        let mut last_process = Instant::now();
+
+        loop {
+            // Check if shutdown requested
+            {
+                let shutdown = self.shutdown.lock().await;
+                if *shutdown {
+                    break;
+                }
+            }
+
+            let should_process = {
+                let queue = self.queue.lock().await;
+                queue.len() >= self.config.max_batch_size
+                    || (queue.len() >= self.config.preferred_batch_size
+                        && last_process.elapsed().as_millis() >= self.config.max_wait_ms as u128)
+                    || (queue.len() > 0
+                        && last_process.elapsed().as_millis() >= self.config.max_wait_ms as u128)
+            };
+
+            if should_process {
+                self.process_batch().await;
+                last_process = Instant::now();
+            } else {
+                // Sleep briefly to avoid busy waiting
+                sleep(Duration::from_millis(1)).await;
+            }
+        }
+
+        // Process remaining items before shutdown
+        self.process_batch().await;
+    }
+
+    /// Process current batch
+    async fn process_batch(&self) {
+        let items = {
+            let mut queue = self.queue.lock().await;
+            let batch_size = self.config.max_batch_size.min(queue.len());
+            if batch_size == 0 {
+                return;
+            }
+            queue.drain(..batch_size).collect::<Vec<_>>()
+        };
+
+        if items.is_empty() {
+            return;
+        }
+
+        // Extract data and response channels
+        let (data, responses): (Vec<_>, Vec<_>) = items
+            .into_iter()
+            .map(|item| (item.data, item.response))
+            .unzip();
+
+        // Process batch
+        let results = (self.processor)(data);
+
+        // Send responses
+        for (response_tx, result) in responses.into_iter().zip(results.into_iter()) {
+            let batch_result = result.map_err(|e| BatchError::ProcessingFailed(e));
+            let _ = response_tx.send(batch_result);
+        }
+    }
+
+    /// Gracefully shutdown the batcher
+    pub async fn shutdown(&self) {
+        let mut shutdown = self.shutdown.lock().await;
+        *shutdown = true;
+    }
+
+    /// Get current queue size
+    pub async fn queue_size(&self) -> usize {
+        self.queue.lock().await.len()
+    }
+
+    /// Get current queue statistics
+    pub async fn stats(&self) -> BatchStats {
+        let queue = self.queue.lock().await;
+        let queue_size = queue.len();
+
+        let max_wait = queue
+            .front()
+            .map(|item| item.enqueued_at.elapsed())
+            .unwrap_or(Duration::from_secs(0));
+
+        BatchStats {
+            queue_size,
+            max_wait_time: max_wait,
+        }
+    }
+}
+
+/// Batch statistics
+#[derive(Debug, Clone)]
+pub struct BatchStats {
+    pub queue_size: usize,
+    pub max_wait_time: Duration,
+}
+
+/// Adaptive batcher that adjusts batch size based on latency
+pub struct AdaptiveBatcher<T, R> {
+    inner: DynamicBatcher<T, R>,
+    config: Arc<Mutex<BatchConfig>>,
+    latency_history: Arc<Mutex<VecDeque<Duration>>>,
+    target_latency: Duration,
+}
+
+impl<T, R> AdaptiveBatcher<T, R>
+where
+    T: Send + 'static,
+    R: Send + 'static,
+{
+    /// Create adaptive batcher with target latency
+    pub fn new<F>(initial_config: BatchConfig, target_latency: Duration, processor: F) -> Self
+    where
+        F: Fn(Vec<T>) -> Vec<Result<R, String>> + Send + Sync + 'static,
+    {
+        let config = Arc::new(Mutex::new(initial_config.clone()));
+        let inner = DynamicBatcher::new(initial_config, processor);
+
+        Self {
+            inner,
+            config,
+            latency_history: Arc::new(Mutex::new(VecDeque::with_capacity(100))),
+            target_latency,
+        }
+    }
+
+    /// Add item and adapt batch size
+    pub async fn add(&self, item: T) -> Result<R, BatchError> {
+        let start = Instant::now();
+        let result = self.inner.add(item).await;
+        let latency = start.elapsed();
+
+        // Record latency
+        {
+            let mut history = self.latency_history.lock().await;
+            history.push_back(latency);
+            if history.len() > 100 {
+                history.pop_front();
+            }
+        }
+
+        // Adapt batch size every 10 requests
+        {
+            let history = self.latency_history.lock().await;
+            if history.len() % 10 == 0 && history.len() >= 10 {
+                let avg_latency: Duration = history.iter().sum::<Duration>() / history.len() as u32;
+
+                let mut config = self.config.lock().await;
+                if avg_latency > self.target_latency {
+                    // Reduce batch size to lower latency
+                    config.max_batch_size = (config.max_batch_size * 9 / 10).max(1);
+                } else if avg_latency < self.target_latency / 2 {
+                    // Increase batch size for better throughput
+                    config.max_batch_size = (config.max_batch_size * 11 / 10).min(128);
+                }
+            }
+        }
+
+        result
+    }
+
+    /// Run the batcher
+    pub async fn run(&self) {
+        self.inner.run().await;
+    }
+
+    /// Get current configuration
+    pub async fn current_config(&self) -> BatchConfig {
+        self.config.lock().await.clone()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_dynamic_batcher() {
+        let config = BatchConfig {
+            max_batch_size: 4,
+            max_wait_ms: 100,
+            max_queue_size: 100,
+            preferred_batch_size: 2,
+        };
+
+        let batcher = Arc::new(DynamicBatcher::new(config, |items: Vec<i32>| {
+            items.into_iter().map(|x| Ok(x * 2)).collect()
+        }));
+
+        // Start processing loop
+        let batcher_clone = batcher.clone();
+        tokio::spawn(async move {
+            batcher_clone.run().await;
+        });
+
+        // Add items
+        let mut handles = vec![];
+        for i in 0..8 {
+            let batcher = batcher.clone();
+            handles.push(tokio::spawn(async move { batcher.add(i).await }));
+        }
+
+        // Wait for results
+        for (i, handle) in handles.into_iter().enumerate() {
+            let result = handle.await.unwrap().unwrap();
+            assert_eq!(result, (i as i32) * 2);
+        }
+
+        batcher.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_batch_stats() {
+        let config = BatchConfig::default();
+        let batcher = DynamicBatcher::new(config, |items: Vec<i32>| {
+            items.into_iter().map(|x| Ok(x)).collect()
+        });
+
+        // Queue some items without processing
+        let _ = batcher.add(1);
+        let _ = batcher.add(2);
+        let _ = batcher.add(3);
+
+        let stats = batcher.stats().await;
+        assert_eq!(stats.queue_size, 3);
+    }
+
+    #[tokio::test]
+    async fn test_queue_full() {
+        let config = BatchConfig {
+            max_queue_size: 2,
+            ..Default::default()
+        };
+
+        let batcher = DynamicBatcher::new(config, |items: Vec<i32>| {
+            std::thread::sleep(Duration::from_secs(1)); // Slow processing
+            items.into_iter().map(|x| Ok(x)).collect()
+        });
+
+        // Fill queue
+        let _ = batcher.add(1);
+        let _ = batcher.add(2);
+
+        // This should fail - queue is full
+        let result = batcher.add(3).await;
+        assert!(matches!(result, Err(BatchError::QueueFull)));
+    }
+
+    #[tokio::test]
+    async fn test_adaptive_batcher() {
+        let config = BatchConfig {
+            max_batch_size: 8,
+            max_wait_ms: 50,
+            max_queue_size: 100,
+            preferred_batch_size: 4,
+        };
+
+        let batcher = Arc::new(AdaptiveBatcher::new(
+            config,
+            Duration::from_millis(100),
+            |items: Vec<i32>| items.into_iter().map(|x| Ok(x * 2)).collect(),
+        ));
+
+        let batcher_clone = batcher.clone();
+        tokio::spawn(async move {
+            batcher_clone.run().await;
+        });
+
+        // Process some requests
+        for i in 0..20 {
+            let result = batcher.add(i).await.unwrap();
+            assert_eq!(result, i * 2);
+        }
+
+        // Configuration should have adapted
+        let final_config = batcher.current_config().await;
+        assert!(final_config.max_batch_size > 0);
+    }
+}
--- a/vendor/ruvector/examples/scipix/src/optimize/memory.rs
+++ b/vendor/ruvector/examples/scipix/src/optimize/memory.rs
@@ -0,0 +1,409 @@
+//! Memory optimization utilities
+//!
+//! Provides object pooling, memory-mapped file loading, and zero-copy operations.
+
+use memmap2::{Mmap, MmapOptions};
+use std::collections::VecDeque;
+use std::fs::File;
+use std::path::Path;
+use std::sync::{Arc, Mutex};
+
+use super::memory_opt_enabled;
+use crate::error::{Result, ScipixError};
+
+/// Object pool for reusable buffers
+pub struct BufferPool<T> {
+    pool: Arc<Mutex<VecDeque<T>>>,
+    factory: Arc<dyn Fn() -> T + Send + Sync>,
+    #[allow(dead_code)]
+    max_size: usize,
+}
+
+impl<T: Send + 'static> BufferPool<T> {
+    /// Create a new buffer pool
+    pub fn new<F>(factory: F, initial_size: usize, max_size: usize) -> Self
+    where
+        F: Fn() -> T + Send + Sync + 'static,
+    {
+        let factory = Arc::new(factory);
+        let pool = Arc::new(Mutex::new(VecDeque::with_capacity(max_size)));
+
+        // Pre-allocate initial buffers
+        if memory_opt_enabled() {
+            let mut pool_lock = pool.lock().unwrap();
+            for _ in 0..initial_size {
+                pool_lock.push_back(factory());
+            }
+        }
+
+        Self {
+            pool,
+            factory,
+            max_size,
+        }
+    }
+
+    /// Acquire a buffer from the pool
+    pub fn acquire(&self) -> PooledBuffer<T> {
+        let buffer = if memory_opt_enabled() {
+            self.pool
+                .lock()
+                .unwrap()
+                .pop_front()
+                .unwrap_or_else(|| (self.factory)())
+        } else {
+            (self.factory)()
+        };
+
+        PooledBuffer {
+            buffer: Some(buffer),
+            pool: self.pool.clone(),
+        }
+    }
+
+    /// Get current pool size
+    pub fn size(&self) -> usize {
+        self.pool.lock().unwrap().len()
+    }
+
+    /// Clear the pool
+    pub fn clear(&self) {
+        self.pool.lock().unwrap().clear();
+    }
+}
+
+/// RAII guard for pooled buffers
+pub struct PooledBuffer<T> {
+    buffer: Option<T>,
+    pool: Arc<Mutex<VecDeque<T>>>,
+}
+
+impl<T> PooledBuffer<T> {
+    /// Get mutable reference to buffer
+    pub fn get_mut(&mut self) -> &mut T {
+        self.buffer.as_mut().unwrap()
+    }
+
+    /// Get immutable reference to buffer
+    pub fn get(&self) -> &T {
+        self.buffer.as_ref().unwrap()
+    }
+}
+
+impl<T> Drop for PooledBuffer<T> {
+    fn drop(&mut self) {
+        if memory_opt_enabled() {
+            if let Some(buffer) = self.buffer.take() {
+                let mut pool = self.pool.lock().unwrap();
+                pool.push_back(buffer);
+            }
+        }
+    }
+}
+
+impl<T> std::ops::Deref for PooledBuffer<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.buffer.as_ref().unwrap()
+    }
+}
+
+impl<T> std::ops::DerefMut for PooledBuffer<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.buffer.as_mut().unwrap()
+    }
+}
+
+/// Memory-mapped model file
+pub struct MmapModel {
+    _mmap: Mmap,
+    data: *const u8,
+    len: usize,
+}
+
+unsafe impl Send for MmapModel {}
+unsafe impl Sync for MmapModel {}
+
+impl MmapModel {
+    /// Load model from file using memory mapping
+    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
+        let file = File::open(path.as_ref()).map_err(|e| ScipixError::Io(e))?;
+
+        let mmap = unsafe {
+            MmapOptions::new()
+                .map(&file)
+                .map_err(|e| ScipixError::Io(e))?
+        };
+
+        let data = mmap.as_ptr();
+        let len = mmap.len();
+
+        Ok(Self {
+            _mmap: mmap,
+            data,
+            len,
+        })
+    }
+
+    /// Get slice of model data
+    pub fn as_slice(&self) -> &[u8] {
+        unsafe { std::slice::from_raw_parts(self.data, self.len) }
+    }
+
+    /// Get size of mapped region
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Check if empty
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+}
+
+/// Zero-copy image view
+pub struct ImageView<'a> {
+    data: &'a [u8],
+    width: u32,
+    height: u32,
+    channels: u8,
+}
+
+impl<'a> ImageView<'a> {
+    /// Create new image view from raw data
+    pub fn new(data: &'a [u8], width: u32, height: u32, channels: u8) -> Result<Self> {
+        let expected_len = (width * height * channels as u32) as usize;
+        if data.len() != expected_len {
+            return Err(ScipixError::InvalidInput(format!(
+                "Invalid data length: expected {}, got {}",
+                expected_len,
+                data.len()
+            )));
+        }
+
+        Ok(Self {
+            data,
+            width,
+            height,
+            channels,
+        })
+    }
+
+    /// Get pixel at (x, y)
+    pub fn pixel(&self, x: u32, y: u32) -> &[u8] {
+        let offset = ((y * self.width + x) * self.channels as u32) as usize;
+        &self.data[offset..offset + self.channels as usize]
+    }
+
+    /// Get raw data slice
+    pub fn data(&self) -> &[u8] {
+        self.data
+    }
+
+    /// Get dimensions
+    pub fn dimensions(&self) -> (u32, u32) {
+        (self.width, self.height)
+    }
+
+    /// Get number of channels
+    pub fn channels(&self) -> u8 {
+        self.channels
+    }
+
+    /// Create subview (region of interest)
+    pub fn subview(&self, x: u32, y: u32, width: u32, height: u32) -> Result<Self> {
+        if x + width > self.width || y + height > self.height {
+            return Err(ScipixError::InvalidInput(
+                "Subview out of bounds".to_string(),
+            ));
+        }
+
+        // For simplicity, this creates a copy. True zero-copy would need stride support
+        let mut subview_data = Vec::new();
+        for row in y..y + height {
+            let start = ((row * self.width + x) * self.channels as u32) as usize;
+            let end = start + (width * self.channels as u32) as usize;
+            subview_data.extend_from_slice(&self.data[start..end]);
+        }
+
+        // This temporarily leaks memory - in production, use arena allocator
+        let leaked = Box::leak(subview_data.into_boxed_slice());
+
+        Ok(Self {
+            data: leaked,
+            width,
+            height,
+            channels: self.channels,
+        })
+    }
+}
+
+/// Arena allocator for temporary allocations
+pub struct Arena {
+    buffer: Vec<u8>,
+    offset: usize,
+}
+
+impl Arena {
+    /// Create new arena with capacity
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self {
+            buffer: Vec::with_capacity(capacity),
+            offset: 0,
+        }
+    }
+
+    /// Allocate aligned memory
+    pub fn alloc(&mut self, size: usize, align: usize) -> &mut [u8] {
+        // Align offset
+        let padding = (align - (self.offset % align)) % align;
+        self.offset += padding;
+
+        let start = self.offset;
+        let end = start + size;
+
+        if end > self.buffer.capacity() {
+            // Grow buffer
+            self.buffer.reserve(end - self.buffer.len());
+        }
+
+        unsafe {
+            self.buffer.set_len(end);
+        }
+
+        self.offset = end;
+        &mut self.buffer[start..end]
+    }
+
+    /// Reset arena (keeps capacity)
+    pub fn reset(&mut self) {
+        self.offset = 0;
+        self.buffer.clear();
+    }
+
+    /// Get current usage
+    pub fn usage(&self) -> usize {
+        self.offset
+    }
+
+    /// Get capacity
+    pub fn capacity(&self) -> usize {
+        self.buffer.capacity()
+    }
+}
+
+/// Global buffer pools for common sizes
+pub struct GlobalPools {
+    small: BufferPool<Vec<u8>>,  // 1KB buffers
+    medium: BufferPool<Vec<u8>>, // 64KB buffers
+    large: BufferPool<Vec<u8>>,  // 1MB buffers
+}
+
+impl GlobalPools {
+    fn new() -> Self {
+        Self {
+            small: BufferPool::new(|| Vec::with_capacity(1024), 10, 100),
+            medium: BufferPool::new(|| Vec::with_capacity(64 * 1024), 5, 50),
+            large: BufferPool::new(|| Vec::with_capacity(1024 * 1024), 2, 20),
+        }
+    }
+
+    /// Get the global pools instance
+    pub fn get() -> &'static Self {
+        static POOLS: std::sync::OnceLock<GlobalPools> = std::sync::OnceLock::new();
+        POOLS.get_or_init(GlobalPools::new)
+    }
+
+    /// Acquire small buffer (1KB)
+    pub fn acquire_small(&self) -> PooledBuffer<Vec<u8>> {
+        self.small.acquire()
+    }
+
+    /// Acquire medium buffer (64KB)
+    pub fn acquire_medium(&self) -> PooledBuffer<Vec<u8>> {
+        self.medium.acquire()
+    }
+
+    /// Acquire large buffer (1MB)
+    pub fn acquire_large(&self) -> PooledBuffer<Vec<u8>> {
+        self.large.acquire()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::io::Write;
+    use tempfile::NamedTempFile;
+
+    #[test]
+    fn test_buffer_pool() {
+        let pool = BufferPool::new(|| Vec::with_capacity(1024), 2, 10);
+
+        assert_eq!(pool.size(), 2);
+
+        let mut buf1 = pool.acquire();
+        assert_eq!(buf1.capacity(), 1024);
+        buf1.extend_from_slice(b"test");
+
+        drop(buf1);
+        assert_eq!(pool.size(), 3); // Returned to pool
+    }
+
+    #[test]
+    fn test_mmap_model() {
+        let mut temp = NamedTempFile::new().unwrap();
+        temp.write_all(b"test model data").unwrap();
+        temp.flush().unwrap();
+
+        let mmap = MmapModel::from_file(temp.path()).unwrap();
+        assert_eq!(mmap.as_slice(), b"test model data");
+        assert_eq!(mmap.len(), 15);
+    }
+
+    #[test]
+    fn test_image_view() {
+        let data = vec![
+            255, 0, 0, 255, // Red pixel
+            0, 255, 0, 255, // Green pixel
+            0, 0, 255, 255, // Blue pixel
+            255, 255, 255, 255, // White pixel
+        ];
+
+        let view = ImageView::new(&data, 2, 2, 4).unwrap();
+        assert_eq!(view.dimensions(), (2, 2));
+        assert_eq!(view.pixel(0, 0), &[255, 0, 0, 255]);
+        assert_eq!(view.pixel(1, 1), &[255, 255, 255, 255]);
+    }
+
+    #[test]
+    fn test_arena() {
+        let mut arena = Arena::with_capacity(1024);
+
+        let slice1 = arena.alloc(100, 8);
+        assert_eq!(slice1.len(), 100);
+
+        let slice2 = arena.alloc(200, 8);
+        assert_eq!(slice2.len(), 200);
+
+        assert!(arena.usage() >= 300);
+
+        arena.reset();
+        assert_eq!(arena.usage(), 0);
+    }
+
+    #[test]
+    fn test_global_pools() {
+        let pools = GlobalPools::get();
+
+        let small = pools.acquire_small();
+        assert!(small.capacity() >= 1024);
+
+        let medium = pools.acquire_medium();
+        assert!(medium.capacity() >= 64 * 1024);
+
+        let large = pools.acquire_large();
+        assert!(large.capacity() >= 1024 * 1024);
+    }
+}
--- a/vendor/ruvector/examples/scipix/src/optimize/mod.rs
+++ b/vendor/ruvector/examples/scipix/src/optimize/mod.rs
@@ -0,0 +1,169 @@
+//! Performance optimization utilities for scipix OCR
+//!
+//! This module provides runtime feature detection and optimized code paths
+//! for different CPU architectures and capabilities.
+
+pub mod batch;
+pub mod memory;
+pub mod parallel;
+pub mod quantize;
+pub mod simd;
+
+use std::sync::OnceLock;
+
+/// CPU features detected at runtime
+#[derive(Debug, Clone, Copy)]
+pub struct CpuFeatures {
+    pub avx2: bool,
+    pub avx512f: bool,
+    pub neon: bool,
+    pub sse4_2: bool,
+}
+
+static CPU_FEATURES: OnceLock<CpuFeatures> = OnceLock::new();
+
+/// Detect CPU features at runtime
+pub fn detect_features() -> CpuFeatures {
+    *CPU_FEATURES.get_or_init(|| {
+        #[cfg(target_arch = "x86_64")]
+        {
+            CpuFeatures {
+                avx2: is_x86_feature_detected!("avx2"),
+                avx512f: is_x86_feature_detected!("avx512f"),
+                neon: false,
+                sse4_2: is_x86_feature_detected!("sse4.2"),
+            }
+        }
+        #[cfg(target_arch = "aarch64")]
+        {
+            CpuFeatures {
+                avx2: false,
+                avx512f: false,
+                neon: std::arch::is_aarch64_feature_detected!("neon"),
+                sse4_2: false,
+            }
+        }
+        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        {
+            CpuFeatures {
+                avx2: false,
+                avx512f: false,
+                neon: false,
+                sse4_2: false,
+            }
+        }
+    })
+}
+
+/// Get the detected CPU features
+pub fn get_features() -> CpuFeatures {
+    detect_features()
+}
+
+/// Runtime dispatch to optimized implementation
+pub trait OptimizedOp<T> {
+    /// Execute the operation with the best available implementation
+    fn execute(&self, input: T) -> T;
+
+    /// Execute with SIMD if available, fallback to scalar
+    fn execute_auto(&self, input: T) -> T {
+        let features = get_features();
+        if features.avx2 || features.avx512f || features.neon {
+            self.execute_simd(input)
+        } else {
+            self.execute_scalar(input)
+        }
+    }
+
+    /// SIMD implementation
+    fn execute_simd(&self, input: T) -> T;
+
+    /// Scalar fallback implementation
+    fn execute_scalar(&self, input: T) -> T;
+}
+
+/// Optimization level configuration
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum OptLevel {
+    /// No optimizations, scalar code only
+    None,
+    /// Use SIMD when available
+    Simd,
+    /// Use SIMD + parallel processing
+    Parallel,
+    /// All optimizations including memory optimizations
+    Full,
+}
+
+impl Default for OptLevel {
+    fn default() -> Self {
+        OptLevel::Full
+    }
+}
+
+/// Global optimization configuration
+static OPT_LEVEL: OnceLock<OptLevel> = OnceLock::new();
+
+/// Set the optimization level
+pub fn set_opt_level(level: OptLevel) {
+    OPT_LEVEL.set(level).ok();
+}
+
+/// Get the current optimization level
+pub fn get_opt_level() -> OptLevel {
+    *OPT_LEVEL.get_or_init(OptLevel::default)
+}
+
+/// Check if SIMD optimizations are enabled
+pub fn simd_enabled() -> bool {
+    matches!(
+        get_opt_level(),
+        OptLevel::Simd | OptLevel::Parallel | OptLevel::Full
+    )
+}
+
+/// Check if parallel optimizations are enabled
+pub fn parallel_enabled() -> bool {
+    matches!(get_opt_level(), OptLevel::Parallel | OptLevel::Full)
+}
+
+/// Check if memory optimizations are enabled
+pub fn memory_opt_enabled() -> bool {
+    matches!(get_opt_level(), OptLevel::Full)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_feature_detection() {
+        let features = detect_features();
+        println!("Detected features: {:?}", features);
+
+        // Should always succeed on any platform
+        assert!(
+            features.avx2
+                || features.avx512f
+                || features.neon
+                || features.sse4_2
+                || (!features.avx2 && !features.avx512f && !features.neon && !features.sse4_2)
+        );
+    }
+
+    #[test]
+    fn test_opt_level() {
+        assert_eq!(get_opt_level(), OptLevel::Full);
+
+        set_opt_level(OptLevel::Simd);
+        // Can't change after first init, should still be Full
+        assert_eq!(get_opt_level(), OptLevel::Full);
+    }
+
+    #[test]
+    fn test_optimization_checks() {
+        assert!(simd_enabled());
+        assert!(parallel_enabled());
+        assert!(memory_opt_enabled());
+    }
+}
--- a/vendor/ruvector/examples/scipix/src/optimize/parallel.rs
+++ b/vendor/ruvector/examples/scipix/src/optimize/parallel.rs
@@ -0,0 +1,335 @@
+//! Parallel processing utilities for OCR pipeline
+//!
+//! Provides parallel image preprocessing, batch OCR, and pipelined execution.
+
+use image::DynamicImage;
+use rayon::prelude::*;
+use std::sync::Arc;
+use tokio::sync::Semaphore;
+
+use super::parallel_enabled;
+
+/// Parallel preprocessing of multiple images
+pub fn parallel_preprocess<F>(images: Vec<DynamicImage>, preprocess_fn: F) -> Vec<DynamicImage>
+where
+    F: Fn(DynamicImage) -> DynamicImage + Sync + Send,
+{
+    if !parallel_enabled() {
+        return images.into_iter().map(preprocess_fn).collect();
+    }
+
+    images.into_par_iter().map(preprocess_fn).collect()
+}
+
+/// Parallel processing with error handling
+pub fn parallel_preprocess_result<F, E>(
+    images: Vec<DynamicImage>,
+    preprocess_fn: F,
+) -> Vec<std::result::Result<DynamicImage, E>>
+where
+    F: Fn(DynamicImage) -> std::result::Result<DynamicImage, E> + Sync + Send,
+    E: Send,
+{
+    if !parallel_enabled() {
+        return images.into_iter().map(preprocess_fn).collect();
+    }
+
+    images.into_par_iter().map(preprocess_fn).collect()
+}
+
+/// Pipeline parallel execution for OCR workflow
+///
+/// Executes stages in a pipeline: preprocess | detect | recognize
+/// Each stage can start processing the next item while previous stages
+/// continue with subsequent items.
+pub struct PipelineExecutor<T, U, V> {
+    stage1: Arc<dyn Fn(T) -> U + Send + Sync>,
+    stage2: Arc<dyn Fn(U) -> V + Send + Sync>,
+}
+
+impl<T, U, V> PipelineExecutor<T, U, V>
+where
+    T: Send,
+    U: Send,
+    V: Send,
+{
+    pub fn new<F1, F2>(stage1: F1, stage2: F2) -> Self
+    where
+        F1: Fn(T) -> U + Send + Sync + 'static,
+        F2: Fn(U) -> V + Send + Sync + 'static,
+    {
+        Self {
+            stage1: Arc::new(stage1),
+            stage2: Arc::new(stage2),
+        }
+    }
+
+    /// Execute pipeline on multiple inputs
+    pub fn execute_batch(&self, inputs: Vec<T>) -> Vec<V> {
+        if !parallel_enabled() {
+            return inputs
+                .into_iter()
+                .map(|input| {
+                    let stage1_out = (self.stage1)(input);
+                    (self.stage2)(stage1_out)
+                })
+                .collect();
+        }
+
+        inputs
+            .into_par_iter()
+            .map(|input| {
+                let stage1_out = (self.stage1)(input);
+                (self.stage2)(stage1_out)
+            })
+            .collect()
+    }
+}
+
+/// Three-stage pipeline executor
+pub struct Pipeline3<T, U, V, W> {
+    stage1: Arc<dyn Fn(T) -> U + Send + Sync>,
+    stage2: Arc<dyn Fn(U) -> V + Send + Sync>,
+    stage3: Arc<dyn Fn(V) -> W + Send + Sync>,
+}
+
+impl<T, U, V, W> Pipeline3<T, U, V, W>
+where
+    T: Send,
+    U: Send,
+    V: Send,
+    W: Send,
+{
+    pub fn new<F1, F2, F3>(stage1: F1, stage2: F2, stage3: F3) -> Self
+    where
+        F1: Fn(T) -> U + Send + Sync + 'static,
+        F2: Fn(U) -> V + Send + Sync + 'static,
+        F3: Fn(V) -> W + Send + Sync + 'static,
+    {
+        Self {
+            stage1: Arc::new(stage1),
+            stage2: Arc::new(stage2),
+            stage3: Arc::new(stage3),
+        }
+    }
+
+    pub fn execute_batch(&self, inputs: Vec<T>) -> Vec<W> {
+        if !parallel_enabled() {
+            return inputs
+                .into_iter()
+                .map(|input| {
+                    let out1 = (self.stage1)(input);
+                    let out2 = (self.stage2)(out1);
+                    (self.stage3)(out2)
+                })
+                .collect();
+        }
+
+        inputs
+            .into_par_iter()
+            .map(|input| {
+                let out1 = (self.stage1)(input);
+                let out2 = (self.stage2)(out1);
+                (self.stage3)(out2)
+            })
+            .collect()
+    }
+}
+
+/// Parallel map with configurable chunk size
+pub fn parallel_map_chunked<T, U, F>(items: Vec<T>, chunk_size: usize, map_fn: F) -> Vec<U>
+where
+    T: Send,
+    U: Send,
+    F: Fn(T) -> U + Sync + Send,
+{
+    if !parallel_enabled() {
+        return items.into_iter().map(map_fn).collect();
+    }
+
+    items
+        .into_par_iter()
+        .with_min_len(chunk_size)
+        .map(map_fn)
+        .collect()
+}
+
+/// Async parallel executor with concurrency limit
+pub struct AsyncParallelExecutor {
+    semaphore: Arc<Semaphore>,
+}
+
+impl AsyncParallelExecutor {
+    /// Create executor with maximum concurrency limit
+    pub fn new(max_concurrent: usize) -> Self {
+        Self {
+            semaphore: Arc::new(Semaphore::new(max_concurrent)),
+        }
+    }
+
+    /// Execute async tasks with concurrency limit
+    pub async fn execute<T, F, Fut>(&self, tasks: Vec<T>, executor: F) -> Vec<Fut::Output>
+    where
+        T: Send + 'static,
+        F: Fn(T) -> Fut + Send + Sync + Clone + 'static,
+        Fut: std::future::Future + Send + 'static,
+        Fut::Output: Send + 'static,
+    {
+        let mut handles = Vec::new();
+
+        for task in tasks {
+            let permit = self.semaphore.clone().acquire_owned().await.unwrap();
+            let executor = executor.clone();
+
+            let handle = tokio::spawn(async move {
+                let result = executor(task).await;
+                drop(permit); // Release semaphore
+                result
+            });
+
+            handles.push(handle);
+        }
+
+        // Wait for all tasks to complete
+        let mut results = Vec::new();
+        for handle in handles {
+            if let Ok(result) = handle.await {
+                results.push(result);
+            }
+        }
+
+        results
+    }
+
+    /// Execute with error handling
+    pub async fn execute_result<T, F, Fut, R, E>(
+        &self,
+        tasks: Vec<T>,
+        executor: F,
+    ) -> Vec<std::result::Result<R, E>>
+    where
+        T: Send + 'static,
+        F: Fn(T) -> Fut + Send + Sync + Clone + 'static,
+        Fut: std::future::Future<Output = std::result::Result<R, E>> + Send + 'static,
+        R: Send + 'static,
+        E: Send + 'static,
+    {
+        let mut handles = Vec::new();
+
+        for task in tasks {
+            let permit = self.semaphore.clone().acquire_owned().await.unwrap();
+            let executor = executor.clone();
+
+            let handle = tokio::spawn(async move {
+                let result = executor(task).await;
+                drop(permit);
+                result
+            });
+
+            handles.push(handle);
+        }
+
+        let mut results = Vec::new();
+        for handle in handles {
+            match handle.await {
+                Ok(result) => results.push(result),
+                Err(_) => continue, // Task panicked
+            }
+        }
+
+        results
+    }
+}
+
+/// Work-stealing parallel iterator for unbalanced workloads
+pub fn parallel_unbalanced<T, U, F>(items: Vec<T>, map_fn: F) -> Vec<U>
+where
+    T: Send,
+    U: Send,
+    F: Fn(T) -> U + Sync + Send,
+{
+    if !parallel_enabled() {
+        return items.into_iter().map(map_fn).collect();
+    }
+
+    // Use adaptive strategy for unbalanced work
+    items
+        .into_par_iter()
+        .with_min_len(1) // Allow fine-grained work stealing
+        .map(map_fn)
+        .collect()
+}
+
+/// Get optimal thread count for current system
+pub fn optimal_thread_count() -> usize {
+    rayon::current_num_threads()
+}
+
+/// Set global thread pool size
+pub fn set_thread_count(threads: usize) {
+    rayon::ThreadPoolBuilder::new()
+        .num_threads(threads)
+        .build_global()
+        .ok();
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parallel_map() {
+        let data: Vec<i32> = (0..100).collect();
+        let result = parallel_map_chunked(data, 10, |x| x * 2);
+
+        assert_eq!(result.len(), 100);
+        assert_eq!(result[0], 0);
+        assert_eq!(result[50], 100);
+        assert_eq!(result[99], 198);
+    }
+
+    #[test]
+    fn test_pipeline_executor() {
+        let pipeline = PipelineExecutor::new(|x: i32| x + 1, |x: i32| x * 2);
+
+        let inputs = vec![1, 2, 3, 4, 5];
+        let results = pipeline.execute_batch(inputs);
+
+        assert_eq!(results, vec![4, 6, 8, 10, 12]);
+    }
+
+    #[test]
+    fn test_pipeline3() {
+        let pipeline = Pipeline3::new(|x: i32| x + 1, |x: i32| x * 2, |x: i32| x - 1);
+
+        let inputs = vec![1, 2, 3];
+        let results = pipeline.execute_batch(inputs);
+
+        // (1+1)*2-1 = 3, (2+1)*2-1 = 5, (3+1)*2-1 = 7
+        assert_eq!(results, vec![3, 5, 7]);
+    }
+
+    #[tokio::test]
+    async fn test_async_executor() {
+        let executor = AsyncParallelExecutor::new(2);
+
+        let tasks = vec![1, 2, 3, 4, 5];
+        let results = executor
+            .execute(tasks, |x| async move {
+                tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
+                x * 2
+            })
+            .await;
+
+        assert_eq!(results.len(), 5);
+        assert!(results.contains(&2));
+        assert!(results.contains(&10));
+    }
+
+    #[test]
+    fn test_optimal_threads() {
+        let threads = optimal_thread_count();
+        assert!(threads > 0);
+        assert!(threads <= num_cpus::get());
+    }
+}
--- a/vendor/ruvector/examples/scipix/src/optimize/quantize.rs
+++ b/vendor/ruvector/examples/scipix/src/optimize/quantize.rs
@@ -0,0 +1,339 @@
+//! Model quantization utilities
+//!
+//! Provides INT8 quantization for model weights and activations to reduce
+//! memory usage and improve inference speed.
+
+use std::f32;
+
+/// Quantization parameters
+#[derive(Debug, Clone, Copy)]
+pub struct QuantParams {
+    pub scale: f32,
+    pub zero_point: i8,
+}
+
+impl QuantParams {
+    /// Calculate quantization parameters from min/max values
+    pub fn from_range(min: f32, max: f32) -> Self {
+        let qmin = i8::MIN as f32;
+        let qmax = i8::MAX as f32;
+
+        let scale = (max - min) / (qmax - qmin);
+        let zero_point = (qmin - min / scale).round() as i8;
+
+        Self { scale, zero_point }
+    }
+
+    /// Calculate from data statistics
+    pub fn from_data(data: &[f32]) -> Self {
+        let min = data.iter().copied().fold(f32::INFINITY, f32::min);
+        let max = data.iter().copied().fold(f32::NEG_INFINITY, f32::max);
+        Self::from_range(min, max)
+    }
+
+    /// Symmetric quantization (zero_point = 0)
+    pub fn symmetric(abs_max: f32) -> Self {
+        let scale = abs_max / 127.0;
+        Self {
+            scale,
+            zero_point: 0,
+        }
+    }
+}
+
+/// Quantize f32 weights to i8
+pub fn quantize_weights(weights: &[f32]) -> (Vec<i8>, QuantParams) {
+    let params = QuantParams::from_data(weights);
+    let quantized = quantize_with_params(weights, params);
+    (quantized, params)
+}
+
+/// Quantize with given parameters
+pub fn quantize_with_params(weights: &[f32], params: QuantParams) -> Vec<i8> {
+    weights.iter().map(|&w| quantize_value(w, params)).collect()
+}
+
+/// Quantize single value
+#[inline]
+pub fn quantize_value(value: f32, params: QuantParams) -> i8 {
+    let scaled = value / params.scale + params.zero_point as f32;
+    scaled.round().clamp(i8::MIN as f32, i8::MAX as f32) as i8
+}
+
+/// Dequantize i8 to f32
+pub fn dequantize(quantized: &[i8], params: QuantParams) -> Vec<f32> {
+    quantized
+        .iter()
+        .map(|&q| dequantize_value(q, params))
+        .collect()
+}
+
+/// Dequantize single value
+#[inline]
+pub fn dequantize_value(quantized: i8, params: QuantParams) -> f32 {
+    (quantized as f32 - params.zero_point as f32) * params.scale
+}
+
+/// Quantized tensor representation
+pub struct QuantizedTensor {
+    pub data: Vec<i8>,
+    pub params: QuantParams,
+    pub shape: Vec<usize>,
+}
+
+impl QuantizedTensor {
+    /// Create from f32 tensor
+    pub fn from_f32(data: &[f32], shape: Vec<usize>) -> Self {
+        let (quantized, params) = quantize_weights(data);
+        Self {
+            data: quantized,
+            params,
+            shape,
+        }
+    }
+
+    /// Create with symmetric quantization
+    pub fn from_f32_symmetric(data: &[f32], shape: Vec<usize>) -> Self {
+        let abs_max = data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
+        let params = QuantParams::symmetric(abs_max);
+        let quantized = quantize_with_params(data, params);
+
+        Self {
+            data: quantized,
+            params,
+            shape,
+        }
+    }
+
+    /// Dequantize to f32
+    pub fn to_f32(&self) -> Vec<f32> {
+        dequantize(&self.data, self.params)
+    }
+
+    /// Get size in bytes
+    pub fn size_bytes(&self) -> usize {
+        self.data.len()
+            + std::mem::size_of::<QuantParams>()
+            + self.shape.len() * std::mem::size_of::<usize>()
+    }
+
+    /// Calculate memory savings vs f32
+    pub fn compression_ratio(&self) -> f32 {
+        let f32_size = self.data.len() * std::mem::size_of::<f32>();
+        let quantized_size = self.size_bytes();
+        f32_size as f32 / quantized_size as f32
+    }
+}
+
+/// Per-channel quantization for conv/linear layers
+pub struct PerChannelQuant {
+    pub data: Vec<i8>,
+    pub params: Vec<QuantParams>,
+    pub shape: Vec<usize>,
+}
+
+impl PerChannelQuant {
+    /// Quantize with per-channel parameters
+    /// For a weight tensor of shape [out_channels, in_channels, ...],
+    /// use separate params for each output channel
+    pub fn from_f32(data: &[f32], shape: Vec<usize>) -> Self {
+        if shape.is_empty() {
+            panic!("Shape cannot be empty");
+        }
+
+        let out_channels = shape[0];
+        let channel_size = data.len() / out_channels;
+
+        let mut all_quantized = Vec::with_capacity(data.len());
+        let mut params = Vec::with_capacity(out_channels);
+
+        for ch in 0..out_channels {
+            let start = ch * channel_size;
+            let end = start + channel_size;
+            let channel_data = &data[start..end];
+
+            let ch_params = QuantParams::from_data(channel_data);
+            let ch_quantized = quantize_with_params(channel_data, ch_params);
+
+            all_quantized.extend(ch_quantized);
+            params.push(ch_params);
+        }
+
+        Self {
+            data: all_quantized,
+            params,
+            shape,
+        }
+    }
+
+    /// Dequantize to f32
+    pub fn to_f32(&self) -> Vec<f32> {
+        let out_channels = self.shape[0];
+        let channel_size = self.data.len() / out_channels;
+
+        let mut result = Vec::with_capacity(self.data.len());
+
+        for ch in 0..out_channels {
+            let start = ch * channel_size;
+            let end = start + channel_size;
+            let channel_data = &self.data[start..end];
+            let ch_params = self.params[ch];
+
+            result.extend(dequantize(channel_data, ch_params));
+        }
+
+        result
+    }
+}
+
+/// Dynamic quantization - quantize at runtime
+pub struct DynamicQuantizer {
+    percentile: f32,
+}
+
+impl DynamicQuantizer {
+    /// Create quantizer with calibration percentile
+    /// percentile: clip values beyond this percentile (e.g., 99.9)
+    pub fn new(percentile: f32) -> Self {
+        Self { percentile }
+    }
+
+    /// Quantize with calibration
+    pub fn quantize(&self, data: &[f32]) -> (Vec<i8>, QuantParams) {
+        let mut sorted: Vec<f32> = data.iter().copied().collect();
+        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+        let idx = ((sorted.len() as f32 * self.percentile / 100.0) as usize).min(sorted.len() - 1);
+
+        let min = -sorted[sorted.len() - idx];
+        let max = sorted[idx];
+
+        let params = QuantParams::from_range(min, max);
+        let quantized = quantize_with_params(data, params);
+
+        (quantized, params)
+    }
+}
+
+/// Calculate quantization error (MSE)
+pub fn quantization_error(original: &[f32], quantized: &[i8], params: QuantParams) -> f32 {
+    let dequantized = dequantize(quantized, params);
+
+    let mse: f32 = original
+        .iter()
+        .zip(dequantized.iter())
+        .map(|(o, d)| (o - d).powi(2))
+        .sum::<f32>()
+        / original.len() as f32;
+
+    mse
+}
+
+/// Calculate signal-to-quantization-noise ratio (SQNR) in dB
+pub fn sqnr(original: &[f32], quantized: &[i8], params: QuantParams) -> f32 {
+    let dequantized = dequantize(quantized, params);
+
+    let signal_power: f32 = original.iter().map(|x| x.powi(2)).sum::<f32>() / original.len() as f32;
+    let noise_power: f32 = original
+        .iter()
+        .zip(dequantized.iter())
+        .map(|(o, d)| (o - d).powi(2))
+        .sum::<f32>()
+        / original.len() as f32;
+
+    10.0 * (signal_power / noise_power).log10()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_quantize_dequantize() {
+        let weights = vec![0.0, 0.5, 1.0, -0.5, -1.0];
+        let (quantized, params) = quantize_weights(&weights);
+        let dequantized = dequantize(&quantized, params);
+
+        // Check approximate equality
+        for (orig, deq) in weights.iter().zip(dequantized.iter()) {
+            assert!((orig - deq).abs() < 0.01, "orig: {}, deq: {}", orig, deq);
+        }
+    }
+
+    #[test]
+    fn test_symmetric_quantization() {
+        let data = vec![-1.0, -0.5, 0.0, 0.5, 1.0];
+        let params = QuantParams::symmetric(1.0);
+
+        assert_eq!(params.zero_point, 0);
+        assert!((params.scale - 1.0 / 127.0).abs() < 1e-6);
+
+        let quantized = quantize_with_params(&data, params);
+        assert_eq!(quantized[2], 0); // 0.0 should map to 0
+    }
+
+    #[test]
+    fn test_quantized_tensor() {
+        let data = vec![1.0, 2.0, 3.0, 4.0];
+        let tensor = QuantizedTensor::from_f32(&data, vec![2, 2]);
+
+        assert_eq!(tensor.shape, vec![2, 2]);
+        assert_eq!(tensor.data.len(), 4);
+
+        let dequantized = tensor.to_f32();
+        for (orig, deq) in data.iter().zip(dequantized.iter()) {
+            assert!((orig - deq).abs() < 0.1);
+        }
+    }
+
+    #[test]
+    fn test_per_channel_quant() {
+        // 2 channels, 3 values each
+        let data = vec![
+            1.0, 2.0, 3.0, // Channel 0
+            10.0, 20.0, 30.0, // Channel 1
+        ];
+
+        let quant = PerChannelQuant::from_f32(&data, vec![2, 3]);
+        assert_eq!(quant.params.len(), 2);
+
+        let dequantized = quant.to_f32();
+        for (orig, deq) in data.iter().zip(dequantized.iter()) {
+            assert!((orig - deq).abs() < 1.0);
+        }
+    }
+
+    #[test]
+    fn test_quantization_error() {
+        let original = vec![1.0, 2.0, 3.0, 4.0, 5.0];
+        let (quantized, params) = quantize_weights(&original);
+
+        let error = quantization_error(&original, &quantized, params);
+        assert!(error < 0.1); // Should be small for simple data
+
+        let snr = sqnr(&original, &quantized, params);
+        assert!(snr > 30.0); // Should have good SNR
+    }
+
+    #[test]
+    fn test_compression_ratio() {
+        let data: Vec<f32> = (0..1000).map(|i| i as f32 / 1000.0).collect();
+        let tensor = QuantizedTensor::from_f32(&data, vec![1000]);
+
+        let ratio = tensor.compression_ratio();
+        assert!(ratio > 3.5); // Should be ~4x compression
+    }
+
+    #[test]
+    fn test_dynamic_quantizer() {
+        let mut data: Vec<f32> = (0..100).map(|i| i as f32).collect();
+        data.push(1000.0); // Outlier
+
+        let quantizer = DynamicQuantizer::new(99.0);
+        let (quantized, params) = quantizer.quantize(&data);
+
+        assert_eq!(quantized.len(), 101);
+        // The outlier should be clipped
+        assert!(params.scale > 0.0);
+    }
+}
--- a/vendor/ruvector/examples/scipix/src/optimize/simd.rs
+++ b/vendor/ruvector/examples/scipix/src/optimize/simd.rs
@@ -0,0 +1,597 @@
+//! SIMD-accelerated image processing operations
+//!
+//! Provides optimized implementations for common image operations using
+//! AVX2, AVX-512, and ARM NEON intrinsics.
+
+use super::{get_features, simd_enabled};
+
+/// Convert RGBA image to grayscale using optimized SIMD operations
+pub fn simd_grayscale(rgba: &[u8], gray: &mut [u8]) {
+    if !simd_enabled() {
+        return scalar_grayscale(rgba, gray);
+    }
+
+    let features = get_features();
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        if features.avx2 {
+            unsafe { avx2_grayscale(rgba, gray) }
+        } else if features.sse4_2 {
+            unsafe { sse_grayscale(rgba, gray) }
+        } else {
+            scalar_grayscale(rgba, gray)
+        }
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        if features.neon {
+            unsafe { neon_grayscale(rgba, gray) }
+        } else {
+            scalar_grayscale(rgba, gray)
+        }
+    }
+
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    {
+        scalar_grayscale(rgba, gray)
+    }
+}
+
+/// Scalar fallback for grayscale conversion
+fn scalar_grayscale(rgba: &[u8], gray: &mut [u8]) {
+    assert_eq!(
+        rgba.len() / 4,
+        gray.len(),
+        "RGBA length must be 4x grayscale length"
+    );
+
+    for (i, chunk) in rgba.chunks_exact(4).enumerate() {
+        let r = chunk[0] as u32;
+        let g = chunk[1] as u32;
+        let b = chunk[2] as u32;
+
+        // ITU-R BT.601 luma coefficients: 0.299 R + 0.587 G + 0.114 B
+        gray[i] = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn avx2_grayscale(rgba: &[u8], gray: &mut [u8]) {
+    use std::arch::x86_64::*;
+
+    let len = gray.len();
+    let mut i = 0;
+
+    // Process 8 pixels at a time (32 RGBA bytes)
+    while i + 8 <= len {
+        // Load 32 bytes (8 RGBA pixels)
+        let rgba_ptr = rgba.as_ptr().add(i * 4);
+        let _pixels = _mm256_loadu_si256(rgba_ptr as *const __m256i);
+
+        // Separate RGBA channels (simplified - actual implementation would use shuffles)
+        // For production, use proper channel extraction
+
+        // Store grayscale result
+        for j in 0..8 {
+            let pixel_idx = (i + j) * 4;
+            let r = *rgba.get_unchecked(pixel_idx) as u32;
+            let g = *rgba.get_unchecked(pixel_idx + 1) as u32;
+            let b = *rgba.get_unchecked(pixel_idx + 2) as u32;
+            *gray.get_unchecked_mut(i + j) = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
+        }
+
+        i += 8;
+    }
+
+    // Handle remaining pixels
+    scalar_grayscale(&rgba[i * 4..], &mut gray[i..]);
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "sse4.2")]
+unsafe fn sse_grayscale(rgba: &[u8], gray: &mut [u8]) {
+    #[allow(unused_imports)]
+    use std::arch::x86_64::*;
+
+    let len = gray.len();
+    let mut i = 0;
+
+    // Process 4 pixels at a time (16 RGBA bytes)
+    while i + 4 <= len {
+        for j in 0..4 {
+            let pixel_idx = (i + j) * 4;
+            let r = *rgba.get_unchecked(pixel_idx) as u32;
+            let g = *rgba.get_unchecked(pixel_idx + 1) as u32;
+            let b = *rgba.get_unchecked(pixel_idx + 2) as u32;
+            *gray.get_unchecked_mut(i + j) = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
+        }
+        i += 4;
+    }
+
+    scalar_grayscale(&rgba[i * 4..], &mut gray[i..]);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn neon_grayscale(rgba: &[u8], gray: &mut [u8]) {
+    use std::arch::aarch64::*;
+
+    let len = gray.len();
+    let mut i = 0;
+
+    // Process 8 pixels at a time
+    while i + 8 <= len {
+        for j in 0..8 {
+            let idx = (i + j) * 4;
+            let r = *rgba.get_unchecked(idx) as u32;
+            let g = *rgba.get_unchecked(idx + 1) as u32;
+            let b = *rgba.get_unchecked(idx + 2) as u32;
+            *gray.get_unchecked_mut(i + j) = ((r * 77 + g * 150 + b * 29) >> 8) as u8;
+        }
+        i += 8;
+    }
+
+    scalar_grayscale(&rgba[i * 4..], &mut gray[i..]);
+}
+
+/// Apply threshold to grayscale image using SIMD
+pub fn simd_threshold(gray: &[u8], thresh: u8, out: &mut [u8]) {
+    if !simd_enabled() {
+        return scalar_threshold(gray, thresh, out);
+    }
+
+    let features = get_features();
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        if features.avx2 {
+            unsafe { avx2_threshold(gray, thresh, out) }
+        } else {
+            scalar_threshold(gray, thresh, out)
+        }
+    }
+
+    #[cfg(not(target_arch = "x86_64"))]
+    {
+        scalar_threshold(gray, thresh, out)
+    }
+}
+
+fn scalar_threshold(gray: &[u8], thresh: u8, out: &mut [u8]) {
+    for (g, o) in gray.iter().zip(out.iter_mut()) {
+        *o = if *g >= thresh { 255 } else { 0 };
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn avx2_threshold(gray: &[u8], thresh: u8, out: &mut [u8]) {
+    use std::arch::x86_64::*;
+
+    let len = gray.len();
+    let mut i = 0;
+
+    let thresh_vec = _mm256_set1_epi8(thresh as i8);
+    let ones = _mm256_set1_epi8(-1); // 0xFF
+
+    // Process 32 bytes at a time
+    while i + 32 <= len {
+        let gray_vec = _mm256_loadu_si256(gray.as_ptr().add(i) as *const __m256i);
+        let cmp = _mm256_cmpgt_epi8(gray_vec, thresh_vec);
+        let result = _mm256_and_si256(cmp, ones);
+        _mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut __m256i, result);
+        i += 32;
+    }
+
+    // Handle remaining bytes
+    scalar_threshold(&gray[i..], thresh, &mut out[i..]);
+}
+
+/// Normalize f32 tensor data using SIMD
+pub fn simd_normalize(data: &mut [f32]) {
+    if !simd_enabled() {
+        return scalar_normalize(data);
+    }
+
+    let features = get_features();
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        if features.avx2 {
+            unsafe { avx2_normalize(data) }
+        } else {
+            scalar_normalize(data)
+        }
+    }
+
+    #[cfg(not(target_arch = "x86_64"))]
+    {
+        scalar_normalize(data)
+    }
+}
+
+fn scalar_normalize(data: &mut [f32]) {
+    let sum: f32 = data.iter().sum();
+    let mean = sum / data.len() as f32;
+
+    let variance: f32 = data.iter().map(|x| (x - mean).powi(2)).sum::<f32>() / data.len() as f32;
+    let std_dev = variance.sqrt() + 1e-8; // Add epsilon for numerical stability
+
+    for x in data.iter_mut() {
+        *x = (*x - mean) / std_dev;
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn avx2_normalize(data: &mut [f32]) {
+    use std::arch::x86_64::*;
+
+    // Calculate mean using SIMD
+    let len = data.len();
+    let mut sum = _mm256_setzero_ps();
+    let mut i = 0;
+
+    while i + 8 <= len {
+        let vals = _mm256_loadu_ps(data.as_ptr().add(i));
+        sum = _mm256_add_ps(sum, vals);
+        i += 8;
+    }
+
+    // Horizontal sum
+    let sum_scalar = {
+        let sum_arr: [f32; 8] = std::mem::transmute(sum);
+        sum_arr.iter().sum::<f32>() + data[i..].iter().sum::<f32>()
+    };
+
+    let mean = sum_scalar / len as f32;
+    let mean_vec = _mm256_set1_ps(mean);
+
+    // Calculate variance
+    let mut var_sum = _mm256_setzero_ps();
+    i = 0;
+
+    while i + 8 <= len {
+        let vals = _mm256_loadu_ps(data.as_ptr().add(i));
+        let diff = _mm256_sub_ps(vals, mean_vec);
+        let sq = _mm256_mul_ps(diff, diff);
+        var_sum = _mm256_add_ps(var_sum, sq);
+        i += 8;
+    }
+
+    let var_scalar = {
+        let var_arr: [f32; 8] = std::mem::transmute(var_sum);
+        var_arr.iter().sum::<f32>() + data[i..].iter().map(|x| (x - mean).powi(2)).sum::<f32>()
+    };
+
+    let std_dev = (var_scalar / len as f32).sqrt() + 1e-8;
+    let std_vec = _mm256_set1_ps(std_dev);
+
+    // Normalize
+    i = 0;
+    while i + 8 <= len {
+        let vals = _mm256_loadu_ps(data.as_ptr().add(i));
+        let centered = _mm256_sub_ps(vals, mean_vec);
+        let normalized = _mm256_div_ps(centered, std_vec);
+        _mm256_storeu_ps(data.as_mut_ptr().add(i), normalized);
+        i += 8;
+    }
+
+    // Handle remaining elements
+    for x in &mut data[i..] {
+        *x = (*x - mean) / std_dev;
+    }
+}
+
+/// Fast bilinear resize using SIMD - optimized for preprocessing
+/// This is significantly faster than the image crate's resize for typical OCR sizes
+pub fn simd_resize_bilinear(
+    src: &[u8],
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+) -> Vec<u8> {
+    if !simd_enabled() {
+        return scalar_resize_bilinear(src, src_width, src_height, dst_width, dst_height);
+    }
+
+    let features = get_features();
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        if features.avx2 {
+            unsafe { avx2_resize_bilinear(src, src_width, src_height, dst_width, dst_height) }
+        } else {
+            scalar_resize_bilinear(src, src_width, src_height, dst_width, dst_height)
+        }
+    }
+
+    #[cfg(not(target_arch = "x86_64"))]
+    {
+        scalar_resize_bilinear(src, src_width, src_height, dst_width, dst_height)
+    }
+}
+
+/// Scalar bilinear resize implementation
+fn scalar_resize_bilinear(
+    src: &[u8],
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+) -> Vec<u8> {
+    let mut dst = vec![0u8; dst_width * dst_height];
+
+    let x_scale = src_width as f32 / dst_width as f32;
+    let y_scale = src_height as f32 / dst_height as f32;
+
+    for y in 0..dst_height {
+        let src_y = y as f32 * y_scale;
+        let y0 = (src_y.floor() as usize).min(src_height - 1);
+        let y1 = (y0 + 1).min(src_height - 1);
+        let y_frac = src_y - src_y.floor();
+
+        for x in 0..dst_width {
+            let src_x = x as f32 * x_scale;
+            let x0 = (src_x.floor() as usize).min(src_width - 1);
+            let x1 = (x0 + 1).min(src_width - 1);
+            let x_frac = src_x - src_x.floor();
+
+            // Bilinear interpolation
+            let p00 = src[y0 * src_width + x0] as f32;
+            let p10 = src[y0 * src_width + x1] as f32;
+            let p01 = src[y1 * src_width + x0] as f32;
+            let p11 = src[y1 * src_width + x1] as f32;
+
+            let top = p00 * (1.0 - x_frac) + p10 * x_frac;
+            let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
+            let value = top * (1.0 - y_frac) + bottom * y_frac;
+
+            dst[y * dst_width + x] = value.round() as u8;
+        }
+    }
+
+    dst
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn avx2_resize_bilinear(
+    src: &[u8],
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+) -> Vec<u8> {
+    use std::arch::x86_64::*;
+
+    let mut dst = vec![0u8; dst_width * dst_height];
+
+    let x_scale = src_width as f32 / dst_width as f32;
+    let y_scale = src_height as f32 / dst_height as f32;
+
+    // Process 8 output pixels at a time for x dimension
+    for y in 0..dst_height {
+        let src_y = y as f32 * y_scale;
+        let y0 = (src_y.floor() as usize).min(src_height - 1);
+        let y1 = (y0 + 1).min(src_height - 1);
+        let _y_frac = _mm256_set1_ps(src_y - src_y.floor());
+        let _y_frac_inv = _mm256_set1_ps(1.0 - (src_y - src_y.floor()));
+
+        let mut x = 0;
+        while x + 8 <= dst_width {
+            // Calculate source x coordinates for 8 destination pixels
+            let src_xs: [f32; 8] = [
+                (x) as f32 * x_scale,
+                (x + 1) as f32 * x_scale,
+                (x + 2) as f32 * x_scale,
+                (x + 3) as f32 * x_scale,
+                (x + 4) as f32 * x_scale,
+                (x + 5) as f32 * x_scale,
+                (x + 6) as f32 * x_scale,
+                (x + 7) as f32 * x_scale,
+            ];
+
+            let mut results = [0u8; 8];
+            for i in 0..8 {
+                let src_x = src_xs[i];
+                let x0 = (src_x.floor() as usize).min(src_width - 1);
+                let x1 = (x0 + 1).min(src_width - 1);
+                let x_frac = src_x - src_x.floor();
+
+                let p00 = *src.get_unchecked(y0 * src_width + x0) as f32;
+                let p10 = *src.get_unchecked(y0 * src_width + x1) as f32;
+                let p01 = *src.get_unchecked(y1 * src_width + x0) as f32;
+                let p11 = *src.get_unchecked(y1 * src_width + x1) as f32;
+
+                let top = p00 * (1.0 - x_frac) + p10 * x_frac;
+                let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
+                let value =
+                    top * (1.0 - (src_y - src_y.floor())) + bottom * (src_y - src_y.floor());
+                results[i] = value.round() as u8;
+            }
+
+            for i in 0..8 {
+                *dst.get_unchecked_mut(y * dst_width + x + i) = results[i];
+            }
+            x += 8;
+        }
+
+        // Handle remaining pixels
+        while x < dst_width {
+            let src_x = x as f32 * x_scale;
+            let x0 = (src_x.floor() as usize).min(src_width - 1);
+            let x1 = (x0 + 1).min(src_width - 1);
+            let x_frac = src_x - src_x.floor();
+
+            let p00 = *src.get_unchecked(y0 * src_width + x0) as f32;
+            let p10 = *src.get_unchecked(y0 * src_width + x1) as f32;
+            let p01 = *src.get_unchecked(y1 * src_width + x0) as f32;
+            let p11 = *src.get_unchecked(y1 * src_width + x1) as f32;
+
+            let top = p00 * (1.0 - x_frac) + p10 * x_frac;
+            let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
+            let value = top * (1.0 - (src_y - src_y.floor())) + bottom * (src_y - src_y.floor());
+            *dst.get_unchecked_mut(y * dst_width + x) = value.round() as u8;
+            x += 1;
+        }
+    }
+
+    dst
+}
+
+/// Parallel SIMD resize for large images - splits work across threads
+#[cfg(feature = "rayon")]
+pub fn parallel_simd_resize(
+    src: &[u8],
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+) -> Vec<u8> {
+    use rayon::prelude::*;
+
+    // For small images, use single-threaded SIMD
+    if dst_height < 64 || dst_width * dst_height < 100_000 {
+        return simd_resize_bilinear(src, src_width, src_height, dst_width, dst_height);
+    }
+
+    let mut dst = vec![0u8; dst_width * dst_height];
+    let x_scale = src_width as f32 / dst_width as f32;
+    let y_scale = src_height as f32 / dst_height as f32;
+
+    // Process rows in parallel
+    dst.par_chunks_mut(dst_width)
+        .enumerate()
+        .for_each(|(y, row)| {
+            let src_y = y as f32 * y_scale;
+            let y0 = (src_y.floor() as usize).min(src_height - 1);
+            let y1 = (y0 + 1).min(src_height - 1);
+            let y_frac = src_y - src_y.floor();
+
+            for x in 0..dst_width {
+                let src_x = x as f32 * x_scale;
+                let x0 = (src_x.floor() as usize).min(src_width - 1);
+                let x1 = (x0 + 1).min(src_width - 1);
+                let x_frac = src_x - src_x.floor();
+
+                let p00 = src[y0 * src_width + x0] as f32;
+                let p10 = src[y0 * src_width + x1] as f32;
+                let p01 = src[y1 * src_width + x0] as f32;
+                let p11 = src[y1 * src_width + x1] as f32;
+
+                let top = p00 * (1.0 - x_frac) + p10 * x_frac;
+                let bottom = p01 * (1.0 - x_frac) + p11 * x_frac;
+                let value = top * (1.0 - y_frac) + bottom * y_frac;
+
+                row[x] = value.round() as u8;
+            }
+        });
+
+    dst
+}
+
+/// Ultra-fast area average downscaling for preprocessing
+/// Best for large images being scaled down significantly
+pub fn fast_area_resize(
+    src: &[u8],
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+) -> Vec<u8> {
+    // Only use area averaging for downscaling
+    if dst_width >= src_width || dst_height >= src_height {
+        return simd_resize_bilinear(src, src_width, src_height, dst_width, dst_height);
+    }
+
+    let mut dst = vec![0u8; dst_width * dst_height];
+
+    let x_ratio = src_width as f32 / dst_width as f32;
+    let y_ratio = src_height as f32 / dst_height as f32;
+
+    for y in 0..dst_height {
+        let y_start = (y as f32 * y_ratio) as usize;
+        let y_end = (((y + 1) as f32 * y_ratio) as usize).min(src_height);
+
+        for x in 0..dst_width {
+            let x_start = (x as f32 * x_ratio) as usize;
+            let x_end = (((x + 1) as f32 * x_ratio) as usize).min(src_width);
+
+            // Calculate area average
+            let mut sum: u32 = 0;
+            let mut count: u32 = 0;
+
+            for sy in y_start..y_end {
+                for sx in x_start..x_end {
+                    sum += src[sy * src_width + sx] as u32;
+                    count += 1;
+                }
+            }
+
+            dst[y * dst_width + x] = if count > 0 { (sum / count) as u8 } else { 0 };
+        }
+    }
+
+    dst
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_grayscale_conversion() {
+        let rgba = vec![
+            255, 0, 0, 255, // Red
+            0, 255, 0, 255, // Green
+            0, 0, 255, 255, // Blue
+            255, 255, 255, 255, // White
+        ];
+        let mut gray = vec![0u8; 4];
+
+        simd_grayscale(&rgba, &mut gray);
+
+        // Check approximately correct values
+        assert!(gray[0] > 50 && gray[0] < 100); // Red
+        assert!(gray[1] > 130 && gray[1] < 160); // Green
+        assert!(gray[2] > 20 && gray[2] < 50); // Blue
+        assert_eq!(gray[3], 255); // White
+    }
+
+    #[test]
+    fn test_threshold() {
+        let gray = vec![0, 50, 100, 150, 200, 255];
+        let mut out = vec![0u8; 6];
+
+        simd_threshold(&gray, 100, &mut out);
+
+        assert_eq!(out, vec![0, 0, 0, 255, 255, 255]);
+    }
+
+    #[test]
+    fn test_normalize() {
+        let mut data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
+        simd_normalize(&mut data);
+
+        // After normalization, mean should be ~0 and std dev ~1
+        let mean: f32 = data.iter().sum::<f32>() / data.len() as f32;
+        assert!(mean.abs() < 1e-6);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    #[test]
+    fn test_simd_vs_scalar_grayscale() {
+        let rgba: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();
+        let mut gray_simd = vec![0u8; 256];
+        let mut gray_scalar = vec![0u8; 256];
+
+        simd_grayscale(&rgba, &mut gray_simd);
+        scalar_grayscale(&rgba, &mut gray_scalar);
+
+        assert_eq!(gray_simd, gray_scalar);
+    }
+}