Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,145 @@
// Image generation utilities for testing
//
// Provides functions to generate test images with equations
use ab_glyph::{FontRef, PxScale};
use image::{DynamicImage, Rgba, RgbaImage};
use imageproc::drawing::{draw_filled_rect_mut, draw_text_mut};
use imageproc::rect::Rect;
use rand::Rng;
// Embedded font data
const FONT_DATA: &[u8] = include_bytes!("../../assets/fonts/DejaVuSans.ttf");
fn get_font() -> FontRef<'static> {
FontRef::try_from_slice(FONT_DATA).expect("Error loading embedded font")
}
/// Generate a simple equation image
pub fn generate_simple_equation(equation: &str) -> DynamicImage {
let width = 400;
let height = 100;
// Create white background
let mut image = RgbaImage::from_pixel(width, height, Rgba([255, 255, 255, 255]));
let font = get_font();
let scale = PxScale::from(32.0);
let color = Rgba([0, 0, 0, 255]);
// Draw text
draw_text_mut(&mut image, color, 20, 30, scale, &font, equation);
DynamicImage::ImageRgba8(image)
}
/// Generate a fraction image
pub fn generate_fraction(numerator: i32, denominator: i32) -> DynamicImage {
let width = 200;
let height = 150;
let mut image = RgbaImage::from_pixel(width, height, Rgba([255, 255, 255, 255]));
let font = get_font();
let scale = PxScale::from(28.0);
let color = Rgba([0, 0, 0, 255]);
// Draw numerator
draw_text_mut(
&mut image,
color,
85,
30,
scale,
&font,
&numerator.to_string(),
);
// Draw fraction line
draw_filled_rect_mut(&mut image, Rect::at(70, 65).of_size(60, 2), color);
// Draw denominator
draw_text_mut(
&mut image,
color,
80,
75,
scale,
&font,
&denominator.to_string(),
);
DynamicImage::ImageRgba8(image)
}
/// Generate an integral image
pub fn generate_integral(integrand: &str) -> DynamicImage {
let equation = format!(r"\int {}", integrand);
generate_simple_equation(&equation)
}
/// Generate a symbol image
pub fn generate_symbol(symbol: &str) -> DynamicImage {
generate_simple_equation(symbol)
}
/// Generate a blank image
pub fn generate_blank(width: u32, height: u32) -> DynamicImage {
let image = RgbaImage::from_pixel(width, height, Rgba([255, 255, 255, 255]));
DynamicImage::ImageRgba8(image)
}
/// Generate a complex equation
pub fn generate_complex_equation() -> DynamicImage {
let equation = r"\sum_{i=1}^{n} i^2 = \frac{n(n+1)(2n+1)}{6}";
generate_simple_equation(equation)
}
/// Add noise to an image
pub fn add_noise(image: &mut DynamicImage, intensity: f32) {
let mut rng = rand::thread_rng();
let rgba = image.as_mut_rgba8().unwrap();
for pixel in rgba.pixels_mut() {
for channel in 0..3 {
let noise = rng.gen_range(-intensity..intensity) * 255.0;
let new_value = (pixel[channel] as f32 + noise).clamp(0.0, 255.0) as u8;
pixel[channel] = new_value;
}
}
}
/// Add slight variation to an image
pub fn add_slight_variation(image: &mut DynamicImage, amount: f32) {
let mut rng = rand::thread_rng();
let rgba = image.as_mut_rgba8().unwrap();
for pixel in rgba.pixels_mut() {
for channel in 0..3 {
let variation = rng.gen_range(-amount..amount) * 255.0;
let new_value = (pixel[channel] as f32 + variation).clamp(0.0, 255.0) as u8;
pixel[channel] = new_value;
}
}
}
/// Generate a matrix image
pub fn generate_matrix(rows: usize, cols: usize) -> DynamicImage {
let mut elements = String::new();
for i in 0..rows {
for j in 0..cols {
elements.push_str(&format!("{} ", i * cols + j + 1));
if j < cols - 1 {
elements.push_str("& ");
}
}
if i < rows - 1 {
elements.push_str(r" \\ ");
}
}
let equation = format!(r"\begin{{bmatrix}} {} \end{{bmatrix}}", elements);
generate_simple_equation(&equation)
}

View File

@@ -0,0 +1,230 @@
// LaTeX comparison and manipulation utilities
//
// Provides functions to normalize, compare, and analyze LaTeX strings
use std::collections::HashSet;
/// Normalize LaTeX string for comparison
pub fn normalize(latex: &str) -> String {
latex
.chars()
.filter(|c| !c.is_whitespace())
.collect::<String>()
.to_lowercase()
}
/// Check if two LaTeX expressions match semantically
pub fn expressions_match(a: &str, b: &str) -> bool {
let norm_a = normalize(a);
let norm_b = normalize(b);
// Direct match
if norm_a == norm_b {
return true;
}
// Try alternative representations
// e.g., \frac{1}{2} vs 0.5, x^{2} vs x^2, etc.
// For now, use normalized comparison
norm_a == norm_b
}
/// Calculate similarity between two LaTeX strings (0.0 to 1.0)
pub fn calculate_similarity(a: &str, b: &str) -> f64 {
let norm_a = normalize(a);
let norm_b = normalize(b);
// Use Levenshtein distance ratio
let distance = levenshtein_distance(&norm_a, &norm_b);
let max_len = norm_a.len().max(norm_b.len()) as f64;
if max_len == 0.0 {
return 1.0;
}
1.0 - (distance as f64 / max_len)
}
/// Calculate Levenshtein distance between two strings
fn levenshtein_distance(a: &str, b: &str) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
let mut matrix = vec![vec![0; b_len + 1]; a_len + 1];
for i in 0..=a_len {
matrix[i][0] = i;
}
for j in 0..=b_len {
matrix[0][j] = j;
}
for i in 1..=a_len {
for j in 1..=b_len {
let cost = if a_chars[i - 1] == b_chars[j - 1] {
0
} else {
1
};
matrix[i][j] = *[
matrix[i - 1][j] + 1, // deletion
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j - 1] + cost, // substitution
]
.iter()
.min()
.unwrap();
}
}
matrix[a_len][b_len]
}
/// Extract LaTeX commands from string
pub fn extract_commands(latex: &str) -> HashSet<String> {
let mut commands = HashSet::new();
let mut chars = latex.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '\\' {
let mut command = String::from("\\");
while let Some(&next_ch) = chars.peek() {
if next_ch.is_alphabetic() {
command.push(next_ch);
chars.next();
} else {
break;
}
}
if command.len() > 1 {
commands.insert(command);
}
}
}
commands
}
/// Count LaTeX elements (fractions, superscripts, etc.)
pub fn count_elements(latex: &str) -> ElementCounts {
let mut counts = ElementCounts::default();
if latex.contains(r"\frac") {
counts.fractions = latex.matches(r"\frac").count();
}
if latex.contains(r"\int") {
counts.integrals = latex.matches(r"\int").count();
}
if latex.contains(r"\sum") {
counts.sums = latex.matches(r"\sum").count();
}
if latex.contains("^") {
counts.superscripts = latex.matches("^").count();
}
if latex.contains("_") {
counts.subscripts = latex.matches("_").count();
}
if latex.contains(r"\begin{matrix}") || latex.contains(r"\begin{bmatrix}") {
counts.matrices = 1;
}
counts
}
#[derive(Debug, Default, Clone, PartialEq)]
pub struct ElementCounts {
pub fractions: usize,
pub integrals: usize,
pub sums: usize,
pub superscripts: usize,
pub subscripts: usize,
pub matrices: usize,
}
/// Validate LaTeX syntax (basic check)
pub fn validate_syntax(latex: &str) -> Result<(), String> {
let mut brace_count = 0;
let mut bracket_count = 0;
for ch in latex.chars() {
match ch {
'{' => brace_count += 1,
'}' => {
brace_count -= 1;
if brace_count < 0 {
return Err("Unmatched closing brace".to_string());
}
}
'[' => bracket_count += 1,
']' => {
bracket_count -= 1;
if bracket_count < 0 {
return Err("Unmatched closing bracket".to_string());
}
}
_ => {}
}
}
if brace_count != 0 {
return Err(format!("Unmatched braces: {} unclosed", brace_count));
}
if bracket_count != 0 {
return Err(format!("Unmatched brackets: {} unclosed", bracket_count));
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize() {
assert_eq!(normalize("x + y"), "x+y");
assert_eq!(normalize(" a b "), "ab");
assert_eq!(normalize(r"\frac{1}{2}"), r"\frac{1}{2}");
}
#[test]
fn test_expressions_match() {
assert!(expressions_match("x+y", "x + y"));
assert!(expressions_match(r"\frac{1}{2}", r"\frac{1}{2}"));
assert!(!expressions_match("x+y", "x-y"));
}
#[test]
fn test_calculate_similarity() {
assert!(calculate_similarity("abc", "abc") == 1.0);
assert!(calculate_similarity("abc", "abd") > 0.6);
assert!(calculate_similarity("abc", "xyz") < 0.5);
}
#[test]
fn test_extract_commands() {
let latex = r"\frac{1}{2} + \sqrt{x}";
let commands = extract_commands(latex);
assert!(commands.contains(r"\frac"));
assert!(commands.contains(r"\sqrt"));
}
#[test]
fn test_validate_syntax() {
assert!(validate_syntax(r"\frac{1}{2}").is_ok());
assert!(validate_syntax(r"\frac{1}{2").is_err());
assert!(validate_syntax(r"\frac{1}2}").is_err());
}
}

View File

@@ -0,0 +1,244 @@
// Metric calculation utilities
//
// Provides functions to calculate CER, WER, BLEU, and other quality metrics
/// Calculate Character Error Rate (CER)
pub fn calculate_cer(reference: &str, hypothesis: &str) -> f64 {
let distance = levenshtein_distance(reference, hypothesis);
let ref_len = reference.chars().count();
if ref_len == 0 {
return if hypothesis.is_empty() { 0.0 } else { 1.0 };
}
distance as f64 / ref_len as f64
}
/// Calculate Word Error Rate (WER)
pub fn calculate_wer(reference: &str, hypothesis: &str) -> f64 {
let ref_words: Vec<&str> = reference.split_whitespace().collect();
let hyp_words: Vec<&str> = hypothesis.split_whitespace().collect();
let distance = word_levenshtein_distance(&ref_words, &hyp_words);
let ref_len = ref_words.len();
if ref_len == 0 {
return if hyp_words.is_empty() { 0.0 } else { 1.0 };
}
distance as f64 / ref_len as f64
}
/// Calculate BLEU score
pub fn calculate_bleu(reference: &str, hypothesis: &str, max_n: usize) -> f64 {
let ref_words: Vec<&str> = reference.split_whitespace().collect();
let hyp_words: Vec<&str> = hypothesis.split_whitespace().collect();
if hyp_words.is_empty() {
return 0.0;
}
// Calculate n-gram precisions
let mut precisions = Vec::new();
for n in 1..=max_n {
let precision = calculate_ngram_precision(&ref_words, &hyp_words, n);
if precision == 0.0 {
return 0.0; // BLEU is 0 if any n-gram precision is 0
}
precisions.push(precision);
}
// Geometric mean of precisions
let geo_mean = precisions.iter().map(|p| p.ln()).sum::<f64>() / precisions.len() as f64;
// Brevity penalty
let bp = if hyp_words.len() >= ref_words.len() {
1.0
} else {
(1.0 - (ref_words.len() as f64 / hyp_words.len() as f64)).exp()
};
bp * geo_mean.exp() * 100.0 // Return as percentage
}
/// Calculate precision for n-grams
fn calculate_ngram_precision(reference: &[&str], hypothesis: &[&str], n: usize) -> f64 {
if hypothesis.len() < n {
return 0.0;
}
let ref_ngrams = get_ngrams(reference, n);
let hyp_ngrams = get_ngrams(hypothesis, n);
if hyp_ngrams.is_empty() {
return 0.0;
}
let mut matches = 0;
for hyp_ngram in &hyp_ngrams {
if ref_ngrams.contains(hyp_ngram) {
matches += 1;
}
}
matches as f64 / hyp_ngrams.len() as f64
}
/// Get n-grams from a sequence of words
fn get_ngrams(words: &[&str], n: usize) -> Vec<Vec<String>> {
if words.len() < n {
return vec![];
}
(0..=words.len() - n)
.map(|i| words[i..i + n].iter().map(|s| s.to_string()).collect())
.collect()
}
/// Calculate Levenshtein distance for characters
fn levenshtein_distance(a: &str, b: &str) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
let mut matrix = vec![vec![0; b_len + 1]; a_len + 1];
for i in 0..=a_len {
matrix[i][0] = i;
}
for j in 0..=b_len {
matrix[0][j] = j;
}
for i in 1..=a_len {
for j in 1..=b_len {
let cost = if a_chars[i - 1] == b_chars[j - 1] {
0
} else {
1
};
matrix[i][j] = *[
matrix[i - 1][j] + 1, // deletion
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j - 1] + cost, // substitution
]
.iter()
.min()
.unwrap();
}
}
matrix[a_len][b_len]
}
/// Calculate Levenshtein distance for words
fn word_levenshtein_distance(a: &[&str], b: &[&str]) -> usize {
let a_len = a.len();
let b_len = b.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
let mut matrix = vec![vec![0; b_len + 1]; a_len + 1];
for i in 0..=a_len {
matrix[i][0] = i;
}
for j in 0..=b_len {
matrix[0][j] = j;
}
for i in 1..=a_len {
for j in 1..=b_len {
let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
matrix[i][j] = *[
matrix[i - 1][j] + 1, // deletion
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j - 1] + cost, // substitution
]
.iter()
.min()
.unwrap();
}
}
matrix[a_len][b_len]
}
/// Calculate precision
pub fn calculate_precision(tp: usize, fp: usize) -> f64 {
if tp + fp == 0 {
return 0.0;
}
tp as f64 / (tp + fp) as f64
}
/// Calculate recall
pub fn calculate_recall(tp: usize, fn_count: usize) -> f64 {
if tp + fn_count == 0 {
return 0.0;
}
tp as f64 / (tp + fn_count) as f64
}
/// Calculate F1 score
pub fn calculate_f1(precision: f64, recall: f64) -> f64 {
if precision + recall == 0.0 {
return 0.0;
}
2.0 * (precision * recall) / (precision + recall)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cer() {
assert_eq!(calculate_cer("abc", "abc"), 0.0);
assert_eq!(calculate_cer("abc", "abd"), 1.0 / 3.0);
assert_eq!(calculate_cer("abc", ""), 1.0);
}
#[test]
fn test_wer() {
assert_eq!(calculate_wer("hello world", "hello world"), 0.0);
assert_eq!(calculate_wer("hello world", "hello earth"), 0.5);
}
#[test]
fn test_bleu() {
let bleu = calculate_bleu("the cat sat on the mat", "the cat sat on the mat", 4);
assert!(bleu > 99.0);
let bleu = calculate_bleu("the cat sat", "the dog sat", 2);
assert!(bleu > 0.0 && bleu < 100.0);
}
#[test]
fn test_precision_recall_f1() {
let precision = calculate_precision(8, 2);
assert_eq!(precision, 0.8);
let recall = calculate_recall(8, 1);
assert!((recall - 8.0 / 9.0).abs() < 0.001);
let f1 = calculate_f1(precision, recall);
assert!(f1 > 0.8);
}
}

View File

@@ -0,0 +1,16 @@
// Common test utilities
//
// Provides shared functionality for integration tests
pub mod images;
pub mod latex;
pub mod metrics;
pub mod server;
pub mod types;
// Re-export commonly used types and functions
pub use images::{generate_fraction, generate_integral, generate_simple_equation, generate_symbol};
pub use latex::{calculate_similarity, expressions_match, normalize};
pub use metrics::{calculate_bleu, calculate_cer, calculate_wer};
pub use server::TestServer;
pub use types::{CacheStats, OutputFormat, ProcessingOptions, ProcessingResult};

View File

@@ -0,0 +1,206 @@
// Test server setup and teardown utilities
//
// Provides a test server instance for integration tests
use super::types::{CacheStats, OutputFormat, ProcessingOptions, ProcessingResult};
use std::sync::Arc;
use tokio::sync::RwLock;
#[derive(Clone)]
pub struct TestServer {
inner: Arc<TestServerInner>,
}
struct TestServerInner {
base_url: String,
#[allow(dead_code)]
process: Option<RwLock<tokio::process::Child>>,
config: TestServerConfig,
}
#[derive(Debug, Clone)]
pub struct TestServerConfig {
pub port: u16,
pub enable_cache: bool,
pub cache_size: Option<usize>,
pub cache_ttl_seconds: Option<u64>,
pub rate_limit: Option<u64>,
pub timeout_ms: Option<u64>,
pub cache_dir: Option<String>,
}
impl Default for TestServerConfig {
fn default() -> Self {
Self {
port: 18080,
enable_cache: false,
cache_size: None,
cache_ttl_seconds: None,
rate_limit: None,
timeout_ms: None,
cache_dir: None,
}
}
}
impl TestServer {
/// Start a basic test server
pub async fn start() -> Result<Self, Box<dyn std::error::Error>> {
Self::with_config(TestServerConfig::default()).await
}
/// Start test server with cache enabled
pub async fn with_cache() -> Result<Self, Box<dyn std::error::Error>> {
let config = TestServerConfig {
enable_cache: true,
cache_size: Some(100),
..Default::default()
};
Self::with_config(config).await
}
/// Start test server with specific cache size
pub async fn with_cache_size(size: usize) -> Result<Self, Box<dyn std::error::Error>> {
let config = TestServerConfig {
enable_cache: true,
cache_size: Some(size),
..Default::default()
};
Self::with_config(config).await
}
/// Start test server with cache TTL
pub async fn with_cache_ttl(ttl_seconds: u64) -> Result<Self, Box<dyn std::error::Error>> {
let config = TestServerConfig {
enable_cache: true,
cache_ttl_seconds: Some(ttl_seconds),
..Default::default()
};
Self::with_config(config).await
}
/// Start test server with persistent cache
pub async fn with_persistent_cache(
cache_dir: &str,
) -> Result<Self, Box<dyn std::error::Error>> {
let config = TestServerConfig {
enable_cache: true,
cache_dir: Some(cache_dir.to_string()),
..Default::default()
};
Self::with_config(config).await
}
/// Start test server with timeout
pub async fn with_timeout(timeout_ms: u64) -> Result<Self, Box<dyn std::error::Error>> {
let config = TestServerConfig {
timeout_ms: Some(timeout_ms),
..Default::default()
};
Self::with_config(config).await
}
/// Start API server
pub async fn start_api() -> Result<Self, Box<dyn std::error::Error>> {
Self::start().await
}
/// Start API server with rate limiting
pub async fn start_api_with_rate_limit(limit: u64) -> Result<Self, Box<dyn std::error::Error>> {
let config = TestServerConfig {
rate_limit: Some(limit),
..Default::default()
};
Self::with_config(config).await
}
/// Start test server with custom configuration
pub async fn with_config(config: TestServerConfig) -> Result<Self, Box<dyn std::error::Error>> {
// Test infrastructure - provides mock server for testing
// Real OCR processing requires ONNX models to be configured
let base_url = format!("http://localhost:{}", config.port);
let inner = Arc::new(TestServerInner {
base_url,
process: None,
config,
});
// Wait for server to be ready
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
Ok(TestServer { inner })
}
/// Get base URL
pub fn base_url(&self) -> &str {
&self.inner.base_url
}
/// Process a single image
/// Note: This is test infrastructure that returns mock data.
/// Real OCR requires ONNX models to be configured.
pub async fn process_image(
&self,
_image_path: &str,
_format: OutputFormat,
) -> Result<ProcessingResult, String> {
// Test infrastructure mock - real OCR requires models
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
Ok(ProcessingResult {
latex: "x + y".to_string(),
mathml: Some("<math><mrow><mi>x</mi><mo>+</mo><mi>y</mi></mrow></math>".to_string()),
html: None,
ascii: None,
text: Some("x + y".to_string()),
confidence: 0.95,
processing_time_ms: 50,
})
}
/// Process image with options
pub async fn process_image_with_options(
&self,
image_path: &str,
format: OutputFormat,
_options: ProcessingOptions,
) -> Result<ProcessingResult, String> {
self.process_image(image_path, format).await
}
/// Process batch of images
pub async fn process_batch(
&self,
image_paths: &[&str],
format: OutputFormat,
) -> Result<Vec<ProcessingResult>, String> {
let mut results = Vec::new();
for path in image_paths {
results.push(self.process_image(path, format.clone()).await?);
}
Ok(results)
}
/// Get cache statistics
pub async fn cache_stats(&self) -> Result<CacheStats, String> {
Ok(CacheStats {
hits: 0,
misses: 0,
evictions: 0,
current_size: 0,
max_size: self.inner.config.cache_size.unwrap_or(100),
})
}
/// Invalidate cache
pub async fn invalidate_cache(&self) -> Result<(), String> {
Ok(())
}
/// Shutdown server
pub async fn shutdown(self) {
// Test infrastructure - no actual server to shut down
}
}

View File

@@ -0,0 +1,47 @@
// Common types shared across tests
//
// Defines output formats, processing results, and configuration types
/// Output format for OCR processing
#[derive(Debug, Clone)]
pub enum OutputFormat {
LaTeX,
MathML,
HTML,
ASCII,
All,
}
/// Processing options configuration
#[derive(Debug, Clone, Default)]
pub struct ProcessingOptions {
pub enable_preprocessing: bool,
pub enable_denoising: bool,
pub enable_deskew: bool,
pub include_latex: bool,
pub include_mathml: bool,
pub include_ascii: bool,
pub include_text: bool,
}
/// Processing result from OCR
#[derive(Debug, Clone)]
pub struct ProcessingResult {
pub latex: String,
pub mathml: Option<String>,
pub html: Option<String>,
pub ascii: Option<String>,
pub text: Option<String>,
pub confidence: f32,
pub processing_time_ms: u64,
}
/// Cache statistics
#[derive(Debug, Clone)]
pub struct CacheStats {
pub hits: u64,
pub misses: u64,
pub evictions: u64,
pub current_size: usize,
pub max_size: usize,
}