Merge commit 'd803bfe2b1fe7f5e219e50ac20d6801a0a58ac75' as 'vendor/ruvector'
This commit is contained in:
298
vendor/ruvector/examples/scipix/src/output/docx.rs
vendored
Normal file
298
vendor/ruvector/examples/scipix/src/output/docx.rs
vendored
Normal file
@@ -0,0 +1,298 @@
|
||||
//! DOCX (Microsoft Word) formatter with Office Math ML support
|
||||
//!
|
||||
//! This is a stub implementation. Full DOCX generation requires:
|
||||
//! - ZIP file creation for .docx format
|
||||
//! - XML generation for document.xml, styles.xml, etc.
|
||||
//! - Office Math ML for equations
|
||||
//! - Image embedding support
|
||||
//!
|
||||
//! Consider using libraries like `docx-rs` for production implementation.
|
||||
|
||||
use super::{LineData, OcrResult};
|
||||
use std::io::Write;
|
||||
|
||||
/// DOCX formatter (stub implementation)
|
||||
#[allow(dead_code)]
|
||||
pub struct DocxFormatter {
|
||||
include_styles: bool,
|
||||
page_size: PageSize,
|
||||
margins: Margins,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PageSize {
|
||||
pub width: u32, // in twips (1/1440 inch)
|
||||
pub height: u32,
|
||||
}
|
||||
|
||||
impl PageSize {
|
||||
pub fn letter() -> Self {
|
||||
Self {
|
||||
width: 12240, // 8.5 inches
|
||||
height: 15840, // 11 inches
|
||||
}
|
||||
}
|
||||
|
||||
pub fn a4() -> Self {
|
||||
Self {
|
||||
width: 11906, // 210mm
|
||||
height: 16838, // 297mm
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Margins {
|
||||
pub top: u32,
|
||||
pub right: u32,
|
||||
pub bottom: u32,
|
||||
pub left: u32,
|
||||
}
|
||||
|
||||
impl Margins {
|
||||
pub fn normal() -> Self {
|
||||
Self {
|
||||
top: 1440, // 1 inch
|
||||
right: 1440,
|
||||
bottom: 1440,
|
||||
left: 1440,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DocxFormatter {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
include_styles: true,
|
||||
page_size: PageSize::letter(),
|
||||
margins: Margins::normal(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_page_size(mut self, page_size: PageSize) -> Self {
|
||||
self.page_size = page_size;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_margins(mut self, margins: Margins) -> Self {
|
||||
self.margins = margins;
|
||||
self
|
||||
}
|
||||
|
||||
/// Generate Office Math ML from LaTeX
|
||||
/// This is a simplified placeholder - real implementation needs proper conversion
|
||||
pub fn latex_to_mathml(&self, latex: &str) -> String {
|
||||
// This is a very simplified stub
|
||||
// Real implementation would parse LaTeX and generate proper Office Math ML
|
||||
format!(
|
||||
r#"<m:oMathPara>
|
||||
<m:oMath>
|
||||
<m:r>
|
||||
<m:t>{}</m:t>
|
||||
</m:r>
|
||||
</m:oMath>
|
||||
</m:oMathPara>"#,
|
||||
self.escape_xml(latex)
|
||||
)
|
||||
}
|
||||
|
||||
/// Generate document.xml content
|
||||
pub fn generate_document_xml(&self, lines: &[LineData]) -> String {
|
||||
let mut xml = String::from(
|
||||
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math">
|
||||
<w:body>
|
||||
"#,
|
||||
);
|
||||
|
||||
for line in lines {
|
||||
xml.push_str(&self.format_line(line));
|
||||
}
|
||||
|
||||
xml.push_str(" </w:body>\n</w:document>");
|
||||
xml
|
||||
}
|
||||
|
||||
fn format_line(&self, line: &LineData) -> String {
|
||||
match line.line_type.as_str() {
|
||||
"text" => self.format_paragraph(&line.text),
|
||||
"math" | "equation" => {
|
||||
let latex = line.latex.as_ref().unwrap_or(&line.text);
|
||||
self.format_math(latex)
|
||||
}
|
||||
"heading" => self.format_heading(&line.text, 1),
|
||||
_ => self.format_paragraph(&line.text),
|
||||
}
|
||||
}
|
||||
|
||||
fn format_paragraph(&self, text: &str) -> String {
|
||||
format!(
|
||||
r#" <w:p>
|
||||
<w:r>
|
||||
<w:t>{}</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
"#,
|
||||
self.escape_xml(text)
|
||||
)
|
||||
}
|
||||
|
||||
fn format_heading(&self, text: &str, level: u32) -> String {
|
||||
format!(
|
||||
r#" <w:p>
|
||||
<w:pPr>
|
||||
<w:pStyle w:val="Heading{}"/>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:t>{}</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
"#,
|
||||
level,
|
||||
self.escape_xml(text)
|
||||
)
|
||||
}
|
||||
|
||||
fn format_math(&self, latex: &str) -> String {
|
||||
let mathml = self.latex_to_mathml(latex);
|
||||
format!(
|
||||
r#" <w:p>
|
||||
<w:r>
|
||||
{}
|
||||
</w:r>
|
||||
</w:p>
|
||||
"#,
|
||||
mathml
|
||||
)
|
||||
}
|
||||
|
||||
fn escape_xml(&self, text: &str) -> String {
|
||||
text.replace('&', "&")
|
||||
.replace('<', "<")
|
||||
.replace('>', ">")
|
||||
.replace('"', """)
|
||||
.replace('\'', "'")
|
||||
}
|
||||
|
||||
/// Save DOCX to file (stub - needs ZIP implementation)
|
||||
pub fn save_to_file<W: Write>(
|
||||
&self,
|
||||
_writer: &mut W,
|
||||
_result: &OcrResult,
|
||||
) -> Result<(), String> {
|
||||
Err("DOCX binary format generation not implemented. Use docx-rs library for full implementation.".to_string())
|
||||
}
|
||||
|
||||
/// Generate styles.xml content
|
||||
pub fn generate_styles_xml(&self) -> String {
|
||||
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:style w:type="paragraph" w:styleId="Normal">
|
||||
<w:name w:val="Normal"/>
|
||||
<w:qFormat/>
|
||||
</w:style>
|
||||
<w:style w:type="paragraph" w:styleId="Heading1">
|
||||
<w:name w:val="Heading 1"/>
|
||||
<w:basedOn w:val="Normal"/>
|
||||
<w:qFormat/>
|
||||
<w:pPr>
|
||||
<w:keepNext/>
|
||||
<w:keepLines/>
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="32"/>
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
</w:styles>"#
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DocxFormatter {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::output::BoundingBox;
|
||||
|
||||
#[test]
|
||||
fn test_page_sizes() {
|
||||
let letter = PageSize::letter();
|
||||
assert_eq!(letter.width, 12240);
|
||||
|
||||
let a4 = PageSize::a4();
|
||||
assert!(a4.width < letter.width);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_xml() {
|
||||
let formatter = DocxFormatter::new();
|
||||
let result = formatter.escape_xml("Test <tag> & \"quote\"");
|
||||
|
||||
assert!(result.contains("<"));
|
||||
assert!(result.contains(">"));
|
||||
assert!(result.contains("&"));
|
||||
assert!(result.contains("""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_paragraph() {
|
||||
let formatter = DocxFormatter::new();
|
||||
let result = formatter.format_paragraph("Hello World");
|
||||
|
||||
assert!(result.contains("<w:p>"));
|
||||
assert!(result.contains("<w:t>Hello World</w:t>"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_heading() {
|
||||
let formatter = DocxFormatter::new();
|
||||
let result = formatter.format_heading("Chapter 1", 1);
|
||||
|
||||
assert!(result.contains("Heading1"));
|
||||
assert!(result.contains("Chapter 1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_latex_to_mathml() {
|
||||
let formatter = DocxFormatter::new();
|
||||
let result = formatter.latex_to_mathml("E = mc^2");
|
||||
|
||||
assert!(result.contains("<m:oMath>"));
|
||||
assert!(result.contains("mc^2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_generate_document_xml() {
|
||||
let formatter = DocxFormatter::new();
|
||||
let lines = vec![LineData {
|
||||
line_type: "text".to_string(),
|
||||
text: "Hello".to_string(),
|
||||
latex: None,
|
||||
bbox: BoundingBox::new(0.0, 0.0, 100.0, 20.0),
|
||||
confidence: 0.95,
|
||||
words: None,
|
||||
}];
|
||||
|
||||
let xml = formatter.generate_document_xml(&lines);
|
||||
assert!(xml.contains("<?xml"));
|
||||
assert!(xml.contains("<w:document"));
|
||||
assert!(xml.contains("Hello"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_generate_styles_xml() {
|
||||
let formatter = DocxFormatter::new();
|
||||
let xml = formatter.generate_styles_xml();
|
||||
|
||||
assert!(xml.contains("<w:styles"));
|
||||
assert!(xml.contains("Normal"));
|
||||
assert!(xml.contains("Heading 1"));
|
||||
}
|
||||
}
|
||||
412
vendor/ruvector/examples/scipix/src/output/formatter.rs
vendored
Normal file
412
vendor/ruvector/examples/scipix/src/output/formatter.rs
vendored
Normal file
@@ -0,0 +1,412 @@
|
||||
//! Multi-format output formatter with batch processing and streaming support
|
||||
|
||||
use super::*;
|
||||
use crate::output::{html, latex, mmd, smiles};
|
||||
use std::io::Write;
|
||||
|
||||
/// Configuration for output formatting
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FormatterConfig {
|
||||
/// Target output formats
|
||||
pub formats: Vec<OutputFormat>,
|
||||
|
||||
/// Enable pretty printing (where applicable)
|
||||
pub pretty: bool,
|
||||
|
||||
/// Include confidence scores in output
|
||||
pub include_confidence: bool,
|
||||
|
||||
/// Include bounding box data
|
||||
pub include_bbox: bool,
|
||||
|
||||
/// Math delimiter style for LaTeX/MMD
|
||||
pub math_delimiters: MathDelimiters,
|
||||
|
||||
/// HTML rendering engine
|
||||
pub html_engine: HtmlEngine,
|
||||
|
||||
/// Enable streaming for large documents
|
||||
pub streaming: bool,
|
||||
}
|
||||
|
||||
impl Default for FormatterConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
formats: vec![OutputFormat::Text],
|
||||
pretty: true,
|
||||
include_confidence: false,
|
||||
include_bbox: false,
|
||||
math_delimiters: MathDelimiters::default(),
|
||||
html_engine: HtmlEngine::MathJax,
|
||||
streaming: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Math delimiter configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MathDelimiters {
|
||||
pub inline_start: String,
|
||||
pub inline_end: String,
|
||||
pub display_start: String,
|
||||
pub display_end: String,
|
||||
}
|
||||
|
||||
impl Default for MathDelimiters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
inline_start: "$".to_string(),
|
||||
inline_end: "$".to_string(),
|
||||
display_start: "$$".to_string(),
|
||||
display_end: "$$".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// HTML rendering engine options
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum HtmlEngine {
|
||||
MathJax,
|
||||
KaTeX,
|
||||
Raw,
|
||||
}
|
||||
|
||||
/// Main output formatter
|
||||
pub struct OutputFormatter {
|
||||
config: FormatterConfig,
|
||||
}
|
||||
|
||||
impl OutputFormatter {
|
||||
/// Create a new formatter with default configuration
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
config: FormatterConfig::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a formatter with custom configuration
|
||||
pub fn with_config(config: FormatterConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Format a single OCR result
|
||||
pub fn format_result(&self, result: &OcrResult) -> Result<FormatsData, String> {
|
||||
let mut formats = FormatsData::default();
|
||||
|
||||
for format in &self.config.formats {
|
||||
let output = self.format_single(result, *format)?;
|
||||
self.set_format_output(&mut formats, *format, output);
|
||||
}
|
||||
|
||||
Ok(formats)
|
||||
}
|
||||
|
||||
/// Format multiple results in batch
|
||||
pub fn format_batch(&self, results: &[OcrResult]) -> Result<Vec<FormatsData>, String> {
|
||||
results
|
||||
.iter()
|
||||
.map(|result| self.format_result(result))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Stream format results to a writer
|
||||
pub fn format_stream<W: Write>(
|
||||
&self,
|
||||
results: &[OcrResult],
|
||||
writer: &mut W,
|
||||
format: OutputFormat,
|
||||
) -> Result<(), String> {
|
||||
for (i, result) in results.iter().enumerate() {
|
||||
let output = self.format_single(result, format)?;
|
||||
writer
|
||||
.write_all(output.as_bytes())
|
||||
.map_err(|e| format!("Write error: {}", e))?;
|
||||
|
||||
// Add separator between results
|
||||
if i < results.len() - 1 {
|
||||
writer
|
||||
.write_all(b"\n\n---\n\n")
|
||||
.map_err(|e| format!("Write error: {}", e))?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Format a single result to a specific format
|
||||
fn format_single(&self, result: &OcrResult, format: OutputFormat) -> Result<String, String> {
|
||||
match format {
|
||||
OutputFormat::Text => self.format_text(result),
|
||||
OutputFormat::LaTeX => self.format_latex(result, false),
|
||||
OutputFormat::LaTeXStyled => self.format_latex(result, true),
|
||||
OutputFormat::Mmd => self.format_mmd(result),
|
||||
OutputFormat::Html => self.format_html(result),
|
||||
OutputFormat::Smiles => self.format_smiles(result),
|
||||
OutputFormat::Docx => self.format_docx(result),
|
||||
OutputFormat::MathML => self.format_mathml(result),
|
||||
OutputFormat::AsciiMath => self.format_asciimath(result),
|
||||
}
|
||||
}
|
||||
|
||||
fn format_text(&self, result: &OcrResult) -> Result<String, String> {
|
||||
if let Some(text) = &result.formats.text {
|
||||
return Ok(text.clone());
|
||||
}
|
||||
|
||||
// Fallback: extract text from line data
|
||||
if let Some(line_data) = &result.line_data {
|
||||
let text = line_data
|
||||
.iter()
|
||||
.map(|line| line.text.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
return Ok(text);
|
||||
}
|
||||
|
||||
Err("No text content available".to_string())
|
||||
}
|
||||
|
||||
fn format_latex(&self, result: &OcrResult, styled: bool) -> Result<String, String> {
|
||||
let latex_content = if styled {
|
||||
result
|
||||
.formats
|
||||
.latex_styled
|
||||
.as_ref()
|
||||
.or(result.formats.latex_normal.as_ref())
|
||||
} else {
|
||||
result.formats.latex_normal.as_ref()
|
||||
};
|
||||
|
||||
if let Some(latex) = latex_content {
|
||||
if styled {
|
||||
// Wrap in document with packages
|
||||
Ok(latex::LaTeXFormatter::new()
|
||||
.with_packages(vec![
|
||||
"amsmath".to_string(),
|
||||
"amssymb".to_string(),
|
||||
"graphicx".to_string(),
|
||||
])
|
||||
.format_document(latex))
|
||||
} else {
|
||||
Ok(latex.clone())
|
||||
}
|
||||
} else {
|
||||
Err("No LaTeX content available".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
fn format_mmd(&self, result: &OcrResult) -> Result<String, String> {
|
||||
if let Some(mmd) = &result.formats.mmd {
|
||||
return Ok(mmd.clone());
|
||||
}
|
||||
|
||||
// Generate MMD from line data
|
||||
if let Some(line_data) = &result.line_data {
|
||||
let formatter = mmd::MmdFormatter::with_delimiters(self.config.math_delimiters.clone());
|
||||
return Ok(formatter.format(line_data));
|
||||
}
|
||||
|
||||
Err("No MMD content available".to_string())
|
||||
}
|
||||
|
||||
fn format_html(&self, result: &OcrResult) -> Result<String, String> {
|
||||
if let Some(html) = &result.formats.html {
|
||||
return Ok(html.clone());
|
||||
}
|
||||
|
||||
// Generate HTML with math rendering
|
||||
let content = self.format_text(result)?;
|
||||
let formatter = html::HtmlFormatter::new()
|
||||
.with_engine(self.config.html_engine)
|
||||
.with_styling(self.config.pretty);
|
||||
|
||||
Ok(formatter.format(&content, result.line_data.as_deref()))
|
||||
}
|
||||
|
||||
fn format_smiles(&self, result: &OcrResult) -> Result<String, String> {
|
||||
if let Some(smiles) = &result.formats.smiles {
|
||||
return Ok(smiles.clone());
|
||||
}
|
||||
|
||||
// Generate SMILES if we have chemical structure data
|
||||
let generator = smiles::SmilesGenerator::new();
|
||||
generator.generate_from_result(result)
|
||||
}
|
||||
|
||||
fn format_docx(&self, _result: &OcrResult) -> Result<String, String> {
|
||||
// DOCX requires binary format, return placeholder
|
||||
Err("DOCX format requires binary output - use save_docx() instead".to_string())
|
||||
}
|
||||
|
||||
fn format_mathml(&self, result: &OcrResult) -> Result<String, String> {
|
||||
if let Some(mathml) = &result.formats.mathml {
|
||||
return Ok(mathml.clone());
|
||||
}
|
||||
|
||||
Err("MathML generation not yet implemented".to_string())
|
||||
}
|
||||
|
||||
fn format_asciimath(&self, result: &OcrResult) -> Result<String, String> {
|
||||
if let Some(asciimath) = &result.formats.asciimath {
|
||||
return Ok(asciimath.clone());
|
||||
}
|
||||
|
||||
Err("AsciiMath conversion not yet implemented".to_string())
|
||||
}
|
||||
|
||||
fn set_format_output(&self, formats: &mut FormatsData, format: OutputFormat, output: String) {
|
||||
match format {
|
||||
OutputFormat::Text => formats.text = Some(output),
|
||||
OutputFormat::LaTeX => formats.latex_normal = Some(output),
|
||||
OutputFormat::LaTeXStyled => formats.latex_styled = Some(output),
|
||||
OutputFormat::Mmd => formats.mmd = Some(output),
|
||||
OutputFormat::Html => formats.html = Some(output),
|
||||
OutputFormat::Smiles => formats.smiles = Some(output),
|
||||
OutputFormat::MathML => formats.mathml = Some(output),
|
||||
OutputFormat::AsciiMath => formats.asciimath = Some(output),
|
||||
OutputFormat::Docx => {} // Binary format, handled separately
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for OutputFormatter {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder for OutputFormatter configuration
|
||||
pub struct FormatterBuilder {
|
||||
config: FormatterConfig,
|
||||
}
|
||||
|
||||
impl FormatterBuilder {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
config: FormatterConfig::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn formats(mut self, formats: Vec<OutputFormat>) -> Self {
|
||||
self.config.formats = formats;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn add_format(mut self, format: OutputFormat) -> Self {
|
||||
self.config.formats.push(format);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn pretty(mut self, pretty: bool) -> Self {
|
||||
self.config.pretty = pretty;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn include_confidence(mut self, include: bool) -> Self {
|
||||
self.config.include_confidence = include;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn include_bbox(mut self, include: bool) -> Self {
|
||||
self.config.include_bbox = include;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn math_delimiters(mut self, delimiters: MathDelimiters) -> Self {
|
||||
self.config.math_delimiters = delimiters;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn html_engine(mut self, engine: HtmlEngine) -> Self {
|
||||
self.config.html_engine = engine;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn streaming(mut self, streaming: bool) -> Self {
|
||||
self.config.streaming = streaming;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> OutputFormatter {
|
||||
OutputFormatter::with_config(self.config)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for FormatterBuilder {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn create_test_result() -> OcrResult {
|
||||
OcrResult {
|
||||
request_id: "test_123".to_string(),
|
||||
version: "3.0".to_string(),
|
||||
image_width: 800,
|
||||
image_height: 600,
|
||||
is_printed: true,
|
||||
is_handwritten: false,
|
||||
auto_rotate_confidence: 0.95,
|
||||
auto_rotate_degrees: 0,
|
||||
confidence: 0.98,
|
||||
confidence_rate: 0.97,
|
||||
formats: FormatsData {
|
||||
text: Some("E = mc^2".to_string()),
|
||||
latex_normal: Some(r"E = mc^2".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
line_data: None,
|
||||
error: None,
|
||||
metadata: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_text() {
|
||||
let formatter = OutputFormatter::new();
|
||||
let result = create_test_result();
|
||||
|
||||
let output = formatter
|
||||
.format_single(&result, OutputFormat::Text)
|
||||
.unwrap();
|
||||
assert_eq!(output, "E = mc^2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_latex() {
|
||||
let formatter = OutputFormatter::new();
|
||||
let result = create_test_result();
|
||||
|
||||
let output = formatter
|
||||
.format_single(&result, OutputFormat::LaTeX)
|
||||
.unwrap();
|
||||
assert!(output.contains("mc^2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_builder() {
|
||||
let formatter = FormatterBuilder::new()
|
||||
.add_format(OutputFormat::Text)
|
||||
.add_format(OutputFormat::LaTeX)
|
||||
.pretty(true)
|
||||
.include_confidence(true)
|
||||
.build();
|
||||
|
||||
assert_eq!(formatter.config.formats.len(), 2);
|
||||
assert!(formatter.config.pretty);
|
||||
assert!(formatter.config.include_confidence);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_format() {
|
||||
let formatter = OutputFormatter::new();
|
||||
let results = vec![create_test_result(), create_test_result()];
|
||||
|
||||
let outputs = formatter.format_batch(&results).unwrap();
|
||||
assert_eq!(outputs.len(), 2);
|
||||
}
|
||||
}
|
||||
396
vendor/ruvector/examples/scipix/src/output/html.rs
vendored
Normal file
396
vendor/ruvector/examples/scipix/src/output/html.rs
vendored
Normal file
@@ -0,0 +1,396 @@
|
||||
//! HTML output formatter with math rendering support
|
||||
|
||||
use super::{HtmlEngine, LineData};
|
||||
|
||||
/// HTML formatter with math rendering
|
||||
pub struct HtmlFormatter {
|
||||
engine: HtmlEngine,
|
||||
css_styling: bool,
|
||||
accessibility: bool,
|
||||
responsive: bool,
|
||||
theme: HtmlTheme,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum HtmlTheme {
|
||||
Light,
|
||||
Dark,
|
||||
Auto,
|
||||
}
|
||||
|
||||
impl HtmlFormatter {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
engine: HtmlEngine::MathJax,
|
||||
css_styling: true,
|
||||
accessibility: true,
|
||||
responsive: true,
|
||||
theme: HtmlTheme::Light,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_engine(mut self, engine: HtmlEngine) -> Self {
|
||||
self.engine = engine;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_styling(mut self, styling: bool) -> Self {
|
||||
self.css_styling = styling;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn accessibility(mut self, enabled: bool) -> Self {
|
||||
self.accessibility = enabled;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn responsive(mut self, enabled: bool) -> Self {
|
||||
self.responsive = enabled;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn theme(mut self, theme: HtmlTheme) -> Self {
|
||||
self.theme = theme;
|
||||
self
|
||||
}
|
||||
|
||||
/// Format content to HTML
|
||||
pub fn format(&self, content: &str, lines: Option<&[LineData]>) -> String {
|
||||
let mut html = String::new();
|
||||
|
||||
// HTML header with math rendering scripts
|
||||
html.push_str(&self.html_header());
|
||||
|
||||
// Body start with theme class
|
||||
html.push_str("<body");
|
||||
if self.css_styling {
|
||||
html.push_str(&format!(r#" class="theme-{:?}""#, self.theme).to_lowercase());
|
||||
}
|
||||
html.push_str(">\n");
|
||||
|
||||
// Main content container
|
||||
html.push_str(r#"<div class="content">"#);
|
||||
html.push_str("\n");
|
||||
|
||||
// Format content
|
||||
if let Some(line_data) = lines {
|
||||
html.push_str(&self.format_lines(line_data));
|
||||
} else {
|
||||
html.push_str(&self.format_text(content));
|
||||
}
|
||||
|
||||
html.push_str("</div>\n");
|
||||
html.push_str("</body>\n</html>");
|
||||
|
||||
html
|
||||
}
|
||||
|
||||
/// Generate HTML header with scripts and styles
|
||||
fn html_header(&self) -> String {
|
||||
let mut header = String::from("<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n");
|
||||
header.push_str(r#" <meta charset="UTF-8">"#);
|
||||
header.push_str("\n");
|
||||
|
||||
if self.responsive {
|
||||
header.push_str(
|
||||
r#" <meta name="viewport" content="width=device-width, initial-scale=1.0">"#,
|
||||
);
|
||||
header.push_str("\n");
|
||||
}
|
||||
|
||||
header.push_str(" <title>Mathematical Content</title>\n");
|
||||
|
||||
// Math rendering scripts
|
||||
match self.engine {
|
||||
HtmlEngine::MathJax => {
|
||||
header.push_str(r#" <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>"#);
|
||||
header.push_str("\n");
|
||||
header.push_str(r#" <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>"#);
|
||||
header.push_str("\n");
|
||||
header.push_str(" <script>\n");
|
||||
header.push_str(" MathJax = {\n");
|
||||
header.push_str(" tex: {\n");
|
||||
header.push_str(r#" inlineMath: [['$', '$'], ['\\(', '\\)']],"#);
|
||||
header.push_str("\n");
|
||||
header.push_str(r#" displayMath: [['$$', '$$'], ['\\[', '\\]']]"#);
|
||||
header.push_str("\n");
|
||||
header.push_str(" }\n");
|
||||
header.push_str(" };\n");
|
||||
header.push_str(" </script>\n");
|
||||
}
|
||||
HtmlEngine::KaTeX => {
|
||||
header.push_str(r#" <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.9/dist/katex.min.css">"#);
|
||||
header.push_str("\n");
|
||||
header.push_str(r#" <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.9/dist/katex.min.js"></script>"#);
|
||||
header.push_str("\n");
|
||||
header.push_str(r#" <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.9/dist/contrib/auto-render.min.js" onload="renderMathInElement(document.body);"></script>"#);
|
||||
header.push_str("\n");
|
||||
}
|
||||
HtmlEngine::Raw => {
|
||||
// No math rendering
|
||||
}
|
||||
}
|
||||
|
||||
// CSS styling
|
||||
if self.css_styling {
|
||||
header.push_str(" <style>\n");
|
||||
header.push_str(&self.generate_css());
|
||||
header.push_str(" </style>\n");
|
||||
}
|
||||
|
||||
header.push_str("</head>\n");
|
||||
header
|
||||
}
|
||||
|
||||
/// Generate CSS styles
|
||||
fn generate_css(&self) -> String {
|
||||
let mut css = String::new();
|
||||
|
||||
css.push_str(" body {\n");
|
||||
css.push_str(" font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;\n");
|
||||
css.push_str(" line-height: 1.6;\n");
|
||||
css.push_str(" max-width: 800px;\n");
|
||||
css.push_str(" margin: 0 auto;\n");
|
||||
css.push_str(" padding: 20px;\n");
|
||||
css.push_str(" }\n");
|
||||
|
||||
// Theme colors
|
||||
match self.theme {
|
||||
HtmlTheme::Light => {
|
||||
css.push_str(" body.theme-light {\n");
|
||||
css.push_str(" background-color: #ffffff;\n");
|
||||
css.push_str(" color: #333333;\n");
|
||||
css.push_str(" }\n");
|
||||
}
|
||||
HtmlTheme::Dark => {
|
||||
css.push_str(" body.theme-dark {\n");
|
||||
css.push_str(" background-color: #1e1e1e;\n");
|
||||
css.push_str(" color: #d4d4d4;\n");
|
||||
css.push_str(" }\n");
|
||||
}
|
||||
HtmlTheme::Auto => {
|
||||
css.push_str(" @media (prefers-color-scheme: dark) {\n");
|
||||
css.push_str(" body { background-color: #1e1e1e; color: #d4d4d4; }\n");
|
||||
css.push_str(" }\n");
|
||||
}
|
||||
}
|
||||
|
||||
css.push_str(" .content { padding: 20px; }\n");
|
||||
css.push_str(" .math-display { text-align: center; margin: 20px 0; }\n");
|
||||
css.push_str(" .math-inline { display: inline; }\n");
|
||||
css.push_str(" .equation-block { margin: 15px 0; padding: 10px; background: #f5f5f5; border-radius: 4px; }\n");
|
||||
css.push_str(" table { border-collapse: collapse; width: 100%; margin: 20px 0; }\n");
|
||||
css.push_str(
|
||||
" th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }\n",
|
||||
);
|
||||
css.push_str(" th { background-color: #f2f2f2; }\n");
|
||||
|
||||
if self.accessibility {
|
||||
css.push_str(" .sr-only { position: absolute; width: 1px; height: 1px; padding: 0; margin: -1px; overflow: hidden; clip: rect(0,0,0,0); border: 0; }\n");
|
||||
}
|
||||
|
||||
css
|
||||
}
|
||||
|
||||
/// Format plain text to HTML
|
||||
fn format_text(&self, text: &str) -> String {
|
||||
let escaped = self.escape_html(text);
|
||||
|
||||
// Convert math delimiters if present
|
||||
let mut html = escaped;
|
||||
|
||||
// Display math $$...$$
|
||||
html = html.replace("$$", "<div class=\"math-display\">$$");
|
||||
html = html.replace("$$", "$$</div>");
|
||||
|
||||
// Inline math $...$
|
||||
// This is simplistic - a real implementation would need proper parsing
|
||||
|
||||
format!("<p>{}</p>", html)
|
||||
}
|
||||
|
||||
/// Format line data to HTML
|
||||
fn format_lines(&self, lines: &[LineData]) -> String {
|
||||
let mut html = String::new();
|
||||
|
||||
for line in lines {
|
||||
match line.line_type.as_str() {
|
||||
"text" => {
|
||||
html.push_str("<p>");
|
||||
html.push_str(&self.escape_html(&line.text));
|
||||
html.push_str("</p>\n");
|
||||
}
|
||||
"math" | "equation" => {
|
||||
let latex = line.latex.as_ref().unwrap_or(&line.text);
|
||||
html.push_str(r#"<div class="math-display">"#);
|
||||
if self.accessibility {
|
||||
html.push_str(&format!(
|
||||
r#"<span class="sr-only">Equation: {}</span>"#,
|
||||
self.escape_html(&line.text)
|
||||
));
|
||||
}
|
||||
html.push_str(&format!("$${}$$", latex));
|
||||
html.push_str("</div>\n");
|
||||
}
|
||||
"inline_math" => {
|
||||
let latex = line.latex.as_ref().unwrap_or(&line.text);
|
||||
html.push_str(&format!(r#"<span class="math-inline">${}$</span>"#, latex));
|
||||
}
|
||||
"heading" => {
|
||||
html.push_str(&format!("<h2>{}</h2>\n", self.escape_html(&line.text)));
|
||||
}
|
||||
"table" => {
|
||||
html.push_str(&self.format_table(&line.text));
|
||||
}
|
||||
"image" => {
|
||||
html.push_str(&format!(
|
||||
r#"<img src="{}" alt="Image" loading="lazy">"#,
|
||||
self.escape_html(&line.text)
|
||||
));
|
||||
html.push_str("\n");
|
||||
}
|
||||
_ => {
|
||||
html.push_str("<p>");
|
||||
html.push_str(&self.escape_html(&line.text));
|
||||
html.push_str("</p>\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
html
|
||||
}
|
||||
|
||||
/// Format table to HTML
|
||||
fn format_table(&self, table: &str) -> String {
|
||||
let mut html = String::from("<table>\n");
|
||||
|
||||
let rows: Vec<&str> = table.lines().collect();
|
||||
for (i, row) in rows.iter().enumerate() {
|
||||
html.push_str(" <tr>\n");
|
||||
|
||||
let cells: Vec<&str> = row
|
||||
.split('|')
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
|
||||
let tag = if i == 0 { "th" } else { "td" };
|
||||
|
||||
for cell in cells {
|
||||
html.push_str(&format!(
|
||||
" <{}>{}</{}>\n",
|
||||
tag,
|
||||
self.escape_html(cell),
|
||||
tag
|
||||
));
|
||||
}
|
||||
|
||||
html.push_str(" </tr>\n");
|
||||
}
|
||||
|
||||
html.push_str("</table>\n");
|
||||
html
|
||||
}
|
||||
|
||||
/// Escape HTML special characters
|
||||
fn escape_html(&self, text: &str) -> String {
|
||||
text.replace('&', "&")
|
||||
.replace('<', "<")
|
||||
.replace('>', ">")
|
||||
.replace('"', """)
|
||||
.replace('\'', "'")
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HtmlFormatter {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::output::BoundingBox;
|
||||
|
||||
#[test]
|
||||
fn test_html_header() {
|
||||
let formatter = HtmlFormatter::new().with_engine(HtmlEngine::MathJax);
|
||||
let header = formatter.html_header();
|
||||
|
||||
assert!(header.contains("<!DOCTYPE html>"));
|
||||
assert!(header.contains("MathJax"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_katex_header() {
|
||||
let formatter = HtmlFormatter::new().with_engine(HtmlEngine::KaTeX);
|
||||
let header = formatter.html_header();
|
||||
|
||||
assert!(header.contains("katex"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_html() {
|
||||
let formatter = HtmlFormatter::new();
|
||||
let result = formatter.escape_html("<script>alert('test')</script>");
|
||||
|
||||
assert!(result.contains("<"));
|
||||
assert!(result.contains(">"));
|
||||
assert!(!result.contains("<script>"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_lines() {
|
||||
let formatter = HtmlFormatter::new();
|
||||
let lines = vec![
|
||||
LineData {
|
||||
line_type: "text".to_string(),
|
||||
text: "Introduction".to_string(),
|
||||
latex: None,
|
||||
bbox: BoundingBox::new(0.0, 0.0, 100.0, 20.0),
|
||||
confidence: 0.95,
|
||||
words: None,
|
||||
},
|
||||
LineData {
|
||||
line_type: "equation".to_string(),
|
||||
text: "E = mc^2".to_string(),
|
||||
latex: Some(r"E = mc^2".to_string()),
|
||||
bbox: BoundingBox::new(0.0, 25.0, 100.0, 30.0),
|
||||
confidence: 0.98,
|
||||
words: None,
|
||||
},
|
||||
];
|
||||
|
||||
let result = formatter.format_lines(&lines);
|
||||
assert!(result.contains("<p>Introduction</p>"));
|
||||
assert!(result.contains("math-display"));
|
||||
assert!(result.contains("$$"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dark_theme() {
|
||||
let formatter = HtmlFormatter::new().theme(HtmlTheme::Dark);
|
||||
let css = formatter.generate_css();
|
||||
|
||||
assert!(css.contains("theme-dark"));
|
||||
assert!(css.contains("#1e1e1e"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_accessibility() {
|
||||
let formatter = HtmlFormatter::new().accessibility(true);
|
||||
let lines = vec![LineData {
|
||||
line_type: "equation".to_string(),
|
||||
text: "x squared".to_string(),
|
||||
latex: Some("x^2".to_string()),
|
||||
bbox: BoundingBox::new(0.0, 0.0, 100.0, 20.0),
|
||||
confidence: 0.98,
|
||||
words: None,
|
||||
}];
|
||||
|
||||
let result = formatter.format_lines(&lines);
|
||||
assert!(result.contains("sr-only"));
|
||||
assert!(result.contains("Equation:"));
|
||||
}
|
||||
}
|
||||
354
vendor/ruvector/examples/scipix/src/output/json.rs
vendored
Normal file
354
vendor/ruvector/examples/scipix/src/output/json.rs
vendored
Normal file
@@ -0,0 +1,354 @@
|
||||
//! JSON API response formatter matching Scipix API specification
|
||||
|
||||
use super::{FormatsData, LineData, OcrResult};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Complete API response matching Scipix format
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ApiResponse {
|
||||
/// Request identifier
|
||||
pub request_id: String,
|
||||
|
||||
/// API version
|
||||
pub version: String,
|
||||
|
||||
/// Image information
|
||||
pub image_width: u32,
|
||||
pub image_height: u32,
|
||||
|
||||
/// Detection metadata
|
||||
pub is_printed: bool,
|
||||
pub is_handwritten: bool,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub auto_rotate_confidence: Option<f32>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub auto_rotate_degrees: Option<i32>,
|
||||
|
||||
/// Confidence metrics
|
||||
pub confidence: f32,
|
||||
pub confidence_rate: f32,
|
||||
|
||||
/// Available output formats
|
||||
#[serde(flatten)]
|
||||
pub formats: FormatsData,
|
||||
|
||||
/// Detailed line data
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub line_data: Option<Vec<LineData>>,
|
||||
|
||||
/// Error information
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub error: Option<String>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub error_info: Option<ErrorInfo>,
|
||||
|
||||
/// Processing metadata
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub metadata: Option<HashMap<String, Value>>,
|
||||
}
|
||||
|
||||
/// Error information structure
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ErrorInfo {
|
||||
pub code: String,
|
||||
pub message: String,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub details: Option<Value>,
|
||||
}
|
||||
|
||||
impl ApiResponse {
|
||||
/// Create response from OCR result
|
||||
pub fn from_ocr_result(result: OcrResult) -> Self {
|
||||
Self {
|
||||
request_id: result.request_id,
|
||||
version: result.version,
|
||||
image_width: result.image_width,
|
||||
image_height: result.image_height,
|
||||
is_printed: result.is_printed,
|
||||
is_handwritten: result.is_handwritten,
|
||||
auto_rotate_confidence: Some(result.auto_rotate_confidence),
|
||||
auto_rotate_degrees: Some(result.auto_rotate_degrees),
|
||||
confidence: result.confidence,
|
||||
confidence_rate: result.confidence_rate,
|
||||
formats: result.formats,
|
||||
line_data: result.line_data,
|
||||
error: result.error,
|
||||
error_info: None,
|
||||
metadata: if result.metadata.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(result.metadata)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Create error response
|
||||
pub fn error(request_id: String, code: &str, message: &str) -> Self {
|
||||
Self {
|
||||
request_id,
|
||||
version: "3.0".to_string(),
|
||||
image_width: 0,
|
||||
image_height: 0,
|
||||
is_printed: false,
|
||||
is_handwritten: false,
|
||||
auto_rotate_confidence: None,
|
||||
auto_rotate_degrees: None,
|
||||
confidence: 0.0,
|
||||
confidence_rate: 0.0,
|
||||
formats: FormatsData::default(),
|
||||
line_data: None,
|
||||
error: Some(message.to_string()),
|
||||
error_info: Some(ErrorInfo {
|
||||
code: code.to_string(),
|
||||
message: message.to_string(),
|
||||
details: None,
|
||||
}),
|
||||
metadata: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert to JSON string
|
||||
pub fn to_json(&self) -> Result<String, String> {
|
||||
serde_json::to_string(self).map_err(|e| format!("JSON serialization error: {}", e))
|
||||
}
|
||||
|
||||
/// Convert to pretty JSON string
|
||||
pub fn to_json_pretty(&self) -> Result<String, String> {
|
||||
serde_json::to_string_pretty(self).map_err(|e| format!("JSON serialization error: {}", e))
|
||||
}
|
||||
|
||||
/// Parse from JSON string
|
||||
pub fn from_json(json: &str) -> Result<Self, String> {
|
||||
serde_json::from_str(json).map_err(|e| format!("JSON parsing error: {}", e))
|
||||
}
|
||||
}
|
||||
|
||||
/// Batch API response
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct BatchApiResponse {
|
||||
pub batch_id: String,
|
||||
pub total: usize,
|
||||
pub completed: usize,
|
||||
pub results: Vec<ApiResponse>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub errors: Option<Vec<BatchError>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct BatchError {
|
||||
pub index: usize,
|
||||
pub error: ErrorInfo,
|
||||
}
|
||||
|
||||
impl BatchApiResponse {
|
||||
pub fn new(batch_id: String, results: Vec<ApiResponse>) -> Self {
|
||||
let total = results.len();
|
||||
let completed = results.iter().filter(|r| r.error.is_none()).count();
|
||||
|
||||
let errors: Vec<BatchError> = results
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(i, r)| {
|
||||
r.error_info.as_ref().map(|e| BatchError {
|
||||
index: i,
|
||||
error: e.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
batch_id,
|
||||
total,
|
||||
completed,
|
||||
results,
|
||||
errors: if errors.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(errors)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_json(&self) -> Result<String, String> {
|
||||
serde_json::to_string(self).map_err(|e| format!("JSON serialization error: {}", e))
|
||||
}
|
||||
|
||||
pub fn to_json_pretty(&self) -> Result<String, String> {
|
||||
serde_json::to_string_pretty(self).map_err(|e| format!("JSON serialization error: {}", e))
|
||||
}
|
||||
}
|
||||
|
||||
/// API request format
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ApiRequest {
|
||||
/// Image source (URL or base64)
|
||||
pub src: String,
|
||||
|
||||
/// Requested output formats
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub formats: Option<Vec<String>>,
|
||||
|
||||
/// OCR options
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub ocr: Option<OcrOptions>,
|
||||
|
||||
/// Additional metadata
|
||||
#[serde(flatten)]
|
||||
pub metadata: HashMap<String, Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct OcrOptions {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub math_inline_delimiters: Option<Vec<String>>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub math_display_delimiters: Option<Vec<String>>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub rm_spaces: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub rm_fonts: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub numbers_default_to_math: Option<bool>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn create_test_result() -> OcrResult {
|
||||
OcrResult {
|
||||
request_id: "test_123".to_string(),
|
||||
version: "3.0".to_string(),
|
||||
image_width: 800,
|
||||
image_height: 600,
|
||||
is_printed: true,
|
||||
is_handwritten: false,
|
||||
auto_rotate_confidence: 0.95,
|
||||
auto_rotate_degrees: 0,
|
||||
confidence: 0.98,
|
||||
confidence_rate: 0.97,
|
||||
formats: FormatsData {
|
||||
text: Some("E = mc^2".to_string()),
|
||||
latex_normal: Some(r"E = mc^2".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
line_data: None,
|
||||
error: None,
|
||||
metadata: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_api_response_from_result() {
|
||||
let result = create_test_result();
|
||||
let response = ApiResponse::from_ocr_result(result);
|
||||
|
||||
assert_eq!(response.request_id, "test_123");
|
||||
assert_eq!(response.version, "3.0");
|
||||
assert_eq!(response.confidence, 0.98);
|
||||
assert!(response.formats.text.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_api_response_to_json() {
|
||||
let result = create_test_result();
|
||||
let response = ApiResponse::from_ocr_result(result);
|
||||
let json = response.to_json().unwrap();
|
||||
|
||||
assert!(json.contains("request_id"));
|
||||
assert!(json.contains("test_123"));
|
||||
assert!(json.contains("confidence"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_api_response_round_trip() {
|
||||
let result = create_test_result();
|
||||
let response = ApiResponse::from_ocr_result(result);
|
||||
let json = response.to_json().unwrap();
|
||||
let parsed = ApiResponse::from_json(&json).unwrap();
|
||||
|
||||
assert_eq!(response.request_id, parsed.request_id);
|
||||
assert_eq!(response.confidence, parsed.confidence);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_response() {
|
||||
let response = ApiResponse::error(
|
||||
"test_456".to_string(),
|
||||
"invalid_image",
|
||||
"Image format not supported",
|
||||
);
|
||||
|
||||
assert_eq!(response.request_id, "test_456");
|
||||
assert!(response.error.is_some());
|
||||
assert!(response.error_info.is_some());
|
||||
|
||||
let error_info = response.error_info.unwrap();
|
||||
assert_eq!(error_info.code, "invalid_image");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_response() {
|
||||
let result1 = create_test_result();
|
||||
let result2 = create_test_result();
|
||||
|
||||
let responses = vec![
|
||||
ApiResponse::from_ocr_result(result1),
|
||||
ApiResponse::from_ocr_result(result2),
|
||||
];
|
||||
|
||||
let batch = BatchApiResponse::new("batch_789".to_string(), responses);
|
||||
|
||||
assert_eq!(batch.batch_id, "batch_789");
|
||||
assert_eq!(batch.total, 2);
|
||||
assert_eq!(batch.completed, 2);
|
||||
assert!(batch.errors.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_with_errors() {
|
||||
let success = create_test_result();
|
||||
let error_response =
|
||||
ApiResponse::error("fail_1".to_string(), "timeout", "Processing timeout");
|
||||
|
||||
let responses = vec![ApiResponse::from_ocr_result(success), error_response];
|
||||
|
||||
let batch = BatchApiResponse::new("batch_error".to_string(), responses);
|
||||
|
||||
assert_eq!(batch.total, 2);
|
||||
assert_eq!(batch.completed, 1);
|
||||
assert!(batch.errors.is_some());
|
||||
assert_eq!(batch.errors.unwrap().len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_api_request() {
|
||||
let request = ApiRequest {
|
||||
src: "https://example.com/image.png".to_string(),
|
||||
formats: Some(vec!["text".to_string(), "latex_styled".to_string()]),
|
||||
ocr: Some(OcrOptions {
|
||||
math_inline_delimiters: Some(vec!["$".to_string(), "$".to_string()]),
|
||||
math_display_delimiters: Some(vec!["$$".to_string(), "$$".to_string()]),
|
||||
rm_spaces: Some(true),
|
||||
rm_fonts: None,
|
||||
numbers_default_to_math: Some(false),
|
||||
}),
|
||||
metadata: HashMap::new(),
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&request).unwrap();
|
||||
assert!(json.contains("src"));
|
||||
assert!(json.contains("formats"));
|
||||
}
|
||||
}
|
||||
430
vendor/ruvector/examples/scipix/src/output/latex.rs
vendored
Normal file
430
vendor/ruvector/examples/scipix/src/output/latex.rs
vendored
Normal file
@@ -0,0 +1,430 @@
|
||||
//! LaTeX output formatter with styling and package management
|
||||
|
||||
use super::LineData;
|
||||
|
||||
/// LaTeX document formatter
|
||||
#[derive(Clone)]
|
||||
pub struct LaTeXFormatter {
|
||||
packages: Vec<String>,
|
||||
document_class: String,
|
||||
preamble: String,
|
||||
numbered_equations: bool,
|
||||
custom_delimiters: Option<(String, String)>,
|
||||
}
|
||||
|
||||
impl LaTeXFormatter {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
packages: vec!["amsmath".to_string(), "amssymb".to_string()],
|
||||
document_class: "article".to_string(),
|
||||
preamble: String::new(),
|
||||
numbered_equations: false,
|
||||
custom_delimiters: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_packages(mut self, packages: Vec<String>) -> Self {
|
||||
self.packages = packages;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn add_package(mut self, package: String) -> Self {
|
||||
if !self.packages.contains(&package) {
|
||||
self.packages.push(package);
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
pub fn document_class(mut self, class: String) -> Self {
|
||||
self.document_class = class;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn preamble(mut self, preamble: String) -> Self {
|
||||
self.preamble = preamble;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn numbered_equations(mut self, numbered: bool) -> Self {
|
||||
self.numbered_equations = numbered;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn custom_delimiters(mut self, start: String, end: String) -> Self {
|
||||
self.custom_delimiters = Some((start, end));
|
||||
self
|
||||
}
|
||||
|
||||
/// Format plain LaTeX content
|
||||
pub fn format(&self, latex: &str) -> String {
|
||||
// Clean up LaTeX if needed
|
||||
let cleaned = self.clean_latex(latex);
|
||||
|
||||
// Apply custom delimiters if specified
|
||||
if let Some((start, end)) = &self.custom_delimiters {
|
||||
format!("{}{}{}", start, cleaned, end)
|
||||
} else {
|
||||
cleaned
|
||||
}
|
||||
}
|
||||
|
||||
/// Format line data to LaTeX
|
||||
pub fn format_lines(&self, lines: &[LineData]) -> String {
|
||||
let mut output = String::new();
|
||||
let mut in_align = false;
|
||||
|
||||
for line in lines {
|
||||
match line.line_type.as_str() {
|
||||
"text" => {
|
||||
if in_align {
|
||||
output.push_str("\\end{align*}\n\n");
|
||||
in_align = false;
|
||||
}
|
||||
output.push_str(&self.escape_text(&line.text));
|
||||
output.push_str("\n\n");
|
||||
}
|
||||
"math" | "equation" => {
|
||||
let latex = line.latex.as_ref().unwrap_or(&line.text);
|
||||
|
||||
if self.numbered_equations {
|
||||
output.push_str("\\begin{equation}\n");
|
||||
output.push_str(latex.trim());
|
||||
output.push_str("\n\\end{equation}\n\n");
|
||||
} else {
|
||||
output.push_str("\\[\n");
|
||||
output.push_str(latex.trim());
|
||||
output.push_str("\n\\]\n\n");
|
||||
}
|
||||
}
|
||||
"inline_math" => {
|
||||
let latex = line.latex.as_ref().unwrap_or(&line.text);
|
||||
output.push_str(&format!("${}$", latex.trim()));
|
||||
}
|
||||
"align" => {
|
||||
if !in_align {
|
||||
output.push_str("\\begin{align*}\n");
|
||||
in_align = true;
|
||||
}
|
||||
let latex = line.latex.as_ref().unwrap_or(&line.text);
|
||||
output.push_str(latex.trim());
|
||||
output.push_str(" \\\\\n");
|
||||
}
|
||||
"table" => {
|
||||
output.push_str(&self.format_table(&line.text));
|
||||
output.push_str("\n\n");
|
||||
}
|
||||
_ => {
|
||||
output.push_str(&line.text);
|
||||
output.push_str("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if in_align {
|
||||
output.push_str("\\end{align*}\n");
|
||||
}
|
||||
|
||||
output.trim().to_string()
|
||||
}
|
||||
|
||||
/// Format complete LaTeX document
|
||||
pub fn format_document(&self, content: &str) -> String {
|
||||
let mut doc = String::new();
|
||||
|
||||
// Document class
|
||||
doc.push_str(&format!("\\documentclass{{{}}}\n\n", self.document_class));
|
||||
|
||||
// Packages
|
||||
for package in &self.packages {
|
||||
doc.push_str(&format!("\\usepackage{{{}}}\n", package));
|
||||
}
|
||||
doc.push_str("\n");
|
||||
|
||||
// Custom preamble
|
||||
if !self.preamble.is_empty() {
|
||||
doc.push_str(&self.preamble);
|
||||
doc.push_str("\n\n");
|
||||
}
|
||||
|
||||
// Begin document
|
||||
doc.push_str("\\begin{document}\n\n");
|
||||
|
||||
// Content
|
||||
doc.push_str(content);
|
||||
doc.push_str("\n\n");
|
||||
|
||||
// End document
|
||||
doc.push_str("\\end{document}\n");
|
||||
|
||||
doc
|
||||
}
|
||||
|
||||
/// Clean and normalize LaTeX
|
||||
fn clean_latex(&self, latex: &str) -> String {
|
||||
let mut cleaned = latex.to_string();
|
||||
|
||||
// Remove excessive whitespace
|
||||
while cleaned.contains(" ") {
|
||||
cleaned = cleaned.replace(" ", " ");
|
||||
}
|
||||
|
||||
// Normalize line breaks
|
||||
cleaned = cleaned.replace("\r\n", "\n");
|
||||
|
||||
// Ensure proper spacing around operators
|
||||
for op in &["=", "+", "-", r"\times", r"\div"] {
|
||||
let spaced = format!(" {} ", op);
|
||||
cleaned = cleaned.replace(op, &spaced);
|
||||
}
|
||||
|
||||
// Remove duplicate spaces again
|
||||
while cleaned.contains(" ") {
|
||||
cleaned = cleaned.replace(" ", " ");
|
||||
}
|
||||
|
||||
cleaned.trim().to_string()
|
||||
}
|
||||
|
||||
/// Escape special LaTeX characters in text
|
||||
fn escape_text(&self, text: &str) -> String {
|
||||
text.replace('\\', r"\\")
|
||||
.replace('{', r"\{")
|
||||
.replace('}', r"\}")
|
||||
.replace('$', r"\$")
|
||||
.replace('%', r"\%")
|
||||
.replace('_', r"\_")
|
||||
.replace('&', r"\&")
|
||||
.replace('#', r"\#")
|
||||
.replace('^', r"\^")
|
||||
.replace('~', r"\~")
|
||||
}
|
||||
|
||||
/// Format table to LaTeX tabular environment
|
||||
fn format_table(&self, table: &str) -> String {
|
||||
let rows: Vec<&str> = table.lines().collect();
|
||||
if rows.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
// Determine number of columns from first row
|
||||
let num_cols = rows[0].split('|').filter(|s| !s.is_empty()).count();
|
||||
let col_spec = "c".repeat(num_cols);
|
||||
|
||||
let mut output = format!("\\begin{{tabular}}{{{}}}\n", col_spec);
|
||||
output.push_str("\\hline\n");
|
||||
|
||||
for (i, row) in rows.iter().enumerate() {
|
||||
let cells: Vec<&str> = row
|
||||
.split('|')
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
|
||||
output.push_str(&cells.join(" & "));
|
||||
output.push_str(" \\\\\n");
|
||||
|
||||
if i == 0 {
|
||||
output.push_str("\\hline\n");
|
||||
}
|
||||
}
|
||||
|
||||
output.push_str("\\hline\n");
|
||||
output.push_str("\\end{tabular}");
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
/// Convert inline LaTeX to display math
|
||||
pub fn inline_to_display(&self, latex: &str) -> String {
|
||||
if self.numbered_equations {
|
||||
format!("\\begin{{equation}}\n{}\n\\end{{equation}}", latex.trim())
|
||||
} else {
|
||||
format!("\\[\n{}\n\\]", latex.trim())
|
||||
}
|
||||
}
|
||||
|
||||
/// Add equation label
|
||||
pub fn add_label(&self, latex: &str, label: &str) -> String {
|
||||
format!("{}\n\\label{{{}}}", latex.trim(), label)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LaTeXFormatter {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Styled LaTeX formatter with predefined templates
|
||||
#[allow(dead_code)]
|
||||
pub struct StyledLaTeXFormatter {
|
||||
base: LaTeXFormatter,
|
||||
style: LaTeXStyle,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum LaTeXStyle {
|
||||
Article,
|
||||
Report,
|
||||
Book,
|
||||
Beamer,
|
||||
Minimal,
|
||||
}
|
||||
|
||||
impl StyledLaTeXFormatter {
|
||||
pub fn new(style: LaTeXStyle) -> Self {
|
||||
let base = match style {
|
||||
LaTeXStyle::Article => LaTeXFormatter::new()
|
||||
.document_class("article".to_string())
|
||||
.with_packages(vec![
|
||||
"amsmath".to_string(),
|
||||
"amssymb".to_string(),
|
||||
"graphicx".to_string(),
|
||||
"hyperref".to_string(),
|
||||
]),
|
||||
LaTeXStyle::Report => LaTeXFormatter::new()
|
||||
.document_class("report".to_string())
|
||||
.with_packages(vec![
|
||||
"amsmath".to_string(),
|
||||
"amssymb".to_string(),
|
||||
"graphicx".to_string(),
|
||||
"hyperref".to_string(),
|
||||
"geometry".to_string(),
|
||||
]),
|
||||
LaTeXStyle::Book => LaTeXFormatter::new()
|
||||
.document_class("book".to_string())
|
||||
.with_packages(vec![
|
||||
"amsmath".to_string(),
|
||||
"amssymb".to_string(),
|
||||
"graphicx".to_string(),
|
||||
"hyperref".to_string(),
|
||||
"geometry".to_string(),
|
||||
"fancyhdr".to_string(),
|
||||
]),
|
||||
LaTeXStyle::Beamer => LaTeXFormatter::new()
|
||||
.document_class("beamer".to_string())
|
||||
.with_packages(vec![
|
||||
"amsmath".to_string(),
|
||||
"amssymb".to_string(),
|
||||
"graphicx".to_string(),
|
||||
]),
|
||||
LaTeXStyle::Minimal => LaTeXFormatter::new()
|
||||
.document_class("article".to_string())
|
||||
.with_packages(vec!["amsmath".to_string()]),
|
||||
};
|
||||
|
||||
Self { base, style }
|
||||
}
|
||||
|
||||
pub fn format_document(
|
||||
&self,
|
||||
content: &str,
|
||||
title: Option<&str>,
|
||||
author: Option<&str>,
|
||||
) -> String {
|
||||
let mut preamble = String::new();
|
||||
|
||||
if let Some(t) = title {
|
||||
preamble.push_str(&format!("\\title{{{}}}\n", t));
|
||||
}
|
||||
if let Some(a) = author {
|
||||
preamble.push_str(&format!("\\author{{{}}}\n", a));
|
||||
}
|
||||
if title.is_some() || author.is_some() {
|
||||
preamble.push_str("\\date{\\today}\n");
|
||||
}
|
||||
|
||||
let formatter = self.base.clone().preamble(preamble);
|
||||
let mut doc = formatter.format_document(content);
|
||||
|
||||
// Add maketitle after \begin{document} if we have title/author
|
||||
if title.is_some() || author.is_some() {
|
||||
doc = doc.replace(
|
||||
"\\begin{document}\n\n",
|
||||
"\\begin{document}\n\n\\maketitle\n\n",
|
||||
);
|
||||
}
|
||||
|
||||
doc
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::output::BoundingBox;
|
||||
|
||||
#[test]
|
||||
fn test_format_simple() {
|
||||
let formatter = LaTeXFormatter::new();
|
||||
let result = formatter.format("E = mc^2");
|
||||
assert!(result.contains("mc^2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_document() {
|
||||
let formatter = LaTeXFormatter::new();
|
||||
let doc = formatter.format_document("E = mc^2");
|
||||
|
||||
assert!(doc.contains(r"\documentclass{article}"));
|
||||
assert!(doc.contains(r"\usepackage{amsmath}"));
|
||||
assert!(doc.contains(r"\begin{document}"));
|
||||
assert!(doc.contains("mc^2"));
|
||||
assert!(doc.contains(r"\end{document}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_text() {
|
||||
let formatter = LaTeXFormatter::new();
|
||||
let result = formatter.escape_text("Price: $100 & 50%");
|
||||
assert!(result.contains(r"\$100"));
|
||||
assert!(result.contains(r"\&"));
|
||||
assert!(result.contains(r"\%"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_inline_to_display() {
|
||||
let formatter = LaTeXFormatter::new();
|
||||
let result = formatter.inline_to_display("x^2 + y^2 = r^2");
|
||||
assert!(result.contains(r"\["));
|
||||
assert!(result.contains(r"\]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_styled_formatter() {
|
||||
let formatter = StyledLaTeXFormatter::new(LaTeXStyle::Article);
|
||||
let doc = formatter.format_document("Content", Some("My Title"), Some("Author Name"));
|
||||
|
||||
assert!(doc.contains(r"\title{My Title}"));
|
||||
assert!(doc.contains(r"\author{Author Name}"));
|
||||
assert!(doc.contains(r"\maketitle"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_lines() {
|
||||
let formatter = LaTeXFormatter::new();
|
||||
let lines = vec![
|
||||
LineData {
|
||||
line_type: "text".to_string(),
|
||||
text: "Introduction".to_string(),
|
||||
latex: None,
|
||||
bbox: BoundingBox::new(0.0, 0.0, 100.0, 20.0),
|
||||
confidence: 0.95,
|
||||
words: None,
|
||||
},
|
||||
LineData {
|
||||
line_type: "equation".to_string(),
|
||||
text: "E = mc^2".to_string(),
|
||||
latex: Some(r"E = mc^2".to_string()),
|
||||
bbox: BoundingBox::new(0.0, 25.0, 100.0, 30.0),
|
||||
confidence: 0.98,
|
||||
words: None,
|
||||
},
|
||||
];
|
||||
|
||||
let result = formatter.format_lines(&lines);
|
||||
assert!(result.contains("Introduction"));
|
||||
assert!(result.contains(r"\[") || result.contains(r"\begin{equation}"));
|
||||
assert!(result.contains("mc^2"));
|
||||
}
|
||||
}
|
||||
379
vendor/ruvector/examples/scipix/src/output/mmd.rs
vendored
Normal file
379
vendor/ruvector/examples/scipix/src/output/mmd.rs
vendored
Normal file
@@ -0,0 +1,379 @@
|
||||
//! Scipix Markdown (MMD) formatter
|
||||
//!
|
||||
//! MMD is an enhanced markdown format that supports:
|
||||
//! - Inline and display math with LaTeX
|
||||
//! - Tables with alignment
|
||||
//! - Chemistry notation (SMILES)
|
||||
//! - Image embedding
|
||||
//! - Structured documents
|
||||
|
||||
use super::{LineData, MathDelimiters};
|
||||
|
||||
/// Scipix Markdown formatter
|
||||
pub struct MmdFormatter {
|
||||
delimiters: MathDelimiters,
|
||||
include_metadata: bool,
|
||||
preserve_structure: bool,
|
||||
}
|
||||
|
||||
impl MmdFormatter {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
delimiters: MathDelimiters::default(),
|
||||
include_metadata: false,
|
||||
preserve_structure: true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_delimiters(delimiters: MathDelimiters) -> Self {
|
||||
Self {
|
||||
delimiters,
|
||||
include_metadata: false,
|
||||
preserve_structure: true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn include_metadata(mut self, include: bool) -> Self {
|
||||
self.include_metadata = include;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn preserve_structure(mut self, preserve: bool) -> Self {
|
||||
self.preserve_structure = preserve;
|
||||
self
|
||||
}
|
||||
|
||||
/// Format line data to MMD
|
||||
pub fn format(&self, lines: &[LineData]) -> String {
|
||||
let mut output = String::new();
|
||||
let mut in_table = false;
|
||||
let mut in_list = false;
|
||||
|
||||
for line in lines {
|
||||
match line.line_type.as_str() {
|
||||
"text" => {
|
||||
if in_table {
|
||||
output.push_str("\n");
|
||||
in_table = false;
|
||||
}
|
||||
if in_list && !line.text.trim_start().starts_with(&['-', '*', '1']) {
|
||||
output.push_str("\n");
|
||||
in_list = false;
|
||||
}
|
||||
output.push_str(&line.text);
|
||||
output.push_str("\n");
|
||||
}
|
||||
"math" | "equation" => {
|
||||
let latex = line.latex.as_ref().unwrap_or(&line.text);
|
||||
let formatted = self.format_math(latex, true); // display mode
|
||||
output.push_str(&formatted);
|
||||
output.push_str("\n\n");
|
||||
}
|
||||
"inline_math" => {
|
||||
let latex = line.latex.as_ref().unwrap_or(&line.text);
|
||||
let formatted = self.format_math(latex, false); // inline mode
|
||||
output.push_str(&formatted);
|
||||
}
|
||||
"table_row" => {
|
||||
if !in_table {
|
||||
in_table = true;
|
||||
}
|
||||
output.push_str(&self.format_table_row(&line.text));
|
||||
output.push_str("\n");
|
||||
}
|
||||
"list_item" => {
|
||||
if !in_list {
|
||||
in_list = true;
|
||||
}
|
||||
output.push_str(&line.text);
|
||||
output.push_str("\n");
|
||||
}
|
||||
"heading" => {
|
||||
output.push_str(&format!("# {}\n\n", line.text));
|
||||
}
|
||||
"image" => {
|
||||
output.push_str(&self.format_image(&line.text));
|
||||
output.push_str("\n\n");
|
||||
}
|
||||
"chemistry" => {
|
||||
let smiles = line.text.trim();
|
||||
output.push_str(&format!("```smiles\n{}\n```\n\n", smiles));
|
||||
}
|
||||
_ => {
|
||||
// Unknown type, output as text
|
||||
output.push_str(&line.text);
|
||||
output.push_str("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output.trim().to_string()
|
||||
}
|
||||
|
||||
/// Format LaTeX math expression
|
||||
pub fn format_math(&self, latex: &str, display: bool) -> String {
|
||||
if display {
|
||||
format!(
|
||||
"{}\n{}\n{}",
|
||||
self.delimiters.display_start,
|
||||
latex.trim(),
|
||||
self.delimiters.display_end
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"{}{}{}",
|
||||
self.delimiters.inline_start,
|
||||
latex.trim(),
|
||||
self.delimiters.inline_end
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Format table row
|
||||
fn format_table_row(&self, row: &str) -> String {
|
||||
// Basic table formatting - split by | and rejoin
|
||||
let cells: Vec<&str> = row.split('|').map(|s| s.trim()).collect();
|
||||
format!("| {} |", cells.join(" | "))
|
||||
}
|
||||
|
||||
/// Format image reference
|
||||
fn format_image(&self, path: &str) -> String {
|
||||
// Extract alt text and path if available
|
||||
if path.contains('[') && path.contains(']') {
|
||||
path.to_string()
|
||||
} else {
|
||||
format!("", path)
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert plain text with embedded LaTeX to MMD
|
||||
pub fn from_mixed_text(&self, text: &str) -> String {
|
||||
let mut output = String::new();
|
||||
let mut current = String::new();
|
||||
let mut in_math = false;
|
||||
let mut display_math = false;
|
||||
|
||||
let chars: Vec<char> = text.chars().collect();
|
||||
let mut i = 0;
|
||||
|
||||
while i < chars.len() {
|
||||
// Check for display math $$
|
||||
if i + 1 < chars.len() && chars[i] == '$' && chars[i + 1] == '$' {
|
||||
if in_math && display_math {
|
||||
// End display math
|
||||
output.push_str(&self.format_math(¤t, true));
|
||||
current.clear();
|
||||
in_math = false;
|
||||
display_math = false;
|
||||
} else if !in_math {
|
||||
// Start display math
|
||||
if !current.is_empty() {
|
||||
output.push_str(¤t);
|
||||
current.clear();
|
||||
}
|
||||
in_math = true;
|
||||
display_math = true;
|
||||
}
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for inline math $
|
||||
if chars[i] == '$' && !display_math {
|
||||
if in_math {
|
||||
// End inline math
|
||||
output.push_str(&self.format_math(¤t, false));
|
||||
current.clear();
|
||||
in_math = false;
|
||||
} else {
|
||||
// Start inline math
|
||||
if !current.is_empty() {
|
||||
output.push_str(¤t);
|
||||
current.clear();
|
||||
}
|
||||
in_math = true;
|
||||
}
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
current.push(chars[i]);
|
||||
i += 1;
|
||||
}
|
||||
|
||||
if !current.is_empty() {
|
||||
output.push_str(¤t);
|
||||
}
|
||||
|
||||
output
|
||||
}
|
||||
|
||||
/// Format a complete document with frontmatter
|
||||
pub fn format_document(&self, title: &str, content: &str, metadata: Option<&str>) -> String {
|
||||
let mut doc = String::new();
|
||||
|
||||
// Add frontmatter if metadata provided
|
||||
if let Some(meta) = metadata {
|
||||
doc.push_str("---\n");
|
||||
doc.push_str(meta);
|
||||
doc.push_str("\n---\n\n");
|
||||
}
|
||||
|
||||
// Add title
|
||||
doc.push_str(&format!("# {}\n\n", title));
|
||||
|
||||
// Add content
|
||||
doc.push_str(content);
|
||||
|
||||
doc
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MmdFormatter {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse MMD back to structured data
|
||||
pub struct MmdParser;
|
||||
|
||||
impl MmdParser {
|
||||
pub fn new() -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
/// Parse MMD content and extract LaTeX expressions
|
||||
pub fn extract_latex(&self, content: &str) -> Vec<(String, bool)> {
|
||||
let mut expressions = Vec::new();
|
||||
let mut current = String::new();
|
||||
let mut in_math = false;
|
||||
let mut display_math = false;
|
||||
|
||||
let chars: Vec<char> = content.chars().collect();
|
||||
let mut i = 0;
|
||||
|
||||
while i < chars.len() {
|
||||
if i + 1 < chars.len() && chars[i] == '$' && chars[i + 1] == '$' {
|
||||
if in_math && display_math {
|
||||
expressions.push((current.trim().to_string(), true));
|
||||
current.clear();
|
||||
in_math = false;
|
||||
display_math = false;
|
||||
} else if !in_math {
|
||||
in_math = true;
|
||||
display_math = true;
|
||||
}
|
||||
i += 2;
|
||||
} else if chars[i] == '$' && !display_math {
|
||||
if in_math {
|
||||
expressions.push((current.trim().to_string(), false));
|
||||
current.clear();
|
||||
in_math = false;
|
||||
} else {
|
||||
in_math = true;
|
||||
}
|
||||
i += 1;
|
||||
} else if in_math {
|
||||
current.push(chars[i]);
|
||||
i += 1;
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
expressions
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MmdParser {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::output::BoundingBox;
|
||||
|
||||
#[test]
|
||||
fn test_format_inline_math() {
|
||||
let formatter = MmdFormatter::new();
|
||||
let result = formatter.format_math("E = mc^2", false);
|
||||
assert_eq!(result, "$E = mc^2$");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_display_math() {
|
||||
let formatter = MmdFormatter::new();
|
||||
let result = formatter.format_math(r"\int_0^1 x^2 dx", true);
|
||||
assert!(result.contains("$$"));
|
||||
assert!(result.contains(r"\int_0^1 x^2 dx"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_lines() {
|
||||
let formatter = MmdFormatter::new();
|
||||
let lines = vec![
|
||||
LineData {
|
||||
line_type: "text".to_string(),
|
||||
text: "The equation".to_string(),
|
||||
latex: None,
|
||||
bbox: BoundingBox::new(0.0, 0.0, 100.0, 20.0),
|
||||
confidence: 0.95,
|
||||
words: None,
|
||||
},
|
||||
LineData {
|
||||
line_type: "math".to_string(),
|
||||
text: "E = mc^2".to_string(),
|
||||
latex: Some(r"E = mc^2".to_string()),
|
||||
bbox: BoundingBox::new(0.0, 25.0, 100.0, 30.0),
|
||||
confidence: 0.98,
|
||||
words: None,
|
||||
},
|
||||
];
|
||||
|
||||
let result = formatter.format(&lines);
|
||||
assert!(result.contains("The equation"));
|
||||
assert!(result.contains("$$"));
|
||||
assert!(result.contains("mc^2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_mixed_text() {
|
||||
let formatter = MmdFormatter::new();
|
||||
let text = "The formula $E = mc^2$ is famous.";
|
||||
let result = formatter.from_mixed_text(text);
|
||||
assert!(result.contains("$E = mc^2$"));
|
||||
assert!(result.contains("famous"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_latex() {
|
||||
let parser = MmdParser::new();
|
||||
let content = "Text with $inline$ and $$display$$ math.";
|
||||
let expressions = parser.extract_latex(content);
|
||||
|
||||
assert_eq!(expressions.len(), 2);
|
||||
assert_eq!(expressions[0].0, "inline");
|
||||
assert!(!expressions[0].1); // inline
|
||||
assert_eq!(expressions[1].0, "display");
|
||||
assert!(expressions[1].1); // display
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_document() {
|
||||
let formatter = MmdFormatter::new();
|
||||
let doc = formatter.format_document(
|
||||
"My Document",
|
||||
"Content here",
|
||||
Some("author: Test\ndate: 2025-01-01"),
|
||||
);
|
||||
|
||||
assert!(doc.contains("---"));
|
||||
assert!(doc.contains("author: Test"));
|
||||
assert!(doc.contains("# My Document"));
|
||||
assert!(doc.contains("Content here"));
|
||||
}
|
||||
}
|
||||
359
vendor/ruvector/examples/scipix/src/output/mod.rs
vendored
Normal file
359
vendor/ruvector/examples/scipix/src/output/mod.rs
vendored
Normal file
@@ -0,0 +1,359 @@
|
||||
//! Output formatting module for Scipix OCR results
|
||||
//!
|
||||
//! Supports multiple output formats:
|
||||
//! - Text: Plain text extraction
|
||||
//! - LaTeX: Mathematical notation
|
||||
//! - Scipix Markdown (mmd): Enhanced markdown with math
|
||||
//! - MathML: XML-based mathematical markup
|
||||
//! - HTML: Web-ready output with math rendering
|
||||
//! - SMILES: Chemical structure notation
|
||||
//! - DOCX: Microsoft Word format (Office Math ML)
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub mod docx;
|
||||
pub mod formatter;
|
||||
pub mod html;
|
||||
pub mod json;
|
||||
pub mod latex;
|
||||
pub mod mmd;
|
||||
pub mod smiles;
|
||||
|
||||
pub use formatter::{HtmlEngine, MathDelimiters, OutputFormatter};
|
||||
pub use json::ApiResponse;
|
||||
|
||||
/// Output format types supported by Scipix OCR
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum OutputFormat {
|
||||
/// Plain text output
|
||||
Text,
|
||||
/// LaTeX mathematical notation
|
||||
#[serde(rename = "latex_normal")]
|
||||
LaTeX,
|
||||
/// Styled LaTeX with custom packages
|
||||
#[serde(rename = "latex_styled")]
|
||||
LaTeXStyled,
|
||||
/// Mathematical Markup Language
|
||||
#[serde(rename = "mathml")]
|
||||
MathML,
|
||||
/// Scipix Markdown (enhanced markdown)
|
||||
#[serde(rename = "mmd")]
|
||||
Mmd,
|
||||
/// ASCII Math notation
|
||||
#[serde(rename = "asciimath")]
|
||||
AsciiMath,
|
||||
/// HTML with embedded math
|
||||
Html,
|
||||
/// Chemical structure notation
|
||||
#[serde(rename = "smiles")]
|
||||
Smiles,
|
||||
/// Microsoft Word format
|
||||
Docx,
|
||||
}
|
||||
|
||||
impl OutputFormat {
|
||||
/// Get the file extension for this format
|
||||
pub fn extension(&self) -> &'static str {
|
||||
match self {
|
||||
OutputFormat::Text => "txt",
|
||||
OutputFormat::LaTeX | OutputFormat::LaTeXStyled => "tex",
|
||||
OutputFormat::MathML => "xml",
|
||||
OutputFormat::Mmd => "mmd",
|
||||
OutputFormat::AsciiMath => "txt",
|
||||
OutputFormat::Html => "html",
|
||||
OutputFormat::Smiles => "smi",
|
||||
OutputFormat::Docx => "docx",
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the MIME type for this format
|
||||
pub fn mime_type(&self) -> &'static str {
|
||||
match self {
|
||||
OutputFormat::Text | OutputFormat::AsciiMath => "text/plain",
|
||||
OutputFormat::LaTeX | OutputFormat::LaTeXStyled => "application/x-latex",
|
||||
OutputFormat::MathML => "application/mathml+xml",
|
||||
OutputFormat::Mmd => "text/markdown",
|
||||
OutputFormat::Html => "text/html",
|
||||
OutputFormat::Smiles => "chemical/x-daylight-smiles",
|
||||
OutputFormat::Docx => {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Complete OCR result with all possible output formats
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct OcrResult {
|
||||
/// Request identifier
|
||||
pub request_id: String,
|
||||
|
||||
/// Version of the OCR engine
|
||||
pub version: String,
|
||||
|
||||
/// Image dimensions
|
||||
pub image_width: u32,
|
||||
pub image_height: u32,
|
||||
|
||||
/// Processing status
|
||||
pub is_printed: bool,
|
||||
pub is_handwritten: bool,
|
||||
pub auto_rotate_confidence: f32,
|
||||
pub auto_rotate_degrees: i32,
|
||||
|
||||
/// Confidence scores
|
||||
pub confidence: f32,
|
||||
pub confidence_rate: f32,
|
||||
|
||||
/// Available output formats
|
||||
pub formats: FormatsData,
|
||||
|
||||
/// Detailed line and word data
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub line_data: Option<Vec<LineData>>,
|
||||
|
||||
/// Error information if processing failed
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub error: Option<String>,
|
||||
|
||||
/// Processing metadata
|
||||
#[serde(flatten)]
|
||||
pub metadata: HashMap<String, serde_json::Value>,
|
||||
}
|
||||
|
||||
/// All available output format data
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct FormatsData {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub text: Option<String>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub latex_normal: Option<String>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub latex_styled: Option<String>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub latex_simplified: Option<String>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub mathml: Option<String>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub asciimath: Option<String>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub mmd: Option<String>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub html: Option<String>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub smiles: Option<String>,
|
||||
}
|
||||
|
||||
/// Line-level OCR data with positioning
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LineData {
|
||||
/// Line type: text, math, table, image, etc.
|
||||
#[serde(rename = "type")]
|
||||
pub line_type: String,
|
||||
|
||||
/// Content in various formats
|
||||
pub text: String,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub latex: Option<String>,
|
||||
|
||||
/// Bounding box coordinates
|
||||
pub bbox: BoundingBox,
|
||||
|
||||
/// Confidence score
|
||||
pub confidence: f32,
|
||||
|
||||
/// Word-level data
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub words: Option<Vec<WordData>>,
|
||||
}
|
||||
|
||||
/// Word-level OCR data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WordData {
|
||||
pub text: String,
|
||||
pub bbox: BoundingBox,
|
||||
pub confidence: f32,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub latex: Option<String>,
|
||||
}
|
||||
|
||||
/// Bounding box coordinates (x, y, width, height)
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct BoundingBox {
|
||||
pub x: f32,
|
||||
pub y: f32,
|
||||
pub width: f32,
|
||||
pub height: f32,
|
||||
}
|
||||
|
||||
impl BoundingBox {
|
||||
pub fn new(x: f32, y: f32, width: f32, height: f32) -> Self {
|
||||
Self {
|
||||
x,
|
||||
y,
|
||||
width,
|
||||
height,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn area(&self) -> f32 {
|
||||
self.width * self.height
|
||||
}
|
||||
|
||||
pub fn center(&self) -> (f32, f32) {
|
||||
(self.x + self.width / 2.0, self.y + self.height / 2.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert between output formats
|
||||
pub fn convert_format(
|
||||
content: &str,
|
||||
from: OutputFormat,
|
||||
to: OutputFormat,
|
||||
) -> Result<String, String> {
|
||||
// Simple pass-through for same format
|
||||
if from == to {
|
||||
return Ok(content.to_string());
|
||||
}
|
||||
|
||||
// Format-specific conversions
|
||||
match (from, to) {
|
||||
(OutputFormat::LaTeX, OutputFormat::Text) => {
|
||||
// Strip LaTeX commands for plain text
|
||||
Ok(strip_latex(content))
|
||||
}
|
||||
(OutputFormat::Mmd, OutputFormat::LaTeX) => {
|
||||
// Extract LaTeX from markdown
|
||||
Ok(extract_latex_from_mmd(content))
|
||||
}
|
||||
(OutputFormat::LaTeX, OutputFormat::Html) => {
|
||||
// Wrap LaTeX in HTML with MathJax
|
||||
Ok(format!(
|
||||
r#"<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
||||
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<p>\({}\)</p>
|
||||
</body>
|
||||
</html>"#,
|
||||
content
|
||||
))
|
||||
}
|
||||
_ => Err(format!(
|
||||
"Conversion from {:?} to {:?} not supported",
|
||||
from, to
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn strip_latex(content: &str) -> String {
|
||||
// Remove common LaTeX commands
|
||||
let mut result = content.to_string();
|
||||
|
||||
// Remove math delimiters
|
||||
result = result.replace("\\(", "").replace("\\)", "");
|
||||
result = result.replace("\\[", "").replace("\\]", "");
|
||||
result = result.replace("$$", "");
|
||||
|
||||
// Remove common commands but keep their content
|
||||
for cmd in &["\\text", "\\mathrm", "\\mathbf", "\\mathit"] {
|
||||
result = result.replace(&format!("{}{}", cmd, "{"), "");
|
||||
}
|
||||
result = result.replace("}", "");
|
||||
|
||||
// Remove standalone commands
|
||||
for cmd in &["\\\\", "\\,", "\\;", "\\:", "\\!", "\\quad", "\\qquad"] {
|
||||
result = result.replace(cmd, " ");
|
||||
}
|
||||
|
||||
result.trim().to_string()
|
||||
}
|
||||
|
||||
fn extract_latex_from_mmd(content: &str) -> String {
|
||||
let mut latex_parts = Vec::new();
|
||||
let mut in_math = false;
|
||||
let mut current = String::new();
|
||||
|
||||
let chars: Vec<char> = content.chars().collect();
|
||||
let mut i = 0;
|
||||
|
||||
while i < chars.len() {
|
||||
if i + 1 < chars.len() && chars[i] == '$' && chars[i + 1] == '$' {
|
||||
if in_math {
|
||||
latex_parts.push(current.clone());
|
||||
current.clear();
|
||||
in_math = false;
|
||||
} else {
|
||||
in_math = true;
|
||||
}
|
||||
i += 2;
|
||||
} else if chars[i] == '$' {
|
||||
in_math = !in_math;
|
||||
i += 1;
|
||||
} else if in_math {
|
||||
current.push(chars[i]);
|
||||
i += 1;
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
latex_parts.join("\n\n")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_output_format_extension() {
|
||||
assert_eq!(OutputFormat::Text.extension(), "txt");
|
||||
assert_eq!(OutputFormat::LaTeX.extension(), "tex");
|
||||
assert_eq!(OutputFormat::Html.extension(), "html");
|
||||
assert_eq!(OutputFormat::Mmd.extension(), "mmd");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_format_mime_type() {
|
||||
assert_eq!(OutputFormat::Text.mime_type(), "text/plain");
|
||||
assert_eq!(OutputFormat::LaTeX.mime_type(), "application/x-latex");
|
||||
assert_eq!(OutputFormat::Html.mime_type(), "text/html");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bounding_box() {
|
||||
let bbox = BoundingBox::new(10.0, 20.0, 100.0, 50.0);
|
||||
assert_eq!(bbox.area(), 5000.0);
|
||||
assert_eq!(bbox.center(), (60.0, 45.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strip_latex() {
|
||||
let input = r"\text{Hello } \mathbf{World}";
|
||||
let output = strip_latex(input);
|
||||
assert!(output.contains("Hello"));
|
||||
assert!(output.contains("World"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_convert_same_format() {
|
||||
let content = "test content";
|
||||
let result = convert_format(content, OutputFormat::Text, OutputFormat::Text).unwrap();
|
||||
assert_eq!(result, content);
|
||||
}
|
||||
}
|
||||
347
vendor/ruvector/examples/scipix/src/output/smiles.rs
vendored
Normal file
347
vendor/ruvector/examples/scipix/src/output/smiles.rs
vendored
Normal file
@@ -0,0 +1,347 @@
|
||||
//! SMILES (Simplified Molecular Input Line Entry System) generator
|
||||
//!
|
||||
//! Converts chemical structure representations to SMILES notation.
|
||||
//! This is a simplified implementation - full chemistry support requires
|
||||
//! dedicated chemistry libraries like RDKit or OpenBabel.
|
||||
|
||||
use super::OcrResult;
|
||||
|
||||
/// SMILES notation generator for chemical structures
|
||||
pub struct SmilesGenerator {
|
||||
canonical: bool,
|
||||
include_stereochemistry: bool,
|
||||
}
|
||||
|
||||
impl SmilesGenerator {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
canonical: true,
|
||||
include_stereochemistry: true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn canonical(mut self, canonical: bool) -> Self {
|
||||
self.canonical = canonical;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn stereochemistry(mut self, include: bool) -> Self {
|
||||
self.include_stereochemistry = include;
|
||||
self
|
||||
}
|
||||
|
||||
/// Generate SMILES from OCR result
|
||||
pub fn generate_from_result(&self, result: &OcrResult) -> Result<String, String> {
|
||||
// Check if SMILES already available
|
||||
if let Some(smiles) = &result.formats.smiles {
|
||||
return Ok(smiles.clone());
|
||||
}
|
||||
|
||||
// Check for chemistry-related content in line data
|
||||
if let Some(line_data) = &result.line_data {
|
||||
for line in line_data {
|
||||
if line.line_type == "chemistry" || line.line_type == "molecule" {
|
||||
return self.parse_chemical_notation(&line.text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err("No chemical structure data found".to_string())
|
||||
}
|
||||
|
||||
/// Parse chemical notation to SMILES
|
||||
/// This is a placeholder - real implementation needs chemistry parsing
|
||||
fn parse_chemical_notation(&self, notation: &str) -> Result<String, String> {
|
||||
// Check if already SMILES format
|
||||
if self.is_smiles(notation) {
|
||||
return Ok(notation.to_string());
|
||||
}
|
||||
|
||||
// Try to parse common chemical formulas
|
||||
if let Some(smiles) = self.simple_formula_to_smiles(notation) {
|
||||
return Ok(smiles);
|
||||
}
|
||||
|
||||
Err(format!("Cannot convert '{}' to SMILES", notation))
|
||||
}
|
||||
|
||||
/// Check if string is already SMILES notation
|
||||
fn is_smiles(&self, s: &str) -> bool {
|
||||
// Basic SMILES characters
|
||||
let smiles_chars = "CNOPSFClBrI[]()=#@+-0123456789cnops";
|
||||
s.chars().all(|c| smiles_chars.contains(c))
|
||||
}
|
||||
|
||||
/// Convert simple chemical formulas to SMILES
|
||||
fn simple_formula_to_smiles(&self, formula: &str) -> Option<String> {
|
||||
// Common chemical formulas
|
||||
match formula.trim() {
|
||||
"H2O" | "water" => Some("O".to_string()),
|
||||
"CO2" | "carbon dioxide" => Some("O=C=O".to_string()),
|
||||
"CH4" | "methane" => Some("C".to_string()),
|
||||
"C2H6" | "ethane" => Some("CC".to_string()),
|
||||
"C2H5OH" | "ethanol" => Some("CCO".to_string()),
|
||||
"CH3COOH" | "acetic acid" => Some("CC(=O)O".to_string()),
|
||||
"C6H6" | "benzene" => Some("c1ccccc1".to_string()),
|
||||
"C6H12O6" | "glucose" => Some("OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O".to_string()),
|
||||
"NH3" | "ammonia" => Some("N".to_string()),
|
||||
"H2SO4" | "sulfuric acid" => Some("OS(=O)(=O)O".to_string()),
|
||||
"NaCl" | "sodium chloride" => Some("[Na+].[Cl-]".to_string()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Validate SMILES notation
|
||||
pub fn validate(&self, smiles: &str) -> Result<(), String> {
|
||||
// Basic validation checks
|
||||
|
||||
// Check parentheses balance
|
||||
let mut depth = 0;
|
||||
for c in smiles.chars() {
|
||||
match c {
|
||||
'(' => depth += 1,
|
||||
')' => {
|
||||
depth -= 1;
|
||||
if depth < 0 {
|
||||
return Err("Unbalanced parentheses".to_string());
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
if depth != 0 {
|
||||
return Err("Unbalanced parentheses".to_string());
|
||||
}
|
||||
|
||||
// Check brackets balance
|
||||
let mut depth = 0;
|
||||
for c in smiles.chars() {
|
||||
match c {
|
||||
'[' => depth += 1,
|
||||
']' => {
|
||||
depth -= 1;
|
||||
if depth < 0 {
|
||||
return Err("Unbalanced brackets".to_string());
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
if depth != 0 {
|
||||
return Err("Unbalanced brackets".to_string());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Convert SMILES to molecular formula
|
||||
pub fn to_molecular_formula(&self, smiles: &str) -> Result<String, String> {
|
||||
self.validate(smiles)?;
|
||||
|
||||
// Simplified formula extraction
|
||||
// Real implementation would parse the SMILES properly
|
||||
let mut counts: std::collections::HashMap<char, usize> = std::collections::HashMap::new();
|
||||
|
||||
for c in smiles.chars() {
|
||||
if c.is_alphabetic() && c.is_uppercase() {
|
||||
*counts.entry(c).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let mut formula = String::new();
|
||||
// Only use single-character elements for simplicity
|
||||
for element in &['C', 'H', 'N', 'O', 'S', 'P', 'F'] {
|
||||
if let Some(&count) = counts.get(element) {
|
||||
formula.push(*element);
|
||||
if count > 1 {
|
||||
formula.push_str(&count.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if formula.is_empty() {
|
||||
Err("Could not determine molecular formula".to_string())
|
||||
} else {
|
||||
Ok(formula)
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate molecular weight (approximate)
|
||||
pub fn molecular_weight(&self, smiles: &str) -> Result<f32, String> {
|
||||
self.validate(smiles)?;
|
||||
|
||||
// Simplified atomic weights
|
||||
let weights: std::collections::HashMap<char, f32> = [
|
||||
('C', 12.01),
|
||||
('H', 1.008),
|
||||
('N', 14.01),
|
||||
('O', 16.00),
|
||||
('S', 32.07),
|
||||
('P', 30.97),
|
||||
('F', 19.00),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
let mut total_weight = 0.0;
|
||||
|
||||
for c in smiles.chars() {
|
||||
if let Some(&weight) = weights.get(&c) {
|
||||
total_weight += weight;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(total_weight)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SmilesGenerator {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// SMILES parser for extracting structure information
|
||||
pub struct SmilesParser;
|
||||
|
||||
impl SmilesParser {
|
||||
pub fn new() -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
/// Count atoms in SMILES notation
|
||||
pub fn count_atoms(&self, smiles: &str) -> std::collections::HashMap<String, usize> {
|
||||
let mut counts = std::collections::HashMap::new();
|
||||
|
||||
let mut i = 0;
|
||||
let chars: Vec<char> = smiles.chars().collect();
|
||||
|
||||
while i < chars.len() {
|
||||
if chars[i].is_uppercase() {
|
||||
let mut atom = String::from(chars[i]);
|
||||
|
||||
// Check for two-letter atoms (Cl, Br, etc.)
|
||||
if i + 1 < chars.len() && chars[i + 1].is_lowercase() {
|
||||
atom.push(chars[i + 1]);
|
||||
i += 1;
|
||||
}
|
||||
|
||||
*counts.entry(atom).or_insert(0) += 1;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
counts
|
||||
}
|
||||
|
||||
/// Extract ring information
|
||||
pub fn find_rings(&self, smiles: &str) -> Vec<usize> {
|
||||
let mut rings = Vec::new();
|
||||
|
||||
for (_i, c) in smiles.chars().enumerate() {
|
||||
if c.is_numeric() {
|
||||
if let Some(digit) = c.to_digit(10) {
|
||||
rings.push(digit as usize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rings
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SmilesParser {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_smiles() {
|
||||
let gen = SmilesGenerator::new();
|
||||
|
||||
assert!(gen.is_smiles("CCO"));
|
||||
assert!(gen.is_smiles("c1ccccc1"));
|
||||
assert!(gen.is_smiles("CC(=O)O"));
|
||||
assert!(!gen.is_smiles("not smiles!"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_formula_conversion() {
|
||||
let gen = SmilesGenerator::new();
|
||||
|
||||
assert_eq!(gen.simple_formula_to_smiles("H2O"), Some("O".to_string()));
|
||||
assert_eq!(
|
||||
gen.simple_formula_to_smiles("CO2"),
|
||||
Some("O=C=O".to_string())
|
||||
);
|
||||
assert_eq!(gen.simple_formula_to_smiles("CH4"), Some("C".to_string()));
|
||||
assert_eq!(
|
||||
gen.simple_formula_to_smiles("benzene"),
|
||||
Some("c1ccccc1".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_smiles() {
|
||||
let gen = SmilesGenerator::new();
|
||||
|
||||
assert!(gen.validate("CCO").is_ok());
|
||||
assert!(gen.validate("CC(O)C").is_ok());
|
||||
assert!(gen.validate("c1ccccc1").is_ok());
|
||||
|
||||
assert!(gen.validate("CC(O").is_err()); // Unbalanced
|
||||
assert!(gen.validate("CC)O").is_err()); // Unbalanced
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_molecular_formula() {
|
||||
let gen = SmilesGenerator::new();
|
||||
|
||||
let formula = gen.to_molecular_formula("CCO").unwrap();
|
||||
assert!(formula.contains('C'));
|
||||
assert!(formula.contains('O'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_molecular_weight() {
|
||||
let gen = SmilesGenerator::new();
|
||||
|
||||
// Water: H2O (but SMILES is just "O", representing OH2)
|
||||
let weight = gen.molecular_weight("O").unwrap();
|
||||
assert!(weight > 0.0);
|
||||
|
||||
// Ethanol: C2H6O
|
||||
let weight = gen.molecular_weight("CCO").unwrap();
|
||||
assert!(weight > 30.0); // Should be around 46
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_count_atoms() {
|
||||
let parser = SmilesParser::new();
|
||||
|
||||
let counts = parser.count_atoms("CCO");
|
||||
assert_eq!(counts.get("C"), Some(&2));
|
||||
assert_eq!(counts.get("O"), Some(&1));
|
||||
|
||||
let counts = parser.count_atoms("CC(=O)O");
|
||||
assert_eq!(counts.get("C"), Some(&2));
|
||||
assert_eq!(counts.get("O"), Some(&2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_rings() {
|
||||
let parser = SmilesParser::new();
|
||||
|
||||
let rings = parser.find_rings("c1ccccc1");
|
||||
assert_eq!(rings, vec![1, 1]);
|
||||
|
||||
let rings = parser.find_rings("C1CC1");
|
||||
assert_eq!(rings, vec![1, 1]);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user