Squashed 'vendor/ruvector/' content from commit b64c2172

git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
This commit is contained in:
ruv
2026-02-28 14:39:40 -05:00
commit d803bfe2b1
7854 changed files with 3522914 additions and 0 deletions

View File

@@ -0,0 +1,64 @@
{
"name": "@ruvector/scipix",
"version": "0.1.0",
"description": "OCR client for scientific documents - extract LaTeX, MathML from equations, research papers, and technical diagrams",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"exports": {
".": {
"import": {
"types": "./dist/index.d.ts",
"default": "./dist/index.js"
},
"require": {
"types": "./dist/index.d.ts",
"default": "./dist/index.js"
}
}
},
"scripts": {
"build": "tsc",
"prepublishOnly": "npm run build",
"test": "node --test test/*.test.js",
"typecheck": "tsc --noEmit",
"clean": "rm -rf dist"
},
"devDependencies": {
"@types/node": "^20.19.30",
"typescript": "^5.9.3"
},
"keywords": [
"ocr",
"latex",
"mathml",
"scientific-computing",
"image-recognition",
"math-ocr",
"equation-recognition",
"document-processing",
"ruvector",
"pdf-extraction"
],
"author": "rUv Team <team@ruv.io>",
"license": "MIT OR Apache-2.0",
"repository": {
"type": "git",
"url": "https://github.com/ruvnet/ruvector.git",
"directory": "npm/packages/scipix"
},
"homepage": "https://github.com/ruvnet/ruvector/tree/main/examples/scipix",
"bugs": {
"url": "https://github.com/ruvnet/ruvector/issues"
},
"engines": {
"node": ">= 18"
},
"publishConfig": {
"registry": "https://registry.npmjs.org/",
"access": "public"
},
"files": [
"dist",
"README.md"
]
}

View File

@@ -0,0 +1,272 @@
/**
* SciPix OCR Client
* Client for interacting with SciPix OCR API
*/
import { readFile } from 'node:fs/promises';
import { extname } from 'node:path';
import {
type SciPixConfig,
type OCROptions,
type OCRResult,
type BatchOCRRequest,
type BatchOCRResult,
type HealthStatus,
SciPixError,
SciPixErrorCode,
OutputFormat,
ImageType,
} from './types.js';
/** Default configuration */
const DEFAULT_CONFIG: Required<Omit<SciPixConfig, 'apiKey'>> = {
baseUrl: 'http://localhost:8080',
timeout: 30000,
maxRetries: 3,
defaultOptions: {
formats: [OutputFormat.LaTeX, OutputFormat.Text],
detectEquations: true,
preprocess: true,
},
};
/** SciPix OCR Client */
export class SciPixClient {
private config: Required<Omit<SciPixConfig, 'apiKey'>> & { apiKey?: string };
constructor(config?: SciPixConfig) {
this.config = { ...DEFAULT_CONFIG, ...config };
}
/**
* Perform OCR on an image
* @param image - Image data as Buffer, base64 string, or file path
* @param options - OCR options
*/
async ocr(image: Buffer | string, options?: OCROptions): Promise<OCRResult> {
const imageData = await this.prepareImage(image);
const mergedOptions = { ...this.config.defaultOptions, ...options };
const response = await this.request('/api/v1/ocr', {
method: 'POST',
body: JSON.stringify({
image: imageData.base64,
imageType: imageData.type,
options: mergedOptions,
}),
});
return response as OCRResult;
}
/**
* Perform OCR on a file
* @param filePath - Path to the image file
* @param options - OCR options
*/
async ocrFile(filePath: string, options?: OCROptions): Promise<OCRResult> {
const buffer = await readFile(filePath);
return this.ocr(buffer, options);
}
/**
* Perform batch OCR on multiple images
* @param request - Batch OCR request
*/
async batchOcr(request: BatchOCRRequest): Promise<BatchOCRResult> {
const response = await this.request('/api/v1/ocr/batch', {
method: 'POST',
body: JSON.stringify(request),
});
return response as BatchOCRResult;
}
/**
* Extract LaTeX from an equation image
* @param image - Image data
*/
async extractLatex(image: Buffer | string): Promise<string> {
const result = await this.ocr(image, {
formats: [OutputFormat.LaTeX],
detectEquations: true,
});
return result.latex ?? result.text;
}
/**
* Extract MathML from an equation image
* @param image - Image data
*/
async extractMathML(image: Buffer | string): Promise<string> {
const result = await this.ocr(image, {
formats: [OutputFormat.MathML],
detectEquations: true,
});
return result.mathml ?? '';
}
/**
* Check API health status
*/
async health(): Promise<HealthStatus> {
const response = await this.request('/api/v1/health', {
method: 'GET',
});
return response as HealthStatus;
}
/**
* Prepare image for API request
*/
private async prepareImage(
image: Buffer | string,
): Promise<{ base64: string; type: ImageType }> {
let buffer: Buffer;
let type: ImageType = ImageType.PNG;
if (Buffer.isBuffer(image)) {
buffer = image;
type = this.detectImageType(buffer);
} else if (image.startsWith('data:')) {
// Base64 data URL
const match = image.match(/^data:image\/(\w+);base64,(.+)$/);
if (!match) {
throw SciPixError.invalidImage('Invalid data URL format');
}
type = this.parseImageType(match[1]);
return { base64: match[2], type };
} else if (image.startsWith('/') || image.includes(':\\')) {
// File path
buffer = await readFile(image);
type = this.getTypeFromExtension(extname(image));
} else {
// Assume base64 string
return { base64: image, type: ImageType.PNG };
}
return {
base64: buffer.toString('base64'),
type,
};
}
/**
* Detect image type from buffer magic bytes
*/
private detectImageType(buffer: Buffer): ImageType {
if (buffer[0] === 0x89 && buffer[1] === 0x50) return ImageType.PNG;
if (buffer[0] === 0xff && buffer[1] === 0xd8) return ImageType.JPEG;
if (buffer[0] === 0x52 && buffer[1] === 0x49) return ImageType.WebP;
if (buffer[0] === 0x25 && buffer[1] === 0x50) return ImageType.PDF;
if (buffer[0] === 0x49 && buffer[1] === 0x49) return ImageType.TIFF;
if (buffer[0] === 0x4d && buffer[1] === 0x4d) return ImageType.TIFF;
if (buffer[0] === 0x42 && buffer[1] === 0x4d) return ImageType.BMP;
return ImageType.PNG; // Default
}
/**
* Parse image type from MIME type
*/
private parseImageType(mimeType: string): ImageType {
switch (mimeType.toLowerCase()) {
case 'png':
return ImageType.PNG;
case 'jpeg':
case 'jpg':
return ImageType.JPEG;
case 'webp':
return ImageType.WebP;
case 'pdf':
return ImageType.PDF;
case 'tiff':
case 'tif':
return ImageType.TIFF;
case 'bmp':
return ImageType.BMP;
default:
return ImageType.PNG;
}
}
/**
* Get image type from file extension
*/
private getTypeFromExtension(ext: string): ImageType {
return this.parseImageType(ext.slice(1));
}
/**
* Make HTTP request to API
*/
private async request(
path: string,
options: RequestInit,
retries = 0,
): Promise<unknown> {
const url = `${this.config.baseUrl}${path}`;
const headers: Record<string, string> = {
'Content-Type': 'application/json',
};
if (this.config.apiKey) {
headers['Authorization'] = `Bearer ${this.config.apiKey}`;
}
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), this.config.timeout);
try {
const response = await fetch(url, {
...options,
headers: { ...headers, ...options.headers },
signal: controller.signal,
});
clearTimeout(timeoutId);
if (!response.ok) {
const error = await response.text();
if (response.status === 401) {
throw new SciPixError('Unauthorized', SciPixErrorCode.Unauthorized, 401);
}
if (response.status === 429) {
throw new SciPixError('Rate limited', SciPixErrorCode.RateLimited, 429);
}
throw SciPixError.serverError(error, response.status);
}
return await response.json();
} catch (error) {
clearTimeout(timeoutId);
if (error instanceof SciPixError) {
throw error;
}
if ((error as Error).name === 'AbortError') {
throw SciPixError.timeout();
}
// Retry on network errors
if (retries < this.config.maxRetries) {
await new Promise((resolve) => setTimeout(resolve, 1000 * (retries + 1)));
return this.request(path, options, retries + 1);
}
throw SciPixError.networkError((error as Error).message);
}
}
}
/**
* Create a SciPix client with default configuration
*/
export function createClient(config?: SciPixConfig): SciPixClient {
return new SciPixClient(config);
}

View File

@@ -0,0 +1,64 @@
/**
* @ruvector/scipix - OCR Client for Scientific Documents
*
* A TypeScript client for the SciPix OCR API, enabling extraction of
* LaTeX, MathML, and text from scientific images, equations, and documents.
*
* @example
* ```typescript
* import { SciPixClient, OutputFormat } from '@ruvector/scipix';
*
* // Create client
* const client = new SciPixClient({
* baseUrl: 'http://localhost:8080',
* apiKey: 'your-api-key',
* });
*
* // OCR an image file
* const result = await client.ocrFile('./equation.png', {
* formats: [OutputFormat.LaTeX, OutputFormat.MathML],
* detectEquations: true,
* });
*
* console.log('LaTeX:', result.latex);
* console.log('Confidence:', result.confidence);
*
* // Quick LaTeX extraction
* const latex = await client.extractLatex('./math.png');
* console.log('Extracted LaTeX:', latex);
*
* // Batch processing
* const batchResult = await client.batchOcr({
* images: [
* { source: 'base64...', id: 'eq1' },
* { source: 'base64...', id: 'eq2' },
* ],
* defaultOptions: { formats: [OutputFormat.LaTeX] },
* });
*
* console.log(`Processed ${batchResult.successful}/${batchResult.totalImages} images`);
* ```
*
* @packageDocumentation
*/
// Types
export {
OutputFormat,
ImageType,
ConfidenceLevel,
ContentType,
BoundingBox,
OCRRegion,
OCRResult,
OCROptions,
BatchOCRRequest,
BatchOCRResult,
SciPixConfig,
HealthStatus,
SciPixError,
SciPixErrorCode,
} from './types.js';
// Client
export { SciPixClient, createClient } from './client.js';

View File

@@ -0,0 +1,230 @@
/**
* SciPix OCR Types
* Types for scientific document OCR and equation recognition
*/
/** Supported output formats for OCR results */
export enum OutputFormat {
/** LaTeX mathematical notation */
LaTeX = 'latex',
/** MathML markup language */
MathML = 'mathml',
/** ASCII math notation */
AsciiMath = 'asciimath',
/** Plain text */
Text = 'text',
/** Structured JSON with metadata */
JSON = 'json',
}
/** Supported image input types */
export enum ImageType {
PNG = 'png',
JPEG = 'jpeg',
WebP = 'webp',
PDF = 'pdf',
TIFF = 'tiff',
BMP = 'bmp',
}
/** OCR confidence level */
export enum ConfidenceLevel {
High = 'high',
Medium = 'medium',
Low = 'low',
}
/** Type of content detected in the image */
export enum ContentType {
/** Mathematical equation */
Equation = 'equation',
/** Text content */
Text = 'text',
/** Table structure */
Table = 'table',
/** Diagram or chart */
Diagram = 'diagram',
/** Mixed content */
Mixed = 'mixed',
}
/** Bounding box for detected regions */
export interface BoundingBox {
x: number;
y: number;
width: number;
height: number;
}
/** Single OCR result region */
export interface OCRRegion {
/** Unique identifier for this region */
id: string;
/** Bounding box of the detected region */
bbox: BoundingBox;
/** Type of content detected */
contentType: ContentType;
/** Raw text content */
text: string;
/** LaTeX representation (if applicable) */
latex?: string;
/** MathML representation (if applicable) */
mathml?: string;
/** Confidence score (0-1) */
confidence: number;
/** Confidence level */
confidenceLevel: ConfidenceLevel;
}
/** Complete OCR result */
export interface OCRResult {
/** Unique result identifier */
id: string;
/** Original image dimensions */
imageDimensions: {
width: number;
height: number;
};
/** All detected regions */
regions: OCRRegion[];
/** Combined text output */
text: string;
/** Combined LaTeX output (if requested) */
latex?: string;
/** Combined MathML output (if requested) */
mathml?: string;
/** Processing time in milliseconds */
processingTime: number;
/** Model version used */
modelVersion: string;
/** Overall confidence */
confidence: number;
/** Metadata */
metadata: {
imageType: ImageType;
hasEquations: boolean;
hasTables: boolean;
hasDiagrams: boolean;
pageCount?: number;
};
}
/** OCR request options */
export interface OCROptions {
/** Desired output formats */
formats?: OutputFormat[];
/** Language hints for OCR */
languages?: string[];
/** Enable equation detection */
detectEquations?: boolean;
/** Enable table detection */
detectTables?: boolean;
/** Enable diagram detection */
detectDiagrams?: boolean;
/** Minimum confidence threshold (0-1) */
minConfidence?: number;
/** Enable preprocessing (deskew, denoise) */
preprocess?: boolean;
/** DPI hint for scanned documents */
dpi?: number;
/** Specific pages to process (for PDFs) */
pages?: number[];
}
/** Batch OCR request */
export interface BatchOCRRequest {
/** Array of image URLs or base64 data */
images: Array<{
/** URL or base64 data */
source: string;
/** Optional identifier */
id?: string;
/** Per-image options */
options?: OCROptions;
}>;
/** Default options for all images */
defaultOptions?: OCROptions;
}
/** Batch OCR result */
export interface BatchOCRResult {
/** Total images processed */
totalImages: number;
/** Successful results */
successful: number;
/** Failed results */
failed: number;
/** Individual results */
results: Array<{
id: string;
success: boolean;
result?: OCRResult;
error?: string;
}>;
/** Total processing time */
totalProcessingTime: number;
}
/** SciPix client configuration */
export interface SciPixConfig {
/** API base URL */
baseUrl?: string;
/** API key for authentication */
apiKey?: string;
/** Request timeout in milliseconds */
timeout?: number;
/** Maximum retries for failed requests */
maxRetries?: number;
/** Default OCR options */
defaultOptions?: OCROptions;
}
/** Health check response */
export interface HealthStatus {
status: 'healthy' | 'degraded' | 'unhealthy';
version: string;
models: {
name: string;
loaded: boolean;
version: string;
}[];
uptime: number;
}
/** Error types */
export class SciPixError extends Error {
constructor(
message: string,
public readonly code: SciPixErrorCode,
public readonly statusCode?: number,
) {
super(message);
this.name = 'SciPixError';
}
static networkError(message: string): SciPixError {
return new SciPixError(message, SciPixErrorCode.Network);
}
static serverError(message: string, statusCode: number): SciPixError {
return new SciPixError(message, SciPixErrorCode.Server, statusCode);
}
static invalidImage(message: string): SciPixError {
return new SciPixError(message, SciPixErrorCode.InvalidImage);
}
static timeout(): SciPixError {
return new SciPixError('Request timed out', SciPixErrorCode.Timeout);
}
}
export enum SciPixErrorCode {
Network = 'NETWORK',
Server = 'SERVER',
InvalidImage = 'INVALID_IMAGE',
Timeout = 'TIMEOUT',
InvalidConfig = 'INVALID_CONFIG',
Unauthorized = 'UNAUTHORIZED',
RateLimited = 'RATE_LIMITED',
}

View File

@@ -0,0 +1,19 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "NodeNext",
"moduleResolution": "NodeNext",
"declaration": true,
"declarationMap": true,
"sourceMap": true,
"outDir": "./dist",
"rootDir": "./src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"resolveJsonModule": true
},
"include": ["src/**/*"],
"exclude": ["node_modules", "dist", "test"]
}