Files
wifi-densepose/vendor/ruvector/npm/packages/agentic-synth/training/dspy-multi-model-benchmark.d.ts

179 lines
4.5 KiB
TypeScript

/**
* DSPy.ts Multi-Model Benchmarking System v1.0.0
*
* Comprehensive benchmarking suite comparing multiple models across:
* - Quality metrics (f1Score, exactMatch, bleuScore, rougeScore)
* - Optimization strategies (BootstrapFewShot, MIPROv2)
* - Cost-effectiveness analysis
* - Performance characteristics
*
* Real-world implementation using actual dspy.ts v2.1.1 features:
* - ChainOfThought for reasoning
* - ReAct for iterative improvement
* - MultiChainComparison for ensemble decisions
* - BootstrapFewShot & MIPROv2 optimizers
*
* @requires dspy.ts@2.1.1
* @requires Environment: OPENAI_API_KEY, ANTHROPIC_API_KEY
*/
declare const ChainOfThought: any;
interface ModelConfig {
name: string;
provider: 'openai' | 'anthropic' | 'openrouter';
modelId: string;
apiKey: string;
costPer1kTokens: {
input: number;
output: number;
};
maxTokens: number;
}
interface BenchmarkMetrics {
quality: {
f1: number;
exactMatch: number;
bleu: number;
rouge: number;
overall: number;
};
performance: {
avgLatency: number;
p50: number;
p95: number;
p99: number;
throughput: number;
successRate: number;
};
cost: {
totalCost: number;
costPerSample: number;
costPerQualityPoint: number;
inputTokens: number;
outputTokens: number;
};
optimization: {
baselineQuality: number;
bootstrapQuality: number;
miproQuality: number;
bootstrapImprovement: number;
miproImprovement: number;
};
}
interface BenchmarkResult {
modelName: string;
timestamp: string;
metrics: BenchmarkMetrics;
optimizationHistory: {
method: 'baseline' | 'bootstrap' | 'mipro';
round: number;
quality: number;
duration: number;
}[];
sampleSize: number;
duration: number;
}
interface ComparisonReport {
summary: {
winner: {
quality: string;
performance: string;
cost: string;
optimization: string;
overall: string;
};
modelsCompared: number;
totalSamples: number;
totalDuration: number;
};
results: BenchmarkResult[];
rankings: {
quality: {
model: string;
score: number;
}[];
performance: {
model: string;
score: number;
}[];
cost: {
model: string;
score: number;
}[];
optimization: {
model: string;
score: number;
}[];
};
recommendations: {
production: string;
research: string;
costOptimized: string;
balanced: string;
};
}
/**
* Synthetic Data Generator using Chain of Thought
*/
declare class SyntheticDataModule extends ChainOfThought {
constructor();
}
export declare class DSPyMultiModelBenchmark {
private models;
private results;
private outputDir;
constructor(outputDir?: string);
/**
* Register a model for benchmarking
*/
addModel(config: ModelConfig): void;
/**
* Run comprehensive comparison across all models
*/
runComparison(sampleSize?: number): Promise<ComparisonReport>;
/**
* Benchmark a single model
*/
private benchmarkModel;
/**
* Optimize with BootstrapFewShot
*/
optimizeWithBootstrap(module: SyntheticDataModule, schema: any, sampleSize: number): Promise<SyntheticDataModule>;
/**
* Optimize with MIPROv2
*/
optimizeWithMIPRO(module: SyntheticDataModule, schema: any, sampleSize: number): Promise<SyntheticDataModule>;
/**
* Evaluate module quality
*/
private evaluateModule;
/**
* Measure performance metrics
*/
private measurePerformance;
/**
* Generate training dataset
*/
private generateTrainingSet;
/**
* Generate sample synthetic data
*/
private generateSampleData;
/**
* Calculate quality score for synthetic data
*/
private calculateQualityScore;
/**
* Calculate percentile
*/
private percentile;
/**
* Generate comparison report
*/
private generateComparisonReport;
/**
* Generate and save markdown report
*/
generateReport(comparison: ComparisonReport): Promise<string>;
}
export { ModelConfig, BenchmarkResult, ComparisonReport, BenchmarkMetrics };
//# sourceMappingURL=dspy-multi-model-benchmark.d.ts.map