179 lines
4.5 KiB
TypeScript
179 lines
4.5 KiB
TypeScript
/**
|
|
* DSPy.ts Multi-Model Benchmarking System v1.0.0
|
|
*
|
|
* Comprehensive benchmarking suite comparing multiple models across:
|
|
* - Quality metrics (f1Score, exactMatch, bleuScore, rougeScore)
|
|
* - Optimization strategies (BootstrapFewShot, MIPROv2)
|
|
* - Cost-effectiveness analysis
|
|
* - Performance characteristics
|
|
*
|
|
* Real-world implementation using actual dspy.ts v2.1.1 features:
|
|
* - ChainOfThought for reasoning
|
|
* - ReAct for iterative improvement
|
|
* - MultiChainComparison for ensemble decisions
|
|
* - BootstrapFewShot & MIPROv2 optimizers
|
|
*
|
|
* @requires dspy.ts@2.1.1
|
|
* @requires Environment: OPENAI_API_KEY, ANTHROPIC_API_KEY
|
|
*/
|
|
declare const ChainOfThought: any;
|
|
interface ModelConfig {
|
|
name: string;
|
|
provider: 'openai' | 'anthropic' | 'openrouter';
|
|
modelId: string;
|
|
apiKey: string;
|
|
costPer1kTokens: {
|
|
input: number;
|
|
output: number;
|
|
};
|
|
maxTokens: number;
|
|
}
|
|
interface BenchmarkMetrics {
|
|
quality: {
|
|
f1: number;
|
|
exactMatch: number;
|
|
bleu: number;
|
|
rouge: number;
|
|
overall: number;
|
|
};
|
|
performance: {
|
|
avgLatency: number;
|
|
p50: number;
|
|
p95: number;
|
|
p99: number;
|
|
throughput: number;
|
|
successRate: number;
|
|
};
|
|
cost: {
|
|
totalCost: number;
|
|
costPerSample: number;
|
|
costPerQualityPoint: number;
|
|
inputTokens: number;
|
|
outputTokens: number;
|
|
};
|
|
optimization: {
|
|
baselineQuality: number;
|
|
bootstrapQuality: number;
|
|
miproQuality: number;
|
|
bootstrapImprovement: number;
|
|
miproImprovement: number;
|
|
};
|
|
}
|
|
interface BenchmarkResult {
|
|
modelName: string;
|
|
timestamp: string;
|
|
metrics: BenchmarkMetrics;
|
|
optimizationHistory: {
|
|
method: 'baseline' | 'bootstrap' | 'mipro';
|
|
round: number;
|
|
quality: number;
|
|
duration: number;
|
|
}[];
|
|
sampleSize: number;
|
|
duration: number;
|
|
}
|
|
interface ComparisonReport {
|
|
summary: {
|
|
winner: {
|
|
quality: string;
|
|
performance: string;
|
|
cost: string;
|
|
optimization: string;
|
|
overall: string;
|
|
};
|
|
modelsCompared: number;
|
|
totalSamples: number;
|
|
totalDuration: number;
|
|
};
|
|
results: BenchmarkResult[];
|
|
rankings: {
|
|
quality: {
|
|
model: string;
|
|
score: number;
|
|
}[];
|
|
performance: {
|
|
model: string;
|
|
score: number;
|
|
}[];
|
|
cost: {
|
|
model: string;
|
|
score: number;
|
|
}[];
|
|
optimization: {
|
|
model: string;
|
|
score: number;
|
|
}[];
|
|
};
|
|
recommendations: {
|
|
production: string;
|
|
research: string;
|
|
costOptimized: string;
|
|
balanced: string;
|
|
};
|
|
}
|
|
/**
|
|
* Synthetic Data Generator using Chain of Thought
|
|
*/
|
|
declare class SyntheticDataModule extends ChainOfThought {
|
|
constructor();
|
|
}
|
|
export declare class DSPyMultiModelBenchmark {
|
|
private models;
|
|
private results;
|
|
private outputDir;
|
|
constructor(outputDir?: string);
|
|
/**
|
|
* Register a model for benchmarking
|
|
*/
|
|
addModel(config: ModelConfig): void;
|
|
/**
|
|
* Run comprehensive comparison across all models
|
|
*/
|
|
runComparison(sampleSize?: number): Promise<ComparisonReport>;
|
|
/**
|
|
* Benchmark a single model
|
|
*/
|
|
private benchmarkModel;
|
|
/**
|
|
* Optimize with BootstrapFewShot
|
|
*/
|
|
optimizeWithBootstrap(module: SyntheticDataModule, schema: any, sampleSize: number): Promise<SyntheticDataModule>;
|
|
/**
|
|
* Optimize with MIPROv2
|
|
*/
|
|
optimizeWithMIPRO(module: SyntheticDataModule, schema: any, sampleSize: number): Promise<SyntheticDataModule>;
|
|
/**
|
|
* Evaluate module quality
|
|
*/
|
|
private evaluateModule;
|
|
/**
|
|
* Measure performance metrics
|
|
*/
|
|
private measurePerformance;
|
|
/**
|
|
* Generate training dataset
|
|
*/
|
|
private generateTrainingSet;
|
|
/**
|
|
* Generate sample synthetic data
|
|
*/
|
|
private generateSampleData;
|
|
/**
|
|
* Calculate quality score for synthetic data
|
|
*/
|
|
private calculateQualityScore;
|
|
/**
|
|
* Calculate percentile
|
|
*/
|
|
private percentile;
|
|
/**
|
|
* Generate comparison report
|
|
*/
|
|
private generateComparisonReport;
|
|
/**
|
|
* Generate and save markdown report
|
|
*/
|
|
generateReport(comparison: ComparisonReport): Promise<string>;
|
|
}
|
|
export { ModelConfig, BenchmarkResult, ComparisonReport, BenchmarkMetrics };
|
|
//# sourceMappingURL=dspy-multi-model-benchmark.d.ts.map
|