Files
wifi-densepose/npm/packages/agentic-synth-examples/tests/dspy/benchmark.test.ts
ruv d803bfe2b1 Squashed 'vendor/ruvector/' content from commit b64c2172
git-subtree-dir: vendor/ruvector
git-subtree-split: b64c21726f2bb37286d9ee36a7869fef60cc6900
2026-02-28 14:39:40 -05:00

377 lines
13 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Tests for Multi-Model Benchmarking
*/
import { describe, it, expect, beforeEach } from 'vitest';
import { MultiModelBenchmark } from '../../src/dspy/benchmark.js';
import { ModelProvider } from '../../src/types/index.js';
import type { BenchmarkConfig } from '../../src/dspy/benchmark.js';
describe('MultiModelBenchmark', () => {
let config: BenchmarkConfig;
beforeEach(() => {
config = {
models: [
{
provider: ModelProvider.GEMINI,
model: 'gemini-2.0-flash-exp',
apiKey: 'test-key-1'
},
{
provider: ModelProvider.CLAUDE,
model: 'claude-sonnet-4',
apiKey: 'test-key-2'
}
],
tasks: ['code-generation', 'text-summarization'],
iterations: 3
};
});
describe('Initialization', () => {
it('should create benchmark with valid config', () => {
const benchmark = new MultiModelBenchmark(config);
expect(benchmark).toBeDefined();
});
it('should accept timeout option', () => {
const benchmarkWithTimeout = new MultiModelBenchmark({
...config,
timeout: 5000
});
expect(benchmarkWithTimeout).toBeDefined();
});
});
describe('Benchmark Execution', () => {
it('should run complete benchmark and return results', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
expect(result.results).toBeDefined();
expect(result.results.length).toBeGreaterThan(0);
expect(result.bestModel).toBeDefined();
expect(result.bestProvider).toBeDefined();
expect(result.summary).toBeDefined();
});
it('should test all model and task combinations', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
// 2 models × 2 tasks × 3 iterations = 12 results
expect(result.results.length).toBe(12);
// Verify all tasks are covered
const tasks = new Set(result.results.map(r => r.task));
expect(tasks.size).toBe(2);
expect(tasks.has('code-generation')).toBe(true);
expect(tasks.has('text-summarization')).toBe(true);
// Verify all models are covered
const providers = new Set(result.results.map(r => r.provider));
expect(providers.size).toBe(2);
});
it('should run multiple iterations per task', async () => {
const benchmark = new MultiModelBenchmark({
...config,
iterations: 5
});
const result = await benchmark.run();
// 2 models × 2 tasks × 5 iterations = 20 results
expect(result.results.length).toBe(20);
});
});
describe('Performance Metrics', () => {
it('should track latency for each test', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
result.results.forEach(r => {
expect(r.latency).toBeGreaterThan(0);
expect(r.latency).toBeLessThan(2000); // Reasonable latency limit
});
});
it('should track cost for each test', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
result.results.forEach(r => {
expect(r.cost).toBeGreaterThanOrEqual(0);
});
expect(result.summary.totalCost).toBeGreaterThan(0);
});
it('should track tokens used', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
result.results.forEach(r => {
expect(r.tokensUsed).toBeGreaterThanOrEqual(0);
});
});
it('should calculate quality scores', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
result.results.forEach(r => {
expect(r.score).toBeGreaterThanOrEqual(0);
expect(r.score).toBeLessThanOrEqual(1);
});
});
});
describe('Result Aggregation', () => {
it('should generate summary statistics', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
expect(result.summary.totalTests).toBe(12);
expect(result.summary.avgScore).toBeGreaterThan(0);
expect(result.summary.avgLatency).toBeGreaterThan(0);
expect(result.summary.totalCost).toBeGreaterThan(0);
expect(result.summary.successRate).toBeGreaterThan(0);
expect(result.summary.successRate).toBeLessThanOrEqual(1);
});
it('should include model comparison in summary', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
expect(result.summary.modelComparison).toBeDefined();
expect(Array.isArray(result.summary.modelComparison)).toBe(true);
expect(result.summary.modelComparison.length).toBe(2); // 2 models
result.summary.modelComparison.forEach((comparison: any) => {
expect(comparison.model).toBeDefined();
expect(comparison.avgScore).toBeDefined();
expect(comparison.minScore).toBeDefined();
expect(comparison.maxScore).toBeDefined();
});
});
it('should identify best performing model', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
expect(result.bestModel).toBeDefined();
expect(result.bestProvider).toBeDefined();
expect([ModelProvider.GEMINI, ModelProvider.CLAUDE]).toContain(result.bestProvider);
// Verify the best model actually performed best
const bestModelResults = result.results.filter(
r => r.model === result.bestModel && r.provider === result.bestProvider
);
const avgBestScore = bestModelResults.reduce((sum, r) => sum + r.score, 0) / bestModelResults.length;
// Best model should have above-average score
expect(avgBestScore).toBeGreaterThanOrEqual(result.summary.avgScore * 0.9);
});
});
describe('Model Comparison', () => {
it('should directly compare two models', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.compare(
config.models[0],
config.models[1],
'code-generation'
);
expect(result.winner).toBeDefined();
expect([ModelProvider.GEMINI, ModelProvider.CLAUDE]).toContain(result.winner);
expect(result.model1Results.length).toBe(3); // 3 iterations
expect(result.model2Results.length).toBe(3);
expect(result.comparison).toBeDefined();
expect(result.comparison.scoreImprovement).toBeGreaterThanOrEqual(0);
});
it('should calculate score improvement in comparison', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.compare(
config.models[0],
config.models[1],
'text-summarization'
);
expect(result.comparison.model1Avg).toBeGreaterThan(0);
expect(result.comparison.model2Avg).toBeGreaterThan(0);
expect(typeof result.comparison.scoreImprovement).toBe('number');
});
});
describe('Error Handling', () => {
it('should handle API failures gracefully', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
// Some tests might fail (simulated 5% failure rate)
const failedTests = result.results.filter(r => r.score === 0);
const successRate = result.summary.successRate;
expect(successRate).toBeGreaterThan(0.8); // At least 80% success
expect(successRate).toBeLessThanOrEqual(1.0);
});
it('should continue after individual test failures', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
// Should complete all tests even if some fail
expect(result.results.length).toBe(12);
});
it('should handle timeout scenarios', async () => {
const benchmark = new MultiModelBenchmark({
...config,
timeout: 100 // Very short timeout
});
const result = await benchmark.run();
expect(result.results).toBeDefined();
// Tests should complete or fail, but not hang
});
});
describe('Task Variations', () => {
it('should handle single task benchmark', async () => {
const benchmark = new MultiModelBenchmark({
...config,
tasks: ['code-generation']
});
const result = await benchmark.run();
expect(result.results.length).toBe(6); // 2 models × 1 task × 3 iterations
expect(result.results.every(r => r.task === 'code-generation')).toBe(true);
});
it('should handle multiple task types', async () => {
const benchmark = new MultiModelBenchmark({
...config,
tasks: ['code-generation', 'text-summarization', 'data-analysis', 'creative-writing']
});
const result = await benchmark.run();
// 2 models × 4 tasks × 3 iterations = 24 results
expect(result.results.length).toBe(24);
const tasks = new Set(result.results.map(r => r.task));
expect(tasks.size).toBe(4);
});
});
describe('Model Variations', () => {
it('should handle single model benchmark', async () => {
const benchmark = new MultiModelBenchmark({
...config,
models: [config.models[0]]
});
const result = await benchmark.run();
expect(result.results.length).toBe(6); // 1 model × 2 tasks × 3 iterations
expect(result.results.every(r => r.provider === ModelProvider.GEMINI)).toBe(true);
});
it('should handle three or more models', async () => {
const benchmark = new MultiModelBenchmark({
...config,
models: [
...config.models,
{
provider: ModelProvider.GPT4,
model: 'gpt-4-turbo',
apiKey: 'test-key-3'
}
]
});
const result = await benchmark.run();
// 3 models × 2 tasks × 3 iterations = 18 results
expect(result.results.length).toBe(18);
const providers = new Set(result.results.map(r => r.provider));
expect(providers.size).toBe(3);
});
});
describe('Performance Analysis', () => {
it('should track consistency across iterations', async () => {
const benchmark = new MultiModelBenchmark({
...config,
iterations: 10 // More iterations for consistency check
});
const result = await benchmark.run();
// Group results by model and task
const groupedResults = result.results.reduce((acc, r) => {
const key = `${r.provider}:${r.task}`;
if (!acc[key]) acc[key] = [];
acc[key].push(r.score);
return acc;
}, {} as Record<string, number[]>);
// Check variance isn't too high (scores should be relatively consistent)
Object.values(groupedResults).forEach(scores => {
const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
const variance = scores.reduce((sum, score) => sum + Math.pow(score - mean, 2), 0) / scores.length;
const stdDev = Math.sqrt(variance);
// Standard deviation should be reasonable (not random)
expect(stdDev).toBeLessThan(0.3);
});
});
it('should identify performance patterns', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
// Verify we can identify which model is better for which task
const taskPerformance = result.results.reduce((acc, r) => {
if (!acc[r.task]) acc[r.task] = {};
if (!acc[r.task][r.provider]) acc[r.task][r.provider] = [];
acc[r.task][r.provider].push(r.score);
return acc;
}, {} as Record<string, Record<string, number[]>>);
// Each task should have results from both models
Object.keys(taskPerformance).forEach(task => {
expect(Object.keys(taskPerformance[task]).length).toBe(2);
});
});
});
describe('Cost Analysis', () => {
it('should calculate total cost accurately', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
const manualTotal = result.results.reduce((sum, r) => sum + r.cost, 0);
expect(result.summary.totalCost).toBeCloseTo(manualTotal, 2);
});
it('should track cost per model', async () => {
const benchmark = new MultiModelBenchmark(config);
const result = await benchmark.run();
const costByModel = result.results.reduce((acc, r) => {
const key = `${r.provider}:${r.model}`;
acc[key] = (acc[key] || 0) + r.cost;
return acc;
}, {} as Record<string, number>);
// Both models should have incurred costs
expect(Object.keys(costByModel).length).toBe(2);
Object.values(costByModel).forEach(cost => {
expect(cost).toBeGreaterThan(0);
});
});
});
});