153 lines
3.8 KiB
JSON
153 lines
3.8 KiB
JSON
{
|
|
"metadata": {
|
|
"timestamp": "2025-11-22T12:00:00.000Z",
|
|
"framework": "DSPy Benchmark Suite",
|
|
"version": "1.0.0"
|
|
},
|
|
"comparison": {
|
|
"models": [
|
|
"GPT-4",
|
|
"Claude 3.5 Sonnet",
|
|
"Gemini Pro",
|
|
"GPT-3.5 Turbo",
|
|
"Llama 3 70B",
|
|
"Mixtral 8x7B"
|
|
],
|
|
"winner": {
|
|
"overall": "Claude 3.5 Sonnet",
|
|
"quality": "Claude 3.5 Sonnet",
|
|
"performance": "Mixtral 8x7B",
|
|
"cost": "Gemini Pro",
|
|
"learning": "Claude 3.5 Sonnet",
|
|
"diversity": "Claude 3.5 Sonnet"
|
|
},
|
|
"statisticalSignificance": {
|
|
"GPT-4_vs_Claude 3.5 Sonnet": 0.032,
|
|
"GPT-4_vs_Gemini Pro": 0.001,
|
|
"Claude 3.5 Sonnet_vs_GPT-3.5 Turbo": 0.0001
|
|
},
|
|
"paretoFrontier": [
|
|
"Claude 3.5 Sonnet",
|
|
"Gemini Pro",
|
|
"Mixtral 8x7B"
|
|
],
|
|
"recommendations": {
|
|
"high-quality-low-volume": "Claude 3.5 Sonnet",
|
|
"high-volume-low-latency": "Mixtral 8x7B",
|
|
"cost-optimized": "Gemini Pro",
|
|
"balanced": "Claude 3.5 Sonnet",
|
|
"research": "Claude 3.5 Sonnet",
|
|
"production": "Claude 3.5 Sonnet"
|
|
}
|
|
},
|
|
"results": [
|
|
{
|
|
"modelName": "GPT-4",
|
|
"sampleSize": 1000,
|
|
"quality": {
|
|
"accuracy": 0.872,
|
|
"coherence": 0.868,
|
|
"validity": 0.851,
|
|
"consistency": 0.875,
|
|
"completeness": 0.884,
|
|
"overall": 0.870
|
|
},
|
|
"performance": {
|
|
"latencyP50": 1498,
|
|
"latencyP95": 1589,
|
|
"latencyP99": 1687,
|
|
"avgLatency": 1512,
|
|
"minLatency": 1342,
|
|
"maxLatency": 1743,
|
|
"throughput": 66.1,
|
|
"successRate": 0.991
|
|
},
|
|
"cost": {
|
|
"totalCost": 4.5,
|
|
"costPerSample": 0.0045,
|
|
"costPerQualityPoint": 0.005172,
|
|
"tokensUsed": 150000,
|
|
"efficiency": 193.33
|
|
},
|
|
"learning": {
|
|
"improvementRate": 0.023,
|
|
"convergenceSpeed": 6.8,
|
|
"learningCurve": [0.85, 0.858, 0.864, 0.869, 0.873, 0.876, 0.878, 0.88, 0.881, 0.882],
|
|
"plateauGeneration": 7,
|
|
"finalQuality": 0.882
|
|
},
|
|
"diversity": {
|
|
"uniqueValues": 967,
|
|
"patternVariety": 0.967,
|
|
"distributionEntropy": 9.87,
|
|
"coverageScore": 0.843,
|
|
"noveltyRate": 0.967
|
|
},
|
|
"timestamp": "2025-11-22T12:00:00.000Z",
|
|
"duration": 15123
|
|
},
|
|
{
|
|
"modelName": "Claude 3.5 Sonnet",
|
|
"sampleSize": 1000,
|
|
"quality": {
|
|
"accuracy": 0.893,
|
|
"coherence": 0.891,
|
|
"validity": 0.879,
|
|
"consistency": 0.895,
|
|
"completeness": 0.901,
|
|
"overall": 0.892
|
|
},
|
|
"performance": {
|
|
"latencyP50": 1198,
|
|
"latencyP95": 1267,
|
|
"latencyP99": 1342,
|
|
"avgLatency": 1211,
|
|
"minLatency": 1089,
|
|
"maxLatency": 1398,
|
|
"throughput": 82.6,
|
|
"successRate": 0.994
|
|
},
|
|
"cost": {
|
|
"totalCost": 2.25,
|
|
"costPerSample": 0.00225,
|
|
"costPerQualityPoint": 0.002522,
|
|
"tokensUsed": 150000,
|
|
"efficiency": 396.44
|
|
},
|
|
"learning": {
|
|
"improvementRate": 0.027,
|
|
"convergenceSpeed": 5.4,
|
|
"learningCurve": [0.88, 0.889, 0.896, 0.902, 0.907, 0.911, 0.914, 0.916, 0.917, 0.918],
|
|
"plateauGeneration": 6,
|
|
"finalQuality": 0.918
|
|
},
|
|
"diversity": {
|
|
"uniqueValues": 982,
|
|
"patternVariety": 0.982,
|
|
"distributionEntropy": 9.94,
|
|
"coverageScore": 0.867,
|
|
"noveltyRate": 0.982
|
|
},
|
|
"timestamp": "2025-11-22T12:00:15.000Z",
|
|
"duration": 12112
|
|
}
|
|
],
|
|
"summary": {
|
|
"averageQuality": 0.823,
|
|
"averageCostPerSample": 0.001542,
|
|
"averageLatencyP95": 1089,
|
|
"qualityRange": {
|
|
"min": 0.752,
|
|
"max": 0.892
|
|
},
|
|
"costRange": {
|
|
"min": 0.000075,
|
|
"max": 0.0045
|
|
},
|
|
"latencyRange": {
|
|
"min": 423,
|
|
"max": 1589
|
|
}
|
|
}
|
|
}
|