A/B 测试
Prompt 变体测试、模型对比
5.1 Prompt 变体测试
javascript
class PromptABTest {
constructor() {
this.variants = {};
this.results = {};
}
addVariant(name, prompt) {
this.variants[name] = prompt;
this.results[name] = { trials: [], scores: [] };
}
async runTrial(testCase) {
const variantResults = {};
for (const [name, prompt] of Object.entries(this.variants)) {
const output = await llm.chat({
messages: [
{ role: 'system', content: prompt },
{ role: 'user', content: testCase.input }
]
});
const score = await evaluateOutput(testCase, output.content);
this.results[name].trials.push({
testCaseId: testCase.id,
score
});
this.results[name].scores.push(score);
variantResults[name] = score;
}
return variantResults;
}
getStatistics() {
const stats = {};
for (const [name, data] of Object.entries(this.results)) {
const scores = data.scores;
stats[name] = {
mean: scores.reduce((a, b) => a + b, 0) / scores.length,
std: this.standardDeviation(scores),
min: Math.min(...scores),
max: Math.max(...scores),
n: scores.length
};
}
return stats;
}
// 统计显著性检验
tTest(variantA, variantB) {
const a = this.results[variantA].scores;
const b = this.results[variantB].scores;
const meanA = a.reduce((x, y) => x + y) / a.length;
const meanB = b.reduce((x, y) => x + y) / b.length;
const varA = a.reduce((sum, x) => sum + (x - meanA) ** 2, 0) / (a.length - 1);
const varB = b.reduce((sum, x) => sum + (x - meanB) ** 2, 0) / (b.length - 1);
const pooledSE = Math.sqrt(varA / a.length + varB / b.length);
const t = (meanA - meanB) / pooledSE;
// 简化的显著性判断
const significant = Math.abs(t) > 2; // 约 95% 置信度
return {
tStatistic: t,
significant,
winner: significant ? (meanA > meanB ? variantA : variantB) : 'no_difference'
};
}
}5.2 模型对比
javascript
async function compareModels(testCases, models) {
const results = {};
for (const model of models) {
results[model] = {
scores: [],
latencies: [],
costs: []
};
for (const testCase of testCases) {
const start = Date.now();
const response = await llm.chat({
model,
messages: [
{ role: 'user', content: testCase.input }
]
});
const latency = Date.now() - start;
const score = await evaluateOutput(testCase, response.content);
const cost = calculateCost(model, response.usage);
results[model].scores.push(score);
results[model].latencies.push(latency);
results[model].costs.push(cost);
}
}
// 生成比较报告
return Object.entries(results).map(([model, data]) => ({
model,
avgScore: average(data.scores),
avgLatency: average(data.latencies),
totalCost: sum(data.costs),
scorePerDollar: average(data.scores) / sum(data.costs)
}));
}