评估与优化:像工程师一样测试 AI 系统 📊
"如果你不能测量它,你就不能改进它。"
1. 评估体系概述
1.1 为什么需要评估
AI 系统的输出是概率性的,需要系统化评估:
- 质量保证: 确保输出符合预期
- 回归检测: 发现 Prompt 修改导致的问题
- 模型对比: 选择最适合的模型
- 成本优化: 在质量和成本间取得平衡
1.2 评估金字塔
┌─────────────────┐
│ 人工评审 │ ← 最准确,成本高
│ (Human Eval) │
└────────┬────────┘
│
┌────────┴────────┐
│ LLM 评审 │ ← 自动化,中等成本
│ (LLM-as-Judge) │
└────────┬────────┘
│
┌──────────────┴──────────────┐
│ 规则检查 │ ← 自动化,低成本
│ (格式、长度、关键词等) │
└─────────────────────────────┘2. 规则检查 (Rule-Based Evaluation)
2.1 格式验证
javascript
class FormatValidator {
// 检查 JSON 格式
validateJSON(output) {
try {
JSON.parse(output);
return { valid: true };
} catch (e) {
return { valid: false, error: e.message };
}
}
// 检查代码语法
async validateTypeScript(code) {
const ts = require('typescript');
const result = ts.transpileModule(code, {
compilerOptions: {
module: ts.ModuleKind.ESNext,
target: ts.ScriptTarget.ESNext,
strict: true
},
reportDiagnostics: true
});
if (result.diagnostics.length > 0) {
return {
valid: false,
errors: result.diagnostics.map(d => d.messageText)
};
}
return { valid: true };
}
// 检查 Markdown 结构
validateMarkdown(content, requiredSections) {
const headings = content.match(/^#+\s+.+$/gm) || [];
const missing = requiredSections.filter(
section => !headings.some(h => h.toLowerCase().includes(section.toLowerCase()))
);
return {
valid: missing.length === 0,
missing
};
}
}2.2 内容规则
javascript
class ContentValidator {
constructor(rules) {
this.rules = rules;
}
validate(output) {
const results = [];
for (const rule of this.rules) {
switch (rule.type) {
case 'contains':
results.push({
rule: rule.name,
passed: output.includes(rule.value),
message: `应包含 "${rule.value}"`
});
break;
case 'not_contains':
results.push({
rule: rule.name,
passed: !output.includes(rule.value),
message: `不应包含 "${rule.value}"`
});
break;
case 'matches':
results.push({
rule: rule.name,
passed: new RegExp(rule.pattern).test(output),
message: `应匹配模式 ${rule.pattern}`
});
break;
case 'length':
const len = output.length;
results.push({
rule: rule.name,
passed: len >= rule.min && len <= rule.max,
message: `长度应在 ${rule.min}-${rule.max} 之间,实际 ${len}`
});
break;
}
}
return {
allPassed: results.every(r => r.passed),
results
};
}
}
// 使用示例
const validator = new ContentValidator([
{ type: 'contains', name: 'has_export', value: 'export' },
{ type: 'not_contains', name: 'no_any', value: ': any' },
{ type: 'not_contains', name: 'no_console', value: 'console.log' },
{ type: 'matches', name: 'has_types', pattern: 'interface|type' }
]);2.3 代码质量检查
javascript
async function validateCodeQuality(code) {
const checks = [];
// ESLint 检查
const eslint = new ESLint();
const lintResults = await eslint.lintText(code, { filePath: 'test.tsx' });
checks.push({
name: 'eslint',
passed: lintResults[0].errorCount === 0,
errors: lintResults[0].messages
});
// TypeScript 类型检查
const tsResult = await validateTypeScript(code);
checks.push({
name: 'typescript',
passed: tsResult.valid,
errors: tsResult.errors
});
// 安全检查
const securityIssues = checkSecurityIssues(code);
checks.push({
name: 'security',
passed: securityIssues.length === 0,
issues: securityIssues
});
return checks;
}
function checkSecurityIssues(code) {
const issues = [];
const patterns = [
{ pattern: /dangerouslySetInnerHTML/g, message: '使用了 dangerouslySetInnerHTML' },
{ pattern: /eval\(/g, message: '使用了 eval' },
{ pattern: /innerHTML\s*=/g, message: '直接设置 innerHTML' },
{ pattern: /document\.write/g, message: '使用了 document.write' }
];
for (const { pattern, message } of patterns) {
if (pattern.test(code)) {
issues.push(message);
}
}
return issues;
}3. LLM-as-Judge (LLM 评审)
3.1 基础评审
javascript
async function llmJudge(task, output, criteria) {
const response = await llm.chat({
messages: [{
role: 'user',
content: `评估以下 AI 输出的质量。
## 任务
${task}
## AI 输出
${output}
## 评估标准
${criteria.map((c, i) => `${i+1}. ${c}`).join('\n')}
## 评分要求
对每个标准打分 1-5 分,并给出简短理由。
输出 JSON:
{
"scores": [
{ "criterion": "...", "score": 1-5, "reason": "..." }
],
"overallScore": 1-5,
"summary": "总体评价"
}`
}],
response_format: { type: 'json_object' }
});
return JSON.parse(response.content);
}
// 使用示例
const evaluation = await llmJudge(
"生成一个 React 登录表单组件",
generatedCode,
[
"代码是否功能完整(包含表单验证)",
"代码是否类型安全(TypeScript)",
"代码是否遵循 React 最佳实践",
"代码是否有良好的错误处理"
]
);3.2 成对比较
javascript
async function pairwiseComparison(task, outputA, outputB) {
const response = await llm.chat({
messages: [{
role: 'user',
content: `比较两个 AI 输出,选择更好的一个。
## 任务
${task}
## 输出 A
${outputA}
## 输出 B
${outputB}
## 评估维度
1. 正确性
2. 完整性
3. 代码质量
4. 可读性
## 要求
对每个维度,说明哪个更好(A/B/平局),并给出理由。
最后给出总体胜者。
输出 JSON:
{
"dimensions": [
{ "name": "正确性", "winner": "A|B|tie", "reason": "..." }
],
"overallWinner": "A|B|tie",
"explanation": "..."
}`
}],
response_format: { type: 'json_object' }
});
return JSON.parse(response.content);
}3.3 Rubric 评审
javascript
const CODE_REVIEW_RUBRIC = `
## 代码评审评分标准
### 功能正确性 (1-5)
1: 完全无法工作
2: 有严重 bug
3: 基本功能正确,有小问题
4: 功能完整正确
5: 功能完整,有额外的边界处理
### 代码质量 (1-5)
1: 代码混乱,无法维护
2: 代码结构差,命名不清
3: 基本可读,有改进空间
4: 清晰整洁,遵循规范
5: 优秀,可作为示例
### 类型安全 (1-5)
1: 大量 any,无类型
2: 类型不完整
3: 基本类型覆盖
4: 完整类型,少量类型断言
5: 完美类型推断,泛型使用得当
### 性能考量 (1-5)
1: 明显性能问题
2: 有潜在性能问题
3: 基本合理
4: 考虑了常见优化
5: 深度优化,最佳实践
`;
async function rubricEvaluation(code, rubric = CODE_REVIEW_RUBRIC) {
const response = await llm.chat({
messages: [{
role: 'user',
content: `使用以下评分标准评估代码:
${rubric}
## 待评估代码
\`\`\`typescript
${code}
\`\`\`
输出 JSON:
{
"scores": {
"functionality": { "score": 1-5, "details": "..." },
"quality": { "score": 1-5, "details": "..." },
"typeSafety": { "score": 1-5, "details": "..." },
"performance": { "score": 1-5, "details": "..." }
},
"totalScore": 平均分,
"strengths": ["..."],
"improvements": ["..."]
}`
}]
});
return JSON.parse(response.content);
}4. 构建测试套件
4.1 Golden Dataset (黄金数据集)
javascript
// test-cases.json
const goldenDataset = [
{
id: "login-form-1",
category: "component",
input: "生成一个 React 登录表单,使用 React Hook Form 和 Zod 验证",
expectedBehaviors: [
"使用 useForm hook",
"使用 zodResolver",
"包含 email 和 password 字段",
"有提交按钮",
"显示验证错误"
],
forbiddenPatterns: [
"any",
"document.getElementById"
],
minQualityScore: 4
},
{
id: "fetch-hook-1",
category: "hook",
input: "创建一个 useFetch hook,支持加载状态和错误处理",
expectedBehaviors: [
"返回 data, loading, error",
"使用 useEffect",
"支持取消请求",
"处理竞态条件"
],
forbiddenPatterns: [],
minQualityScore: 4
}
];
class TestRunner {
async runAll(prompt, model) {
const results = [];
for (const testCase of goldenDataset) {
const result = await this.runTestCase(testCase, prompt, model);
results.push(result);
}
return {
total: results.length,
passed: results.filter(r => r.passed).length,
failed: results.filter(r => !r.passed).length,
passRate: results.filter(r => r.passed).length / results.length,
details: results
};
}
async runTestCase(testCase, systemPrompt, model) {
// 生成输出
const output = await llm.chat({
model,
messages: [
{ role: 'system', content: systemPrompt },
{ role: 'user', content: testCase.input }
]
});
// 检查预期行为
const behaviorChecks = testCase.expectedBehaviors.map(behavior => ({
behavior,
present: output.content.toLowerCase().includes(behavior.toLowerCase()) ||
await this.checkBehaviorWithLLM(output.content, behavior)
}));
// 检查禁止模式
const forbiddenChecks = testCase.forbiddenPatterns.map(pattern => ({
pattern,
found: output.content.includes(pattern)
}));
// 质量评分
const qualityScore = await this.getQualityScore(testCase.input, output.content);
const passed =
behaviorChecks.every(c => c.present) &&
forbiddenChecks.every(c => !c.found) &&
qualityScore >= testCase.minQualityScore;
return {
id: testCase.id,
passed,
behaviorChecks,
forbiddenChecks,
qualityScore,
output: output.content.slice(0, 500) + '...'
};
}
}4.2 回归测试
javascript
class RegressionTester {
constructor(baselineResults) {
this.baseline = baselineResults;
}
async compare(newResults) {
const regressions = [];
const improvements = [];
for (const newResult of newResults.details) {
const baselineResult = this.baseline.details.find(
r => r.id === newResult.id
);
if (!baselineResult) continue;
// 检测回归
if (baselineResult.passed && !newResult.passed) {
regressions.push({
testId: newResult.id,
type: 'regression',
oldScore: baselineResult.qualityScore,
newScore: newResult.qualityScore
});
}
// 检测改进
if (!baselineResult.passed && newResult.passed) {
improvements.push({
testId: newResult.id,
type: 'improvement',
oldScore: baselineResult.qualityScore,
newScore: newResult.qualityScore
});
}
}
return {
hasRegressions: regressions.length > 0,
regressions,
improvements,
summary: {
oldPassRate: this.baseline.passRate,
newPassRate: newResults.passRate,
delta: newResults.passRate - this.baseline.passRate
}
};
}
}5. A/B 测试
5.1 Prompt 变体测试
javascript
class PromptABTest {
constructor() {
this.variants = {};
this.results = {};
}
addVariant(name, prompt) {
this.variants[name] = prompt;
this.results[name] = { trials: [], scores: [] };
}
async runTrial(testCase) {
const variantResults = {};
for (const [name, prompt] of Object.entries(this.variants)) {
const output = await llm.chat({
messages: [
{ role: 'system', content: prompt },
{ role: 'user', content: testCase.input }
]
});
const score = await evaluateOutput(testCase, output.content);
this.results[name].trials.push({
testCaseId: testCase.id,
score
});
this.results[name].scores.push(score);
variantResults[name] = score;
}
return variantResults;
}
getStatistics() {
const stats = {};
for (const [name, data] of Object.entries(this.results)) {
const scores = data.scores;
stats[name] = {
mean: scores.reduce((a, b) => a + b, 0) / scores.length,
std: this.standardDeviation(scores),
min: Math.min(...scores),
max: Math.max(...scores),
n: scores.length
};
}
return stats;
}
// 统计显著性检验
tTest(variantA, variantB) {
const a = this.results[variantA].scores;
const b = this.results[variantB].scores;
const meanA = a.reduce((x, y) => x + y) / a.length;
const meanB = b.reduce((x, y) => x + y) / b.length;
const varA = a.reduce((sum, x) => sum + (x - meanA) ** 2, 0) / (a.length - 1);
const varB = b.reduce((sum, x) => sum + (x - meanB) ** 2, 0) / (b.length - 1);
const pooledSE = Math.sqrt(varA / a.length + varB / b.length);
const t = (meanA - meanB) / pooledSE;
// 简化的显著性判断
const significant = Math.abs(t) > 2; // 约 95% 置信度
return {
tStatistic: t,
significant,
winner: significant ? (meanA > meanB ? variantA : variantB) : 'no_difference'
};
}
}5.2 模型对比
javascript
async function compareModels(testCases, models) {
const results = {};
for (const model of models) {
results[model] = {
scores: [],
latencies: [],
costs: []
};
for (const testCase of testCases) {
const start = Date.now();
const response = await llm.chat({
model,
messages: [
{ role: 'user', content: testCase.input }
]
});
const latency = Date.now() - start;
const score = await evaluateOutput(testCase, response.content);
const cost = calculateCost(model, response.usage);
results[model].scores.push(score);
results[model].latencies.push(latency);
results[model].costs.push(cost);
}
}
// 生成比较报告
return Object.entries(results).map(([model, data]) => ({
model,
avgScore: average(data.scores),
avgLatency: average(data.latencies),
totalCost: sum(data.costs),
scorePerDollar: average(data.scores) / sum(data.costs)
}));
}6. 成本优化
6.1 Token 使用分析
javascript
class CostAnalyzer {
constructor() {
this.records = [];
}
record(request, response) {
this.records.push({
timestamp: Date.now(),
model: request.model,
inputTokens: response.usage.prompt_tokens,
outputTokens: response.usage.completion_tokens,
cost: this.calculateCost(request.model, response.usage)
});
}
calculateCost(model, usage) {
const pricing = {
'claude-sonnet-4-20250514': { input: 0.003, output: 0.015 },
'claude-3-5-haiku-20241022': { input: 0.0008, output: 0.004 },
'gpt-4o': { input: 0.005, output: 0.015 },
'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
};
const price = pricing[model] || { input: 0, output: 0 };
return (
(usage.prompt_tokens / 1000) * price.input +
(usage.completion_tokens / 1000) * price.output
);
}
getReport(period = '24h') {
const since = Date.now() - this.parsePeriod(period);
const filtered = this.records.filter(r => r.timestamp >= since);
return {
totalCost: sum(filtered.map(r => r.cost)),
totalInputTokens: sum(filtered.map(r => r.inputTokens)),
totalOutputTokens: sum(filtered.map(r => r.outputTokens)),
requestCount: filtered.length,
avgCostPerRequest: sum(filtered.map(r => r.cost)) / filtered.length,
byModel: this.groupByModel(filtered)
};
}
}6.2 优化策略
javascript
class CostOptimizer {
// 策略 1: 模型路由
async routeToModel(task, complexity) {
const modelTiers = {
simple: 'gpt-4o-mini', // 简单任务
medium: 'claude-3-5-haiku-20241022', // 中等任务
complex: 'claude-sonnet-4-20250514' // 复杂任务
};
// 自动判断复杂度
if (complexity === 'auto') {
complexity = await this.assessComplexity(task);
}
return modelTiers[complexity] || modelTiers.medium;
}
// 策略 2: Prompt 压缩
async compressPrompt(prompt, maxTokens) {
const currentTokens = estimateTokens(prompt);
if (currentTokens <= maxTokens) {
return prompt;
}
// 使用小模型压缩
const compressed = await llm.chat({
model: 'gpt-4o-mini',
messages: [{
role: 'user',
content: `压缩以下内容,保留关键信息,目标 ${maxTokens} tokens:\n\n${prompt}`
}]
});
return compressed.content;
}
// 策略 3: 缓存
constructor() {
this.cache = new Map();
}
async cachedCall(key, fn) {
if (this.cache.has(key)) {
return this.cache.get(key);
}
const result = await fn();
this.cache.set(key, result);
return result;
}
}7. 安全护栏
7.1 输入过滤
javascript
class InputGuard {
constructor() {
this.blockedPatterns = [
/ignore.*instructions/i,
/forget.*rules/i,
/pretend.*you.*are/i,
/system.*prompt/i
];
}
check(input) {
// 检查注入尝试
for (const pattern of this.blockedPatterns) {
if (pattern.test(input)) {
return {
safe: false,
reason: 'Potential prompt injection detected'
};
}
}
// 检查长度
if (input.length > 50000) {
return {
safe: false,
reason: 'Input too long'
};
}
return { safe: true };
}
sanitize(input) {
// 移除潜在危险内容
return input
.replace(/```[\s\S]*?```/g, '[CODE_BLOCK]') // 移除代码块
.replace(/<script[\s\S]*?<\/script>/gi, '') // 移除脚本
.trim();
}
}7.2 输出过滤
javascript
class OutputGuard {
constructor() {
this.sensitivePatterns = [
/api[_-]?key\s*[:=]\s*['"][^'"]+['"]/gi,
/password\s*[:=]\s*['"][^'"]+['"]/gi,
/secret\s*[:=]\s*['"][^'"]+['"]/gi,
/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g // 邮箱
];
}
check(output) {
const issues = [];
for (const pattern of this.sensitivePatterns) {
const matches = output.match(pattern);
if (matches) {
issues.push({
type: 'sensitive_data',
matches: matches.map(m => m.slice(0, 20) + '...')
});
}
}
return {
safe: issues.length === 0,
issues
};
}
redact(output) {
let redacted = output;
for (const pattern of this.sensitivePatterns) {
redacted = redacted.replace(pattern, '[REDACTED]');
}
return redacted;
}
}7.3 行为边界
javascript
class BehaviorGuard {
constructor(config) {
this.maxTokensPerRequest = config.maxTokensPerRequest || 4096;
this.maxRequestsPerMinute = config.maxRequestsPerMinute || 60;
this.requestCounts = new Map();
}
async checkRateLimit(userId) {
const now = Date.now();
const minute = Math.floor(now / 60000);
const key = `${userId}:${minute}`;
const count = (this.requestCounts.get(key) || 0) + 1;
this.requestCounts.set(key, count);
// 清理旧记录
for (const [k] of this.requestCounts) {
if (!k.endsWith(`:${minute}`)) {
this.requestCounts.delete(k);
}
}
if (count > this.maxRequestsPerMinute) {
throw new Error('Rate limit exceeded');
}
return true;
}
validateRequest(request) {
const errors = [];
if (estimateTokens(request.messages) > this.maxTokensPerRequest) {
errors.push('Request exceeds max tokens');
}
if (request.temperature > 1.5) {
errors.push('Temperature too high');
}
return {
valid: errors.length === 0,
errors
};
}
}8. CI/CD 集成
8.1 GitHub Actions 工作流
yaml
# .github/workflows/prompt-eval.yml
name: Prompt Evaluation
on:
push:
paths:
- 'prompts/**'
pull_request:
paths:
- 'prompts/**'
jobs:
evaluate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
- name: Install dependencies
run: npm ci
- name: Run prompt tests
run: npm run test:prompts
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
- name: Check regression
run: npm run test:regression
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: eval-results
path: eval-results/
- name: Comment on PR
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const results = JSON.parse(fs.readFileSync('eval-results/summary.json'));
const body = `## Prompt Evaluation Results
| Metric | Value |
|--------|-------|
| Pass Rate | ${(results.passRate * 100).toFixed(1)}% |
| Avg Score | ${results.avgScore.toFixed(2)} |
| Regressions | ${results.regressions} |
${results.regressions > 0 ? '⚠️ **Regressions detected!**' : '✅ No regressions'}
`;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body
});8.2 评估脚本
javascript
// scripts/eval-prompts.js
const { TestRunner } = require('./test-runner');
const { RegressionTester } = require('./regression');
async function main() {
const runner = new TestRunner();
// 运行测试
const results = await runner.runAll(
fs.readFileSync('prompts/code-gen.txt', 'utf-8'),
process.env.MODEL || 'claude-sonnet-4-20250514'
);
// 保存结果
fs.writeFileSync(
'eval-results/current.json',
JSON.stringify(results, null, 2)
);
// 检查回归
if (fs.existsSync('eval-results/baseline.json')) {
const baseline = JSON.parse(
fs.readFileSync('eval-results/baseline.json')
);
const regression = new RegressionTester(baseline);
const comparison = await regression.compare(results);
fs.writeFileSync(
'eval-results/summary.json',
JSON.stringify({
passRate: results.passRate,
avgScore: average(results.details.map(d => d.qualityScore)),
regressions: comparison.regressions.length
})
);
if (comparison.hasRegressions) {
console.error('Regressions detected!');
process.exit(1);
}
}
console.log(`Tests passed: ${results.passed}/${results.total}`);
}
main().catch(console.error);9. 关键要点
- 分层评估: 规则检查 → LLM 评审 → 人工审核
- 黄金数据集: 维护高质量的测试用例
- 回归测试: 每次 Prompt 修改都要验证
- A/B 测试: 用数据选择最佳方案
- 成本追踪: 监控并优化 Token 使用
- 安全护栏: 输入输出都要过滤
- CI/CD 集成: 自动化评估流程
延伸阅读
- Anthropic: Model Evaluation
- OpenAI Evals
- Promptfoo - Prompt 测试框架