CI/CD 集成
GitHub Actions 工作流、评估脚本
8.1 GitHub Actions 工作流
yaml
# .github/workflows/prompt-eval.yml
name: Prompt Evaluation
on:
push:
paths:
- 'prompts/**'
pull_request:
paths:
- 'prompts/**'
jobs:
evaluate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
- name: Install dependencies
run: npm ci
- name: Run prompt tests
run: npm run test:prompts
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
- name: Check regression
run: npm run test:regression
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: eval-results
path: eval-results/
- name: Comment on PR
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const results = JSON.parse(fs.readFileSync('eval-results/summary.json'));
const body = `## Prompt Evaluation Results
| Metric | Value |
|--------|-------|
| Pass Rate | ${(results.passRate * 100).toFixed(1)}% |
| Avg Score | ${results.avgScore.toFixed(2)} |
| Regressions | ${results.regressions} |
${results.regressions > 0 ? '⚠️ **Regressions detected!**' : '✅ No regressions'}
`;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body
});8.2 评估脚本
javascript
// scripts/eval-prompts.js
const { TestRunner } = require('./test-runner');
const { RegressionTester } = require('./regression');
async function main() {
const runner = new TestRunner();
// 运行测试
const results = await runner.runAll(
fs.readFileSync('prompts/code-gen.txt', 'utf-8'),
process.env.MODEL || 'claude-sonnet-4-20250514'
);
// 保存结果
fs.writeFileSync(
'eval-results/current.json',
JSON.stringify(results, null, 2)
);
// 检查回归
if (fs.existsSync('eval-results/baseline.json')) {
const baseline = JSON.parse(
fs.readFileSync('eval-results/baseline.json')
);
const regression = new RegressionTester(baseline);
const comparison = await regression.compare(results);
fs.writeFileSync(
'eval-results/summary.json',
JSON.stringify({
passRate: results.passRate,
avgScore: average(results.details.map(d => d.qualityScore)),
regressions: comparison.regressions.length
})
);
if (comparison.hasRegressions) {
console.error('Regressions detected!');
process.exit(1);
}
}
console.log(`Tests passed: ${results.passed}/${results.total}`);
}
main().catch(console.error);关键要点
- 分层评估: 规则检查 → LLM 评审 → 人工审核
- 黄金数据集: 维护高质量的测试用例
- 回归测试: 每次 Prompt 修改都要验证
- A/B 测试: 用数据选择最佳方案
- 成本追踪: 监控并优化 Token 使用
- 安全护栏: 输入输出都要过滤
- CI/CD 集成: 自动化评估流程
延伸阅读
- Anthropic: Model Evaluation
- OpenAI Evals
- Promptfoo - Prompt 测试框架