Skip to content

RAG 与记忆系统:让 AI 拥有知识和记忆 📚

"LLM 的知识是静态的,RAG 让它动态获取信息。"

1. RAG 基础

RAG (Retrieval-Augmented Generation) 让 LLM 访问外部知识。

传统 LLM: 问题 → LLM → 回答
RAG: 问题 → 检索文档 → LLM (问题 + 文档) → 回答

工作流: 文档 → 分块 → 嵌入 → 向量数据库 → 查询检索 → LLM 生成


2. 文档分块策略

2.1 基础分块

javascript
// 按字符数分块(带重叠)
function chunkBySize(text, chunkSize = 1000, overlap = 200) {
  const chunks = [];
  let start = 0;

  while (start < text.length) {
    const end = Math.min(start + chunkSize, text.length);
    chunks.push(text.slice(start, end));
    start = end - overlap;
  }
  return chunks;
}

// 按段落分块
function chunkByParagraph(text, maxSize = 1000) {
  const paragraphs = text.split(/\n\n+/);
  const chunks = [];
  let current = '';

  for (const para of paragraphs) {
    if ((current + para).length > maxSize && current) {
      chunks.push(current.trim());
      current = para;
    } else {
      current += '\n\n' + para;
    }
  }
  if (current.trim()) chunks.push(current.trim());
  return chunks;
}

2.2 语义分块

javascript
async function semanticChunking(text) {
  const response = await llm.chat({
    messages: [{
      role: 'user',
      content: `将文本分割成语义完整的段落,用 "---CHUNK---" 分隔:\n${text}`
    }]
  });
  return response.content.split('---CHUNK---').map(c => c.trim());
}

2.3 代码分块

javascript
function chunkCode(code, language) {
  const ast = parseAST(code, language);
  return ast.body
    .filter(node => ['FunctionDeclaration', 'ClassDeclaration'].includes(node.type))
    .map(node => ({
      type: node.type,
      name: node.id?.name || 'anonymous',
      content: code.slice(node.start, node.end)
    }));
}

3. 向量嵌入

javascript
import OpenAI from 'openai';

const openai = new OpenAI();

async function getEmbedding(text) {
  const response = await openai.embeddings.create({
    model: 'text-embedding-3-small',
    input: text
  });
  return response.data[0].embedding;
}

// 批量嵌入
async function getEmbeddings(texts) {
  const response = await openai.embeddings.create({
    model: 'text-embedding-3-small',
    input: texts
  });
  return response.data.map(d => d.embedding);
}

模型选择:

  • text-embedding-3-small (1536维): 快速、便宜
  • text-embedding-3-large (3072维): 更精确
  • Cohere embed-v3 (1024维): 多语言优秀
  • BGE-M3 (1024维): 开源、本地部署

4. 向量数据库

4.1 Chroma 示例

javascript
import { ChromaClient } from 'chromadb';

const client = new ChromaClient();
const collection = await client.createCollection({
  name: 'codebase',
  metadata: { 'hnsw:space': 'cosine' }
});

// 添加文档
await collection.add({
  ids: ['doc1', 'doc2'],
  embeddings: [embedding1, embedding2],
  documents: ['内容1', '内容2'],
  metadatas: [
    { source: 'src/App.tsx', type: 'code' },
    { source: 'README.md', type: 'doc' }
  ]
});

// 查询
const results = await collection.query({
  queryEmbeddings: [queryEmbedding],
  nResults: 5,
  where: { type: 'code' }
});

4.2 PostgreSQL + pgvector

sql
CREATE EXTENSION vector;

CREATE TABLE documents (
  id SERIAL PRIMARY KEY,
  content TEXT,
  embedding vector(1536),
  metadata JSONB
);

CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops);

-- 查询最相似文档
SELECT id, content, 1 - (embedding <=> $1) AS similarity
FROM documents
ORDER BY embedding <=> $1
LIMIT 5;

5. 检索策略

5.1 混合检索

javascript
async function hybridSearch(query, k = 5) {
  const vectorResults = await vectorSearch(query, k * 2);
  const keywordResults = await bm25Search(query, k * 2);

  // RRF 融合排序
  const scores = new Map();

  vectorResults.forEach((doc, rank) => {
    scores.set(doc.id, (scores.get(doc.id) || 0) + 1 / (60 + rank));
  });

  keywordResults.forEach((doc, rank) => {
    scores.set(doc.id, (scores.get(doc.id) || 0) + 1 / (60 + rank));
  });

  return [...scores.entries()]
    .sort((a, b) => b[1] - a[1])
    .slice(0, k)
    .map(([id]) => getDocument(id));
}

5.2 重排序

javascript
async function retrieveWithRerank(query, k = 5) {
  const candidates = await retrieve(query, k * 3);

  const response = await cohere.rerank({
    model: 'rerank-english-v3.0',
    query,
    documents: candidates.map(d => d.content)
  });

  return response.results
    .sort((a, b) => b.relevance_score - a.relevance_score)
    .slice(0, k)
    .map(r => candidates[r.index]);
}

5.3 查询扩展

javascript
async function expandQuery(query) {
  const response = await llm.chat({
    messages: [{
      role: 'user',
      content: `生成 3 个与以下查询语义相似的问题:\n"${query}"\n每行一个。`
    }]
  });
  return [query, ...response.content.split('\n').filter(Boolean)];
}

async function retrieveWithExpansion(query, k = 5) {
  const queries = await expandQuery(query);
  const allResults = await Promise.all(queries.map(q => retrieve(q, k)));

  const seen = new Set();
  const merged = [];

  for (const results of allResults) {
    for (const doc of results) {
      if (!seen.has(doc.id)) {
        seen.add(doc.id);
        merged.push(doc);
      }
    }
  }
  return merged.slice(0, k);
}

6. RAG 应用

6.1 完整 RAG 系统

javascript
class RAGSystem {
  constructor(collection, llm) {
    this.collection = collection;
    this.llm = llm;
  }

  async query(question) {
    const docs = await this.retrieve(question);
    const context = docs.map(d => d.content).join('\n\n---\n\n');

    const response = await this.llm.chat({
      messages: [
        {
          role: 'system',
          content: `根据上下文回答问题。如果上下文中没有相关信息,说"我没有找到相关信息"。\n\n上下文:\n${context}`
        },
        { role: 'user', content: question }
      ]
    });

    return {
      answer: response.content,
      sources: docs.map(d => d.metadata.source)
    };
  }

  async retrieve(question, k = 5) {
    const embedding = await getEmbedding(question);
    const results = await this.collection.query({
      queryEmbeddings: [embedding],
      nResults: k
    });

    return results.documents[0].map((content, i) => ({
      content,
      metadata: results.metadatas[0][i]
    }));
  }
}

6.2 代码库 RAG

javascript
class CodebaseRAG {
  async indexRepository(repoPath) {
    const files = await glob(`${repoPath}/**/*.{ts,tsx,js,jsx}`);

    for (const file of files) {
      const content = await fs.readFile(file, 'utf-8');
      const chunks = chunkCode(content, getLanguage(file));

      for (const chunk of chunks) {
        await this.collection.add({
          ids: [`${file}:${chunk.name}`],
          documents: [chunk.content],
          metadatas: [{ file, type: chunk.type, name: chunk.name }]
        });
      }
    }
  }

  async findRelatedCode(question) {
    const results = await this.retrieve(question, 10);

    const filtered = await this.llm.chat({
      messages: [{
        role: 'user',
        content: `用户问题: ${question}\n\n选择最相关的 3-5 个代码片段:\n${results.map((r, i) => `[${i}] ${r.metadata.file}\n${r.content}`).join('\n\n')}\n\n输出编号,逗号分隔。`
      }]
    });

    const indices = filtered.content.split(',').map(Number);
    return indices.map(i => results[i]);
  }
}

7. 记忆系统

7.1 对话记忆

javascript
class ConversationMemory {
  constructor(maxMessages = 20) {
    this.messages = [];
    this.maxMessages = maxMessages;
  }

  add(role, content) {
    this.messages.push({ role, content, timestamp: Date.now() });

    if (this.messages.length > this.maxMessages) {
      const systemMessages = this.messages.filter(m => m.role === 'system');
      const recentMessages = this.messages
        .filter(m => m.role !== 'system')
        .slice(-this.maxMessages + systemMessages.length);

      this.messages = [...systemMessages, ...recentMessages];
    }
  }

  getContext() {
    return this.messages.map(({ role, content }) => ({ role, content }));
  }

  async summarize() {
    if (this.messages.length < 10) return;

    const oldMessages = this.messages.slice(0, -5);
    const summary = await llm.chat({
      messages: [{
        role: 'user',
        content: `总结对话关键信息:\n${oldMessages.map(m => `${m.role}: ${m.content}`).join('\n')}`
      }]
    });

    this.messages = [
      { role: 'system', content: `对话历史摘要: ${summary.content}` },
      ...this.messages.slice(-5)
    ];
  }
}

7.2 长期记忆

javascript
class LongTermMemory {
  constructor(vectorDB) {
    this.collection = vectorDB.collection('memories');
  }

  async store(content, metadata = {}) {
    const id = crypto.randomUUID();
    const embedding = await getEmbedding(content);

    await this.collection.add({
      ids: [id],
      embeddings: [embedding],
      documents: [content],
      metadatas: [{ ...metadata, timestamp: Date.now() }]
    });
    return id;
  }

  async recall(query, k = 5) {
    const embedding = await getEmbedding(query);
    const results = await this.collection.query({
      queryEmbeddings: [embedding],
      nResults: k
    });

    return results.documents[0].map((content, i) => ({
      content,
      metadata: results.metadatas[0][i]
    }));
  }

  async processConversation(messages) {
    const response = await llm.chat({
      messages: [{
        role: 'user',
        content: `提取值得长期记忆的信息:\n${messages.map(m => `${m.role}: ${m.content}`).join('\n')}\n\n输出 JSON 数组,每项包含 content 和 type。无则输出 []。`
      }],
      response_format: { type: 'json_object' }
    });

    const memories = JSON.parse(response.content).memories || [];
    for (const memory of memories) {
      await this.store(memory.content, { type: memory.type });
    }
  }
}

7.3 工作记忆

javascript
class WorkingMemory {
  constructor() {
    this.scratchpad = {};
    this.facts = [];
    this.goals = [];
  }

  set(key, value) {
    this.scratchpad[key] = { value, timestamp: Date.now() };
  }

  get(key) {
    return this.scratchpad[key]?.value;
  }

  addFact(fact) {
    this.facts.push({ content: fact, timestamp: Date.now() });
  }

  toSystemPrompt() {
    const scratchpad = Object.entries(this.scratchpad).map(([k, v]) => `${k}: ${v.value}`);
    return `
## 当前工作记忆
${scratchpad.length > 0 ? `临时数据:\n${scratchpad.join('\n')}` : ''}
${this.facts.length > 0 ? `已确认事实:\n${this.facts.map(f => f.content).join('\n')}` : ''}
${this.goals.length > 0 ? `当前目标:\n${this.goals.join('\n')}` : ''}
    `.trim();
  }
}

8. 上下文管理

8.1 优先级管理

javascript
class ContextManager {
  constructor(maxTokens = 100000) {
    this.maxTokens = maxTokens;
    this.priorities = {
      system_prompt: 1,
      user_question: 2,
      relevant_code: 3,
      recent_messages: 4,
      retrieved_docs: 5,
      history_summary: 6
    };
  }

  buildContext(components) {
    const sorted = Object.entries(components)
      .sort(([a], [b]) => this.priorities[a] - this.priorities[b]);

    let totalTokens = 0;
    const included = [];

    for (const [type, content] of sorted) {
      const tokens = estimateTokens(content);

      if (totalTokens + tokens <= this.maxTokens) {
        included.push({ type, content });
        totalTokens += tokens;
      } else {
        const remaining = this.maxTokens - totalTokens;
        if (remaining > 500) {
          included.push({
            type,
            content: truncateToTokens(content, remaining),
            truncated: true
          });
        }
        break;
      }
    }
    return included;
  }
}

8.2 动态上下文选择

javascript
async function selectContext(question, availableContext) {
  const response = await llm.chat({
    messages: [{
      role: 'user',
      content: `用户问题: ${question}\n\n选择最相关的上下文(最多 5 个):\n${availableContext.map((ctx, i) => `[${i}] ${ctx.title}\n${ctx.preview}`).join('\n\n')}\n\n输出编号,逗号分隔。`
    }]
  });

  const indices = response.content.split(',').map(s => parseInt(s.trim()));
  return indices.map(i => availableContext[i]).filter(Boolean);
}

9. 实战:代码库问答系统

javascript
class CodebaseQA {
  constructor() {
    this.vectorDB = new ChromaClient();
    this.memory = new ConversationMemory();
    this.longTermMemory = new LongTermMemory(this.vectorDB);
  }

  async initialize(repoPath) {
    this.collection = await this.vectorDB.createCollection({ name: 'codebase' });
    await this.indexRepository(repoPath);
  }

  async chat(question) {
    const relevantCode = await this.retrieveCode(question);
    const memories = await this.longTermMemory.recall(question, 3);
    const context = this.buildContext(relevantCode, memories);

    this.memory.add('user', question);
    const history = this.memory.getContext();

    const response = await llm.chat({
      messages: [
        {
          role: 'system',
          content: `你是代码库专家。\n\n## 相关代码\n${context.code}\n\n## 相关记忆\n${context.memories}\n\n规则:\n- 引用代码时说明文件路径\n- 不确定时明确说明\n- 提供具体代码示例`
        },
        ...history,
        { role: 'user', content: question }
      ]
    });

    this.memory.add('assistant', response.content);

    await this.longTermMemory.processConversation([
      { role: 'user', content: question },
      { role: 'assistant', content: response.content }
    ]);

    return {
      answer: response.content,
      sources: relevantCode.map(c => c.metadata.file)
    };
  }
}

10. 关键要点

  1. 分块策略: 语义完整的分块提高检索质量
  2. 混合检索: 结合向量和关键词检索
  3. 重排序: 用更精确的模型重排检索结果
  4. 分层记忆: 工作记忆 + 对话记忆 + 长期记忆
  5. 上下文管理: 在有限窗口内优化信息选择
  6. 持续优化: 监控检索质量,迭代改进

延伸阅读

前端面试知识库