[{"data":1,"prerenderedAt":241},["ShallowReactive",2],{"article-agent\u002Flangchainpdf":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"tags":11,"body":13,"_type":235,"_id":236,"_source":237,"_file":238,"_stem":239,"_extension":240},"\u002Farticles\u002Fagent\u002Flangchainpdf","agent",false,"","LangChain 实现 PDF 检索：从文档加载到向量查询的完整流程","基于 LangChain 框架，完整讲解 PDF 文档加载、文本分片、向量转换、Chroma 向量库存储及多种检索方式的实现方法。","2026-03-23",[12],"Agent开发",{"type":14,"children":15,"toc":220},"root",[16,25,31,41,47,52,60,66,71,79,85,90,98,104,109,116,121,129,135,140,148,154,159,167,173,178,186,191,204,215],{"type":17,"tag":18,"props":19,"children":21},"element","h2",{"id":20},"一文档加载读取-pdf-内容至上下文",[22],{"type":23,"value":24},"text","一、文档加载：读取 PDF 内容至上下文",{"type":17,"tag":26,"props":27,"children":28},"p",{},[29],{"type":23,"value":30},"通过 PyPDFLoader 加载目标 PDF 文件，将文档内容载入程序上下文，为后续处理奠定基础。加载后可查看文档页数及首页核心信息，快速验证加载效果。",{"type":17,"tag":32,"props":33,"children":35},"pre",{"code":34},"from langchain_community.document_loaders import PyPDFLoader\n​\n# 定义 PDF 文件路径（请根据实际场景调整）\nfile_path = \".\u002Fstatic\u002F26考研考点排查表.pdf\"\n# 初始化 PDF 加载器\nloader = PyPDFLoader(file_path)\n​\n# 加载文档内容至上下文\ndocs = loader.load()\n​\n# 验证加载结果：输出文档页数、首页前200字符内容及元数据\nprint(f\"文档总页数：{len(docs)}\")\nprint(f\"首页内容预览（前200字符）：\\n{docs[0].page_content[:200]}\\n\")\nprint(f\"首页元数据：{docs[0].metadata}\")\n",[36],{"type":17,"tag":37,"props":38,"children":39},"code",{"__ignoreMap":7},[40],{"type":23,"value":34},{"type":17,"tag":18,"props":42,"children":44},{"id":43},"二文档分片合理切割文本保留上下文关联",[45],{"type":23,"value":46},"二、文档分片：合理切割文本，保留上下文关联",{"type":17,"tag":26,"props":48,"children":49},{},[50],{"type":23,"value":51},"为适配向量模型的输入长度限制，同时避免割裂关键信息，采用递归字符分割策略：将文档切割为 1000 字符\u002F块的文本片段，且相邻块保留 200 字符重叠。这种重叠设计能有效降低重要表述与关联上下文被拆分的风险，保障后续检索的准确性。",{"type":17,"tag":32,"props":53,"children":55},{"code":54},"from langchain_text_splitters import RecursiveCharacterTextSplitter\n​\n# 初始化文本分割器，配置分片参数\ntext_splitter = RecursiveCharacterTextSplitter(\n    chunk_size=1000,  # 单个文本块的最大字符数\n    chunk_overlap=200,  # 相邻文本块的重叠字符数\n    add_start_index=True  # 为每个块添加原始文档中的起始索引，便于溯源\n)\n​\n# 对加载的文档进行分片处理\nall_splits = text_splitter.split_documents(docs)\n​\n# 预览分片结果（仅展示前3块，避免大量文本输出）\nprint(\"=== 文本分片结果预览（前3个块） ===\")\nfor i, chunk in enumerate(all_splits[:3]):\n    print(f\"块 #{i+1} | 字符长度: {len(chunk.page_content)} | 内容预览: {chunk.page_content[:50]}{'...' if len(chunk.page_content) > 50 else ''}\")\nprint(f\"\\n文档分片完成，总块数：{len(all_splits)}\")\n",[56],{"type":17,"tag":37,"props":57,"children":58},{"__ignoreMap":7},[59],{"type":23,"value":54},{"type":17,"tag":18,"props":61,"children":63},{"id":62},"三向量转换将文本片段转为数值向量",[64],{"type":23,"value":65},"三、向量转换：将文本片段转为数值向量",{"type":17,"tag":26,"props":67,"children":68},{},[69],{"type":23,"value":70},"文本属于非结构化数据，无法直接用于相似度匹配。通过 Hugging Face 提供的预训练嵌入模型，将分割后的文本块转换为固定长度的数值向量，实现非结构化文本的结构化表征。此处选用 sentence-transformers\u002Fall-mpnet-base-v2 模型，兼顾向量表征效果与通用性。",{"type":17,"tag":32,"props":72,"children":74},{"code":73},"from langchain_huggingface import HuggingFaceEmbeddings\n​\n# 初始化嵌入模型（首次执行会自动从 Hugging Face Hub 下载至本地缓存，默认路径：～\u002F.cache\u002Fhuggingface\u002Fhub）\nembeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers\u002Fall-mpnet-base-v2\")\n​\n# 测试向量生成：对前2个文本块进行嵌入转换\nvector_1 = embeddings.embed_query(all_splits[0].page_content)\nvector_2 = embeddings.embed_query(all_splits[1].page_content)\n​\n# 验证向量一致性：同一模型生成的向量长度必须相同\nassert len(vector_1) == len(vector_2)\nprint(f\"向量生成完成，单个向量长度：{len(vector_1)}\\n\")\nprint(f\"第一个向量预览（前10个元素）：{vector_1[:10]}\")\n",[75],{"type":17,"tag":37,"props":76,"children":77},{"__ignoreMap":7},[78],{"type":23,"value":73},{"type":17,"tag":18,"props":80,"children":82},{"id":81},"四向量存储将向量与文本关联存入数据库",[83],{"type":23,"value":84},"四、向量存储：将向量与文本关联存入数据库",{"type":17,"tag":26,"props":86,"children":87},{},[88],{"type":23,"value":89},"将生成的文本向量与其对应的原始文本块关联存储，便于后续快速检索。此处选用轻量级向量数据库 Chroma，支持本地持久化存储，无需复杂的服务部署，适合快速验证和小规模应用场景。",{"type":17,"tag":32,"props":91,"children":93},{"code":92},"from langchain_chroma import Chroma\n​\n# 初始化 Chroma 向量数据库\nvector_store = Chroma(\n    collection_name=\"example_collection\",  # 向量集合名称，便于区分不同文档\n    embedding_function=embeddings,  # 关联嵌入模型\n    persist_directory=\".\u002Fchroma_langchain_db\"  # 本地持久化存储路径（可选，删除则仅存于内存）\n)\n​\n# 将所有文本块及其向量存入数据库，并返回各块的唯一ID\nids = vector_store.add_documents(documents=all_splits)\nprint(f\"向量存储完成，共存入 {len(ids)} 个文本块的向量数据\")\n",[94],{"type":17,"tag":37,"props":95,"children":96},{"__ignoreMap":7},[97],{"type":23,"value":92},{"type":17,"tag":18,"props":99,"children":101},{"id":100},"五多样检索基于向量相似度的灵活查询",[102],{"type":23,"value":103},"五、多样检索：基于向量相似度的灵活查询",{"type":17,"tag":26,"props":105,"children":106},{},[107],{"type":23,"value":108},"向量数据库的核心价值在于通过向量相似度匹配，快速定位与查询需求相关的文本内容。以下提供 4 种常用检索方式，适配不同业务场景（如同步\u002F异步、是否需要相似度分数等）。",{"type":17,"tag":110,"props":111,"children":113},"h3",{"id":112},"_1-基础文本检索根据查询字符串返回相似文档",[114],{"type":23,"value":115},"1. 基础文本检索：根据查询字符串返回相似文档",{"type":17,"tag":26,"props":117,"children":118},{},[119],{"type":23,"value":120},"直接输入查询文本（如“数一”），数据库自动将其转为向量，再匹配相似度最高的文档并返回。",{"type":17,"tag":32,"props":122,"children":124},{"code":123},"# 输入查询文本，检索相似文档\nresults = vector_store.similarity_search(\"数一\")\n​\n# 输出相似度最高的文档内容\nprint(\"=== 基础文本检索结果（相似度最高） ===\")\nprint(results[0])\n",[125],{"type":17,"tag":37,"props":126,"children":127},{"__ignoreMap":7},[128],{"type":23,"value":123},{"type":17,"tag":110,"props":130,"children":132},{"id":131},"_2-异步文本检索非阻塞查询支持并行任务",[133],{"type":23,"value":134},"2. 异步文本检索：非阻塞查询，支持并行任务",{"type":17,"tag":26,"props":136,"children":137},{},[138],{"type":23,"value":139},"采用异步方式执行检索，不阻塞主线程，可在检索过程中并行处理其他任务，提升程序执行效率，适合高并发场景。",{"type":17,"tag":32,"props":141,"children":143},{"code":142},"# 异步检索（需在异步函数中执行，或使用异步环境）\nresults = await vector_store.asimilarity_search(\"数一\")\n​\n# 输出相似度最高的文档内容\nprint(\"=== 异步文本检索结果（相似度最高） ===\")\nprint(results[0])\n",[144],{"type":17,"tag":37,"props":145,"children":146},{"__ignoreMap":7},[147],{"type":23,"value":142},{"type":17,"tag":110,"props":149,"children":151},{"id":150},"_3-带相似度分数的检索量化匹配程度",[152],{"type":23,"value":153},"3. 带相似度分数的检索：量化匹配程度",{"type":17,"tag":26,"props":155,"children":156},{},[157],{"type":23,"value":158},"检索时返回相似文档及对应的相似度分数（该分数为距离度量，值越小表示相似度越高），可通过分数筛选符合阈值的结果，提升检索精准度。",{"type":17,"tag":32,"props":160,"children":162},{"code":161},"# 检索相似文档并返回相似度分数\nresults = vector_store.similarity_search_with_score(\"数一\")\n​\n# 提取并输出Top1结果的分数和文档内容\ndoc, score = results[0]\nprint(\"=== 带相似度分数的检索结果 ===\")\nprint(f\"相似度分数（值越小越相似）：{score:.4f}\\n\")\nprint(f\"匹配文档内容：\\n{doc}\")\n",[163],{"type":17,"tag":37,"props":164,"children":165},{"__ignoreMap":7},[166],{"type":23,"value":161},{"type":17,"tag":110,"props":168,"children":170},{"id":169},"_4-向量直接检索基于预生成向量的精准匹配",[171],{"type":23,"value":172},"4. 向量直接检索：基于预生成向量的精准匹配",{"type":17,"tag":26,"props":174,"children":175},{},[176],{"type":23,"value":177},"若已提前生成查询文本的向量，可直接传入向量进行检索，跳过数据库内部的向量转换步骤，进一步提升检索速度。",{"type":17,"tag":32,"props":179,"children":181},{"code":180},"# 先手动生成查询文本的向量\nquery_embedding = embeddings.embed_query(\"数一\")\n​\n# 直接传入向量进行相似检索\nresults = vector_store.similarity_search_by_vector(query_embedding)\n​\n# 输出相似度最高的文档内容\nprint(\"=== 向量直接检索结果（相似度最高） ===\")\nprint(results[0])\n",[182],{"type":17,"tag":37,"props":183,"children":184},{"__ignoreMap":7},[185],{"type":23,"value":180},{"type":17,"tag":110,"props":187,"children":189},{"id":188},"代码仓库",[190],{"type":23,"value":188},{"type":17,"tag":26,"props":192,"children":193},{},[194,196],{"type":23,"value":195},"gitee：",{"type":17,"tag":197,"props":198,"children":202},"a",{"href":199,"rel":200},"https:\u002F\u002Fgitee.com\u002Fo_insist\u002Flangchain1.0_learn.git",[201],"nofollow",[203],{"type":23,"value":199},{"type":17,"tag":26,"props":205,"children":206},{},[207,209],{"type":23,"value":208},"github：",{"type":17,"tag":197,"props":210,"children":213},{"href":211,"rel":212},"https:\u002F\u002Fgithub.com\u002Fo-insist\u002Flangchain1.0_learn.git",[201],[214],{"type":23,"value":211},{"type":17,"tag":26,"props":216,"children":217},{},[218],{"type":23,"value":219},"✨ 学习之路，循序渐进，持续更新中...",{"title":7,"searchDepth":221,"depth":221,"links":222},2,[223,224,225,226,227],{"id":20,"depth":221,"text":24},{"id":43,"depth":221,"text":46},{"id":62,"depth":221,"text":65},{"id":81,"depth":221,"text":84},{"id":100,"depth":221,"text":103,"children":228},[229,231,232,233,234],{"id":112,"depth":230,"text":115},3,{"id":131,"depth":230,"text":134},{"id":150,"depth":230,"text":153},{"id":169,"depth":230,"text":172},{"id":188,"depth":230,"text":188},"markdown","content:articles:agent:langchain实现pdf检索.md","content","articles\u002Fagent\u002Flangchain实现pdf检索.md","articles\u002Fagent\u002Flangchain实现pdf检索","md",1779811689339]