langchain + ollama 实现本地文档搜索

from langchain.document_loaders import OnlinePDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings
from langchain import PromptTemplate
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
import sys
import os

class SuppressStdout:
    def __enter__(self):
        self._original_stdout = sys.stdout
        self._original_stderr = sys.stderr
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout
        sys.stderr = self._original_stderr

# load the pdf and split it into chunks
loader = OnlinePDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf")
data = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

with SuppressStdout():
    vectorstore = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())

while True:
    query = input("\nQuery: ")
    if query == "exit":
        break
    if query.strip() == "":
        continue

    # Prompt
    template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use three sentences maximum and keep the answer as concise as possible.
    {context}
    Question: {question}
    Helpful Answer:"""
    QA_CHAIN_PROMPT = PromptTemplate(
        input_variables=["context", "question"],
        template=template,
    )

    llm = Ollama(model="llama3", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    )

    result = qa_chain({"query": query})

版本2

from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain.chains import create_extraction_chain
import json
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


prompt =ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a top-tier algorithm for extracting information from text. "
            "Only extract information that is relevant to the provided text. "
            "If no information is relevant, use the schema and output "
            "an empty list where appropriate."
        ),
        ("user",
            "I need to extract information from "
            "the following text: ```\n{text}\n```\n",
        ),
    ]
)
# Schema
schema = {
  "type": "object",
  "title": "Recipe Information Extractor",
  "$schema": "http://json-schema.org/draft-07/schema#",
  "required": [
    "recipes"
  ],
  "properties": {
    "recipes": {
      "type": "array",
      "items": {
        "type": "object",
        "required": [
          "name",
          "ingredients"
        ],
        "properties": {
          "name": {
            "type": "string",
            "description": "The name of the recipe."
          },
          "ingredients": {
            "type": "array",
            "items": {
              "type": "object",
              "required": [
                "name",
                "amount",
                "unit"
              ],
              "properties": {
                "name": {
                  "type": "string",
                  "description": "The name of the ingredient."
                },
                "unit": {
                  "type": "string",
                  "description": "The unit of the amount of the ingredient."
                },
                "amount": {
                  "type": "number",
                  "description": "The numeric amount of the ingredient."
                }
              }
            }
          }
        }
      }
    }
  },
  "description": "Schema for extracting recipe information from text."
}

loader = PyMuPDFLoader("https://github.com/slowmagic10/langchian-ollama-pdf-extractor/blob/main/recipe.pdf")
docs = loader.load()

def split_docs(documents, chunk_size=int(128_000 * 0.8), chunk_overlap=20):
    
    # Initializing the RecursiveCharacterTextSplitter with
    # chunk_size and chunk_overlap
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    # Splitting the documents into chunks
    chunks = text_splitter.split_documents(documents=documents)
    
    # returning the document chunks
    return chunks
documents = split_docs(documents=docs)
# Run chain
llm = OllamaFunctions(model="mistral:7b-instruct", temperature=0)
chain = prompt | create_extraction_chain(schema, llm)
responses = []
for document in documents:
  input_data = {
          "text": document,
          "json_schema": schema,  
          "instruction": (
              "recipe.each recipe has a name and list of ingredients.ingredients should have a name,numeric amount,and unit of amount"
          )
      }
  response = chain.invoke(input_data)
  responses.append(response)
for response in responses:
    result = response['text']
    print(json.dumps(result, indent=4))
posted @   朝阳1  阅读(745)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· winform 绘制太阳,地球,月球 运作规律
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· AI 智能体引爆开源社区「GitHub 热点速览」
· 写一个简单的SQL生成工具
点击右上角即可分享
微信分享提示