langchain + ollama 实现本地文档搜索

from langchain.document_loaders import OnlinePDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings
from langchain import PromptTemplate
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
import sys
import os

class SuppressStdout:
    def __enter__(self):
        self._original_stdout = sys.stdout
        self._original_stderr = sys.stderr
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout
        sys.stderr = self._original_stderr

# load the pdf and split it into chunks
loader = OnlinePDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf")
data = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

with SuppressStdout():
    vectorstore = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())

while True:
    query = input("\nQuery: ")
    if query == "exit":
        break
    if query.strip() == "":
        continue

    # Prompt
    template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use three sentences maximum and keep the answer as concise as possible.
    {context}
    Question: {question}
    Helpful Answer:"""
    QA_CHAIN_PROMPT = PromptTemplate(
        input_variables=["context", "question"],
        template=template,
    )

    llm = Ollama(model="llama3", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    )

    result = qa_chain({"query": query})

版本2

from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain.chains import create_extraction_chain
import json
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


prompt =ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a top-tier algorithm for extracting information from text. "
            "Only extract information that is relevant to the provided text. "
            "If no information is relevant, use the schema and output "
            "an empty list where appropriate."
        ),
        ("user",
            "I need to extract information from "
            "the following text: ```\n{text}\n```\n",
        ),
    ]
)
# Schema
schema = {
  "type": "object",
  "title": "Recipe Information Extractor",
  "$schema": "http://json-schema.org/draft-07/schema#",
  "required": [
    "recipes"
  ],
  "properties": {
    "recipes": {
      "type": "array",
      "items": {
        "type": "object",
        "required": [
          "name",
          "ingredients"
        ],
        "properties": {
          "name": {
            "type": "string",
            "description": "The name of the recipe."
          },
          "ingredients": {
            "type": "array",
            "items": {
              "type": "object",
              "required": [
                "name",
                "amount",
                "unit"
              ],
              "properties": {
                "name": {
                  "type": "string",
                  "description": "The name of the ingredient."
                },
                "unit": {
                  "type": "string",
                  "description": "The unit of the amount of the ingredient."
                },
                "amount": {
                  "type": "number",
                  "description": "The numeric amount of the ingredient."
                }
              }
            }
          }
        }
      }
    }
  },
  "description": "Schema for extracting recipe information from text."
}

loader = PyMuPDFLoader("https://github.com/slowmagic10/langchian-ollama-pdf-extractor/blob/main/recipe.pdf")
docs = loader.load()

def split_docs(documents, chunk_size=int(128_000 * 0.8), chunk_overlap=20):
    
    # Initializing the RecursiveCharacterTextSplitter with
    # chunk_size and chunk_overlap
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    # Splitting the documents into chunks
    chunks = text_splitter.split_documents(documents=documents)
    
    # returning the document chunks
    return chunks
documents = split_docs(documents=docs)
# Run chain
llm = OllamaFunctions(model="mistral:7b-instruct", temperature=0)
chain = prompt | create_extraction_chain(schema, llm)
responses = []
for document in documents:
  input_data = {
          "text": document,
          "json_schema": schema,  
          "instruction": (
              "recipe.each recipe has a name and list of ingredients.ingredients should have a name,numeric amount,and unit of amount"
          )
      }
  response = chain.invoke(input_data)
  responses.append(response)
for response in responses:
    result = response['text']
    print(json.dumps(result, indent=4))