from langchain.document_loaders import OnlinePDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings
from langchain import PromptTemplate
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
import sys
import os
class SuppressStdout:
def __enter__(self):
self._original_stdout = sys.stdout
self._original_stderr = sys.stderr
sys.stdout = open(os.devnull, 'w')
sys.stderr = open(os.devnull, 'w')
def __exit__(self, exc_type, exc_val, exc_tb):
sys.stdout.close()
sys.stdout = self._original_stdout
sys.stderr = self._original_stderr
loader = OnlinePDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001813756/975b3e9b-268e-4798-a9e4-2a9a7c92dc10.pdf")
data = loader.load()
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)
with SuppressStdout():
vectorstore = Chroma.from_documents(documents=all_splits, embedding=GPT4AllEmbeddings())
while True:
query = input("\nQuery: ")
if query == "exit":
break
if query.strip() == "":
continue
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(
input_variables=["context", "question"],
template=template,
)
llm = Ollama(model="llama3", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
qa_chain = RetrievalQA.from_chain_type(
llm,
retriever=vectorstore.as_retriever(),
chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)
result = qa_chain({"query": query})
版本2
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain.chains import create_extraction_chain
import json
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
prompt =ChatPromptTemplate.from_messages(
[
(
"system",
"You are a top-tier algorithm for extracting information from text. "
"Only extract information that is relevant to the provided text. "
"If no information is relevant, use the schema and output "
"an empty list where appropriate."
),
("user",
"I need to extract information from "
"the following text: ```\n{text}\n```\n",
),
]
)
schema = {
"type": "object",
"title": "Recipe Information Extractor",
"$schema": "http://json-schema.org/draft-07/schema#",
"required": [
"recipes"
],
"properties": {
"recipes": {
"type": "array",
"items": {
"type": "object",
"required": [
"name",
"ingredients"
],
"properties": {
"name": {
"type": "string",
"description": "The name of the recipe."
},
"ingredients": {
"type": "array",
"items": {
"type": "object",
"required": [
"name",
"amount",
"unit"
],
"properties": {
"name": {
"type": "string",
"description": "The name of the ingredient."
},
"unit": {
"type": "string",
"description": "The unit of the amount of the ingredient."
},
"amount": {
"type": "number",
"description": "The numeric amount of the ingredient."
}
}
}
}
}
}
}
},
"description": "Schema for extracting recipe information from text."
}
loader = PyMuPDFLoader("https://github.com/slowmagic10/langchian-ollama-pdf-extractor/blob/main/recipe.pdf")
docs = loader.load()
def split_docs(documents, chunk_size=int(128_000 * 0.8), chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_documents(documents=documents)
return chunks
documents = split_docs(documents=docs)
llm = OllamaFunctions(model="mistral:7b-instruct", temperature=0)
chain = prompt | create_extraction_chain(schema, llm)
responses = []
for document in documents:
input_data = {
"text": document,
"json_schema": schema,
"instruction": (
"recipe.each recipe has a name and list of ingredients.ingredients should have a name,numeric amount,and unit of amount"
)
}
response = chain.invoke(input_data)
responses.append(response)
for response in responses:
result = response['text']
print(json.dumps(result, indent=4))
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· AI 智能体引爆开源社区「GitHub 热点速览」
· 写一个简单的SQL生成工具