langchain multi modal support
How to pass multimodal data directly to models
https://python.langchain.com/v0.2/docs/how_to/multimodal_inputs/
message = HumanMessage( content=[ {"type": "text", "text": "describe the weather in this image"}, {"type": "image_url", "image_url": {"url": image_url}}, ], ) response = model.invoke([message]) print(response.content)
message = HumanMessage( content=[ {"type": "text", "text": "describe the weather in this image"}, { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, }, ], ) response = model.invoke([message]) print(response.content)
from typing import Literal from langchain_core.tools import tool @tool def weather_tool(weather: Literal["sunny", "cloudy", "rainy"]) -> None: """Describe the weather""" pass model_with_tools = model.bind_tools([weather_tool]) message = HumanMessage( content=[ {"type": "text", "text": "describe the weather in this image"}, {"type": "image_url", "image_url": {"url": image_url}}, ], ) response = model_with_tools.invoke([message]) print(response.tool_calls)
How to use ImagePromptTemplate
https://github.com/langchain-ai/langchain/discussions/20820
from langchain.agents import AgentExecutor, create_openai_tools_agent from langchain.schema import SystemMessage from langchain_community.tools import CopyFileTool from langchain_core.prompts import ( ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder, ) from langchain_openai import ChatOpenAI # Define the prompt template with the image_url prompt_messages = [ SystemMessage(content="Describe the image provided"), HumanMessagePromptTemplate.from_template( template=[ {"type": "image_url", "image_url": {"url": "{image_url}"}}, ] ), MessagesPlaceholder("agent_scratchpad"), ] prompt = ChatPromptTemplate(messages=prompt_messages) # Define the LLM model llm = ChatOpenAI(model="gpt-4o") # Define a dummy tool tools = [CopyFileTool()] # Create an Agent that can be used to call the tools we defined agent = create_openai_tools_agent(llm, tools, prompt) # Define the AgentExecutor properly agent_executor = AgentExecutor( agent=agent, tools=tools, ) # Example image URL you want to pass in at runtime image_url = "https://fastly.picsum.photos/id/450/200/200.jpg?hmac=DluUYibC-zBoNHLOHsO6aHIuiA3pDhholFjiR5KcwR0" # Execute the Agent with the image URL response = agent_executor.invoke({"image_url": image_url, "agent_scratchpad": []})
Image-caption-with-LLM-and-Langchain
https://github.com/stemgene/Image-caption-with-LLM-and-Langchain/blob/main/main.py
import streamlit as st import os import requests import torch from PIL import Image from langchain.agents import initialize_agent from langchain.chat_models import ChatOpenAI from langchain.chains.conversation.memory import ConversationBufferWindowMemory from tools import ImageCaptionTool, ObjectDetectionTool from tempfile import NamedTemporaryFile ######################## ### Initialize agent ### ######################## tools = [ImageCaptionTool(), ObjectDetectionTool()] conversational_memory = ConversationBufferWindowMemory( memory_key='chat_history', k=5, return_messages=True ) llm = ChatOpenAI( # openai_api_key=os.environ.get("OPENAI_API_KEY"), # for local usage openai_api_key=st.secrets['auth_key'], temperature=0, model_name='gpt-3.5-turbo' ) agent = initialize_agent( agent='chat-conversational-react-description', tools=tools, llm=llm, max_iterations=5, verbose=True, memory=conversational_memory, early_stoppy_method='generate' ) # Set title st.title("Ask a question to an image") # set header st.header("Please upload an image") # upload file # file = st.file_uploader("", type=['jpg', 'png', 'jpeg']) # input image url image_url = st.text_input('Please input the image URL') if image_url and image_url != "": image_object = Image.open(requests.get(image_url, stream=True).raw).convert("RGB") # display image #st.image(file, use_column_width=True) st.image(image_object, use_column_width=True) # text input user_question = st.text_input('Ask a question your image:') ############################## ### Compute agent response ### ############################## # with NamedTemporaryFile(dir='.', mode='w+b') as f: # f.write(file.getbuffer()) # image_path = f.name # # write agent response # if user_question and user_question != "": # with st.spinner(text='In progress...'): # response = agent.run('{}, this is the image path: {}'.format(user_question, image_path)) # st.write(response) #write agent response if user_question and user_question != "": with st.spinner(text='In progress...'): response = agent.run('{}, this is the image path: {}'.format(user_question, image_url)) st.write(response)
how_to_use_crew_ai_for_image_understanding
https://github.com/mdwoicke/crewai_multi_agent/blob/main/01_how_to_use_crew_ai_for_image_understanding.py
""" Author: Rajib Deb Date: 02/10/2024 Description: This is the driver program that starts the MasterChef crew. """ from crew_ai_crews.master_chef import MasterChefCrew if __name__ == "__main__": # url="https://rumkisgoldenspoon.com/wp-content/uploads/2022/02/Aar-macher-jhol.jpg" url="https://m.media-amazon.com/images/I/51dFvTRE3iL.__AC_SX300_SY300_QL70_FMwebp_.jpg" crew = MasterChefCrew(url=url) print(crew.kickoff())
分析菜肴成分,并给出做法
""" Author: Rajib Deb Date: 02/10/2024 Description: This module creates the crew objects with all the required agents and tasks """ import os from crewai import Crew from dotenv import load_dotenv from crew_ai_crews.agents import MasterChef from crew_ai_crews.tasks import ExtractIngredientsFromImage load_dotenv() OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') class MasterChefCrew(): """ The master chef crew consisting of the food connoisseur and the chef """ def __init__(self, url): agents = MasterChef(url=url) self.extract_agent = agents.get_recipe_from_image() self.instruct_agent = agents.instruct_to_cook_dish() def kickoff(self): """ Creating the crew with the two agents :return: """ tasks = ExtractIngredientsFromImage() crew = Crew( agents=[self.extract_agent, self.instruct_agent], tasks=[tasks.get_ingredients(self.extract_agent), tasks.cooking_instruction(self.instruct_agent)], verbose=True ) result = crew.kickoff() return result
LangChain vs. CrewAI: Comparing AI Development Platforms
https://smythos.com/ai-agents/ai-agent-builders/langchain-vs-crewai/
crewai不支持多模态, 其本身是workflow定义
Feature Comparison Table
LangChain CrewAI SmythOS CORE FEATURES Hosted Agents (Dev, Production) ✅ ❌ ✅ Environments (Dev, Production) ✅ ❌ ✅ Visual Builder ❌ ❌ ✅ No-Code Options ❌ ❌ ✅ Explainability & Transparency ✅ ❌ ✅ Debug Tools ✅ ❌ ✅ Multimodal ✅ ❌ ✅ Audit Logs for Analytics ✅ ❌ ✅ Agent Work Scheduler ❌ ✅ ✅ SECURITY Constrained Alignment ❌ ❌ ✅ Data Encryption ✅ ❌ ✅ OAuth ✅ ❌ ✅ IP Control ❌ ❌ ✅ COMPONENTS Foundation AIs ✅ ❌ ✅ Huggingface AIs ✅ ❌ ✅ Zapier APIs ❌ ❌ ✅ All other APIs, RPA ✅ ❌ ✅ Classifiers ✅ ❌ ✅ Logic ✅ ❌ ✅ Data Lakes ❌ ❌ ✅ DEPLOYMENT OPTIONS (EMBODIMENTS) Deploy as API ✅ ❌ ✅ Deploy as Webhook ❌ ❌ ✅ Staging Domains ❌ ❌ ✅ Production Domains ❌ ❌ ✅ API Authentication (OAuth + Key) ✅ ❌ ✅ Deploy as Site Chat ✅ ❌ ✅ Deploy as Scheduled Agent ❌ ❌ ✅ Deploy as GPT ✅ ❌ ✅ DATA LAKE SUPPORT Hosted Vector Database ❌ ❌ ✅ Sitemap Crawler ❌ ❌ ✅ YouTube Transcript Crawler ❌ ❌ ✅ URL Crawler ❌ ❌ ✅ PDF Support ✅ ❌ ✅ Word File Support ❌ ❌ ✅ TXT File Support ✅ ❌ ✅
Multi-modal outputs: Image & Text
https://python.langchain.com.cn/docs/use_cases/agents/multi_modal_output_agent
audio理解
https://js.langchain.com/v0.2/docs/how_to/tool_calls_multimodal/
https://js.langchain.ac.cn/v0.2/docs/how_to/tool_calls_multimodal/
import { SystemMessage } from "@langchain/core/messages"; import { tool } from "@langchain/core/tools"; const summaryTool = tool( (input) => { return input.summary; }, { name: "summary_tool", description: "Log the summary of the content", schema: z.object({ summary: z.string().describe("The summary of the content to log"), }), } ); const audioUrl = "https://www.pacdv.com/sounds/people_sound_effects/applause-1.wav"; const axiosRes = await axios.get(audioUrl, { responseType: "arraybuffer" }); const base64 = btoa( new Uint8Array(axiosRes.data).reduce( (data, byte) => data + String.fromCharCode(byte), "" ) ); const model = new ChatGoogleGenerativeAI({ model: "gemini-1.5-pro-latest", }).bindTools([summaryTool]); const response = await model.invoke([ new SystemMessage( "Summarize this content. always use the summary_tool in your response" ), new HumanMessage({ content: [ { type: "media", mimeType: "audio/wav", data: base64, }, ], }), ]); console.log(response.tool_calls);
How to use multimodal prompts
https://python.langchain.com/docs/how_to/multimodal_prompts/
https://blog.csdn.net/cheese_burger_/article/details/140351968
prompt = ChatPromptTemplate.from_messages( [ ("system", "compare the two pictures provided"), ( "user", [ { "type": "image_url", "image_url": {"url": "data:image/jpeg;base64,{image_data1}"}, }, { "type": "image_url", "image_url": {"url": "data:image/jpeg;base64,{image_data2}"}, }, ], ), ] )
快速封装替换自定义LLM(基于自定义API或本地模型)
https://blog.csdn.net/HYY_2000/article/details/138339333
LangGraph: Multi-Agent Workflows
https://blog.langchain.dev/langgraph-multi-agent-workflows/
Other Frameworks
LangGraph is not the first framework to support multi-agent workflows. Most of the difference between these frameworks largely lies in the mental models and concepts they introduce.
Autogen
Autogen emerged as perhaps the first multi-agent framework. The biggest difference in mental model between LangGraph and Autogen is in construction of the agents. LangGraph prefers an approach where you explicitly define different agents and transition probabilities, preferring to represent it as a graph. Autogen frames it more as a "conversation". We believe that this "graph" framing makes it more intuitive and provides better developer experience for constructing more complex and opinionated workflows where you really want to control the transition probabilities between nodes. It also supports workflows that aren't explicitly captured by "conversations."
Another key difference between Autogen and LangGraph is that LangGraph is fully integrated into the LangChain ecosystem, meaning you take fully advantage of all the LangChain integrations and LangSmith observability.
CrewAI
Another key framework we want to highlight is CrewAI. CrewAI has emerged recently as a popular way to create multi-agent "teams". Compared to LangGraph, CrewAI is a higher-level framework. In fact, we are actively working with the CrewAI team to integration LangGraph into CrewAI! We think CrewAI has arrived at an awesome higher level DevEx and we want to support that.
CrewAI Unleashed: Future of AI Agent Teams
https://blog.langchain.dev/crewai-unleashed-future-of-ai-agent-teams/
CrewAI represents a shift in AI agents by offering a thin framework that leverages collaboration and roleplaying, based on versatility and efficiency. It stands as a tool for engineers and creatives alike, enabling the seamless assembly of AI agents into cohesive, high-performing teams.
Whether it's transforming a single thought into a fully-fledged landing page or conducting complex idea analysis, CrewAI is adept at handling a diverse range of tasks through its processes.
The real-world applications of CrewAI, from boosting social media presence to building interactive landing pages, underscore its practicality and adaptability. Looking forward, CrewAI is set to evolve further, introducing more intricate processes and continuing to redefine the landscape of AI teamwork. With its user-friendly integration and customization options, CrewAI is not just a concept but a tangible, powerful tool to harness the power of AI Agents.