RAG-Multi-Modal-Generative-AI-Agent
RAG-Multi-Modal-Generative-AI-Agent
https://github.com/ganeshnehru/RAG-Multi-Modal-Generative-AI-Agent/tree/main
router -> each agents.
A multimodal RAG-based generative AI digital assistant that combines text generation, vision QA, and code generation.
Agent-Nesh is a Retrieval-Augmented Generation (RAG)-based multi-modal AI assistant that leverages advanced AI models to provide intelligent, context-aware responses to various types of input including text, images, code, and voice. This project uses the following models:
- Text Assistance: Handle general text-based queries.
- Code Assistance: Provide coding assistance and help with code-related queries.
- Image Analysis: Analyze and describe images.
- Voice Recognition: Convert spoken language into text.
router assistant
# import re # import logging # from chains.code_assistant import CodeAssistant # from chains.language_assistant import LanguageAssistant # from chains.vision_assistant import VisionAssistant # # class AssistantRouter: # def __init__(self): # self.code_assistant = CodeAssistant() # self.language_assistant = LanguageAssistant() # self.vision_assistant = VisionAssistant() # # def route_input(self, user_input='', image_path=None): # """ # Route the input to the appropriate assistant based on the content of the user input. # # :param user_input: str, The input text from the user. # :param image_path: str, Path to an image file if provided. # :return: tuple, The response from the appropriate assistant and the assistant name. # """ # try: # if image_path: # # Process image and route to VisionAssistant # image_b64 = self.vision_assistant.process_image(image_path) # if image_b64 is None: # raise ValueError("Failed to process image.") # input_string = f"{user_input}|{image_b64}" # response = self.vision_assistant.invoke(input_string) # return response, 'VisionAssistant' # # if self.is_code_related(user_input): # response = self.code_assistant.invoke(user_input) # return response, 'CodeAssistant' # else: # response = self.language_assistant.invoke(user_input) # return response, 'LanguageAssistant' # except Exception as e: # logging.error(f"Error in AssistantRouter.route_input: {e}") # return {"error": str(e)}, 'Error' # # def is_code_related(self, text): # """ # Determine if the text input is related to coding. # # :param text: str, The input text. # :return: bool, True if the text is code related, False otherwise. # """ # # Basic keyword-based detection # code_keywords = [ # 'function', 'class', 'def', 'import', 'print', 'variable', # 'loop', 'array', 'list', 'dictionary', 'exception', 'error', 'bug', # 'code', 'compile', 'execute', 'algorithm', 'data structure' , 'java' , 'python' , 'javascript', 'c++', # 'c#', 'ruby', 'php', 'html', 'css', 'sql', 'swift', 'kotlin', 'go', 'rust', 'typescript', 'r', 'perl', # 'scala', 'shell', 'bash', 'powershell', 'objective-c', 'matlab', 'groovy', 'lua', 'dart', 'cobol', # 'fortran', 'haskell', 'lisp', 'pascal', 'prolog', 'scheme', 'smalltalk', 'verilog', 'vhdl', # 'assembly', 'coffeescript', 'f#', 'julia', 'racket', 'scratch', 'solidity', 'vba', 'abap', 'apex', # 'awk', 'clojure', 'd', 'elixir', 'erlang', 'forth', 'hack', 'idris', 'j', 'julia', 'kdb+', 'labview', # 'logtalk', 'lolcode', 'mumps', 'nim', 'ocaml', 'pl/i', 'postscript', 'powershell', 'rpg', 'sas', 'sml', # 'tcl', 'turing', 'unicon', 'x10', 'xquery', 'zsh' # ] # pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in code_keywords) + r')\b', re.IGNORECASE) # return bool(pattern.search(text)) import re import logging from chains.code_assistant import CodeAssistant from chains.language_assistant import LanguageAssistant from chains.vision_assistant import VisionAssistant class AssistantRouter: def __init__(self): self.code_assistant = CodeAssistant() self.language_assistant = LanguageAssistant() self.vision_assistant = VisionAssistant() def route_input(self, user_input='', image_path=None): """ Route the input to the appropriate assistant based on the content of the user input. :param user_input: str, The input text from the user. :param image_path: str, Path to an image file if provided. :return: tuple, The response from the appropriate assistant and the assistant name. """ try: if image_path: # Process image and route to VisionAssistant image_b64 = self.vision_assistant.process_image(image_path) if image_b64 is None: raise ValueError("Failed to process image.") input_string = f"{user_input}|{image_b64}" response = self.vision_assistant.invoke(input_string) return response, 'VisionAssistant' if self.is_code_related(user_input): response = self.code_assistant.invoke(user_input) return response, 'CodeAssistant' else: response = self.language_assistant.invoke(user_input) return response, 'LanguageAssistant' except Exception as e: logging.error(f"Error in AssistantRouter.route_input: {e}") return {"content": f"Error: {str(e)}"}, 'Error' def is_code_related(self, text): """ Determine if the text input is related to coding. :param text: str, The input text. :return: bool, True if the text is code related, False otherwise. """ # Basic keyword-based detection code_keywords = [ 'function', 'class', 'def', 'import', 'print', 'variable', 'loop', 'array', 'list', 'dictionary', 'exception', 'error', 'bug', 'code', 'compile', 'execute', 'algorithm', 'data structure', 'java', 'python', 'javascript', 'c++', 'c#', 'ruby', 'php', 'html', 'css', 'sql', 'swift', 'kotlin', 'go', 'rust', 'typescript', 'r', 'perl', 'scala', 'shell', 'bash', 'powershell', 'objective-c', 'matlab', 'groovy', 'lua', 'dart', 'cobol', 'fortran', 'haskell', 'lisp', 'pascal', 'prolog', 'scheme', 'smalltalk', 'verilog', 'vhdl', 'assembly', 'coffeescript', 'f#', 'julia', 'racket', 'scratch', 'solidity', 'vba', 'abap', 'apex', 'awk', 'clojure', 'd', 'elixir', 'erlang', 'forth', 'hack', 'idris', 'j', 'julia', 'kdb+', 'labview', 'logtalk', 'lolcode', 'mumps', 'nim', 'ocaml', 'pl/i', 'postscript', 'powershell', 'rpg', 'sas', 'sml', 'tcl', 'turing', 'unicon', 'x10', 'xquery', 'zsh' ] pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in code_keywords) + r')\b', re.IGNORECASE) return bool(pattern.search(text))
vision assistant
from langchain_nvidia_ai_endpoints import ChatNVIDIA from langchain_core.prompts import ChatPromptTemplate from langchain_core.messages import HumanMessage from langchain_core.output_parsers import StrOutputParser from chains.memory import central_memory from PIL import Image, ImageOps import base64 import io from dotenv import load_dotenv import logging load_dotenv() class VisionAssistant: def __init__(self, model_name="microsoft/phi-3-vision-128k-instruct"): self.chat_model = ChatNVIDIA(model=model_name) self.system_prompt = """You are an AI vision assistant specialized in analyzing, describing and answering questions about images. You are accurately able to describe the contents of an image, including objects, actions, and scenes.""" self.human_prompt_template = ChatPromptTemplate.from_messages([ ("system", self.system_prompt), ("user", "{input}") ]) self.chain = self.human_prompt_template | self.chat_model | StrOutputParser() self.memory = central_memory # def process_image(self, image_path, desired_size=256): # try: # image = Image.open(image_path) # image.thumbnail((desired_size, desired_size), Image.Resampling.LANCZOS) # image = ImageOps.pad(image, (desired_size, desired_size), color="white") # buffered = io.BytesIO() # image.save(buffered, format="PNG") # return base64.b64encode(buffered.getvalue()).decode('utf-8') # except Exception as e: # logging.error(f"Error in VisionAssistant.process_image: {e}") # return None def process_image(self, image_path, desired_size=256): try: # Open the input image with Image.open(image_path) as img: # Convert the image to PNG format buffered = io.BytesIO() img.save(buffered, format="PNG") buffered.seek(0) image = Image.open(buffered) # Resize the image image.thumbnail((desired_size, desired_size), Image.Resampling.LANCZOS) image = ImageOps.pad(image, (desired_size, desired_size), color="white") # Save the processed image to a buffer buffered = io.BytesIO() image.save(buffered, format="PNG") # Encode the image in base64 return base64.b64encode(buffered.getvalue()).decode('utf-8') except Exception as e: logging.error(f"Error in VisionAssistant.process_image: {e}") return None def invoke(self, input_string): try: if '|' not in input_string: raise ValueError("Input must be in the format 'text|base64_image'.") text_input, image_b64 = input_string.split('|', 1) input_message = [ {"type": "text", "text": text_input}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}} ] result = self.chat_model.invoke([HumanMessage(content=input_message)]) self.add_to_memory(text_input, result.content) # Save the interaction to memory return result.content except Exception as e: logging.error(f"Error in VisionAssistant.invoke: {e}") return {"error": str(e)} def add_to_memory(self, text_input, response): """ Add the interaction to the memory. :param text_input: str, The input text from the user. :param response: str, The response from the assistant. """ self.memory.save_context({'input': text_input}, {'response': response})
出处:http://www.cnblogs.com/lightsong/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接。