RAG-Multi-Modal-Generative-AI-Agent

https://github.com/ganeshnehru/RAG-Multi-Modal-Generative-AI-Agent/tree/main

router -> each agents.

A multimodal RAG-based generative AI digital assistant that combines text generation, vision QA, and code generation.

Generative-AI-Digital-Assistant-w-RAG (Agent-Nesh 🤖)

Agent-Nesh is a Retrieval-Augmented Generation (RAG)-based multi-modal AI assistant that leverages advanced AI models to provide intelligent, context-aware responses to various types of input including text, images, code, and voice. This project uses the following models:

Meta Llama 3

Microsoft Phi 3 Vision

IBM Granite

OpenAI Whisper

Features

Text Assistance: Handle general text-based queries.

Code Assistance: Provide coding assistance and help with code-related queries.

Image Analysis: Analyze and describe images.

Voice Recognition: Convert spoken language into text.

router assistant

# import re
# import logging
# from chains.code_assistant import CodeAssistant
# from chains.language_assistant import LanguageAssistant
# from chains.vision_assistant import VisionAssistant
#
# class AssistantRouter:
#     def __init__(self):
#         self.code_assistant = CodeAssistant()
#         self.language_assistant = LanguageAssistant()
#         self.vision_assistant = VisionAssistant()
#
#     def route_input(self, user_input='', image_path=None):
#         """
#         Route the input to the appropriate assistant based on the content of the user input.
#
#         :param user_input: str, The input text from the user.
#         :param image_path: str, Path to an image file if provided.
#         :return: tuple, The response from the appropriate assistant and the assistant name.
#         """
#         try:
#             if image_path:
#                 # Process image and route to VisionAssistant
#                 image_b64 = self.vision_assistant.process_image(image_path)
#                 if image_b64 is None:
#                     raise ValueError("Failed to process image.")
#                 input_string = f"{user_input}|{image_b64}"
#                 response = self.vision_assistant.invoke(input_string)
#                 return response, 'VisionAssistant'
#
#             if self.is_code_related(user_input):
#                 response = self.code_assistant.invoke(user_input)
#                 return response, 'CodeAssistant'
#             else:
#                 response = self.language_assistant.invoke(user_input)
#                 return response, 'LanguageAssistant'
#         except Exception as e:
#             logging.error(f"Error in AssistantRouter.route_input: {e}")
#             return {"error": str(e)}, 'Error'
#
#     def is_code_related(self, text):
#         """
#         Determine if the text input is related to coding.
#
#         :param text: str, The input text.
#         :return: bool, True if the text is code related, False otherwise.
#         """
#         # Basic keyword-based detection
#         code_keywords = [
#             'function', 'class', 'def', 'import', 'print', 'variable',
#             'loop', 'array', 'list', 'dictionary', 'exception', 'error', 'bug',
#             'code', 'compile', 'execute', 'algorithm', 'data structure' , 'java' , 'python' , 'javascript', 'c++',
#             'c#', 'ruby', 'php', 'html', 'css', 'sql', 'swift', 'kotlin', 'go', 'rust', 'typescript', 'r', 'perl',
#             'scala', 'shell', 'bash', 'powershell', 'objective-c', 'matlab', 'groovy', 'lua', 'dart', 'cobol',
#             'fortran', 'haskell', 'lisp', 'pascal', 'prolog', 'scheme', 'smalltalk', 'verilog', 'vhdl',
#             'assembly', 'coffeescript', 'f#', 'julia', 'racket', 'scratch', 'solidity', 'vba', 'abap', 'apex',
#             'awk', 'clojure', 'd', 'elixir', 'erlang', 'forth', 'hack', 'idris', 'j', 'julia', 'kdb+', 'labview',
#             'logtalk', 'lolcode', 'mumps', 'nim', 'ocaml', 'pl/i', 'postscript', 'powershell', 'rpg', 'sas', 'sml',
#             'tcl', 'turing', 'unicon', 'x10', 'xquery', 'zsh'
#         ]
#         pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in code_keywords) + r')\b', re.IGNORECASE)
#         return bool(pattern.search(text))



import re
import logging
from chains.code_assistant import CodeAssistant
from chains.language_assistant import LanguageAssistant
from chains.vision_assistant import VisionAssistant

class AssistantRouter:
    def __init__(self):
        self.code_assistant = CodeAssistant()
        self.language_assistant = LanguageAssistant()
        self.vision_assistant = VisionAssistant()

    def route_input(self, user_input='', image_path=None):
        """
        Route the input to the appropriate assistant based on the content of the user input.

        :param user_input: str, The input text from the user.
        :param image_path: str, Path to an image file if provided.
        :return: tuple, The response from the appropriate assistant and the assistant name.
        """
        try:
            if image_path:
                # Process image and route to VisionAssistant
                image_b64 = self.vision_assistant.process_image(image_path)
                if image_b64 is None:
                    raise ValueError("Failed to process image.")
                input_string = f"{user_input}|{image_b64}"
                response = self.vision_assistant.invoke(input_string)
                return response, 'VisionAssistant'

            if self.is_code_related(user_input):
                response = self.code_assistant.invoke(user_input)
                return response, 'CodeAssistant'
            else:
                response = self.language_assistant.invoke(user_input)
                return response, 'LanguageAssistant'
        except Exception as e:
            logging.error(f"Error in AssistantRouter.route_input: {e}")
            return {"content": f"Error: {str(e)}"}, 'Error'

    def is_code_related(self, text):
        """
        Determine if the text input is related to coding.

        :param text: str, The input text.
        :return: bool, True if the text is code related, False otherwise.
        """
        # Basic keyword-based detection
        code_keywords = [
            'function', 'class', 'def', 'import', 'print', 'variable',
            'loop', 'array', 'list', 'dictionary', 'exception', 'error', 'bug',
            'code', 'compile', 'execute', 'algorithm', 'data structure', 'java', 'python', 'javascript', 'c++',
            'c#', 'ruby', 'php', 'html', 'css', 'sql', 'swift', 'kotlin', 'go', 'rust', 'typescript', 'r', 'perl',
            'scala', 'shell', 'bash', 'powershell', 'objective-c', 'matlab', 'groovy', 'lua', 'dart', 'cobol',
            'fortran', 'haskell', 'lisp', 'pascal', 'prolog', 'scheme', 'smalltalk', 'verilog', 'vhdl',
            'assembly', 'coffeescript', 'f#', 'julia', 'racket', 'scratch', 'solidity', 'vba', 'abap', 'apex',
            'awk', 'clojure', 'd', 'elixir', 'erlang', 'forth', 'hack', 'idris', 'j', 'julia', 'kdb+', 'labview',
            'logtalk', 'lolcode', 'mumps', 'nim', 'ocaml', 'pl/i', 'postscript', 'powershell', 'rpg', 'sas', 'sml',
            'tcl', 'turing', 'unicon', 'x10', 'xquery', 'zsh'
        ]
        pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in code_keywords) + r')\b', re.IGNORECASE)
        return bool(pattern.search(text))

vision assistant

from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from chains.memory import central_memory
from PIL import Image, ImageOps
import base64
import io
from dotenv import load_dotenv
import logging

load_dotenv()

class VisionAssistant:
    def __init__(self, model_name="microsoft/phi-3-vision-128k-instruct"):
        self.chat_model = ChatNVIDIA(model=model_name)
        self.system_prompt = """You are an AI vision assistant specialized in analyzing,
                                describing and answering questions about images. You are accurately
                                able to describe the contents of an image, including objects, actions,
                                and scenes."""
        self.human_prompt_template = ChatPromptTemplate.from_messages([
            ("system", self.system_prompt),
            ("user", "{input}")
        ])
        self.chain = self.human_prompt_template | self.chat_model | StrOutputParser()
        self.memory = central_memory

    # def process_image(self, image_path, desired_size=256):
    #     try:
    #         image = Image.open(image_path)
    #         image.thumbnail((desired_size, desired_size), Image.Resampling.LANCZOS)
    #         image = ImageOps.pad(image, (desired_size, desired_size), color="white")
    #         buffered = io.BytesIO()
    #         image.save(buffered, format="PNG")
    #         return base64.b64encode(buffered.getvalue()).decode('utf-8')
    #     except Exception as e:
    #         logging.error(f"Error in VisionAssistant.process_image: {e}")
    #         return None

    def process_image(self, image_path, desired_size=256):
        try:
            # Open the input image
            with Image.open(image_path) as img:
                # Convert the image to PNG format
                buffered = io.BytesIO()
                img.save(buffered, format="PNG")
                buffered.seek(0)
                image = Image.open(buffered)

                # Resize the image
                image.thumbnail((desired_size, desired_size), Image.Resampling.LANCZOS)
                image = ImageOps.pad(image, (desired_size, desired_size), color="white")

                # Save the processed image to a buffer
                buffered = io.BytesIO()
                image.save(buffered, format="PNG")

                # Encode the image in base64
                return base64.b64encode(buffered.getvalue()).decode('utf-8')
        except Exception as e:
            logging.error(f"Error in VisionAssistant.process_image: {e}")
            return None

    def invoke(self, input_string):
        try:
            if '|' not in input_string:
                raise ValueError("Input must be in the format 'text|base64_image'.")
            text_input, image_b64 = input_string.split('|', 1)
            input_message = [
                {"type": "text", "text": text_input},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
            ]
            result = self.chat_model.invoke([HumanMessage(content=input_message)])
            self.add_to_memory(text_input, result.content)  # Save the interaction to memory
            return result.content
        except Exception as e:
            logging.error(f"Error in VisionAssistant.invoke: {e}")
            return {"error": str(e)}

    def add_to_memory(self, text_input, response):
        """
        Add the interaction to the memory.

        :param text_input: str, The input text from the user.
        :param response: str, The response from the assistant.
        """
        self.memory.save_context({'input': text_input}, {'response': response})

posted @ 2024-10-05 23:25 lightsong 阅读(6) 评论(0) 编辑收藏举报

刷新页面返回顶部

Stay Hungry,Stay Foolish!

lightsong

{Web: [React, Vue, NodeJS, HTTP]，DevOps:[Jenkins,Docker,K8S], Languages:[Python, JS, C, Lua, Shell, Groovy]}

RAG-Multi-Modal-Generative-AI-Agent

RAG-Multi-Modal-Generative-AI-Agent

Generative-AI-Digital-Assistant-w-RAG (Agent-Nesh 🤖)

Features

公告