From 5f1f641dba804231bfa999809427cd7c60d01183 Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Tue, 1 Apr 2025 22:59:14 -0700 Subject: [PATCH] Tools and RAG are working together!! --- src/ketr-chat/src/App.tsx | 6 ++---- src/server.py | 40 ++++++++++++++++++++++----------------- src/utils/defines.py | 7 +++++-- src/utils/rag.py | 2 +- 4 files changed, 31 insertions(+), 24 deletions(-) diff --git a/src/ketr-chat/src/App.tsx b/src/ketr-chat/src/App.tsx index 11bf9cb..3251c71 100644 --- a/src/ketr-chat/src/App.tsx +++ b/src/ketr-chat/src/App.tsx @@ -318,11 +318,9 @@ const MessageMeta = ({ metadata }: MessageMetaInterface) => { return <> } - console.log(JSON.stringify(metadata.tools[0].result, null, 2)); - return (<> { - metadata.tools !== undefined && + metadata.tools !== undefined && metadata.tools.length !== 0 &&

Tools queried:

{metadata.tools.map((tool: any, index: number) => <> @@ -340,7 +338,7 @@ const MessageMeta = ({ metadata }: MessageMetaInterface) => { { metadata.rag.name !== undefined && -

RAG from '{metadata.rag.name}' collection matches against embedding vector of {metadata.rag.query_embedding.length} dimensions:

+

Top RAG {metadata.rag.ids.length} matches from '{metadata.rag.name}' collection against embedding vector of {metadata.rag.query_embedding.length} dimensions:

{metadata.rag.ids.map((id: number, index: number) => <> diff --git a/src/server.py b/src/server.py index e07b66d..2d3c8cc 100644 --- a/src/server.py +++ b/src/server.py @@ -52,7 +52,10 @@ from fastapi import FastAPI, HTTPException, BackgroundTasks, Request from fastapi.responses import JSONResponse, StreamingResponse, FileResponse, RedirectResponse from fastapi.middleware.cors import CORSMiddleware -from utils import rag as Rag +from utils import ( + rag as Rag, + defines +) from tools import ( DateTime, @@ -129,15 +132,14 @@ def system_info(model): "System RAM": get_installed_ram(), "Graphics Card": get_graphics_cards(), "CPU": get_cpu_info(), - "LLM Model": model + "LLM Model": model, + "Context length": defines.max_context } # %% # Defaults -OLLAMA_API_URL = "http://ollama:11434" # Default Ollama local endpoint -#MODEL_NAME = "deepseek-r1:7b" -#MODEL_NAME = "llama3.2" -MODEL_NAME = "qwen2.5:7b" +OLLAMA_API_URL = defines.ollama_api_url +MODEL_NAME = defines.model LOG_LEVEL="info" USE_TLS=False WEB_HOST="0.0.0.0" @@ -145,19 +147,20 @@ WEB_PORT=5000 # %% # Globals +context_tag = "INFO" system_message = f""" Launched on {DateTime()}. When answering queries, follow these steps: 1. First analyze the query to determine if real-time information might be helpful -2. Even when [CONTEXT] is provided, consider whether the tools would provide more current or comprehensive information +2. Even when [{context_tag}] is provided, consider whether the tools would provide more current or comprehensive information 3. Use the provided tools whenever they would enhance your response, regardless of whether context is also available -4. When both [CONTEXT] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer -5. Always prioritize the most up-to-date and relevant information, whether it comes from [CONTEXT] or tools -6. If [CONTEXT] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data +4. When both [{context_tag}] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer +5. Always prioritize the most up-to-date and relevant information, whether it comes from [{context_tag}] or tools +6. If [{context_tag}] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data -Always use tools and [CONTEXT] when possible. Be concise, and never make up information. If you do not know the answer, say so. +Always use tools and [{context_tag}] when possible. Be concise, and never make up information. If you do not know the answer, say so. """.strip() tool_log = [] @@ -719,23 +722,26 @@ class WebServer: metadata["rag"] = { "name": rag["name"], **chroma_results } preamble = "" if len(rag_docs): - preamble = "In addition to real-time tools, use the following context to answer the question:\n[CONTEXT]:\n" + preamble = f""" +1. Respond to this query: {content} +2. If there is information in this context to enhance the answer, do so: +[{context_tag}]:\n""" for doc in rag_docs: preamble += doc - preamble += "\n[/CONTEXT]\nHuman: " + preamble += f"\n[/{context_tag}]\nUse all of that information to respond to: " # Figure llm_history.append({"role": "user", "content": preamble + content}) user_history.append({"role": "user", "content": content}) - messages = context["system"] + llm_history[-1:] - + messages = context["system"] + llm_history try: yield {"status": "processing", "message": "Processing request..."} # Use the async generator in an async for loop - response = self.client.chat(model=self.model, messages=messages, tools=llm_tools(context["tools"])) + response = self.client.chat(model=self.model, messages=messages, tools=llm_tools(context["tools"]), options={ 'num_ctx': defines.max_context }) + tools_used = [] yield {"status": "processing", "message": "Initial response received..."} @@ -775,7 +781,7 @@ class WebServer: metadata["tools"] = tools_used yield {"status": "processing", "message": "Generating final response..."} - response = self.client.chat(model=self.model, messages=messages, stream=False) + response = self.client.chat(model=self.model, messages=messages, stream=False, options={ 'num_ctx': defines.max_context }) reply = response['message']['content'] final_message = {"role": "assistant", "content": reply } diff --git a/src/utils/defines.py b/src/utils/defines.py index 7ba8219..6066eff 100644 --- a/src/utils/defines.py +++ b/src/utils/defines.py @@ -1,4 +1,7 @@ ollama_api_url="http://ollama:11434" # Default Ollama local endpoint -model="qwen2.5:7b" +#model = "deepseek-r1:7b" +model = "llama3.2" +#model="qwen2.5:7b" encoding_model="mxbai-embed-large" -persist_directory="./chromadb" \ No newline at end of file +persist_directory="./chromadb" +max_context = 2048*8 \ No newline at end of file diff --git a/src/utils/rag.py b/src/utils/rag.py index f86340c..281113d 100644 --- a/src/utils/rag.py +++ b/src/utils/rag.py @@ -59,7 +59,7 @@ def get_vector_collection(path=defines.persist_directory, name="documents"): # Function to generate embeddings using Ollama def get_embedding(llm, text): - response = llm.embeddings(model=defines.model, prompt=text) + response = llm.embeddings(model=defines.model, prompt=text, options={ 'num_ctx': defines.max_context }) return response["embedding"] def add_embeddings_to_collection(llm, collection, chunks):