From 5f1f641dba804231bfa999809427cd7c60d01183 Mon Sep 17 00:00:00 2001
From: James Ketrenos <james_git@ketrenos.com>
Date: Tue, 1 Apr 2025 22:59:14 -0700
Subject: [PATCH] Tools and RAG are working together!!

---
 src/ketr-chat/src/App.tsx |  6 ++----
 src/server.py             | 40 ++++++++++++++++++++++-----------------
 src/utils/defines.py      |  7 +++++--
 src/utils/rag.py          |  2 +-
 4 files changed, 31 insertions(+), 24 deletions(-)
diff --git a/src/ketr-chat/src/App.tsx b/src/ketr-chat/src/App.tsx
index 11bf9cb..3251c71 100644
--- a/src/ketr-chat/src/App.tsx
+++ b/src/ketr-chat/src/App.tsx
@@ -318,11 +318,9 @@ const MessageMeta = ({ metadata }: MessageMetaInterface) => {
     return <></>
   }
 
-  console.log(JSON.stringify(metadata.tools[0].result, null, 2));
-
   return (<>
     {
-      metadata.tools !== undefined &&
+      metadata.tools !== undefined && metadata.tools.length !== 0 &&
       <Typography sx={{ marginBottom: 2 }}>
         <p>Tools queried:</p>
         {metadata.tools.map((tool: any, index: number) => <>
@@ -340,7 +338,7 @@ const MessageMeta = ({ metadata }: MessageMetaInterface) => {
     {
       metadata.rag.name !== undefined &&
       <Typography sx={{ marginBottom: 2 }}>
-        <p>RAG from '{metadata.rag.name}' collection matches against embedding vector of {metadata.rag.query_embedding.length} dimensions:</p>
+          <p>Top RAG {metadata.rag.ids.length} matches from '{metadata.rag.name}' collection against embedding vector of {metadata.rag.query_embedding.length} dimensions:</p>
         {metadata.rag.ids.map((id: number, index: number) => <>
           <Divider />
           <Box sx={{ fontSize: "0.75rem", display: "flex", flexDirection: "row", mb: 0.5, mt: 0.5 }} key={index}>
diff --git a/src/server.py b/src/server.py
index e07b66d..2d3c8cc 100644
--- a/src/server.py
+++ b/src/server.py
@@ -52,7 +52,10 @@ from fastapi import FastAPI, HTTPException, BackgroundTasks, Request
 from fastapi.responses import JSONResponse, StreamingResponse, FileResponse, RedirectResponse
 from fastapi.middleware.cors import CORSMiddleware
 
-from utils import rag as Rag
+from utils import (
+    rag as Rag,
+    defines
+)
 
 from tools import (
     DateTime, 
@@ -129,15 +132,14 @@ def system_info(model):
         "System RAM": get_installed_ram(),
         "Graphics Card": get_graphics_cards(),
         "CPU": get_cpu_info(),
-        "LLM Model": model
+        "LLM Model": model,
+        "Context length": defines.max_context
     }
 
 # %%
 # Defaults
-OLLAMA_API_URL = "http://ollama:11434"  # Default Ollama local endpoint
-#MODEL_NAME = "deepseek-r1:7b"
-#MODEL_NAME = "llama3.2"
-MODEL_NAME = "qwen2.5:7b"
+OLLAMA_API_URL = defines.ollama_api_url
+MODEL_NAME = defines.model
 LOG_LEVEL="info"
 USE_TLS=False
 WEB_HOST="0.0.0.0"
@@ -145,19 +147,20 @@ WEB_PORT=5000
 
 # %%
 # Globals
+context_tag = "INFO"
 system_message = f"""
 Launched on {DateTime()}.
 
 When answering queries, follow these steps:
 
 1. First analyze the query to determine if real-time information might be helpful
-2. Even when [CONTEXT] is provided, consider whether the tools would provide more current or comprehensive information
+2. Even when [{context_tag}] is provided, consider whether the tools would provide more current or comprehensive information
 3. Use the provided tools whenever they would enhance your response, regardless of whether context is also available
-4. When both [CONTEXT] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
-5. Always prioritize the most up-to-date and relevant information, whether it comes from [CONTEXT] or tools
-6. If [CONTEXT] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
+4. When both [{context_tag}] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
+5. Always prioritize the most up-to-date and relevant information, whether it comes from [{context_tag}] or tools
+6. If [{context_tag}] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
 
-Always use tools and [CONTEXT] when possible. Be concise, and never make up information. If you do not know the answer, say so.
+Always use tools and [{context_tag}] when possible. Be concise, and never make up information. If you do not know the answer, say so.
 """.strip()
 
 tool_log = []
@@ -719,23 +722,26 @@ class WebServer:
                     metadata["rag"] = { "name": rag["name"], **chroma_results }
         preamble = ""
         if len(rag_docs):
-            preamble = "In addition to real-time tools, use the following context to answer the question:\n[CONTEXT]:\n"
+            preamble = f"""
+1. Respond to this query: {content}
+2. If there is information in this context to enhance the answer, do so:
+[{context_tag}]:\n"""
             for doc in rag_docs:
                 preamble += doc
-            preamble += "\n[/CONTEXT]\nHuman: "
+            preamble += f"\n[/{context_tag}]\nUse all of that information to respond to: "
 
         # Figure 
         llm_history.append({"role": "user", "content": preamble + content})
         user_history.append({"role": "user", "content": content})
 
-        messages = context["system"] + llm_history[-1:]
-
+        messages = context["system"] + llm_history
 
         try:
             yield {"status": "processing", "message": "Processing request..."}
 
             # Use the async generator in an async for loop
-            response = self.client.chat(model=self.model, messages=messages, tools=llm_tools(context["tools"]))
+            response = self.client.chat(model=self.model, messages=messages, tools=llm_tools(context["tools"]), options={ 'num_ctx': defines.max_context })
+
             tools_used = []
             
             yield {"status": "processing", "message": "Initial response received..."}
@@ -775,7 +781,7 @@ class WebServer:
                 metadata["tools"] = tools_used
 
                 yield {"status": "processing", "message": "Generating final response..."}
-                response = self.client.chat(model=self.model, messages=messages, stream=False)
+                response = self.client.chat(model=self.model, messages=messages, stream=False, options={ 'num_ctx': defines.max_context })
 
             reply = response['message']['content']
             final_message = {"role": "assistant", "content": reply }
diff --git a/src/utils/defines.py b/src/utils/defines.py
index 7ba8219..6066eff 100644
--- a/src/utils/defines.py
+++ b/src/utils/defines.py
@@ -1,4 +1,7 @@
 ollama_api_url="http://ollama:11434"  # Default Ollama local endpoint
-model="qwen2.5:7b"
+#model = "deepseek-r1:7b"
+model = "llama3.2"
+#model="qwen2.5:7b"
 encoding_model="mxbai-embed-large"
-persist_directory="./chromadb"
\ No newline at end of file
+persist_directory="./chromadb"
+max_context = 2048*8
\ No newline at end of file
diff --git a/src/utils/rag.py b/src/utils/rag.py
index f86340c..281113d 100644
--- a/src/utils/rag.py
+++ b/src/utils/rag.py
@@ -59,7 +59,7 @@ def get_vector_collection(path=defines.persist_directory, name="documents"):
 
 # Function to generate embeddings using Ollama
 def get_embedding(llm, text):
-    response = llm.embeddings(model=defines.model, prompt=text)
+    response = llm.embeddings(model=defines.model, prompt=text, options={ 'num_ctx': defines.max_context })
     return response["embedding"]
 
 def add_embeddings_to_collection(llm, collection, chunks):