Tools and RAG are working together!!

2025-04-01 22:59:14 -07:00 · 2025-04-01 22:59:14 -07:00 · 5f1f641dba
commit 5f1f641dba
parent cf29c85449
4 changed files with 31 additions and 24 deletions
--- a/src/ketr-chat/src/App.tsx
+++ b/src/ketr-chat/src/App.tsx
@ -318,11 +318,9 @@ const MessageMeta = ({ metadata }: MessageMetaInterface) => {
    return <></>
  }

-  console.log(JSON.stringify(metadata.tools[0].result, null, 2));
-
  return (<>
    {
-      metadata.tools !== undefined &&
+      metadata.tools !== undefined && metadata.tools.length !== 0 &&
      <Typography sx={{ marginBottom: 2 }}>
        <p>Tools queried:</p>
        {metadata.tools.map((tool: any, index: number) => <>
@ -340,7 +338,7 @@ const MessageMeta = ({ metadata }: MessageMetaInterface) => {
    {
      metadata.rag.name !== undefined &&
      <Typography sx={{ marginBottom: 2 }}>
-        <p>RAG from '{metadata.rag.name}' collection matches against embedding vector of {metadata.rag.query_embedding.length} dimensions:</p>
+          <p>Top RAG {metadata.rag.ids.length} matches from '{metadata.rag.name}' collection against embedding vector of {metadata.rag.query_embedding.length} dimensions:</p>
        {metadata.rag.ids.map((id: number, index: number) => <>
          <Divider />
          <Box sx={{ fontSize: "0.75rem", display: "flex", flexDirection: "row", mb: 0.5, mt: 0.5 }} key={index}>
--- a/src/server.py
+++ b/src/server.py
@ -52,7 +52,10 @@ from fastapi import FastAPI, HTTPException, BackgroundTasks, Request
 from fastapi.responses import JSONResponse, StreamingResponse, FileResponse, RedirectResponse
 from fastapi.middleware.cors import CORSMiddleware

-from utils import rag as Rag
+from utils import (
+    rag as Rag,
+    defines
+)

 from tools import (
    DateTime, 
@ -129,15 +132,14 @@ def system_info(model):
        "System RAM": get_installed_ram(),
        "Graphics Card": get_graphics_cards(),
        "CPU": get_cpu_info(),
-        "LLM Model": model
+        "LLM Model": model,
+        "Context length": defines.max_context
    }

 # %%
 # Defaults
-OLLAMA_API_URL = "http://ollama:11434"  # Default Ollama local endpoint
-#MODEL_NAME = "deepseek-r1:7b"
-#MODEL_NAME = "llama3.2"
-MODEL_NAME = "qwen2.5:7b"
+OLLAMA_API_URL = defines.ollama_api_url
+MODEL_NAME = defines.model
 LOG_LEVEL="info"
 USE_TLS=False
 WEB_HOST="0.0.0.0"
@ -145,19 +147,20 @@ WEB_PORT=5000

 # %%
 # Globals
+context_tag = "INFO"
 system_message = f"""
 Launched on {DateTime()}.

 When answering queries, follow these steps:

 1. First analyze the query to determine if real-time information might be helpful
-2. Even when [CONTEXT] is provided, consider whether the tools would provide more current or comprehensive information
+2. Even when [{context_tag}] is provided, consider whether the tools would provide more current or comprehensive information
 3. Use the provided tools whenever they would enhance your response, regardless of whether context is also available
-4. When both [CONTEXT] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
-5. Always prioritize the most up-to-date and relevant information, whether it comes from [CONTEXT] or tools
-6. If [CONTEXT] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
+4. When both [{context_tag}] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
+5. Always prioritize the most up-to-date and relevant information, whether it comes from [{context_tag}] or tools
+6. If [{context_tag}] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data

-Always use tools and [CONTEXT] when possible. Be concise, and never make up information. If you do not know the answer, say so.
+Always use tools and [{context_tag}] when possible. Be concise, and never make up information. If you do not know the answer, say so.
 """.strip()

 tool_log = []
@ -719,23 +722,26 @@ class WebServer:
                    metadata["rag"] = { "name": rag["name"], **chroma_results }
        preamble = ""
        if len(rag_docs):
-            preamble = "In addition to real-time tools, use the following context to answer the question:\n[CONTEXT]:\n"
+            preamble = f"""
+1. Respond to this query: {content}
+2. If there is information in this context to enhance the answer, do so:
+[{context_tag}]:\n"""
            for doc in rag_docs:
                preamble += doc
-            preamble += "\n[/CONTEXT]\nHuman: "
+            preamble += f"\n[/{context_tag}]\nUse all of that information to respond to: "

        # Figure 
        llm_history.append({"role": "user", "content": preamble + content})
        user_history.append({"role": "user", "content": content})

-        messages = context["system"] + llm_history[-1:]
-
+        messages = context["system"] + llm_history

        try:
            yield {"status": "processing", "message": "Processing request..."}

            # Use the async generator in an async for loop
-            response = self.client.chat(model=self.model, messages=messages, tools=llm_tools(context["tools"]))
+            response = self.client.chat(model=self.model, messages=messages, tools=llm_tools(context["tools"]), options={ 'num_ctx': defines.max_context })
+
            tools_used = []
            
            yield {"status": "processing", "message": "Initial response received..."}
@ -775,7 +781,7 @@ class WebServer:
                metadata["tools"] = tools_used

                yield {"status": "processing", "message": "Generating final response..."}
-                response = self.client.chat(model=self.model, messages=messages, stream=False)
+                response = self.client.chat(model=self.model, messages=messages, stream=False, options={ 'num_ctx': defines.max_context })

            reply = response['message']['content']
            final_message = {"role": "assistant", "content": reply }
--- a/src/utils/defines.py
+++ b/src/utils/defines.py
@ -1,4 +1,7 @@
 ollama_api_url="http://ollama:11434"  # Default Ollama local endpoint
-model="qwen2.5:7b"
+#model = "deepseek-r1:7b"
+model = "llama3.2"
+#model="qwen2.5:7b"
 encoding_model="mxbai-embed-large"
-persist_directory="./chromadb"
+persist_directory="./chromadb"
+max_context = 2048*8
--- a/src/utils/rag.py
+++ b/src/utils/rag.py
@ -59,7 +59,7 @@ def get_vector_collection(path=defines.persist_directory, name="documents"):

 # Function to generate embeddings using Ollama
 def get_embedding(llm, text):
-    response = llm.embeddings(model=defines.model, prompt=text)
+    response = llm.embeddings(model=defines.model, prompt=text, options={ 'num_ctx': defines.max_context })
    return response["embedding"]

 def add_embeddings_to_collection(llm, collection, chunks):