diff --git a/src/ketr-chat/src/App.css b/src/ketr-chat/src/App.css index 723d482..25088e1 100644 --- a/src/ketr-chat/src/App.css +++ b/src/ketr-chat/src/App.css @@ -192,3 +192,12 @@ div { margin-bottom: 0; font-size: 0.9rem; } + +.PromptStats .MuiTableCell-root { + font-size: 0.8rem; +} + +#SystemPromptInput { + font-size: 0.9rem; + line-height: 1.25rem; +} \ No newline at end of file diff --git a/src/ketr-chat/src/App.tsx b/src/ketr-chat/src/App.tsx index cae062f..0f36883 100644 --- a/src/ketr-chat/src/App.tsx +++ b/src/ketr-chat/src/App.tsx @@ -1,7 +1,6 @@ import React, { useState, useEffect, useRef, useCallback, ReactElement } from 'react'; import FormGroup from '@mui/material/FormGroup'; import FormControlLabel from '@mui/material/FormControlLabel'; -import { useTheme } from '@mui/material'; import { styled } from '@mui/material/styles'; import Switch from '@mui/material/Switch'; import Divider from '@mui/material/Divider'; @@ -18,35 +17,43 @@ import Button from '@mui/material/Button'; import AppBar from '@mui/material/AppBar'; import Drawer from '@mui/material/Drawer'; import Toolbar from '@mui/material/Toolbar'; -import MenuIcon from '@mui/icons-material/Menu'; import SettingsIcon from '@mui/icons-material/Settings'; import IconButton, { IconButtonProps } from '@mui/material/IconButton'; import Box from '@mui/material/Box'; import CssBaseline from '@mui/material/CssBaseline'; -import AddIcon from '@mui/icons-material/AddCircle'; +import ResetIcon from '@mui/icons-material/History'; import SendIcon from '@mui/icons-material/Send'; import ExpandMoreIcon from '@mui/icons-material/ExpandMore'; -import MoreVertIcon from '@mui/icons-material/MoreVert'; import Card from '@mui/material/Card'; -import CardHeader from '@mui/material/CardHeader'; -import CardMedia from '@mui/material/CardMedia'; import CardContent from '@mui/material/CardContent'; import CardActions from '@mui/material/CardActions'; import Collapse from '@mui/material/Collapse'; +import Table from '@mui/material/Table'; +import TableBody from '@mui/material/TableBody'; +import TableCell from '@mui/material/TableCell'; +import TableContainer from '@mui/material/TableContainer'; +import TableHead from '@mui/material/TableHead'; +import TableRow from '@mui/material/TableRow'; import PropagateLoader from "react-spinners/PropagateLoader"; -// import Markdown from 'react-markdown'; -import { MuiMarkdown as Markdown } from "mui-markdown"; -import './App.css'; + +import { MuiMarkdown } from "mui-markdown"; +import ReactMarkdown from 'react-markdown'; import rehypeKatex from 'rehype-katex' import remarkMath from 'remark-math' import 'katex/dist/katex.min.css' // `rehype-katex` does not import the CSS for you +import './App.css'; + import '@fontsource/roboto/300.css'; import '@fontsource/roboto/400.css'; import '@fontsource/roboto/500.css'; import '@fontsource/roboto/700.css'; +//const use_mui_markdown = true +const use_mui_markdown = true + + const welcomeMarkdown = ` # Welcome to Ketr-Chat. @@ -54,12 +61,12 @@ This LLM agent was built by James Ketrenos in order to provide answers to any qu In addition to being a RAG enabled expert system, the LLM is configured with real-time access to weather, stocks, the current time, and can answer questions about the contents of a website. -Ask things like: - * What are the headlines from CNBC? - * What is the weather in Portland, OR? - * What is James Ketrenos' work history? - * What are the stock value of the most traded companies? - * What programming languages has James used? +You can ask things like: (or just click the text to submit the query) + * + * + * + * + * `; const welcomeMessage = { @@ -90,7 +97,9 @@ interface ControlsParams { toggleTool: (tool: Tool) => void, toggleRag: (tool: Tool) => void, setSystemPrompt: (prompt: string) => void, - reset: (types: ("rags" | "tools" | "history" | "system-prompt")[], message: string) => Promise + reset: (types: ("rags" | "tools" | "history" | "system-prompt" | "message-history-length")[], message: string) => Promise + messageHistoryLength: number, + setMessageHistoryLength: (messageHistoryLength: number) => void, }; type GPUInfo = { @@ -106,7 +115,11 @@ type SystemInfo = { type MessageMetadata = { rag: any, - tools: any[] + tools: any[], + eval_count: number, + eval_duration: number, + prompt_eval_count: number, + prompt_eval_duration: number }; type MessageData = { @@ -167,7 +180,7 @@ const SystemInfoComponent: React.FC<{ systemInfo: SystemInfo }> = ({ systemInfo return
{systemElements}
; }; -const Controls = ({ tools, rags, systemPrompt, toggleTool, toggleRag, setSystemPrompt, reset, systemInfo }: ControlsParams) => { +const Controls = ({ tools, rags, systemPrompt, toggleTool, toggleRag, messageHistoryLength, setMessageHistoryLength, setSystemPrompt, reset, systemInfo }: ControlsParams) => { const [editSystemPrompt, setEditSystemPrompt] = useState(systemPrompt); useEffect(() => { @@ -222,6 +235,29 @@ const Controls = ({ tools, rags, systemPrompt, toggleTool, toggleRag, setSystemP + + }> + Tunables + + + setMessageHistoryLength(e.target.value)} + slotProps={{ + htmlInput: { + min: 0 + }, + inputLabel: { + shrink: true, + }, + }} + /> + + }> Tools @@ -236,7 +272,7 @@ const Controls = ({ tools, rags, systemPrompt, toggleTool, toggleRag, setSystemP } onChange={() => toggle("tool", index)} label={tool?.function?.name} /> - {tool?.function?.description} + {tool?.function?.description} ) } @@ -274,7 +310,7 @@ const Controls = ({ tools, rags, systemPrompt, toggleTool, toggleRag, setSystemP - + ); } @@ -307,7 +343,8 @@ const ExpandMore = styled((props: ExpandMoreProps) => { })); interface MessageInterface { - message: MessageData + message: MessageData, + submitQuery: (text: string) => void }; interface MessageMetaInterface { @@ -319,46 +356,101 @@ const MessageMeta = ({ metadata }: MessageMetaInterface) => { } return (<> + + Below is the LLM performance of this query. Note that if tools are called, the entire context is processed for each separate tool request by the LLM. This can dramatically increase the total time for a response. + + + + + + + Tokens + Time (s) + TPS + + + + + Prompt + {metadata.prompt_eval_count} + {Math.round(metadata.prompt_eval_duration / 10 ** 7) / 100} + {Math.round(metadata.prompt_eval_count * 10 ** 9 / metadata.prompt_eval_duration)} + + + Response + {metadata.eval_count} + {Math.round(metadata.eval_duration / 10 ** 7) / 100} + {Math.round(metadata.eval_count * 10 ** 9 / metadata.eval_duration)} + + + Total + {metadata.prompt_eval_count + metadata.eval_count} + {Math.round((metadata.prompt_eval_duration + metadata.eval_duration) / 10 ** 7) / 100} + {Math.round((metadata.prompt_eval_count + metadata.eval_count) * 10 ** 9 / (metadata.prompt_eval_duration + metadata.eval_duration))} + + +
+
{ metadata.tools !== undefined && metadata.tools.length !== 0 && - -

Tools queried:

- {metadata.tools.map((tool: any, index: number) => <> - - -
-
{tool.tool}
-
Result Len: {JSON.stringify(tool.result).length}
-
-
{JSON.stringify(tool.result, null, 2)}
+ + }> + + Tools queried - )} -
+ + + {metadata.tools.map((tool: any, index: number) => + {index !== 0 && } + +
+ {tool.tool} +
+
{JSON.stringify(tool.result, null, 2)}
+
+
)} +
+ } { metadata.rag.name !== undefined && - -

Top RAG {metadata.rag.ids.length} matches from '{metadata.rag.name}' collection against embedding vector of {metadata.rag.query_embedding.length} dimensions:

- {metadata.rag.ids.map((id: number, index: number) => <> - - -
-
Doc ID: {metadata.rag.ids[index]}
-
Similarity: {Math.round(metadata.rag.distances[index] * 100) / 100}
-
Type: {metadata.rag.metadatas[index].doc_type}
-
Chunk Len: {metadata.rag.documents[index].length}
-
-
{metadata.rag.documents[index]}
+ + }> + + Top RAG {metadata.rag.ids.length} matches from '{metadata.rag.name}' collection against embedding vector of {metadata.rag.query_embedding.length} dimensions - - )} -
+ + + {metadata.rag.ids.map((id: number, index: number) => + {index !== 0 && } + +
+
Doc ID: {metadata.rag.ids[index]}
+
Similarity: {Math.round(metadata.rag.distances[index] * 100) / 100}
+
Type: {metadata.rag.metadatas[index].doc_type}
+
Chunk Len: {metadata.rag.documents[index].length}
+
+
{metadata.rag.documents[index]}
+
+
+ )} +
+ } ); }; -const Message = ({ message }: MessageInterface) => { +interface ChatQueryInterface { + text: string, + submitQuery: (text: string) => void +} + +const ChatQuery = ({ text, submitQuery }: ChatQueryInterface) => { + return (); +} + +const Message = ({ message, submitQuery }: MessageInterface) => { const [expanded, setExpanded] = React.useState(false); const handleExpandClick = () => { @@ -371,7 +463,15 @@ const Message = ({ message }: MessageInterface) => { {message.role === 'assistant' ? - + use_mui_markdown ? : : {message.content} @@ -423,6 +523,37 @@ const App = () => { const [serverSystemPrompt, setServerSystemPrompt] = useState(""); const [systemInfo, setSystemInfo] = useState(undefined); const [contextStatus, setContextStatus] = useState({ context_used: 0, max_context: 0 }); + const [contextWarningShown, setContextWarningShown] = useState(false); + const [contextUsedPercentage, setContextUsedPercentage] = useState(0); + const [lastEvalTPS, setLastEvalTPS] = useState(35); + const [lastPromptTPS, setLastPromptTPS] = useState(430); + const [countdown, setCountdown] = useState(0); + const [messageHistoryLength, setMessageHistoryLength] = useState(0); + + const timerRef = useRef(null); + + const startCountdown = (seconds: number) => { + if (timerRef.current) clearInterval(timerRef.current); + setCountdown(seconds); + timerRef.current = setInterval(() => { + setCountdown((prev) => { + if (prev <= 1) { + clearInterval(timerRef.current); + timerRef.current = null; + return 0; + } + return prev - 1; + }); + }, 1000); + }; + + const stopCountdown = () => { + if (timerRef.current) { + clearInterval(timerRef.current); + timerRef.current = null; + setCountdown(0); + } + }; // Scroll to bottom of conversation when conversation updates useEffect(() => { @@ -460,6 +591,7 @@ const App = () => { }); }, [systemInfo, setSystemInfo, loc, setSnack, sessionId]) + // Update the context status const updateContextStatus = useCallback(() => { fetch(getConnectionBase(loc) + `/api/context-status/${sessionId}`, { method: 'GET', @@ -469,7 +601,6 @@ const App = () => { }) .then(response => response.json()) .then(data => { - console.log(`Session id: ${sessionId} -- history returned from server with ${data.length} entries`) setContextStatus(data); }) .catch(error => { @@ -538,9 +669,9 @@ const App = () => { if (serverSystemPrompt !== "" || sessionId === undefined) { return; } - const fetchSystemPrompt = async () => { + const fetchTunables = async () => { // Make the fetch request with proper headers - const response = await fetch(getConnectionBase(loc) + `/api/system-prompt/${sessionId}`, { + const response = await fetch(getConnectionBase(loc) + `/api/tunables/${sessionId}`, { method: 'GET', headers: { 'Content-Type': 'application/json', @@ -549,12 +680,12 @@ const App = () => { }); const data = await response.json(); const serverSystemPrompt = data["system-prompt"].trim(); - console.log("System prompt initialized to:", serverSystemPrompt); setServerSystemPrompt(serverSystemPrompt); setSystemPrompt(serverSystemPrompt); + setMessageHistoryLength(data["message-history-length"]); } - fetchSystemPrompt(); + fetchTunables(); }, [sessionId, serverSystemPrompt, setServerSystemPrompt, loc]); // If the tools have not been set, fetch them from the server @@ -615,6 +746,21 @@ const App = () => { fetchRags(); }, [sessionId, rags, setRags, setSnack, loc]); + // If context status changes, show a warning if necessary. If it drops + // back below the threshold, clear the warning trigger + useEffect(() => { + const context_used_percentage = Math.round(100 * contextStatus.context_used / contextStatus.max_context); + if (context_used_percentage >= 90 && !contextWarningShown) { + setSnack(`${context_used_percentage}% of context used. You may wish to start a new chat.`, "warning"); + setContextWarningShown(true); + } + if (context_used_percentage < 90 && contextWarningShown) { + setContextWarningShown(false); + } + setContextUsedPercentage(context_used_percentage) + }, [contextStatus, setContextWarningShown, contextWarningShown, setContextUsedPercentage, setSnack]); + + const toggleRag = async (tool: Tool) => { tool.enabled = !tool.enabled try { @@ -665,7 +811,7 @@ const App = () => { } const sendSystemPrompt = async (prompt: string) => { try { - const response = await fetch(getConnectionBase(loc) + `/api/system-prompt/${sessionId}`, { + const response = await fetch(getConnectionBase(loc) + `/api/tunables/${sessionId}`, { method: 'PUT', headers: { 'Content-Type': 'application/json', @@ -691,7 +837,38 @@ const App = () => { }, [systemPrompt, setServerSystemPrompt, serverSystemPrompt, loc, sessionId, setSnack]); - const reset = async (types: ("rags" | "tools" | "history" | "system-prompt")[], message: string = "Update successful.") => { + useEffect(() => { + if (sessionId === undefined) { + return; + } + const sendMessageHistoryLength = async (length: number) => { + try { + const response = await fetch(getConnectionBase(loc) + `/api/tunables/${sessionId}`, { + method: 'PUT', + headers: { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + }, + body: JSON.stringify({ "message-history-length": length }), + }); + + const data = await response.json(); + const newLength = data["message-history-length"]; + if (newLength !== messageHistoryLength) { + setMessageHistoryLength(newLength); + setSnack("Message history length updated", "success"); + } + } catch (error) { + console.error('Fetch error:', error); + setSnack("Message history length update failed", "error"); + } + }; + + sendMessageHistoryLength(messageHistoryLength); + + }, [messageHistoryLength, setMessageHistoryLength, loc, sessionId, setSnack]); + + const reset = async (types: ("rags" | "tools" | "history" | "system-prompt" | "message-history-length")[], message: string = "Update successful.") => { try { const response = await fetch(getConnectionBase(loc) + `/api/reset/${sessionId}`, { method: 'PUT', @@ -751,15 +928,20 @@ const App = () => { const drawer = ( <> - {sessionId !== undefined && systemInfo !== undefined && } + {sessionId !== undefined && systemInfo !== undefined && + } ); + const submitQuery = (text: string) => { + sendQuery(text); + } + const handleKeyPress = (event: any) => { if (event.key === 'Enter') { switch (event.target.id) { case 'QueryInput': - sendQuery(); + sendQuery(query); break; } } @@ -769,11 +951,9 @@ const App = () => { reset(["history"], "New chat started."); } - const sendQuery = async () => { + const sendQuery = async (query: string) => { if (!query.trim()) return; - setSnack("Query sent", "info"); - const userMessage = [{ role: 'user', content: query }]; // Add user message to conversation @@ -811,6 +991,12 @@ const App = () => { body: JSON.stringify({ role: 'user', content: query.trim() }), }); + // We'll guess that the response will be around 500 tokens... + const token_guess = 500; + const estimate = Math.round(token_guess / lastEvalTPS + contextStatus.context_used / lastPromptTPS); + setSnack(`Query sent. Response estimated in ${estimate}s.`, "info"); + startCountdown(Math.round(estimate)); + if (!response.ok) { throw new Error(`Server responded with ${response.status}: ${response.statusText}`); } @@ -860,6 +1046,11 @@ const App = () => { ...prev.filter(msg => msg.id !== processingId), update.message ]); + const metadata = update.message.metadata; + const evalTPS = metadata.eval_count * 10 ** 9 / metadata.eval_duration; + const promptTPS = metadata.prompt_eval_count * 10 ** 9 / metadata.prompt_eval_duration; + setLastEvalTPS(evalTPS ? evalTPS : 35); + setLastPromptTPS(promptTPS ? promptTPS : 35); updateContextStatus(); } else if (update.status === 'error') { // Show error @@ -891,6 +1082,7 @@ const App = () => { } } + stopCountdown(); setProcessing(false); } catch (error) { console.error('Fetch error:', error); @@ -900,6 +1092,7 @@ const App = () => { { role: 'assistant', type: 'error', content: `Error: ${error}` } ]); setProcessing(false); + stopCountdown(); } }; @@ -946,7 +1139,7 @@ const App = () => { onClick={onNew} sx={{ mr: 2 }} > - + @@ -985,17 +1178,39 @@ const App = () => { - {conversation.map((message, index) => )} -
+ {conversation.map((message, index) => )} + -
+ {processing === true && countdown > 0 && ( + Estimated response time: {countdown}s + )} +
+ + Context used: {contextUsedPercentage}% {contextStatus.context_used}/{contextStatus.max_context} + { + contextUsedPercentage >= 90 ? WARNING: Context almost exhausted. You should start a new chat. + : (contextUsedPercentage >= 50 ? NOTE: Context is getting long. Queries will be slower, and the LLM may stop issuing tool calls. + : <>) + } +
- {/* Context used: {Math.round(100 * contextStatus.context_used / contextStatus.max_context)}% {contextStatus.context_used}/{contextStatus.max_context} */} { /> - + diff --git a/src/server.py b/src/server.py index ae250df..133d75d 100644 --- a/src/server.py +++ b/src/server.py @@ -161,6 +161,7 @@ When answering queries, follow these steps: 6. If [{context_tag}] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data Always use tools and [{context_tag}] when possible. Be concise, and never make up information. If you do not know the answer, say so. + """.strip() tool_log = [] @@ -387,6 +388,12 @@ class WebServer: context["llm_history"] = [] context["user_history"] = [] response["history"] = [] + context["context_tokens"] = round(len(str(context["system"])) * 3 / 4) # Estimate context usage + response["context_used"] = context["context_tokens"] + case "message-history-length": + context["message_history_length"] = 5 + response["message-history-length"] = 5 + if not response: return JSONResponse({ "error": "Usage: { reset: rags|tools|history|system-prompt}"}) else: @@ -396,25 +403,40 @@ class WebServer: except: return JSONResponse({ "error": "Usage: { reset: rags|tools|history|system-prompt}"}) - @self.app.put('/api/system-prompt/{context_id}') - async def put_system_prompt(context_id: str, request: Request): + @self.app.put('/api/tunables/{context_id}') + async def put_tunables(context_id: str, request: Request): if not is_valid_uuid(context_id): logging.warning(f"Invalid context_id: {context_id}") return JSONResponse({"error": "Invalid context_id"}, status_code=400) context = self.upsert_context(context_id) data = await request.json() - system_prompt = data["system-prompt"].strip() - if not system_prompt: - return JSONResponse({ "status": "error", "message": "System prompt can not be empty." }) - context["system"] = [{"role": "system", "content": system_prompt}] - self.save_context(context_id) - return JSONResponse({ "system-prompt": system_prompt }) + for k in data.keys(): + match k: + case "system-prompt": + system_prompt = data[k].strip() + if not system_prompt: + return JSONResponse({ "status": "error", "message": "System prompt can not be empty." }) + context["system"] = [{"role": "system", "content": system_prompt}] + self.save_context(context_id) + return JSONResponse({ "system-prompt": system_prompt }) + case "message-history-length": + value = max(0, int(data[k])) + context["message_history_length"] = value + self.save_context(context_id) + return JSONResponse({ "message-history-length": value }) + case _: + return JSONResponse({ "error": f"Unrecognized tunable {k}"}, 404) - @self.app.get('/api/system-prompt/{context_id}') - async def get_system_prompt(context_id: str): + @self.app.get('/api/tunables/{context_id}') + async def get_tunables(context_id: str): + if not is_valid_uuid(context_id): + logging.warning(f"Invalid context_id: {context_id}") + return JSONResponse({"error": "Invalid context_id"}, status_code=400) context = self.upsert_context(context_id) - system_prompt = context["system"][0]["content"]; - return JSONResponse({ "system-prompt": system_prompt }) + return JSONResponse({ + "system-prompt": context["system"][0]["content"], + "message-history-length": context["message_history_length"] + }) @self.app.get('/api/system-info/{context_id}') async def get_system_info(context_id: str): @@ -510,12 +532,8 @@ class WebServer: if not is_valid_uuid(context_id): logging.warning(f"Invalid context_id: {context_id}") return JSONResponse({"error": "Invalid context_id"}, status_code=400) - context_used = 0 context = self.upsert_context(context_id) - # TODO: Switch this to use the tokenizer values instead of 75% of character length - for message in context["llm_history"]: - context_used += round((len(message["role"]) + len(message["content"])) * 3 / 4) - return JSONResponse({"context_used": context_used, "max_context": defines.max_context}) + return JSONResponse({"context_used": context["context_tokens"], "max_context": defines.max_context}) @self.app.get('/api/health') async def health_check(): @@ -530,6 +548,8 @@ class WebServer: self.logging.info(f"Serve index.html for {path}") return FileResponse('/opt/airc/src/ketr-chat/build/index.html') + import requests + def save_context(self, session_id): """ Serialize a Python dictionary to a file in the sessions directory. @@ -581,18 +601,25 @@ class WebServer: def create_context(self, context_id = None): if not context_id: context_id = str(uuid.uuid4()) + system_context = [{"role": "system", "content": system_message}]; context = { "id": context_id, - "system": [{"role": "system", "content": system_message}], + "system": system_context, "llm_history": [], "user_history": [], "tools": default_tools(tools), - "rags": rags.copy() + "rags": rags.copy(), + "context_tokens": round(len(str(system_context)) * 3 / 4), # Estimate context usage + "message_history_length": 5 # Number of messages to supply in context } logging.info(f"{context_id} created and added to sessions.") self.contexts[context_id] = context return context + def get_optimal_ctx_size(self, context, messages, ctx_buffer = 4096): + ctx = round(context + len(str(messages)) * 3 / 4) + return max(defines.max_context, min(2048, ctx + ctx_buffer)) + def upsert_context(self, context_id): if not context_id: logging.warning("No context ID provided. Creating a new context.") @@ -619,7 +646,11 @@ class WebServer: user_history = context["user_history"] metadata = { "rag": {}, - "tools": [] + "tools": [], + "eval_count": 0, + "eval_duration": 0, + "prompt_eval_count": 0, + "prompt_eval_duration": 0, } rag_docs = [] for rag in context["rags"]: @@ -643,13 +674,24 @@ class WebServer: llm_history.append({"role": "user", "content": preamble + content}) user_history.append({"role": "user", "content": content}) - messages = context["system"] + llm_history + if context["message_history_length"]: + messages = context["system"] + llm_history[-context["message_history_length"]:] + else: + messages = context["system"] + llm_history try: yield {"status": "processing", "message": "Processing request..."} + # Estimate token length of new messages + ctx_size = self.get_optimal_ctx_size(context["context_tokens"], messages=llm_history[-1]["content"]) + # Use the async generator in an async for loop - response = self.client.chat(model=self.model, messages=messages, tools=llm_tools(context["tools"]), options={ 'num_ctx': defines.max_context }) + response = self.client.chat(model=self.model, messages=messages, tools=llm_tools(context["tools"]), options={ 'num_ctx': ctx_size }) + metadata["eval_count"] += response['eval_count'] + metadata["eval_duration"] += response['eval_duration'] + metadata["prompt_eval_count"] += response['prompt_eval_count'] + metadata["prompt_eval_duration"] += response['prompt_eval_duration'] + context["context_tokens"] = response['prompt_eval_count'] + response['eval_count'] tools_used = [] @@ -680,6 +722,8 @@ class WebServer: {'function': {'name': tc['function']['name'], 'arguments': tc['function']['arguments']}} for tc in message['tool_calls'] ] + + pre_add_index = len(messages) messages.append(message_dict) if isinstance(tool_result, list): @@ -690,7 +734,15 @@ class WebServer: metadata["tools"] = tools_used yield {"status": "processing", "message": "Generating final response..."} - response = self.client.chat(model=self.model, messages=messages, stream=False, options={ 'num_ctx': defines.max_context }) + # Estimate token length of new messages + ctx_size = self.get_optimal_ctx_size(context["context_tokens"], messages=messages[pre_add_index:]) + # Decrease creativity when processing tool call requests + response = self.client.chat(model=self.model, messages=messages, stream=False, options={ 'num_ctx': ctx_size }) #, "temperature": 0.5 }) + metadata["eval_count"] += response['eval_count'] + metadata["eval_duration"] += response['eval_duration'] + metadata["prompt_eval_count"] += response['prompt_eval_count'] + metadata["prompt_eval_duration"] += response['prompt_eval_duration'] + context["context_tokens"] = response['prompt_eval_count'] + response['eval_count'] reply = response['message']['content'] final_message = {"role": "assistant", "content": reply }