Caching

2025-03-18 13:08:48 -07:00 · 2025-03-18 13:08:48 -07:00 · 1130077c03
commit 1130077c03
parent 5f6971510a
4 changed files with 307 additions and 584 deletions
--- a/283
+++ b/283
@ -1,3 +1,75 @@
+#
+# Build Pyton 3.11 for use in later stages
+#    
+FROM ubuntu:oracular AS python-build
+
+SHELL [ "/bin/bash", "-c" ]
+
+# Instructions Dockerfied from:
+#
+# https://github.com/pytorch/pytorch
+#
+# and
+#
+# https://pytorch.org/docs/stable/notes/get_start_xpu.html
+# https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-6.html
+# 
+#
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    gpg \
+    wget \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+
+# ipex only supports python 3.11, so use 3.11 instead of latest oracular (3.12)
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ca-certificates \
+    ccache \
+    cmake \
+    curl \
+    git \
+    gpg-agent \
+    less \
+    libbz2-dev \
+    libffi-dev \
+    libjpeg-dev \
+    libpng-dev \
+    libreadline-dev \
+    libssl-dev \
+    libsqlite3-dev \
+    llvm \
+    nano \
+    wget \
+    zlib1g-dev \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+#    python3 \
+#    python3-pip \
+#    python3-venv \
+#    python3-dev \
+
+RUN /usr/sbin/update-ccache-symlinks
+RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
+
+# Build Python in /opt/..., install it locally, then remove the build environment 
+# collapsed to a single docker layer.
+WORKDIR /opt
+ENV PYTHON_VERSION=3.11.9
+
+RUN wget -q -O - https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz | tar -xz \
+    && cd Python-${PYTHON_VERSION} \
+    && ./configure --prefix=/opt/python --enable-optimizations \
+    && make -j$(nproc) \
+    && make install \
+    && cd /opt \
+    && rm -rf Python-${PYTHON_VERSION}
+
 FROM ubuntu:oracular AS ze-monitor
 # From https://github.com/jketreno/ze-monitor
 RUN apt-get update \
@ -29,19 +101,75 @@ RUN cmake .. \
    && make \
    && cpack

+#
+# Build the ipex-llm wheel for use in later stages
+#    
+FROM python-build AS ipex-llm-src
+
+RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2
+
+RUN git clone --branch main --depth 1 https://github.com/intel/ipex-llm.git /opt/ipex-llm \
+    && cd /opt/ipex-llm \
+    && git fetch --depth 1 origin cb3c4b26ad058c156591816aa37eec4acfcbf765 \
+    && git checkout cb3c4b26ad058c156591816aa37eec4acfcbf765
+
+WORKDIR /opt/ipex-llm
+
+RUN python3 -m venv --system-site-packages /opt/ipex-llm/venv
+RUN { \
+    echo '#!/bin/bash' ; \
+    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
+    echo 'source /opt/ipex-llm/venv/bin/activate' ; \
+    echo 'bash -c "${@}"' ; \
+    } > /opt/ipex-llm/shell ; \
+    chmod +x /opt/ipex-llm/shell
+
+SHELL [ "/opt/ipex-llm/shell" ]
+
+RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
+
+WORKDIR /opt/ipex-llm/python/llm
+RUN pip install requests wheel
+RUN python setup.py clean --all bdist_wheel --linux
+
+#
+# The main airc image:
+# * python 3.11
+# * pytorch xpu w/ ipex-llm
+# * ollama-ipex-llm
+# * src/server.py     - model server supporting RAG and fine-tuned models
+#
+# Agents using server:
+# * src/web-ui.py     - REACT server (airc.ketrenos.com)
+# * src/irc.py        - IRC backend (irc.libera.chat #airc-test)
+# * src/cli.py        - Command line chat
+#
+# Utilities:
+# * src/training-fine-tune.py  - Perform fine-tuning on currated documents
 FROM ubuntu:oracular AS airc

+COPY --from=python-build /opt/python /opt/python
+
 # Get a couple prerequisites
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    gpg \
-    python3 \
-    python3-pip \
-    python3-venv \
+    # python3 \
+    # python3-pip \
+    # python3-venv \
    wget \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}

+# The client frontend is built using React Expo to allow 
+# easy creation of an Android app as well as web app
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    nodejs \
+    npm \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
 # Install Intel graphics runtimes
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
@ -58,27 +186,41 @@ RUN apt-get update \

 WORKDIR /opt/airc

+RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2
+
 # Setup the ollama python virtual environment
 RUN python3 -m venv --system-site-packages /opt/airc/venv

 # Setup the docker pip shell
 RUN { \
    echo '#!/bin/bash' ; \
+    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
+    echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
    echo 'source /opt/airc/venv/bin/activate' ; \
-    echo 'bash -c "${@}"' ; \
+    echo 'if [[ "${1}" != "" ]]; then bash -c "${@}"; else bash; fi' ; \
    } > /opt/airc/shell ; \
    chmod +x /opt/airc/shell

 # Activate the pip environment on all shell calls
 SHELL [ "/opt/airc/shell" ]

+
+# From https://pytorch-extension.intel.com/installation?platform=gpu&version=v2.6.10%2Bxpu&os=linux%2Fwsl2&package=pip
+RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
+RUN pip install intel-extension-for-pytorch==2.6.10+xpu oneccl_bind_pt==2.6.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+
+# From https://huggingface.co/docs/bitsandbytes/main/en/installation?backend=Intel+CPU+%2B+GPU#multi-backend
+RUN pip install "transformers>=4.45.1"
+RUN pip install 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'
+
 # Install ollama python module
-RUN pip3 install ollama
+RUN pip install ollama
+
 # pydle does not work with newer asyncio due to coroutine
 # being deprecated. Patch to work.
 COPY /src/pydle.patch /opt/pydle.patch

-RUN pip3 install pydle \
+RUN pip install pydle \
    && patch -d /opt/airc/venv/lib/python3*/site-packages/pydle \
    -p1 < /opt/pydle.patch \
    && rm /opt/pydle.patch
@ -87,9 +229,49 @@ RUN pip install setuptools --upgrade
 RUN pip install ollama 
 RUN pip install feedparser bs4 chromadb
 RUN pip install tiktoken
+RUN pip install flask flask_cors
+RUN pip install peft datasets
+
+COPY --from=ipex-llm-src /opt/ipex-llm/python/llm/dist/*.whl /opt/wheels/
+RUN for pkg in /opt/wheels/ipex_llm*.whl; do pip install $pkg; done
+
+# mistral fails with cache_position errors with transformers>4.40 (or at least it fails with the latest)
+# as well as MistralSpda* and QwenSpda* things missing (needed when loading models with )
+RUN pip install "sentence_transformers<3.4.1"
+# "transformers==4.40.0" ""
+#RUN pip install sentence_transformers "transformers==4.40.0" "trl<0.12.0"
+#RUN pip install transformers==4.45.0 "trl<0.12.0"
+# trl.core doesn't have what is needed with the default 'pip install trl' version
+#RUN pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
+
+# To get xe_linear and other Xe methods
+# NOTE: As of 2025-03-10, these are only available for Python 3.11, hence
+# why we build python from source    
+RUN pip3 install 'bigdl-core-xe-all>=2.6.0b'
+
+# NOTE: IPEX includes the oneAPI components... not sure if they still need to be installed separately with a oneAPI env
+RUN pip install einops diffusers # Required for IPEX optimize(), which is required to convert from Params4bit
+
+RUN pip install yfinance pyzt geopy

 SHELL [ "/bin/bash", "-c" ]

+# Don't install the full oneapi essentials; just the ones that we seem to need
+# RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+#     | gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \
+#     && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
+#     | tee /etc/apt/sources.list.d/oneAPI.list \
+#     && apt-get update \
+#     && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+#     intel-oneapi-mkl-sycl-2025.0 \
+#     intel-oneapi-dnnl-2025.0 \
+#     intel-oneapi-dpcpp-cpp-2025.0 \
+#     && apt-get clean \
+#     && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+# dpcpp is needed for LoRA backend when 
+# libze-dev is needed for LoRA/triton backend in order to build stuff
+# Unfortunately, that fails with:
+#    ImportError: /opt/airc/venv/lib/python3.11/site-packages/intel_extension_for_pytorch/lib/libintel-ext-pt-cpu.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    libncurses6 \
@ -108,6 +290,8 @@ RUN { \
    echo 'echo "Container: airc"'; \
    echo 'set -e'; \
    echo 'echo "Setting pip environment to /opt/airc"'; \
+    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
+    echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
    echo 'source /opt/airc/venv/bin/activate'; \
    echo ''; \
    echo 'if [[ "${1}" == "/bin/bash" ]] || [[ "${1}" =~ ^(/opt/airc/)?shell$ ]]; then'; \
@ -126,6 +310,11 @@ RUN { \
    } > /entrypoint.sh \
    && chmod +x /entrypoint.sh

+# From 
+ENV USE_XETLA=OFF
+ENV SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+ENV SYCL_CACHE_PERSISTENT=1
+
 ENTRYPOINT [ "/entrypoint.sh" ]

 FROM ubuntu:oracular AS ollama
@ -185,7 +374,7 @@ RUN { \
 SHELL [ "/opt/ollama/shell" ]

 # Install ollama python module
-RUN pip3 install ollama
+RUN pip install ollama

 SHELL [ "/bin/bash", "-c" ]

@ -233,13 +422,14 @@ FROM airc AS jupyter
 SHELL [ "/opt/airc/shell" ]

 # BEGIN setup Jupyter
-RUN pip install jupyter \
-    jupyterlab==4.3.0a0 \
-    jupyterhub==5.0.0 \
-    notebook==7.3.0a0 \
-    "jupyter-server-proxy>=4.1.2"
+RUN pip install \
+    jupyterlab \
+    dash[jupyterlab] \
+    && jupyter lab build --dev-build=False --minimize=False
 # END setup Jupyter

+RUN pip install -r /opt/airc/src/requirements.txt
+
 SHELL [ "/bin/bash", "-c" ]

 RUN { \
@ -259,8 +449,8 @@ RUN { \
    echo 'source /opt/airc/venv/bin/activate' ; \
    echo 'if [[ "${1}" == "shell" ]]; then echo "Dropping to shell"; /bin/bash; exit $?; fi' ; \
    echo 'while true; do' ; \
-    echo '  echo "Launching jupyter notebook"' ; \
-    echo '  jupyter notebook \' ; \
+    echo '  echo "Launching jupyter lab"' ; \
+    echo '  jupyter lab \' ; \
    echo '    --notebook-dir=/opt/jupyter \' ; \
    echo '    --port 8888 \' ; \
    echo '    --ip 0.0.0.0 \' ; \
@ -278,4 +468,67 @@ RUN { \
    } > /entrypoint-jupyter.sh \
    && chmod +x /entrypoint-jupyter.sh

-ENTRYPOINT [ "/entrypoint-jupyter.sh" ]
+ENTRYPOINT [ "/entrypoint-jupyter.sh" ]
+
+FROM ubuntu:oracular AS miniircd
+
+COPY --from=python-build /opt/python /opt/python
+
+# Get a couple prerequisites
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    gpg \
+    wget \
+    nano \
+    irssi \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+WORKDIR /opt/miniircd
+
+RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2
+
+# Setup the ollama python virtual environment
+RUN python3 -m venv --system-site-packages /opt/miniircd/venv
+
+# Setup the docker pip shell
+RUN { \
+    echo '#!/bin/bash' ; \
+    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
+    echo 'source /opt/miniircd/venv/bin/activate' ; \
+    echo 'if [[ "${1}" != "" ]]; then bash -c "${@}"; else bash; fi' ; \
+    } > /opt/miniircd/shell ; \
+    chmod +x /opt/miniircd/shell
+
+# Activate the pip environment on all shell calls
+SHELL [ "/opt/miniircd/shell" ]
+
+RUN pip install miniircd
+
+SHELL [ "/bin/bash", "-c" ]
+
+RUN { \
+    echo '#!/bin/bash'; \
+    echo 'echo "Container: miniircd"'; \
+    echo 'set -e'; \
+    echo 'echo "Setting pip environment to /opt/miniircd"'; \
+    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
+    echo 'source /opt/miniircd/venv/bin/activate'; \
+    echo ''; \
+    echo 'if [[ "${1}" == "/bin/bash" ]] || [[ "${1}" =~ ^(/opt/miniircd/)?shell$ ]]; then'; \
+    echo '  echo "Dropping to shell"'; \
+    echo '  shift' ; \
+    echo '  echo "Running: ${@}"' ; \
+    echo '  if [[ "${1}" != "" ]]; then' ; \
+    echo '    exec ${@}'; \
+    echo '  else' ; \
+    echo '    exec /bin/bash'; \
+    echo '  fi' ; \
+    echo 'else'; \
+    echo '  echo "Launching IRC server..."'; \
+    echo '  miniircd --setuid root "${@}"' ; \
+    echo 'fi'; \
+    } > /entrypoint.sh \
+    && chmod +x /entrypoint.sh
+
+ENTRYPOINT [ "/entrypoint.sh" ]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -14,9 +14,13 @@ services:
      - ollama
    networks:
      - internal
+    ports:
+      - 8911:8911
    volumes:
      - ./cache:/root/.cache
      - ./src:/opt/airc/src:rw
+      - ./doc:/opt/airc/doc:ro
+      - ./results:/opt/airc/results:rw
    cap_add: # used for running ze-monitor within airc container
      - CAP_DAC_READ_SEARCH # Bypass all filesystem read access checks
      - CAP_PERFMON         # Access to perf_events (vs. overloaded CAP_SYS_ADMIN)
@ -36,8 +40,8 @@ services:
      - ONEAPI_DEVICE_SELECTOR=level_zero:0
    devices:
      - /dev/dri:/dev/dri
-#    ports:
-#      - 11434:11434 # ollama serve port
+    ports:
+      - 11434:11434 # ollama serve port
    networks:
      - internal
    volumes:
@ -61,13 +65,41 @@ services:
      - /dev/dri:/dev/dri
    depends_on:
      - ollama
+      - miniircd
    ports:
    - 8888:8888 # Jupyter Notebook
+    - 60673:60673 # Gradio
    networks:
      - internal
    volumes:
      - ./jupyter:/opt/jupyter:rw
      - ./cache:/root/.cache
+    deploy:
+      resources:
+        limits:
+          memory: "0"  # No memory limit (Docker treats 0 as unlimited)
+        reservations:
+          memory: "0"  # No reserved memory (optional)
+    ulimits:
+      memlock: -1  # Prevents memory from being locked
+    oom_kill_disable: true  # Prevents OOM killer from killing the container
+
+  miniircd:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      target: miniircd
+    image: miniircd
+    env_file:
+      - .env
+    devices:
+      - /dev/dri:/dev/dri
+    ports:
+    - 6667:6667 # IRC
+    networks:
+      - internal
+    volumes:
+      - ./cache:/root/.cache

 networks:
  internal:
--- a/jupyter/stock.py
+++ b/jupyter/stock.py
@ -539,11 +539,11 @@ def create_ui():
            outputs=[chat_history, tool_history]
        )

-        # timer.tick(check_message_queue, inputs=chatbot, outputs=chatbot).then(
-        #     update_log, # This new function updates the log after chatbot processing
-        #     inputs=chatbot,
-        #     outputs=[chat_history, tool_history]
-        # )
+        timer.tick(check_message_queue, inputs=chatbot, outputs=chatbot).then(
+            update_log, # This new function updates the log after chatbot processing
+            inputs=chatbot,
+            outputs=[chat_history, tool_history]
+        )

        clear.click(do_clear, inputs=None, outputs=[chatbot, chat_history, tool_history], queue=False)

--- a/src/chunk.py
+++ b/src/chunk.py
@ -1,562 +0,0 @@
-import requests
-from typing import List, Dict, Any, Union
-import tiktoken
-import feedparser
-import logging as log
-import datetime
-from bs4 import BeautifulSoup
-import chromadb
-import ollama
-import re
-import numpy as np
-
-def normalize(vec):
-    return vec / np.linalg.norm(vec)
-
-OLLAMA_API_URL = "http://ollama:11434"  # Default Ollama local endpoint
-MODEL_NAME = "deepseek-r1:7b"
-EMBED_MODEL = "mxbai-embed-large"
-PERSIST_DIRECTORY = "/root/.cache/chroma"
-
-client = ollama.Client(host=OLLAMA_API_URL)
-
-def extract_text_from_html_or_xml(content, is_xml=False):
-    # Parse the content
-    if is_xml:
-        soup = BeautifulSoup(content, 'xml')  # Use 'xml' parser for XML content
-    else:
-        soup = BeautifulSoup(content, 'html.parser')  # Default to 'html.parser' for HTML content
-
-    # Extract and return just the text
-    return soup.get_text()
-
-class Feed():
-    def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
-        self.name = name
-        self.url = url
-        self.poll_limit_min = datetime.timedelta(minutes=poll_limit_min)
-        self.last_poll = None
-        self.articles = []
-        self.max_articles = max_articles
-        self.update()
-
-    def update(self):
-        now = datetime.datetime.now()
-        if self.last_poll is None or (now - self.last_poll) >= self.poll_limit_min:
-            log.info(f"Updating {self.name}")
-            feed = feedparser.parse(self.url)
-            self.articles = []
-            self.last_poll = now
-
-            if len(feed.entries) == 0:
-                return
-            
-            for i, entry in enumerate(feed.entries[:self.max_articles]):
-                content = {}
-                content['source'] = self.name
-                content['id'] = f"{self.name}{i}"
-                title = entry.get("title")
-                if title:
-                    content['title'] = title
-                link = entry.get("link")
-                if link:
-                    content['link'] = link
-                text = entry.get("summary")
-                if text:
-                    content['text'] = extract_text_from_html_or_xml(text, False)
-                else:
-                    continue
-                published = entry.get("published")
-                if published:
-                    content['published'] = published
-
-                self.articles.append(content)
-        else:
-            log.info(f"Not updating {self.name} -- {self.poll_limit_min - (now - self.last_poll)}s remain to refresh.")
-        return self.articles
-
-# News RSS Feeds
-rss_feeds = [
-    Feed(name="IGN.com", url="https://feeds.feedburner.com/ign/games-all"),
-    Feed(name="BBC World", url="http://feeds.bbci.co.uk/news/world/rss.xml"),
-    Feed(name="Reuters World", url="http://feeds.reuters.com/Reuters/worldNews"),
-    Feed(name="Al Jazeera", url="https://www.aljazeera.com/xml/rss/all.xml"),
-    Feed(name="CNN World", url="http://rss.cnn.com/rss/edition_world.rss"),
-    Feed(name="Time", url="https://time.com/feed/"),
-    Feed(name="Euronews", url="https://www.euronews.com/rss"),
-#    Feed(name="FeedX", url="https://feedx.net/rss/ap.xml")
-]
-
-def get_encoding():
-    """Get the tokenizer for counting tokens."""
-    try:
-        return tiktoken.get_encoding("cl100k_base")  # Default encoding used by many embedding models
-    except:
-        return tiktoken.encoding_for_model(MODEL_NAME)
-
-def count_tokens(text: str) -> int:
-    """Count the number of tokens in a text string."""
-    encoding = get_encoding()
-    return len(encoding.encode(text))
-
-def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:
-    """
-    Split a text into chunks based on token count with overlap between chunks.
-    
-    Args:
-        text: The text to split into chunks
-        max_tokens: Maximum number of tokens per chunk
-        overlap: Number of tokens to overlap between chunks
-    
-    Returns:
-        List of text chunks
-    """
-    if not text or max_tokens <= 0:
-        return []
-    
-    encoding = get_encoding()
-    tokens = encoding.encode(text)
-    chunks = []
-    
-    i = 0
-    while i < len(tokens):
-        # Get the current chunk of tokens
-        chunk_end = min(i + max_tokens, len(tokens))
-        chunk_tokens = tokens[i:chunk_end]
-        chunks.append(encoding.decode(chunk_tokens))
-        
-        # Move to the next position with overlap
-        if chunk_end == len(tokens):
-            break
-        i += max_tokens - overlap
-    
-    return chunks
-
-def chunk_document(document: Dict[str, Any], 
-                  text_key: str = "text",
-                  max_tokens: int = 512, 
-                  overlap: int = 50) -> List[Dict[str, Any]]:
-    """
-    Chunk a document dictionary into multiple chunks.
-    
-    Args:
-        document: Document dictionary with metadata and text
-        text_key: The key in the document that contains the text to chunk
-        max_tokens: Maximum number of tokens per chunk
-        overlap: Number of tokens to overlap between chunks
-        
-    Returns:
-        List of document dictionaries, each with chunked text and preserved metadata
-    """
-    if text_key not in document:
-        raise Exception(f"{text_key} not in document")
-    
-    # Extract text and create chunks
-    if "title" in document:
-        text = f"{document["title"]}: {document[text_key]}"
-    else:
-        text = document[text_key]
-    chunks = chunk_text(text, max_tokens, overlap)
-    
-    # Create document chunks with preserved metadata
-    chunked_docs = []
-    for i, chunk in enumerate(chunks):
-        # Create a new doc with all original fields
-        doc_chunk = document.copy()
-        # Replace text with the chunk
-        doc_chunk[text_key] = chunk
-        # Add chunk metadata
-        doc_chunk["chunk_id"] = i
-        doc_chunk["chunk_total"] = len(chunks)
-        chunked_docs.append(doc_chunk)
-    
-    return chunked_docs
-
-def init_chroma_client(persist_directory: str = PERSIST_DIRECTORY):
-    """Initialize and return a ChromaDB client."""
-#    return chromadb.PersistentClient(path=persist_directory)
-    return chromadb.Client()
-
-def create_or_get_collection(client, collection_name: str):
-    """Create or get a ChromaDB collection."""
-    try:
-        return client.get_collection(
-            name=collection_name
-        )
-    except:
-        return client.create_collection(
-            name=collection_name,
-            metadata={"hnsw:space": "cosine"}
-        )
-
-def process_documents_to_chroma(
-    documents: List[Dict[str, Any]],
-    collection_name: str = "document_collection",
-    text_key: str = "text",
-    max_tokens: int = 512,
-    overlap: int = 50,
-    model: str = EMBED_MODEL,
-    persist_directory: str = PERSIST_DIRECTORY
-):
-    """
-    Process documents, chunk them, compute embeddings, and store in ChromaDB.
-    
-    Args:
-        documents: List of document dictionaries
-        collection_name: Name for the ChromaDB collection
-        text_key: The key containing text content
-        max_tokens: Maximum tokens per chunk
-        overlap: Token overlap between chunks
-        model: Ollama model for embeddings
-        persist_directory: Directory to store ChromaDB data
-    """
-    # Initialize ChromaDB client and collection
-    db = init_chroma_client(persist_directory)
-    collection = create_or_get_collection(db, collection_name)
-    
-    # Process each document
-    for doc in documents:
-        # Chunk the document
-        doc_chunks = chunk_document(doc, text_key, max_tokens, overlap)
-        
-        # Prepare data for ChromaDB
-        ids = []
-        texts = []
-        metadatas = []
-        embeddings = []
-
-        for chunk in doc_chunks:
-            # Create a unique ID for the chunk
-            chunk_id = f"{chunk['id']}_{chunk['chunk_id']}"
-            
-            # Extract text
-            text = chunk[text_key]
-            
-            # Create metadata (excluding text and embedding to avoid duplication)
-            metadata = {k: v for k, v in chunk.items() if k != text_key and k != "embedding"}
-
-            response = client.embed(model=model, input=text)
-            embedding = response["embeddings"][0]
-            ids.append(chunk_id)
-            texts.append(text)
-            metadatas.append(metadata)
-            embeddings.append(embedding)
-        
-        # Add chunks to ChromaDB collection
-        collection.add(
-            ids=ids,
-            documents=texts,
-            embeddings=embeddings,
-            metadatas=metadatas
-        )
-    
-    return collection
-
-def query_chroma(
-    query_text: str,
-    collection_name: str = "document_collection",
-    n_results: int = 5,
-    model: str = EMBED_MODEL,
-    persist_directory: str = PERSIST_DIRECTORY
-):
-    """
-    Query ChromaDB for similar documents.
-    
-    Args:
-        query_text: The text to search for
-        collection_name: Name of the ChromaDB collection
-        n_results: Number of results to return
-        model: Ollama model for embedding the query
-        persist_directory: Directory where ChromaDB data is stored
-    
-    Returns:
-        Query results from ChromaDB
-    """
-    # Initialize ChromaDB client and collection
-    db = init_chroma_client(persist_directory)
-    collection = create_or_get_collection(db, collection_name)
-    
-    query_response = client.embed(model=model, input=query_text)
-    query_embeddings = query_response["embeddings"]
-
-    # Query the collection
-    results = collection.query(
-        query_embeddings=query_embeddings,
-        n_results=n_results
-    )
-    
-    return results
-
-def print_top_match(query_results, index=0, documents=None):
-    """
-    Print detailed information about the top matching document,
-    including the full original document content.
-    
-    Args:
-        query_results: Results from ChromaDB query
-        documents: Original documents dictionary to look up full content (optional)
-    """
-    if not query_results or not query_results["ids"] or len(query_results["ids"][0]) == 0:
-        print("No matching documents found.")
-        return
-    
-    # Get the top result
-    top_id = query_results["ids"][0][index]
-    top_document_chunk = query_results["documents"][0][index]
-    top_metadata = query_results["metadatas"][0][index]
-    top_distance = query_results["distances"][0][index]
-    
-    print("="*50)
-    print("MATCHING DOCUMENT")
-    print("="*50)
-    print(f"Chunk ID: {top_id}")
-    print(f"Similarity Score: {top_distance:.4f}")  # Convert distance to similarity
-    
-    print("\nCHUNK METADATA:")
-    for key, value in top_metadata.items():
-        print(f"  {key}: {value}")
-    
-    print("\nMATCHING CHUNK CONTENT:")
-    print(top_document_chunk[:500].strip() + ("..." if len(top_document_chunk) > 500 else ""))
-    
-    # Extract the original document ID from the chunk ID
-    # Chunk IDs are in format "doc_id_chunk_num"
-    original_doc_id = top_id.split('_')[0]
-
-def get_top_match(query_results, index=0, documents=None):
-    top_id = query_results["ids"][index][0]
-    # Extract the original document ID from the chunk ID
-    # Chunk IDs are in format "doc_id_chunk_num"
-    original_doc_id = top_id.split('_')[0]
-        
-    # Return the full document for further processing if needed
-    if documents is not None:
-        return next((doc for doc in documents if doc["id"] == original_doc_id), None)
-    
-    return None
-
-def show_documents(documents=None):
-    if not documents:
-        return
-    
-    # Print the top matching document
-    for i, doc in enumerate(documents):
-        print(f"Document {i+1}:")
-        print(f"  Title: {doc['title']}")
-        print(f"  Text: {doc['text'][:100]}...")
-        print()
-
-def show_headlines(documents=None):
-    if not documents:
-        return
-    
-    # Print the top matching document
-    for doc in documents:
-        print(f"{doc['source']}: {doc['title']}")
-
-def show_help():
-    print("""help>
-docs       Show RAG docs
-full       Show last full top match
-headlines  Show the RAG headlines
-prompt     Show the last prompt
-response   Show the last response
-scores     Show last RAG scores
-why|think        Show last response's <think>
-context|match    Show RAG match info to last prompt
-""")
-
-
-# Example usage
-if __name__ == "__main__":
-    documents = []
-    for feed in rss_feeds:
-        documents.extend(feed.articles)
-
-    show_documents(documents=documents)
-
-    # Process documents and store in ChromaDB
-    collection = process_documents_to_chroma(
-        documents=documents,
-        collection_name="research_papers",
-        max_tokens=256,
-        overlap=25,
-        model=EMBED_MODEL,
-        persist_directory="/root/.cache/chroma"
-    )
-
-    last_results = None
-    last_prompt = None
-    last_system = None
-    last_response = None
-    last_why = None
-    last_messages = []
-    while True:
-        try:
-            search_query = input("> ").strip()
-        except KeyboardInterrupt as e:
-            print("\nExiting.")
-            break
-
-        if search_query == "exit" or search_query == "quit":
-            print("\nExiting.")
-            break
-    
-        if search_query == "docs":
-            show_documents(documents)
-            continue
-
-        if search_query == "prompt":
-            if last_prompt:
-                print(f"""last prompt>
-{"="*10}system{"="*10}
-{last_system}
-{"="*10}prompt{"="*10}
-{last_prompt}""")
-            else:
-                print(f"No prompts yet")
-            continue
-
-        if search_query == "response":
-            if last_response:
-                print(f"""last response>
-{"="*10}response{"="*10}
-{last_response}""")
-            else:
-                print(f"No responses yet")
-            continue
-
-        if search_query == "" or search_query == "help":
-            show_help()
-            continue
-
-        if search_query == "headlines":
-            show_headlines(documents)
-            continue
-
-        if search_query == "match" or search_query == "context":
-            if last_results:
-                print_top_match(last_results, documents=documents)
-            else:
-                print("No match to give info on")
-            continue
-
-        if search_query == "why" or search_query == "think":
-            if last_why:
-                print(f"""
-why>
-{last_why}
-""")
-            else:
-                print("No processed prompts")
-            continue
-
-        if search_query == "scores":
-            if last_results:
-                for i, _ in enumerate(last_results):
-                    print_top_match(last_results, documents=documents, index=i)
-            else:
-                print("No match to give info on")
-            continue
-
-        if search_query == "full":
-            if last_results:
-                full = get_top_match(last_results, documents=documents)
-                if full:
-                    print(f"""Context:
-Source: {full["source"]}
-Title: {full["title"]}
-Link: {full["link"]}
-Distance: {last_results.get("distances", [[0]])[0][0]}
-Full text:
-{full["text"]}""")
-            else:
-                print("No match to give info on")
-            continue
-
-        # Query ChromaDB
-        results = query_chroma(
-            query_text=search_query,
-            collection_name="research_papers",
-            n_results=10
-        )
-        last_results = results
-
-        full = get_top_match(results, documents=documents)
-
-        headlines = ""
-        for doc in documents:
-            headlines += f"{doc['source']}: {doc['title']}\n"
-
-        system="""
-You are the assistant. Your name is airc.
-
-Do not ask to help the user further.
-
-Provide short (less than 100 character) responses.
-
-Rules:
-* If the user asks for information about the AI model, how, or who wrote it, provide information about the author from inside the <author></author> tags.
-* If you think the user might be asking about the author, ask a follow up question to clarify.
-* If there is news in between the <input></input> tags relevant to the prompt, use that. Always mention the source when information comes from an item. If asked for the link, provide it.
-* Respond to the prompt in a single, direct response.
-* Do not prefix it with a word like "Answer"
-
-You must follow the rules.
-
-"""
-
-# * If a user asks for weather information, include in your response "{{weather_query("country", "city", "state")}}" where the description of the weather should go.
-
-        context = f"""<author>
-author={[
-{'info': 'James wrote the python application that is driving this RAG model on top of deepseek-r1:7b. You can find it at https://github.com/jketreno/airc'},
-{'info': 'James Ketrenos wrote the program deploying this AI model with RAG.'},
-{'info': 'James Ketrenos is a software engineer with a history in all levels of the computer stack, from the kernel to full-stack web applications. He dabbles in AI/ML and is familiar with pytorch and ollama.'},
-{'info': 'James lives in Portland, Oregon and has three kids. Two are attending Oregon State University and one is attending Williamette University.'}
-]}
-</author>"""
-
-
-        context += "<input>additional information unrelated to James Ketrenos = ["
-        for doc in documents:
-            item = {'source':doc["source"],'article':{'title':doc["title"],'link':doc["link"],'text':doc["text"]}}
-            context += f"{item}"
-        context += """]
-</input>
-"""
-        prompt = f"{context}{search_query}"
-        last_prompt = prompt
-        last_system = system
-        if len(last_messages) != 0:
-            message_context = f"{last_messages}"
-            prompt = f"{message_context}{prompt}"
-
-        print(f"system len: {len(system)}")            
-        print(f"prompt len: {len(prompt)}")            
-        output = client.generate(
-            model=MODEL_NAME,
-            system=f"{system}{context}",
-            prompt=prompt,
-            stream=False,
-            options={ 'num_ctx': 100000 }
-        )
-        # Prune off the <think>...</think>
-        matches = re.match(r'^<think>(.*?)</think>(.*)$', output['response'], flags=re.DOTALL)
-        if matches:
-            last_why = matches[1].strip()
-            content = matches[2].strip()
-        else:
-            print(f"[garbled] response>\n{output['response']}")
-        print(f"Response>\n{content}")
-
-        last_response = content
-        last_messages.extend(({
-            'role': 'user',
-            'name': 'james',
-            'message': search_query
-        }, {
-            'role': 'assistant',
-            'message': content
-        }))
-        last_messages = last_messages[:10]