Caching

2025-03-18 13:08:48 -07:00 · 2025-03-18 13:08:48 -07:00 · 1130077c03
commit 1130077c03
parent 5f6971510a
4 changed files with 307 additions and 584 deletions
--- a/283
+++ b/283
@ -1,3 +1,75 @@
 #
 # Build Pyton 3.11 for use in later stages
 #    
 FROM ubuntu:oracular AS python-build
 SHELL [ "/bin/bash", "-c" ]
 # Instructions Dockerfied from:
 #
 # https://github.com/pytorch/pytorch
 #
 # and
 #
 # https://pytorch.org/docs/stable/notes/get_start_xpu.html
 # https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-6.html
 # 
 #
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    gpg \
    wget \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 # ipex only supports python 3.11, so use 3.11 instead of latest oracular (3.12)
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    build-essential \
    ca-certificates \
    ccache \
    cmake \
    curl \
    git \
    gpg-agent \
    less \
    libbz2-dev \
    libffi-dev \
    libjpeg-dev \
    libpng-dev \
    libreadline-dev \
    libssl-dev \
    libsqlite3-dev \
    llvm \
    nano \
    wget \
    zlib1g-dev \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 #    python3 \
 #    python3-pip \
 #    python3-venv \
 #    python3-dev \
 RUN /usr/sbin/update-ccache-symlinks
 RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
 # Build Python in /opt/..., install it locally, then remove the build environment 
 # collapsed to a single docker layer.
 WORKDIR /opt
 ENV PYTHON_VERSION=3.11.9
 RUN wget -q -O - https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz | tar -xz \
    && cd Python-${PYTHON_VERSION} \
    && ./configure --prefix=/opt/python --enable-optimizations \
    && make -j$(nproc) \
    && make install \
    && cd /opt \
    && rm -rf Python-${PYTHON_VERSION}
 FROM ubuntu:oracular AS ze-monitor
 # From https://github.com/jketreno/ze-monitor
 RUN apt-get update \
@ -29,19 +101,75 @@ RUN cmake .. \
    && make \
    && cpack
 #
 # Build the ipex-llm wheel for use in later stages
 #    
 FROM python-build AS ipex-llm-src
 RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2
 RUN git clone --branch main --depth 1 https://github.com/intel/ipex-llm.git /opt/ipex-llm \
    && cd /opt/ipex-llm \
    && git fetch --depth 1 origin cb3c4b26ad058c156591816aa37eec4acfcbf765 \
    && git checkout cb3c4b26ad058c156591816aa37eec4acfcbf765
 WORKDIR /opt/ipex-llm
 RUN python3 -m venv --system-site-packages /opt/ipex-llm/venv
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
    echo 'source /opt/ipex-llm/venv/bin/activate' ; \
    echo 'bash -c "${@}"' ; \
    } > /opt/ipex-llm/shell ; \
    chmod +x /opt/ipex-llm/shell
 SHELL [ "/opt/ipex-llm/shell" ]
 RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
 WORKDIR /opt/ipex-llm/python/llm
 RUN pip install requests wheel
 RUN python setup.py clean --all bdist_wheel --linux
 #
 # The main airc image:
 # * python 3.11
 # * pytorch xpu w/ ipex-llm
 # * ollama-ipex-llm
 # * src/server.py     - model server supporting RAG and fine-tuned models
 #
 # Agents using server:
 # * src/web-ui.py     - REACT server (airc.ketrenos.com)
 # * src/irc.py        - IRC backend (irc.libera.chat #airc-test)
 # * src/cli.py        - Command line chat
 #
 # Utilities:
 # * src/training-fine-tune.py  - Perform fine-tuning on currated documents
 FROM ubuntu:oracular AS airc
 COPY --from=python-build /opt/python /opt/python
 # Get a couple prerequisites
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    gpg \
-    python3 \
+    # python3 \
-    python3-pip \
+    # python3-pip \
-    python3-venv \
+    # python3-venv \
    wget \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 # The client frontend is built using React Expo to allow 
 # easy creation of an Android app as well as web app
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    nodejs \
    npm \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 # Install Intel graphics runtimes
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
@ -58,27 +186,41 @@ RUN apt-get update \
 WORKDIR /opt/airc
 RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2
 # Setup the ollama python virtual environment
 RUN python3 -m venv --system-site-packages /opt/airc/venv
 # Setup the docker pip shell
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
    echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
    echo 'source /opt/airc/venv/bin/activate' ; \
-    echo 'bash -c "${@}"' ; \
+    echo 'if [[ "${1}" != "" ]]; then bash -c "${@}"; else bash; fi' ; \
    } > /opt/airc/shell ; \
    chmod +x /opt/airc/shell
 # Activate the pip environment on all shell calls
 SHELL [ "/opt/airc/shell" ]
 # From https://pytorch-extension.intel.com/installation?platform=gpu&version=v2.6.10%2Bxpu&os=linux%2Fwsl2&package=pip
 RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
 RUN pip install intel-extension-for-pytorch==2.6.10+xpu oneccl_bind_pt==2.6.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 # From https://huggingface.co/docs/bitsandbytes/main/en/installation?backend=Intel+CPU+%2B+GPU#multi-backend
 RUN pip install "transformers>=4.45.1"
 RUN pip install 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'
 # Install ollama python module
-RUN pip3 install ollama
+RUN pip install ollama
 # pydle does not work with newer asyncio due to coroutine
 # being deprecated. Patch to work.
 COPY /src/pydle.patch /opt/pydle.patch
-RUN pip3 install pydle \
+RUN pip install pydle \
    && patch -d /opt/airc/venv/lib/python3*/site-packages/pydle \
    -p1 < /opt/pydle.patch \
    && rm /opt/pydle.patch
@ -87,9 +229,49 @@ RUN pip install setuptools --upgrade
 RUN pip install ollama 
 RUN pip install feedparser bs4 chromadb
 RUN pip install tiktoken
 RUN pip install flask flask_cors
 RUN pip install peft datasets
 COPY --from=ipex-llm-src /opt/ipex-llm/python/llm/dist/*.whl /opt/wheels/
 RUN for pkg in /opt/wheels/ipex_llm*.whl; do pip install $pkg; done
 # mistral fails with cache_position errors with transformers>4.40 (or at least it fails with the latest)
 # as well as MistralSpda* and QwenSpda* things missing (needed when loading models with )
 RUN pip install "sentence_transformers<3.4.1"
 # "transformers==4.40.0" ""
 #RUN pip install sentence_transformers "transformers==4.40.0" "trl<0.12.0"
 #RUN pip install transformers==4.45.0 "trl<0.12.0"
 # trl.core doesn't have what is needed with the default 'pip install trl' version
 #RUN pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
 # To get xe_linear and other Xe methods
 # NOTE: As of 2025-03-10, these are only available for Python 3.11, hence
 # why we build python from source    
 RUN pip3 install 'bigdl-core-xe-all>=2.6.0b'
 # NOTE: IPEX includes the oneAPI components... not sure if they still need to be installed separately with a oneAPI env
 RUN pip install einops diffusers # Required for IPEX optimize(), which is required to convert from Params4bit
 RUN pip install yfinance pyzt geopy
 SHELL [ "/bin/bash", "-c" ]
 # Don't install the full oneapi essentials; just the ones that we seem to need
 # RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
 #     | gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \
 #     && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
 #     | tee /etc/apt/sources.list.d/oneAPI.list \
 #     && apt-get update \
 #     && DEBIAN_FRONTEND=noninteractive apt-get install -y \
 #     intel-oneapi-mkl-sycl-2025.0 \
 #     intel-oneapi-dnnl-2025.0 \
 #     intel-oneapi-dpcpp-cpp-2025.0 \
 #     && apt-get clean \
 #     && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 # dpcpp is needed for LoRA backend when 
 # libze-dev is needed for LoRA/triton backend in order to build stuff
 # Unfortunately, that fails with:
 #    ImportError: /opt/airc/venv/lib/python3.11/site-packages/intel_extension_for_pytorch/lib/libintel-ext-pt-cpu.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    libncurses6 \
@ -108,6 +290,8 @@ RUN { \
    echo 'echo "Container: airc"'; \
    echo 'set -e'; \
    echo 'echo "Setting pip environment to /opt/airc"'; \
    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
    echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
    echo 'source /opt/airc/venv/bin/activate'; \
    echo ''; \
    echo 'if [[ "${1}" == "/bin/bash" ]] || [[ "${1}" =~ ^(/opt/airc/)?shell$ ]]; then'; \
@ -126,6 +310,11 @@ RUN { \
    } > /entrypoint.sh \
    && chmod +x /entrypoint.sh
 # From 
 ENV USE_XETLA=OFF
 ENV SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 ENV SYCL_CACHE_PERSISTENT=1
 ENTRYPOINT [ "/entrypoint.sh" ]
 FROM ubuntu:oracular AS ollama
@ -185,7 +374,7 @@ RUN { \
 SHELL [ "/opt/ollama/shell" ]
 # Install ollama python module
-RUN pip3 install ollama
+RUN pip install ollama
 SHELL [ "/bin/bash", "-c" ]
@ -233,13 +422,14 @@ FROM airc AS jupyter
 SHELL [ "/opt/airc/shell" ]
 # BEGIN setup Jupyter
-RUN pip install jupyter \
+RUN pip install \
-    jupyterlab==4.3.0a0 \
+    jupyterlab \
-    jupyterhub==5.0.0 \
+    dash[jupyterlab] \
-    notebook==7.3.0a0 \
+    && jupyter lab build --dev-build=False --minimize=False
    "jupyter-server-proxy>=4.1.2"
 # END setup Jupyter
 RUN pip install -r /opt/airc/src/requirements.txt
 SHELL [ "/bin/bash", "-c" ]
 RUN { \
@ -259,8 +449,8 @@ RUN { \
    echo 'source /opt/airc/venv/bin/activate' ; \
    echo 'if [[ "${1}" == "shell" ]]; then echo "Dropping to shell"; /bin/bash; exit $?; fi' ; \
    echo 'while true; do' ; \
-    echo '  echo "Launching jupyter notebook"' ; \
+    echo '  echo "Launching jupyter lab"' ; \
-    echo '  jupyter notebook \' ; \
+    echo '  jupyter lab \' ; \
    echo '    --notebook-dir=/opt/jupyter \' ; \
    echo '    --port 8888 \' ; \
    echo '    --ip 0.0.0.0 \' ; \
@ -278,4 +468,67 @@ RUN { \
    } > /entrypoint-jupyter.sh \
    && chmod +x /entrypoint-jupyter.sh
-ENTRYPOINT [ "/entrypoint-jupyter.sh" ]
+ENTRYPOINT [ "/entrypoint-jupyter.sh" ]
 FROM ubuntu:oracular AS miniircd
 COPY --from=python-build /opt/python /opt/python
 # Get a couple prerequisites
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    gpg \
    wget \
    nano \
    irssi \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 WORKDIR /opt/miniircd
 RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2
 # Setup the ollama python virtual environment
 RUN python3 -m venv --system-site-packages /opt/miniircd/venv
 # Setup the docker pip shell
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
    echo 'source /opt/miniircd/venv/bin/activate' ; \
    echo 'if [[ "${1}" != "" ]]; then bash -c "${@}"; else bash; fi' ; \
    } > /opt/miniircd/shell ; \
    chmod +x /opt/miniircd/shell
 # Activate the pip environment on all shell calls
 SHELL [ "/opt/miniircd/shell" ]
 RUN pip install miniircd
 SHELL [ "/bin/bash", "-c" ]
 RUN { \
    echo '#!/bin/bash'; \
    echo 'echo "Container: miniircd"'; \
    echo 'set -e'; \
    echo 'echo "Setting pip environment to /opt/miniircd"'; \
    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
    echo 'source /opt/miniircd/venv/bin/activate'; \
    echo ''; \
    echo 'if [[ "${1}" == "/bin/bash" ]] || [[ "${1}" =~ ^(/opt/miniircd/)?shell$ ]]; then'; \
    echo '  echo "Dropping to shell"'; \
    echo '  shift' ; \
    echo '  echo "Running: ${@}"' ; \
    echo '  if [[ "${1}" != "" ]]; then' ; \
    echo '    exec ${@}'; \
    echo '  else' ; \
    echo '    exec /bin/bash'; \
    echo '  fi' ; \
    echo 'else'; \
    echo '  echo "Launching IRC server..."'; \
    echo '  miniircd --setuid root "${@}"' ; \
    echo 'fi'; \
    } > /entrypoint.sh \
    && chmod +x /entrypoint.sh
 ENTRYPOINT [ "/entrypoint.sh" ]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -14,9 +14,13 @@ services:
      - ollama
    networks:
      - internal
    ports:
      - 8911:8911
    volumes:
      - ./cache:/root/.cache
      - ./src:/opt/airc/src:rw
      - ./doc:/opt/airc/doc:ro
      - ./results:/opt/airc/results:rw
    cap_add: # used for running ze-monitor within airc container
      - CAP_DAC_READ_SEARCH # Bypass all filesystem read access checks
      - CAP_PERFMON         # Access to perf_events (vs. overloaded CAP_SYS_ADMIN)
@ -36,8 +40,8 @@ services:
      - ONEAPI_DEVICE_SELECTOR=level_zero:0
    devices:
      - /dev/dri:/dev/dri
-#    ports:
+    ports:
-#      - 11434:11434 # ollama serve port
+      - 11434:11434 # ollama serve port
    networks:
      - internal
    volumes:
@ -61,13 +65,41 @@ services:
      - /dev/dri:/dev/dri
    depends_on:
      - ollama
      - miniircd
    ports:
    - 8888:8888 # Jupyter Notebook
    - 60673:60673 # Gradio
    networks:
      - internal
    volumes:
      - ./jupyter:/opt/jupyter:rw
      - ./cache:/root/.cache
    deploy:
      resources:
        limits:
          memory: "0"  # No memory limit (Docker treats 0 as unlimited)
        reservations:
          memory: "0"  # No reserved memory (optional)
    ulimits:
      memlock: -1  # Prevents memory from being locked
    oom_kill_disable: true  # Prevents OOM killer from killing the container
  miniircd:
    build:
      context: .
      dockerfile: Dockerfile
      target: miniircd
    image: miniircd
    env_file:
      - .env
    devices:
      - /dev/dri:/dev/dri
    ports:
    - 6667:6667 # IRC
    networks:
      - internal
    volumes:
      - ./cache:/root/.cache
 networks:
  internal:
--- a/jupyter/stock.py
+++ b/jupyter/stock.py
@ -539,11 +539,11 @@ def create_ui():
            outputs=[chat_history, tool_history]
        )
-        # timer.tick(check_message_queue, inputs=chatbot, outputs=chatbot).then(
+        timer.tick(check_message_queue, inputs=chatbot, outputs=chatbot).then(
-        #     update_log, # This new function updates the log after chatbot processing
+            update_log, # This new function updates the log after chatbot processing
-        #     inputs=chatbot,
+            inputs=chatbot,
-        #     outputs=[chat_history, tool_history]
+            outputs=[chat_history, tool_history]
-        # )
+        )
        clear.click(do_clear, inputs=None, outputs=[chatbot, chat_history, tool_history], queue=False)
--- a/src/chunk.py
+++ b/src/chunk.py
@ -1,562 +0,0 @@
 import requests
 from typing import List, Dict, Any, Union
 import tiktoken
 import feedparser
 import logging as log
 import datetime
 from bs4 import BeautifulSoup
 import chromadb
 import ollama
 import re
 import numpy as np
 def normalize(vec):
    return vec / np.linalg.norm(vec)
 OLLAMA_API_URL = "http://ollama:11434"  # Default Ollama local endpoint
 MODEL_NAME = "deepseek-r1:7b"
 EMBED_MODEL = "mxbai-embed-large"
 PERSIST_DIRECTORY = "/root/.cache/chroma"
 client = ollama.Client(host=OLLAMA_API_URL)
 def extract_text_from_html_or_xml(content, is_xml=False):
    # Parse the content
    if is_xml:
        soup = BeautifulSoup(content, 'xml')  # Use 'xml' parser for XML content
    else:
        soup = BeautifulSoup(content, 'html.parser')  # Default to 'html.parser' for HTML content
    # Extract and return just the text
    return soup.get_text()
 class Feed():
    def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
        self.name = name
        self.url = url
        self.poll_limit_min = datetime.timedelta(minutes=poll_limit_min)
        self.last_poll = None
        self.articles = []
        self.max_articles = max_articles
        self.update()
    def update(self):
        now = datetime.datetime.now()
        if self.last_poll is None or (now - self.last_poll) >= self.poll_limit_min:
            log.info(f"Updating {self.name}")
            feed = feedparser.parse(self.url)
            self.articles = []
            self.last_poll = now
            if len(feed.entries) == 0:
                return
            for i, entry in enumerate(feed.entries[:self.max_articles]):
                content = {}
                content['source'] = self.name
                content['id'] = f"{self.name}{i}"
                title = entry.get("title")
                if title:
                    content['title'] = title
                link = entry.get("link")
                if link:
                    content['link'] = link
                text = entry.get("summary")
                if text:
                    content['text'] = extract_text_from_html_or_xml(text, False)
                else:
                    continue
                published = entry.get("published")
                if published:
                    content['published'] = published
                self.articles.append(content)
        else:
            log.info(f"Not updating {self.name} -- {self.poll_limit_min - (now - self.last_poll)}s remain to refresh.")
        return self.articles
 # News RSS Feeds
 rss_feeds = [
    Feed(name="IGN.com", url="https://feeds.feedburner.com/ign/games-all"),
    Feed(name="BBC World", url="http://feeds.bbci.co.uk/news/world/rss.xml"),
    Feed(name="Reuters World", url="http://feeds.reuters.com/Reuters/worldNews"),
    Feed(name="Al Jazeera", url="https://www.aljazeera.com/xml/rss/all.xml"),
    Feed(name="CNN World", url="http://rss.cnn.com/rss/edition_world.rss"),
    Feed(name="Time", url="https://time.com/feed/"),
    Feed(name="Euronews", url="https://www.euronews.com/rss"),
 #    Feed(name="FeedX", url="https://feedx.net/rss/ap.xml")
 ]
 def get_encoding():
    """Get the tokenizer for counting tokens."""
    try:
        return tiktoken.get_encoding("cl100k_base")  # Default encoding used by many embedding models
    except:
        return tiktoken.encoding_for_model(MODEL_NAME)
 def count_tokens(text: str) -> int:
    """Count the number of tokens in a text string."""
    encoding = get_encoding()
    return len(encoding.encode(text))
 def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:
    """
    Split a text into chunks based on token count with overlap between chunks.
    Args:
        text: The text to split into chunks
        max_tokens: Maximum number of tokens per chunk
        overlap: Number of tokens to overlap between chunks
    Returns:
        List of text chunks
    """
    if not text or max_tokens <= 0:
        return []
    encoding = get_encoding()
    tokens = encoding.encode(text)
    chunks = []
    i = 0
    while i < len(tokens):
        # Get the current chunk of tokens
        chunk_end = min(i + max_tokens, len(tokens))
        chunk_tokens = tokens[i:chunk_end]
        chunks.append(encoding.decode(chunk_tokens))
        # Move to the next position with overlap
        if chunk_end == len(tokens):
            break
        i += max_tokens - overlap
    return chunks
 def chunk_document(document: Dict[str, Any], 
                  text_key: str = "text",
                  max_tokens: int = 512, 
                  overlap: int = 50) -> List[Dict[str, Any]]:
    """
    Chunk a document dictionary into multiple chunks.
    Args:
        document: Document dictionary with metadata and text
        text_key: The key in the document that contains the text to chunk
        max_tokens: Maximum number of tokens per chunk
        overlap: Number of tokens to overlap between chunks
    Returns:
        List of document dictionaries, each with chunked text and preserved metadata
    """
    if text_key not in document:
        raise Exception(f"{text_key} not in document")
    # Extract text and create chunks
    if "title" in document:
        text = f"{document["title"]}: {document[text_key]}"
    else:
        text = document[text_key]
    chunks = chunk_text(text, max_tokens, overlap)
    # Create document chunks with preserved metadata
    chunked_docs = []
    for i, chunk in enumerate(chunks):
        # Create a new doc with all original fields
        doc_chunk = document.copy()
        # Replace text with the chunk
        doc_chunk[text_key] = chunk
        # Add chunk metadata
        doc_chunk["chunk_id"] = i
        doc_chunk["chunk_total"] = len(chunks)
        chunked_docs.append(doc_chunk)
    return chunked_docs
 def init_chroma_client(persist_directory: str = PERSIST_DIRECTORY):
    """Initialize and return a ChromaDB client."""
 #    return chromadb.PersistentClient(path=persist_directory)
    return chromadb.Client()
 def create_or_get_collection(client, collection_name: str):
    """Create or get a ChromaDB collection."""
    try:
        return client.get_collection(
            name=collection_name
        )
    except:
        return client.create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )
 def process_documents_to_chroma(
    documents: List[Dict[str, Any]],
    collection_name: str = "document_collection",
    text_key: str = "text",
    max_tokens: int = 512,
    overlap: int = 50,
    model: str = EMBED_MODEL,
    persist_directory: str = PERSIST_DIRECTORY
 ):
    """
    Process documents, chunk them, compute embeddings, and store in ChromaDB.
    Args:
        documents: List of document dictionaries
        collection_name: Name for the ChromaDB collection
        text_key: The key containing text content
        max_tokens: Maximum tokens per chunk
        overlap: Token overlap between chunks
        model: Ollama model for embeddings
        persist_directory: Directory to store ChromaDB data
    """
    # Initialize ChromaDB client and collection
    db = init_chroma_client(persist_directory)
    collection = create_or_get_collection(db, collection_name)
    # Process each document
    for doc in documents:
        # Chunk the document
        doc_chunks = chunk_document(doc, text_key, max_tokens, overlap)
        # Prepare data for ChromaDB
        ids = []
        texts = []
        metadatas = []
        embeddings = []
        for chunk in doc_chunks:
            # Create a unique ID for the chunk
            chunk_id = f"{chunk['id']}_{chunk['chunk_id']}"
            # Extract text
            text = chunk[text_key]
            # Create metadata (excluding text and embedding to avoid duplication)
            metadata = {k: v for k, v in chunk.items() if k != text_key and k != "embedding"}
            response = client.embed(model=model, input=text)
            embedding = response["embeddings"][0]
            ids.append(chunk_id)
            texts.append(text)
            metadatas.append(metadata)
            embeddings.append(embedding)
        # Add chunks to ChromaDB collection
        collection.add(
            ids=ids,
            documents=texts,
            embeddings=embeddings,
            metadatas=metadatas
        )
    return collection
 def query_chroma(
    query_text: str,
    collection_name: str = "document_collection",
    n_results: int = 5,
    model: str = EMBED_MODEL,
    persist_directory: str = PERSIST_DIRECTORY
 ):
    """
    Query ChromaDB for similar documents.
    Args:
        query_text: The text to search for
        collection_name: Name of the ChromaDB collection
        n_results: Number of results to return
        model: Ollama model for embedding the query
        persist_directory: Directory where ChromaDB data is stored
    Returns:
        Query results from ChromaDB
    """
    # Initialize ChromaDB client and collection
    db = init_chroma_client(persist_directory)
    collection = create_or_get_collection(db, collection_name)
    query_response = client.embed(model=model, input=query_text)
    query_embeddings = query_response["embeddings"]
    # Query the collection
    results = collection.query(
        query_embeddings=query_embeddings,
        n_results=n_results
    )
    return results
 def print_top_match(query_results, index=0, documents=None):
    """
    Print detailed information about the top matching document,
    including the full original document content.
    Args:
        query_results: Results from ChromaDB query
        documents: Original documents dictionary to look up full content (optional)
    """
    if not query_results or not query_results["ids"] or len(query_results["ids"][0]) == 0:
        print("No matching documents found.")
        return
    # Get the top result
    top_id = query_results["ids"][0][index]
    top_document_chunk = query_results["documents"][0][index]
    top_metadata = query_results["metadatas"][0][index]
    top_distance = query_results["distances"][0][index]
    print("="*50)
    print("MATCHING DOCUMENT")
    print("="*50)
    print(f"Chunk ID: {top_id}")
    print(f"Similarity Score: {top_distance:.4f}")  # Convert distance to similarity
    print("\nCHUNK METADATA:")
    for key, value in top_metadata.items():
        print(f"  {key}: {value}")
    print("\nMATCHING CHUNK CONTENT:")
    print(top_document_chunk[:500].strip() + ("..." if len(top_document_chunk) > 500 else ""))
    # Extract the original document ID from the chunk ID
    # Chunk IDs are in format "doc_id_chunk_num"
    original_doc_id = top_id.split('_')[0]
 def get_top_match(query_results, index=0, documents=None):
    top_id = query_results["ids"][index][0]
    # Extract the original document ID from the chunk ID
    # Chunk IDs are in format "doc_id_chunk_num"
    original_doc_id = top_id.split('_')[0]
    # Return the full document for further processing if needed
    if documents is not None:
        return next((doc for doc in documents if doc["id"] == original_doc_id), None)
    return None
 def show_documents(documents=None):
    if not documents:
        return
    # Print the top matching document
    for i, doc in enumerate(documents):
        print(f"Document {i+1}:")
        print(f"  Title: {doc['title']}")
        print(f"  Text: {doc['text'][:100]}...")
        print()
 def show_headlines(documents=None):
    if not documents:
        return
    # Print the top matching document
    for doc in documents:
        print(f"{doc['source']}: {doc['title']}")
 def show_help():
    print("""help>
 docs       Show RAG docs
 full       Show last full top match
 headlines  Show the RAG headlines
 prompt     Show the last prompt
 response   Show the last response
 scores     Show last RAG scores
 why|think        Show last response's <think>
 context|match    Show RAG match info to last prompt
 """)
 # Example usage
 if __name__ == "__main__":
    documents = []
    for feed in rss_feeds:
        documents.extend(feed.articles)
    show_documents(documents=documents)
    # Process documents and store in ChromaDB
    collection = process_documents_to_chroma(
        documents=documents,
        collection_name="research_papers",
        max_tokens=256,
        overlap=25,
        model=EMBED_MODEL,
        persist_directory="/root/.cache/chroma"
    )
    last_results = None
    last_prompt = None
    last_system = None
    last_response = None
    last_why = None
    last_messages = []
    while True:
        try:
            search_query = input("> ").strip()
        except KeyboardInterrupt as e:
            print("\nExiting.")
            break
        if search_query == "exit" or search_query == "quit":
            print("\nExiting.")
            break
        if search_query == "docs":
            show_documents(documents)
            continue
        if search_query == "prompt":
            if last_prompt:
                print(f"""last prompt>
 {"="*10}system{"="*10}
 {last_system}
 {"="*10}prompt{"="*10}
 {last_prompt}""")
            else:
                print(f"No prompts yet")
            continue
        if search_query == "response":
            if last_response:
                print(f"""last response>
 {"="*10}response{"="*10}
 {last_response}""")
            else:
                print(f"No responses yet")
            continue
        if search_query == "" or search_query == "help":
            show_help()
            continue
        if search_query == "headlines":
            show_headlines(documents)
            continue
        if search_query == "match" or search_query == "context":
            if last_results:
                print_top_match(last_results, documents=documents)
            else:
                print("No match to give info on")
            continue
        if search_query == "why" or search_query == "think":
            if last_why:
                print(f"""
 why>
 {last_why}
 """)
            else:
                print("No processed prompts")
            continue
        if search_query == "scores":
            if last_results:
                for i, _ in enumerate(last_results):
                    print_top_match(last_results, documents=documents, index=i)
            else:
                print("No match to give info on")
            continue
        if search_query == "full":
            if last_results:
                full = get_top_match(last_results, documents=documents)
                if full:
                    print(f"""Context:
 Source: {full["source"]}
 Title: {full["title"]}
 Link: {full["link"]}
 Distance: {last_results.get("distances", [[0]])[0][0]}
 Full text:
 {full["text"]}""")
            else:
                print("No match to give info on")
            continue
        # Query ChromaDB
        results = query_chroma(
            query_text=search_query,
            collection_name="research_papers",
            n_results=10
        )
        last_results = results
        full = get_top_match(results, documents=documents)
        headlines = ""
        for doc in documents:
            headlines += f"{doc['source']}: {doc['title']}\n"
        system="""
 You are the assistant. Your name is airc.
 Do not ask to help the user further.
 Provide short (less than 100 character) responses.
 Rules:
 * If the user asks for information about the AI model, how, or who wrote it, provide information about the author from inside the <author></author> tags.
 * If you think the user might be asking about the author, ask a follow up question to clarify.
 * If there is news in between the <input></input> tags relevant to the prompt, use that. Always mention the source when information comes from an item. If asked for the link, provide it.
 * Respond to the prompt in a single, direct response.
 * Do not prefix it with a word like "Answer"
 You must follow the rules.
 """
 # * If a user asks for weather information, include in your response "{{weather_query("country", "city", "state")}}" where the description of the weather should go.
        context = f"""<author>
 author={[
 {'info': 'James wrote the python application that is driving this RAG model on top of deepseek-r1:7b. You can find it at https://github.com/jketreno/airc'},
 {'info': 'James Ketrenos wrote the program deploying this AI model with RAG.'},
 {'info': 'James Ketrenos is a software engineer with a history in all levels of the computer stack, from the kernel to full-stack web applications. He dabbles in AI/ML and is familiar with pytorch and ollama.'},
 {'info': 'James lives in Portland, Oregon and has three kids. Two are attending Oregon State University and one is attending Williamette University.'}
 ]}
 </author>"""
        context += "<input>additional information unrelated to James Ketrenos = ["
        for doc in documents:
            item = {'source':doc["source"],'article':{'title':doc["title"],'link':doc["link"],'text':doc["text"]}}
            context += f"{item}"
        context += """]
 </input>
 """
        prompt = f"{context}{search_query}"
        last_prompt = prompt
        last_system = system
        if len(last_messages) != 0:
            message_context = f"{last_messages}"
            prompt = f"{message_context}{prompt}"
        print(f"system len: {len(system)}")            
        print(f"prompt len: {len(prompt)}")            
        output = client.generate(
            model=MODEL_NAME,
            system=f"{system}{context}",
            prompt=prompt,
            stream=False,
            options={ 'num_ctx': 100000 }
        )
        # Prune off the <think>...</think>
        matches = re.match(r'^<think>(.*?)</think>(.*)$', output['response'], flags=re.DOTALL)
        if matches:
            last_why = matches[1].strip()
            content = matches[2].strip()
        else:
            print(f"[garbled] response>\n{output['response']}")
        print(f"Response>\n{content}")
        last_response = content
        last_messages.extend(({
            'role': 'user',
            'name': 'james',
            'message': search_query
        }, {
            'role': 'assistant',
            'message': content
        }))
        last_messages = last_messages[:10]