Caching

2025-03-18 13:09:52 -07:00 · 2025-03-18 13:09:52 -07:00 · f10f056e0d
commit f10f056e0d
parent 1130077c03
73 changed files with 763972 additions and 0 deletions
--- a/Dockerfile.airc-pytorch
+++ b/Dockerfile.airc-pytorch
@ -0,0 +1,444 @@
+FROM ubuntu:oracular AS pytorch-build
+
+SHELL [ "/bin/bash", "-c" ]
+
+# Instructions Dockerfied from:
+#
+# https://github.com/pytorch/pytorch
+#
+# and
+#
+# https://pytorch.org/docs/stable/notes/get_start_xpu.html
+# https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-6.html
+# 
+#
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    gpg \
+    wget \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+# ipex only supports python 3.11, so use 3.11 instead of latest oracular (3.12)
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ca-certificates \
+    ccache \
+    cmake \
+    curl \
+    git \
+    gpg-agent \
+    less \
+    libbz2-dev \
+    libffi-dev \
+    libjpeg-dev \
+    libpng-dev \
+    libreadline-dev \
+    libssl-dev \
+    libsqlite3-dev \
+    llvm \
+    nano \
+    wget \
+    zlib1g-dev \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+#    python3 \
+#    python3-pip \
+#    python3-venv \
+#    python3-dev \
+
+RUN /usr/sbin/update-ccache-symlinks
+RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
+
+# Build Python in /opt/..., install it locally, then remove the build environment 
+# collapsed to a single docker layer.
+WORKDIR /opt
+ENV PYTHON_VERSION=3.11.9
+
+RUN wget -q -O - https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz | tar -xz \
+    && cd Python-${PYTHON_VERSION} \
+    && ./configure --prefix=/opt/python --enable-optimizations \
+    && make -j$(nproc) \
+    && make install \
+    && cd /opt \
+    && rm -rf Python-${PYTHON_VERSION}
+
+WORKDIR /opt/pytorch
+
+FROM ubuntu:oracular AS ze-monitor
+# From https://github.com/jketreno/ze-monitor
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    debhelper \
+    devscripts \
+    cmake \
+    git \
+    libfmt-dev \
+    libncurses-dev \
+    rpm \
+    rpm2cpio \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+RUN apt-get install -y \
+    software-properties-common \
+    && add-apt-repository -y ppa:kobuk-team/intel-graphics \
+    && apt-get update \
+    && apt-get install -y \
+    libze-dev \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+RUN git clone --depth 1 --branch v0.3.0-1 https://github.com/jketreno/ze-monitor /opt/ze-monitor
+WORKDIR /opt/ze-monitor/build
+RUN cmake .. \
+    && make \
+    && cpack
+
+FROM pytorch-build AS pytorch
+
+COPY --from=pytorch-build /opt/pytorch /opt/pytorch
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
+    && add-apt-repository -y ppa:kobuk-team/intel-graphics \
+    && apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    libze-intel-gpu1 \
+    libze1 \
+    intel-ocloc \
+    intel-opencl-icd \
+    xpu-smi \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2
+
+# When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries.
+ENV SYCL_CACHE_PERSISTENT=1
+
+WORKDIR /opt/pytorch
+
+RUN { \
+    echo '#!/bin/bash' ; \
+    update-alternatives --set python3 /opt/python/bin/python3.11 ; \
+    echo 'source /opt/pytorch/venv/bin/activate' ; \
+    echo 'bash -c "${@}"' ; \
+    } > /opt/pytorch/shell ; \
+    chmod +x /opt/pytorch/shell
+
+RUN python3 -m venv --system-site-packages /opt/pytorch/venv
+
+SHELL [ "/opt/pytorch/shell" ]
+
+RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
+RUN pip3 freeze > /opt/pytorch/requirements.txt
+
+SHELL [ "/bin/bash", "-c" ]
+
+RUN { \
+    echo '#!/bin/bash' ; \
+    echo 'echo "Container: pytorch"' ; \
+    echo 'set -e' ; \
+    echo 'echo "Setting pip environment to /opt/pytorch"' ; \
+    echo 'source /opt/pytorch/venv/bin/activate'; \
+    echo 'if [[ "${1}" == "" ]] || [[ "${1}" == "shell" ]]; then' ; \
+    echo '  echo "Dropping to shell"' ; \
+    echo '  /bin/bash -c "source /opt/pytorch/venv/bin/activate ; /bin/bash"' ; \
+    echo 'else' ; \
+    echo '  exec "${@}"' ; \
+    echo 'fi' ; \
+    } > /entrypoint.sh \
+    && chmod +x /entrypoint.sh
+
+ENTRYPOINT [ "/entrypoint.sh" ]
+
+FROM pytorch AS ipex-llm-src
+
+# Build ipex-llm from source
+
+RUN git clone --branch main --depth 1 https://github.com/intel/ipex-llm.git /opt/ipex-llm \
+    && cd /opt/ipex-llm \
+    && git fetch --depth 1 origin cb3c4b26ad058c156591816aa37eec4acfcbf765 \
+    && git checkout cb3c4b26ad058c156591816aa37eec4acfcbf765
+
+WORKDIR /opt/ipex-llm
+
+RUN python3 -m venv --system-site-packages /opt/ipex-llm/venv
+RUN { \
+    echo '#!/bin/bash' ; \
+    update-alternatives --set python3 /opt/python/bin/python3.11 ; \
+    echo 'source /opt/ipex-llm/venv/bin/activate' ; \
+    echo 'bash -c "${@}"' ; \
+    } > /opt/ipex-llm/shell ; \
+    chmod +x /opt/ipex-llm/shell
+
+SHELL [ "/opt/ipex-llm/shell" ]
+
+RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
+
+WORKDIR /opt/ipex-llm/python/llm
+RUN pip install requests wheel
+RUN python setup.py clean --all bdist_wheel --linux
+
+FROM airc AS jupyter
+
+SHELL [ "/opt/airc/shell" ]
+
+# BEGIN setup Jupyter
+RUN pip install jupyter \
+    jupyterlab==4.3.0a0 \
+    jupyterhub==5.0.0 \
+    notebook==7.3.0a0 \
+    "jupyter-server-proxy>=4.1.2"
+# END setup Jupyter
+
+SHELL [ "/bin/bash", "-c" ]
+
+RUN { \
+    echo '#!/bin/bash' ; \
+    echo 'echo "Container: airc jupyter"' ; \
+    echo 'if [[ ! -e "/root/.cache/hub/token" ]]; then' ; \
+    echo '  if [[ "${HF_ACCESS_TOKEN}" == "" ]]; then' ; \
+    echo '    echo "Set your HF access token in .env as: HF_ACCESS_TOKEN=<token>" >&2' ; \
+    echo '    exit 1' ; \
+    echo '  else' ; \
+    echo '    if [[ ! -d '/root/.cache/hub' ]]; then mkdir -p /root/.cache/hub; fi' ; \
+    echo '    echo "${HF_ACCESS_TOKEN}" > /root/.cache/hub/token' ; \
+    echo '  fi' ; \
+    echo 'fi' ; \
+    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
+    echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
+    echo 'source /opt/airc/venv/bin/activate' ; \
+    echo 'if [[ "${1}" == "shell" ]]; then echo "Dropping to shell"; /bin/bash; exit $?; fi' ; \
+    echo 'while true; do' ; \
+    echo '  echo "Launching jupyter notebook"' ; \
+    echo '  jupyter notebook \' ; \
+    echo '    --notebook-dir=/opt/jupyter \' ; \
+    echo '    --port 8888 \' ; \
+    echo '    --ip 0.0.0.0 \' ; \
+    echo '    --no-browser \' ; \
+    echo '    --allow-root \' ; \
+    echo '    --ServerApp.token= \' ; \
+    echo '    --ServerApp.password= \' ; \
+    echo '    --ServerApp.allow_origin=* \' ; \
+    echo '    --ServerApp.base_url="/jupyter" \' ; \
+    echo '    "${@}" \' ; \
+    echo '    2>&1 | tee -a "/root/.cache/jupyter.log"' ; \
+    echo '  echo "jupyter notebook died ($?). Restarting."' ; \
+    echo '  sleep 5' ; \
+    echo 'done' ; \
+    } > /entrypoint-jupyter.sh \
+    && chmod +x /entrypoint-jupyter.sh
+
+ENTRYPOINT [ "/entrypoint-jupyter.sh" ]
+
+FROM pytorch AS airc
+
+RUN python3 -m venv --system-site-packages /opt/airc/venv
+
+# Don't install the full oneapi essentials; just the ones that we seem to need
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+    | gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \
+    && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
+    | tee /etc/apt/sources.list.d/oneAPI.list \
+    && apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    intel-oneapi-mkl-sycl-2025.0 \
+    intel-oneapi-dnnl-2025.0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+RUN { \
+    echo '#!/bin/bash' ; \
+    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
+    echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
+    echo 'source /opt/airc/venv/bin/activate' ; \
+    echo 'if [[ "$1" == "" ]]; then bash -c; else bash -c "${@}"; fi' ; \
+    } > /opt/airc/shell ; \
+    chmod +x /opt/airc/shell
+
+SHELL [ "/opt/airc/shell" ]
+
+RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
+# Install ipex-llm built in ipex-llm-src
+COPY --from=ipex-llm-src /opt/ipex-llm/python/llm/dist/*.whl /opt/wheels/
+RUN for pkg in /opt/wheels/ipex_llm*.whl; do pip install $pkg; done
+
+COPY src/ /opt/airc/src/
+
+# pydle does not work with newer asyncio due to coroutine
+# being deprecated. Patch to work.
+RUN pip3 install pydle transformers sentencepiece accelerate \
+    && patch -d /opt/airc/venv/lib/python3*/site-packages/pydle \
+    -p1 < /opt/airc/src/pydle.patch
+
+# mistral fails with cache_position errors with transformers>4.40 (or at least it fails with the latest)
+# as well as MistralSpda* things missing
+RUN pip install "sentence_transformers<3.4.1" "transformers==4.40.0"
+
+# To get xe_linear and other Xe methods    
+RUN pip3 install 'bigdl-core-xe-all>=2.6.0b'
+
+# trl.core doesn't have what is needed with the default 'pip install trl' version
+RUN pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
+
+# Needed by src/model-server.py
+RUN pip install flask
+
+SHELL [ "/bin/bash", "-c" ]
+
+RUN { \
+    echo '#!/bin/bash' ; \
+    echo 'set -e' ; \
+    echo 'if [[ ! -e "/root/.cache/hub/token" ]]; then' ; \
+    echo '  if [[ "${HF_ACCESS_TOKEN}" == "" ]]; then' ; \
+    echo '    echo "Set your HF access token in .env as: HF_ACCESS_TOKEN=<token>" >&2' ; \
+    echo '    exit 1' ; \
+    echo '  else' ; \
+    echo '    if [[ ! -d '/root/.cache/hub' ]]; then mkdir -p /root/.cache/hub; fi' ; \
+    echo '    echo "${HF_ACCESS_TOKEN}" > /root/.cache/hub/token' ; \
+    echo '  fi' ; \
+    echo 'fi' ; \
+    echo 'echo "Container: airc"' ; \
+    echo 'echo "Setting pip environment to /opt/airc"' ; \
+    echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
+    echo 'source /opt/airc/venv/bin/activate'; \
+    echo 'if [[ "${1}" == "shell" ]] || [[ "${1}" == "/bin/bash" ]]; then' ; \
+    echo '  echo "Dropping to shell"' ; \
+    echo '  /bin/bash -c "source /opt/airc/venv/bin/activate ; /bin/bash"' ; \
+    echo '  exit $?' ; \
+    echo 'else' ; \
+    echo '  while true; do' ; \
+    echo '    echo "Launching model-server"' ; \
+    echo '    python src/model-server.py \' ; \
+    echo '      2>&1 | tee -a "/root/.cache/model-server.log"'; \
+    echo '    echo "model-server died ($?). Restarting."' ; \
+    echo '    sleep 5' ; \
+    echo '  done &' ; \
+    echo '  while true; do' ; \
+    echo '    echo "Launching airc"' ; \
+    echo '    python src/airc.py "${@}" \' ; \
+    echo '      2>&1 | tee -a "/root/.cache/airc.log"' ; \
+    echo '    echo "airc died ($?). Restarting."' ; \
+    echo '    sleep 5' ; \
+    echo '  done' ; \
+    echo 'fi' ; \
+    } > /entrypoint-airc.sh \
+    && chmod +x /entrypoint-airc.sh
+
+COPY --from=ze-monitor /opt/ze-monitor/build/ze-monitor-*deb /opt/
+RUN dpkg -i /opt/ze-monitor-*deb
+
+WORKDIR /opt/airc
+
+SHELL [ "/opt/airc/shell" ]
+
+# Needed by src/model-server.py
+#RUN pip install faiss-cpu sentence_transformers feedparser bs4
+RUN pip install ollama feedparser bs4
+
+SHELL [ "/bin/bash", "-c" ]
+
+ENTRYPOINT [ "/entrypoint-airc.sh" ]
+
+FROM ubuntu:oracular AS ollama
+
+# Get a couple prerequisites
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    gpg \
+    wget \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+# Install Intel graphics runtimes
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
+    && add-apt-repository -y ppa:kobuk-team/intel-graphics \
+    && apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    libze-intel-gpu1 \
+    libze1 \
+    intel-ocloc \
+    intel-opencl-icd \
+    xpu-smi \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+WORKDIR /opt/ollama
+
+# Download the nightly ollama release from ipex-llm
+RUN wget -qO - https://github.com/intel/ipex-llm/releases/download/v2.2.0-nightly/ollama-0.5.4-ipex-llm-2.2.0b20250226-ubuntu.tgz | \
+    tar --strip-components=1 -C . -xzv 
+
+# Install Python from Oracular (ollama works with 3.12)
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    gpg \
+    python3 \
+    python3-pip \
+    python3-venv \
+    wget \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
+
+# Setup the ollama python virtual environment
+RUN python3 -m venv --system-site-packages /opt/ollama/venv
+
+# Setup the docker pip shell
+RUN { \
+    echo '#!/bin/bash' ; \
+    update-alternatives --set python3 /opt/python/bin/python3.11 ; \
+    echo 'source /opt/ollama/venv/bin/activate' ; \
+    echo 'bash -c "${@}"' ; \
+    } > /opt/ollama/shell ; \
+    chmod +x /opt/ollama/shell
+
+# Activate the pip environment on all shell calls
+SHELL [ "/opt/ollama/shell" ]
+
+# Install ollama python module
+RUN pip3 install ollama
+
+SHELL [ "/bin/bash", "-c" ]
+
+RUN { \
+    echo '#!/bin/bash'; \
+    echo 'echo "Container: ollama"'; \
+    echo 'set -e'; \
+    echo 'echo "Setting pip environment to /opt/ollama"'; \
+    echo 'source /opt/ollama/venv/bin/activate'; \
+    echo 'export OLLAMA_NUM_GPU=999'; \
+    echo 'export ZES_ENABLE_SYSMAN=1'; \
+    echo 'export SYCL_CACHE_PERSISTENT=1'; \
+    echo 'export OLLAMA_KEEP_ALIVE=-1'; \
+    echo 'export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1'; \
+    echo ''; \
+    echo 'if [[ "${1}" == "/bin/bash" ]] || [[ "${1}" =~ ^(/opt/ollama/)?shell$ ]]; then'; \
+    echo '  echo "Dropping to shell"'; \
+    echo '  exec /bin/bash'; \
+    echo 'else'; \
+    echo '  echo "Launching Ollama server..."'; \
+    echo '  exec ./ollama serve'; \
+    echo 'fi'; \
+    } > /entrypoint.sh \
+    && chmod +x /entrypoint.sh
+
+RUN { \
+    echo '#!/bin/bash'; \
+    echo 'echo "Container: ollama"'; \
+    echo 'set -e'; \
+    echo 'echo "Setting pip environment to /opt/ollama"'; \
+    echo 'source /opt/ollama/venv/bin/activate'; \
+    echo './ollama pull mxbai-embed-large' ; \
+    echo './ollama pull deepseek-r1:7b' ; \
+    } > /fetch-models.sh \
+    && chmod +x /fetch-models.sh
+
+VOLUME [" /root/.ollama" ]
+ENTRYPOINT [ "/entrypoint.sh" ]
--- a/112
+++ b/112
@ -0,0 +1,112 @@
+# RAG and Fine Tuning
+
+Two common practices for augmenting a model with new information:
+
+1. In-context learning
+2. Fine Tuning
+
+## In-context learning
+
+In-context learnings is great for dynamic data. Rather than retrain the model, you include relevant contextual information in the prompt to the model. The approach to collect that context information is generally:
+
+1. Store the content in a context retrieval system
+2. Based on the user's query, determine which content in #1 is most relevant
+3. Augment the user's query with that context information
+
+PROS: Easy to implement
+CONS: Retrieving the correct context can be difficult, and without the context, the model won't have the additional information and will revert to its internal knowledge.
+
+If you only have one or two documents (totalling a few thousand tokens than the models maximum input) you can include the full text of the document in the prompt and skip the context mapping.
+
+## Fine tuning
+
+Fine tuning is great if you have static content, as applying the knowledge to the model requires the model to go through a training phase which can take several hours or longer, depending on the amount of information being updated. The approach to fine-tune is:
+
+1. Process your data into a series of 'context', 'query', 'response' correlations
+2. Train the model using those correlations
+
+The main work involved is in #1, and the success of fine tuning will be greatly impacted by the method used to perform it. As having an expert system manually generate queries and responses can be time consuming, the "prompt adjustment" of the first method can be used. 
+
+For this, you can iteratively perform the following:
+
+### Query generation
+
+1. Context data
+2. Prompt: Given the context, create a list of questions about the topic. Do not provide answers.
+
+### Response generation
+
+1. Context data
+2. Prompt: Given the context, respond to the following question: {query}
+
+Repeat the above for each piece of context data. This is now your expert system which you can use to fine-tune your model.
+
+PROS: Not too difficult to implement and can give much better responses to queries about items covered in the context.
+CONS: Updating the model with changes in the context requires retraining, and training takes a log of system resources.
+
+With the Intel Arc B580, I was not able to fine-tune on a full 7B parameter model and had to use a smaller 1.5B parameter model. To train using the alpaca-clean dataset takes 3-5 hours. To generate the correlations, the full 7B model was used in order to get better questions and answers. That data was then used to fine-tune the 1.5B model.
+
+As inference using deepseek-r1 can take 15 seconds or so per query (using the 7B model) and queries on neuralchat-7b are nearly instantaneous, I will explore creating correlations using deepseek, and then use that data to train neurlchat.
+
+# Aproach taken in resume-bot
+
+I tried several techniques, and have collected example output.
+
+1. In-context via pre-embedding context tokens (ollama TRAINING)
+2. In-context via full-context in-query
+3. In-context via relavent text (traditional RAG)
+4. Fine-tune
+
+# Torch vs Ollama
+
+Ollama is easy to setup, and it performs well. However, it does not expose a method for fine-tuning a model beyond the TRAINING template which does not adjust model weights and is more akin to in-context training.
+
+torch is a little more difficult to setup, and it too performs well. With the fast collection of libraries and infrastructure available, fine-tuning using torch is relatively straight forward.
+
+Once you have a fine-tuned model, you can use that model with ollama or torch. I have run the resume-bot using both torch and ollama on an Intel Core i9-14900KS with 64G of RAM and an Intel Arc B580 GPU with 12G of RAM. Below are some metrics gathered while running several query passes:
+
+|                      | ollama-ipex-llm | pytorch w/ ipex-llm |
+|:---------------------|:----------------|:--------------------|
+| Query time           |                 |                     |
+| Concurrent queries   |                 |                     |
+
+### How Ollama Uses the TRAINING Section
+
+The `TRAINING` section in an Ollama Modelfile works differently than traditional fine-tuning methods. Here's how Ollama uses it:
+
+1. **Not True Parameter Fine-tuning**:
+   - Unlike traditional fine-tuning that updates model weights through backpropagation, Ollama doesn't modify the underlying model parameters
+   - The examples in `TRAINING` don't trigger a training loop or gradient updates
+
+2. **Template-Based Learning**:
+   - Ollama uses these examples as additional context when the model is created
+   - The examples effectively become part of the model's "knowledge" 
+   - This is more like instruction-tuning through examples than actual parameter updates
+
+3. **Implementation Details**:
+   - The examples are processed during model creation
+   - They're tokenized and stored alongside the model
+   - When running inference, Ollama doesn't directly include these examples in every prompt
+   - Instead, the model is influenced by having processed these examples during creation
+
+4. **Technical Mechanism**:
+   - The exact implementation varies by model architecture
+   - For many models, Ollama prepends these examples during the model creation process
+   - This shapes the model's understanding without modifying weights
+   - It's similar to how system prompts work but applied at model creation time
+
+5. **Limitations**:
+   - The effectiveness depends on the base model's capability
+   - It works best for teaching patterns and preferred response styles
+   - It's less effective for teaching new facts or complex reasoning
+   - The number of examples is limited by context window size
+
+### Practical Considerations
+
+- Use concise, high-quality examples that demonstrate the exact behavior you want
+- Focus on patterns rather than specific facts
+- Include diverse examples covering different aspects of desired behavior
+- For best results, combine with well-crafted system prompts
+- Remember that this isn't true fine-tuning - it's more like "example-based conditioning"
+
+This approach works well for adapting model style and format, but for more substantial changes to model behavior, traditional fine-tuning frameworks that update weights (like those in Hugging Face's ecosystem) would be more effective.
--- a/doc/projects/airc.txt
+++ b/doc/projects/airc.txt
@ -0,0 +1,105 @@
+# AIRC (pronounced Eric)
+
+AI is Really Cool
+
+This project provides a simple IRC chat client. It runs the neuralchat model, enhanced with a little bit of RAG to fetch news RSS feeds.
+
+Internally, it is built using PyTorch 2.6 and the Intel IPEX/LLM.
+
+NOTE: If running on an Intel Arc A series graphics processor, fp64 is not supported and may need to either be emulated or have the model quantized. It has been a while since I've had an A series GPU to test on, so if you run into problems please file an [issue](https://github.com/jketreno/airc/issues)--I have some routines I can put in, but don't have a way to test them. 
+
+# Installation
+
+This project uses docker containers to build. As this was originally written to work on an Intel Arc B580 (Battlemage), it requires a kernel that supports that hardware, such as the one documented at [Intel Graphics Preview](https://github.com/canonical/intel-graphics-preview), which runs in Ubuntu Oracular (24.10)..
+
+NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
+
+## Want to run under WSL2? No can do...
+
+https://www.intel.com/content/www/us/en/support/articles/000093216/graphics/processor-graphics.html
+
+The A- and B-series discrete GPUs do not support SR-IOV, required for the GPU partitioning that Microsoft Windows uses in order to support GPU acceleration in WSL.
+
+## Building
+
+NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
+
+
+```bash
+git clone https://github.com/jketreno/airc
+cd airc
+docker compose build
+```
+
+## Running
+
+In order to download the models, you need to have a Hugging Face token. See https://huggingface.co/settings/tokens for information on obtaining a token.
+
+Edit .env to add the following:
+
+```.env
+HF_ACCESS_TOKEN=<access token from huggingface>
+```
+
+NOTE: Models downloaded by most examples will be placed in the ./cache directory, which is bind mounted to the container.
+
+### AIRC
+
+To launch the airc shell interactively, with the pytorch 2.6 environment loaded, use the default entrypoint to launch a shell:
+
+```bash
+docker compose run --rm airc shell
+```
+
+Once in the shell, you can then launch the model-server.py and then the airc.py client:
+
+```bash
+docker compose run --rm airc shell
+src/airc.py --ai-server=http://localhost:5000 &
+src/model-server.py
+```
+
+By default, src/airc.py will connect to irc.libera.chat on the airc-test channel. See `python src/airc.py --help` for options.
+
+By separating the model-server into its own process, you can develop and tweak the chat backend without losing the IRC connection established by airc.
+
+### Jupyter
+
+```bash
+docker compose up jupyter -d
+```
+
+The default port for inbound connections is 8888 (see docker-compose.yml). $(pwd)/jupyter is bind mounted to /opt/juypter in the container, which is where notebooks will be saved by default.
+
+To access the jupyter notebook, go to `https://localhost:8888/jupyter`.
+
+### Monitoring
+
+You can run `ze-monitor` within the launched containers to monitor GPU usage.
+
+```bash
+containers=($(docker ps --filter "ancestor=airc" --format "{{.ID}}"))
+if [[ ${#containers[*]} -eq 0 ]]; then
+  echo "Running airc container not found."
+else
+  for container in ${containers[@]}; do
+    echo "Container ${container} devices:"
+    docker exec -it ${container} ze-monitor
+  done
+fi
+```
+
+If an airc container is running, you should see something like:
+
+```
+Container 5317c503e771 devices:
+Device 1: 8086:A780 (Intel(R) UHD Graphics 770)
+Device 2: 8086:E20B (Intel(R) Graphics [0xe20b])
+```
+
+You can then launch ze-monitor in that container specifying  the device you wish to monitor:
+
+```
+containers=($(docker ps --filter "ancestor=airc" --format "{{.ID}}"))
+docker exec -it ${containers[0]} ze-monitor --device 2
+```
--- a/doc/projects/ze-monitor.txt
+++ b/doc/projects/ze-monitor.txt
@ -0,0 +1,279 @@
+# ze-monitor
+
+A small utility to monitor Level Zero devices via 
+[Level Zero Sysman](https://oneapi-src.github.io/level-zero-spec/level-zero/latest/sysman/PROG.html#sysman-programming-guide) 
+from the command line, similar to 'top'.
+
+# Installation
+
+Requires Ubuntu Oracular 24.10.
+
+## Easiest
+
+### Install prerequisites
+
+This will add the [Intel Graphics Preview PPA](https://github.com/canonical/intel-graphics-preview) and install the required dependencies:
+
+```bash
+sudo apt-get install -y \
+    software-properties-common \
+    && sudo add-apt-repository -y ppa:kobuk-team/intel-graphics \
+    && sudo apt-get update \
+    && sudo apt-get install -y \
+    libze1 libze-intel-gpu1 libncurses6
+```
+
+### Install ze-monitor from .deb package
+
+This will download the ze-monitor GitHub, install it, and add the current
+user to the 'ze-monitor' group to allow running the utility:
+
+```bash
+version=0.3.0-1
+wget https://github.com/jketreno/ze-monitor/releases/download/v${version}/ze-monitor-${version}_amd64.deb
+sudo dpkg -i ze-monitor-${version}_amd64.deb
+sudo usermod -a -G ze-monitor $(whoami)
+newgrp ze-monitor
+```
+
+Congratulations! You can run ze-monitor:
+
+```bash
+ze-monitor
+```
+
+You should see something like:
+
+```bash
+Device 1: 8086:A780 (Intel(R) UHD Graphics 770)
+Device 2: 8086:E20B (Intel(R) Graphics [0xe20b])
+```
+
+To monitor a device:
+
+```bash
+ze-monitor --device 2
+```
+
+Check the docs (`man ze-monitor`) for additional details on running the ze-monitor utility.
+
+## Slightly more involved
+
+This project uses docker containers to build. As this was originally written to monitor an Intel Arc B580 (Battlemage), it requires a kernel that supports that hardware, such as the one documented at [Intel Graphics Preview](https://github.com/canonical/intel-graphics-preview), which runs in Ubuntu Oracular (24.10). It will monitor any Level Zero device, even those using the i915 driver.
+
+NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
+
+```
+git clone https://github.com/jketreno/ze-monitor.git
+cd ze-monitor
+docker compose build
+sudo apt install libze1 libncurses6
+version=$(cat src/version.txt)
+docker compose run --remove-orphans --rm \
+  ze-monitor \
+  cp /opt/ze-monitor-static/build/ze-monitor-${version}_amd64.deb \
+  /opt/ze-monitor/build
+sudo dpkg -i build/ze-monitor-${version}_amd64.deb
+```
+
+# Security
+
+In order for ze-monitor to read the performance metric units (PMU) in the  Linux kernel, it needs elevated permissions. The easiest way is to install the .deb package and add the user to the ze-monitor group. Or, run under sudo (eg., `sudo ze-monitor ...`.)
+
+The specific capabilities required to monitor the GPU are documented in [Perf Security](https://www.kernel.org/doc/html/v5.1/admin-guide/perf-security.html) and [man capabilities](https://man7.org/linux/man-pages/man7/capabilities.7.html). These include:
+
+| Capability          | Reason                                               |
+|:--------------------|:-----------------------------------------------------|
+| CAP_DAC_READ_SEARCH | Bypass all filesystem read access checks             |
+| CAP_PERFMON         | Access to perf_events (vs. overloaded CAP_SYS_ADMIN) |
+| CAP_SYS_PTRACE      | PTRACE_MODE_READ_REALCREDS ptrace access mode check  |
+
+To configure ze-monitor to run with those privileges, you can use `setcap` to set the correct capabilities on ze-monitor. You can further secure your system by creating a user group specifically for running the utility and restrict  running of that command to users in that group. That is what the .deb package does.
+
+If you install the .deb package from a [Release](https://github.com/jketreno/ze-monitor/releases) or by building it, that package will set the appropriate permissions for ze-monitor on installation and set it executable only to those in the 'ze-monitor' group.
+
+## Anyone can run ze-monitor
+
+If you build from source and want to set the capabilities:
+
+```bash
+sudo setcap "cap_perfmon,cap_dac_read_search,cap_sys_ptrace=ep" build/ze-monitor
+getcap build/ze-monitor
+```
+
+Any user can then run `build/ze-monitor` and monitor the GPU.
+
+# Build outside container
+
+## Prerequisites
+
+If you would like to build outside of docker, you need the following packages installed:
+
+```
+sudo apt-get install -y \
+    build-essential \
+    libfmt-dev \
+    libncurses-dev
+```
+
+In addition, you need the Intel drivers installed, which are available from the `kobuk-team/intel-graphics` PPA:
+
+```
+sudo apt-get install -y \
+    software-properties-common \
+    && sudo add-apt-repository -y ppa:kobuk-team/intel-graphics \
+    && sudo apt-get update \
+    && sudo apt-get install -y \
+    libze-intel-gpu1 \
+    libze1 \
+    libze-dev
+```
+## Building
+
+```
+cd build
+cmake ..
+make
+```
+
+## Running
+
+```
+build/ze-monitor
+```
+
+## Build and install .deb
+
+In order to build the .deb package, you need the following packages installed:
+
+```bash
+sudo apt-get install -y \
+    debhelper \
+    devscripts \
+    rpm \
+    rpm2cpio
+```
+
+You can then build the .deb:
+
+```bash
+if [ -d build ]; then
+  cd build
+fi
+version=$(cat ../src/version.txt)
+cpack
+sudo dpkg -i build/packages/ze-monitor_${version}_amd64.deb
+```
+
+You can then run ze-monitor from your path:
+
+```bash
+ze-monitor
+```
+
+# Developing
+
+To run the built binary without building a full .deb package, you can build and run on the host by compiling in the container:
+
+```
+docker compose run --rm ze-monitor build.sh
+build/ze-monitor
+```
+
+The build.sh script will build the binary in /opt/ze-monitor/build, which is volume mounted to the host's build directory.
+
+NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
+
+# Running
+
+NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
+
+If running within a docker container, the container environment does not have access to the host's `/proc/fd`, which is necessary to obtain information about the processes outside the current container which are using the GPU. As such, only processes running within that container running ze-monitor will be listed as using the GPU.
+
+## List available devices
+
+```
+ze-monitor
+```
+
+Example output:
+
+```bash
+$ ze-monitor 
+Device 1: 8086:E20B (Intel(R) Graphics [0xe20b])
+Device 2: 8086:A780 (Intel(R) UHD Graphics 770)
+```
+
+## Show details for a given device
+
+```
+sudo ze-monitor --info --device ( PCIID | # | BDF | UUID | /dev/dri/render*)
+```
+
+Example output:
+
+```bash
+$ sudo ze-monitor --device 2 --info
+Device: 8086:A780 (Intel(R) UHD Graphics 770)
+ UUID: 868080A7-0400-0000-0002-000000000000
+ BDF: 0000:0000:0002:0000
+ PCI ID: 8086:A780
+ Subdevices: 0
+ Serial Number: unknown
+ Board Number: unknown
+ Brand Name: unknown
+ Model Name: Intel(R) UHD Graphics 770
+ Vendor Name: Intel(R) Corporation
+ Driver Version: 0CB7EFCAD5695B7EC5C8CE6
+ Type: GPU
+ Is integrated with host: Yes
+ Is a sub-device: No
+ Supports error correcting memory: No
+ Supports on-demand page-faulting: No
+ Engines: 7
+  Engine 1: ZES_ENGINE_GROUP_RENDER_SINGLE
+  Engine 2: ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE
+  Engine 3: ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE
+  Engine 4: ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE
+  Engine 5: ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE
+  Engine 6: ZES_ENGINE_GROUP_COPY_SINGLE
+  Engine 7: ZES_ENGINE_GROUP_MEDIA_ENHANCEMENT_SINGLE
+ Temperature Sensors: 0
+```
+
+NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
+
+## Monitor a given device
+
+```
+sudo ze-monitor --device ( PCIID | # | BDF | UUID | /dev/dri/render* ) \
+  --interval ms
+```
+
+NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
+
+Output:
+
+```bash
+$ sudo ze-monitor --device 2 --interval 500
+Device: 8086:E20B (Intel(R) Graphics [0xe20b])
+Total Memory:  12809404416
+Free memory:  [#  55% ############################                              ]
+Power usage: 165.0W
+------------------------------------------------------------------------------------------
+   PID COMMAND-LINE
+       USED MEMORY       SHARED MEMORY     ENGINE FLAGS
+------------------------------------------------------------------------------------------
+     1 /sbin/init splash
+       MEM: 106102784    SHR: 100663296    FLAGS: RENDER COMPUTE
+  1606 /usr/lib/systemd/systemd-logind
+       MEM: 106102784    SHR: 100663296    FLAGS: RENDER COMPUTE
+  5164 /usr/bin/gnome-shell
+       MEM: 530513920    SHR: 503316480    FLAGS: RENDER COMPUTE
+  5237 /usr/bin/Xwayland :1024 -rootless -nores...isplayfd 6 -initfd 7 -byteswappedclients
+       MEM: 0            SHR: 0            FLAGS:
+ 40480 python chat.py
+       MEM: 5544226816   SHR: 0            FLAGS: DMA COMPUTE
+```
+
+If you pass `--one-shot`, statistics will be gathered, displayed, and then ze-monitor will exit.
--- a/doc/resume/generic.txt
+++ b/doc/resume/generic.txt
@ -0,0 +1,56 @@
+# JAMES KETRENOS
+software architect, designer, developer, and team lead
+Beaverton, OR 97003
+
+james@ketrenos.com
+(503) 501 8281
+
+Seeking an opportunity to contribute to the advancement of energy efficient AI solutions, James is a driven problem solver, solution creator, technical leader, and skilled software developer focused on rapid, high-quality results, with an eye toward bringing solutions to the market.
+
+## SUMMARY
+
+Problem-solving: Trusted resource for executive leadership, able to identify opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers.
+
+Proficient: Adept in compiled and interpreted languages, the software frameworks built around them, and front- and backend infrastructure. Leveraging deep and varied experience to quickly find solutions. Rapidly familiarizes and puts to use new and emerging technologies.
+
+Experienced: 20+ years of experience as an end-to-end Linux software architect, team lead, developer, system administrator, and user. Working with teams to bring together technologies into existing ecosystems for a myriad of technologies.
+
+Leader: Frequent project lead spanning all areas of development and phases of the product life cycle from pre-silicon to post launch support. Capable change agent and mentor, providing technical engineering guidance to multiple teams and organizations.
+
+Communicates: Thrives on helping people solve problems, working to educate others to help them better understand problems and work toward solutions.
+
+## RECENT HISTORY
+
+2024-2025: Present
+
+* Developed 'ze-monitor', a lightweight C++ Linux application leveraging Level Zero Sysman APIs to provide 'top' like device monitoring of Intel GPUs. https://github.com/jketreno/ze-monitor
+* Developed 'airc', a LLM pipeline allowing interactive queries about James' resume. Utilizing both in-context and fine-tuned approaches, questions asked about James will use information from his resume and portfolio for answers. Includes a full-stack React web ui, a command line client, and an IRC bot integration. https://github.com/jketreno/airc
+
+2018-2024: Intel® Graphics Software Staff Architect and Lead
+
+* Redefined how Intel approaches graphics enabling on Linux to meet customer and product timelines.
+* Spearheaded internal projects to prove out the developer and customer deployment experience when using Intel graphics products with PyTorch, working to ensure all ingredients are available and consumable for success (from kernel driver integration, runtime, framework integration, up to containerized Python workload solution deployment.)
+* Focused on improving the customer experience for Intel graphics software for Linux in the data center, high-performance compute clusters, and end users. Worked with several teams and business units to close gaps, improve our software, documentation, and release methodologies.
+* Worked with hardware and firmware teams to scope and define architectural solutions for customer features.
+
+1998-2018: Open Source Software Architect and Lead
+
+* Defined software architecture for handheld devices, tablets, Internet of Things, smart appliances, and emerging technologies. Key resource to executive staff to investigate emerging technologies and drive solutions to close existing gaps
+* James career at Intel has been diverse. His strongest skills are related to quickly ramping on technologies being utilized in the market, identifying gaps in existing solutions, and working with teams to close those gaps. He excels at adopting and fitting new technology trends as they materialize in the industry.
+
+## PROLONGED HISTORY
+
+The following are technical areas James has been an architect, team lead, and/or individual contributor:
+
+* Linux release infrastructure overhaul: Identified bottlenecks in the CI/CD build pipeline, built proof-of-concept, and moved to production for generating releases of Intel graphics software (https://dgpu-docs.intel.com) as well as internal dashboards and infrastructure for tracking build and release pipelines. JavaScript, HTML, Markdown, RTD, bash/python, Linux packaging, Linux repositories, Linux OS release life cycles, sqlite3. Worked with multiple teams across Intel to meet Intel’s requirements for public websites as well as to integrate with existing build and validation methodologies while educating teams on tools and infrastructure available from the ecosystem (vs. roll-your-own).
+* Board Explorer: Web app targeting developer ecosystem to utilize new single board computers, providing quick access to board details, circuits, and programming information. Delivered as a pure front-end service (no backend required) https://board-explorer.github.io/board-explorer/#quark_mcu_dev_kit_d2000. Tight coordination with UX design team. JavaScript, HTML, CSS, XML, hardware specs, programming specs.
+* (internal) Travel Requisition: Internal HTML application and backend enabling internal organizations to request travel approval and a manager front end to track budgetary expenditures in order to determine approval/deny decisions. NodeJS, JavaScript, Polymer, SQL. Tight coordination with internal requirements providers and UX design teams.
+* Developer Journey: Web infrastructure allowing engineers to document DIY processes. Front end for parsing, viewing, and following projects. Back end for managing content submitted (extended markdown) including images, videos, and screencasts. Tight coordination with UX design team.
+* Robotics: Worked with teams aligning on a ROS (Robot OS) roadmap and alignment. Presented at Embedded Linux conference on the state of open source and robotics. LIDAR, Intel RealSense, opencv, python, C. Developed a robotic vision controlled stewart platform that could play the marble game labyrinth.
+* Moblin and MeeGo architect: Focused on overall software architecture as well as moving forward multi-touch and the industry shift to resolution independent applications; all in a time before smart phones as we know them today. Qt, HTML5, EFL.
+* Marblin: An HTML/WebGL graphical application simulating the 2D collision physics of marbles in a 3D rendered canvas.
+* Linux Kernel: Developed and maintained initial Intel Pro Wireless 2100, 2200, and 3945 drivers in the Linux kernel. C, Software Defined Radios, IEEE 802.11, upstream kernel driver, team lead for team that took over the Intel wireless drivers, internal coordination regarding technical and legal issues surrounding the wireless stack.
+* Open source at Intel: Built proof-of-concepts to illustrate to management the potential and opportunities for Intel by embracing open source and Linux.
+* Intel Intercast Technology: Team lead for Intel Intercast software for Windows. Worked with 3rd party companies to integrate the technology into their solutions. 
+
+
--- a/results/doc/projects/airc.txt.json
+++ b/results/doc/projects/airc.txt.json
--- a/results/doc/projects/ze-monitor.txt.json
+++ b/results/doc/projects/ze-monitor.txt.json
--- a/results/doc/resume/generic.txt.json
+++ b/results/doc/resume/generic.txt.json
@ -0,0 +1,57 @@
+{
+  "file_path": "../doc/resume/generic.txt",
+  "processed_at": "2025-03-12T19:22:00.311002",
+  "context": "# JAMES KETRENOS\nsoftware architect, designer, developer, and team lead\nBeaverton, OR 97003\n\njames@ketrenos.com\n(503) 501 8281\n\nSeeking an opportunity to contribute to the advancement of energy efficient AI solutions, James is a driven problem solver, solution creator, technical leader, and skilled software developer focused on rapid, high-quality results, with an eye toward bringing solutions to the market.\n\n## SUMMARY\n\nProblem-solving: Trusted resource for executive leadership, able to identify opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers.\n\nProficient: Adept in compiled and interpreted languages, the software frameworks built around them, and front- and backend infrastructure. Leveraging deep and varied experience to quickly find solutions. Rapidly familiarizes and puts to use new and emerging technologies.\n\nExperienced: 20+ years of experience as an end-to-end Linux software architect, team lea...",
+  "qa_pairs": [
+    {
+      "question": "What is James' primary role in the software architecture field?",
+      "answer": "James' primary role in the software architecture field is focused on developing and maintaining the Linux kernel, ensuring compatibility with Intel wireless drivers. He also contributes to hardware design, particularly in the development of the Intel Intercast technology.",
+      "validation": "**Assessment:**\n\n1. **ACCURACY:**  \n   The answer accurately mentions James' primary role in the software architecture field, focusing on kernel development for Intel wireless and contributing to hardware design, including Intel Intercast. However, it omits details about other contributions, such as kernel development for other hardware projects.\n\n2. **COMPLETENESS:**  \n   The answer is incomplete as it does not provide information on James' work in hardware design beyond Intel Intercast, such as kernel development for other hardware.\n\n3. **RELEVANCE:**  \n   The answer is relevant, covering James' kernel development for Intel wireless and his contribution to hardware design.\n\n4. **NATIVITY:**  \n   The answer is natural and conversational.\n\n5. **DIVERSITY:**  \n   The answer is somewhat diverse, covering different areas of software architecture.\n\n6. **IMPROVEMENTS:**  \n   The answer could include more details on James' contributions to hardware design, such as his kernel development for Intel wireless or other hardware projects.\n\n**Decision:**  \nREJECT"
+    },
+    {
+      "question": "Can you explain the development of the 'ze-monitor' application?",
+      "answer": "The 'ze-monitor' application was developed as a lightweight C++ Linux application that leverages Level Zero Sysman APIs to provide 'top' like device monitoring for Intel GPUs. This project was part of James' experience working with Linux and Level Zero Sysman APIs, focusing on leveraging new technologies and ensuring seamless integration with existing systems. The development process involved identifying opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers. This led to the creation of a lightweight application that integrates Level Zero Sysman APIs, enabling device monitoring for Intel GPUs. The detailed development process involved collaboration with the team, ensuring that the application met the technical requirements and worked seamlessly with existing systems.",
+      "validation": "**Answer:**\n\nThe 'ze-monitor' application was developed as a lightweight C++ Linux application that leverages Level Zero Sysman APIs to provide device monitoring for Intel GPUs. This project was part of James' experience working with Linux and Level Zero Sysman APIs, focusing on leveraging new technologies and ensuring seamless integration with existing systems. The development process involved identifying opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers. The application was created to integrate Level Zero Sysman APIs, enabling device monitoring for Intel GPUs, while ensuring seamless integration with existing systems.\n\n**Assessment:**\n\n- **VERDICT:** [ACCEPT]\n\n- **REASONING:** The answer is accurate, comprehensive, relevant, and natural. It explains the technical aspects of the application and its development, which is useful for understanding the context provided. It also mentions how the answer was generated and its contribution to the overall knowledge base, making it somewhat complete.\n\n- **IMPROVEMENTS:** To add diversity, the answer could mention other areas where similar applications were developed, or highlight the broader impact of these technologies. For example, mentioning other device monitoring applications developed by James or other developers could add more diversity. Also, including how these technologies contribute to broader areas like cybersecurity or efficiency could enhance the answer's breadth and relevance."
+    },
+    {
+      "question": "How did the 'airc' project utilize James' resume data?",
+      "answer": "James' resume data was utilized in the development of the 'airc' project, which is a command line client, interactive query system, and IRC bot integration. This project leverages his extensive experience in software architecture and development, particularly in areas such as Linux release infrastructure and system architecture.",
+      "validation": "- VERDICT: [ACCEPT]\n- REASONING: The answer is accurate, complete, relevant, and natural. It provides specific details and adds some diversity by focusing on a single project and experience area. No further improvements are needed.\n- IMPROVEMENTS: None needed, as the answer meets all criteria effectively.\n\nAnswer:\nJames' resume data was utilized in the development of the 'airc' project, which includes a command line client, interactive query system, and IRC bot integration. This project leverages his extensive experience in software architecture and development, particularly in areas such as Linux release infrastructure and system architecture."
+    },
+    {
+      "question": "What is James' experience with Linux kernel driver integration?",
+      "answer": "James' experience with Linux kernel driver integration includes working on the Intel Wireless drivers, where he contributed to the development of kernel driver integration, utilizing tools like Linux packaging and managing CI/CD pipelines for release infrastructure.",
+      "validation": "**Answer:**\n\nJames' experience with Linux kernel driver integration includes his contributions to the development of kernel driver integration for Intel's Wireless drivers and his role in managing CI/CD pipelines for projects like Open Source at Intel.\n\n**Evaluation:**\n\n- **VERDICT:** [ACCEPT]\n- **REASONING:** The answer accurately describes James' experience with kernel driver integration, providing specific examples and context. It is relevant, natural, and adds diversity to his expertise.\n- **IMPROVEMENTS:** If possible, enhancing the answer with more details about the tools or processes used could further enhance naturalness. However, as provided, it is sufficiently detailed.\n- **DIVERSITY:** The answer naturally incorporates James' work in kernel driver integration, which is a natural addition to his role and provides a new angle for understanding his expertise.\n\nThis answer is appropriate and suitable for training purposes."
+    },
+    {
+      "question": "How did James contribute to the development of Intel RealSense hardware?",
+      "answer": "James contributed to the development of Intel RealSense hardware through the development of a command line client and an IRC bot integration for the 'airc' project.",
+      "validation": "**Answer:**\n\nJames contributed to the development of Intel RealSense hardware through the creation of a command line client and an IRC bot integration for the 'airc' project. This involved integrating AI capabilities into the 'ze-monitor' project, allowing for device monitoring of Intel GPUs, which was part of the broader effort to advance energy-efficient AI solutions. However, the answer does not directly address how James contributed to Intel RealSense hardware, focusing instead on his other projects. This could be an area for improvement to better align with the context provided.\n\n**Assessment (VERDICT):**  \nThe answer is accurate and provides specific information about James' contributions but lacks essential details about his role in Intel RealSense hardware development. It is a bit verbose but could be more concise. The response is appropriate but could be enhanced to better meet the quality standards by providing a clearer link to Intel RealSense.\n\n**Reasoning:**  \nThe answer accurately describes James' contributions but does not address the specific impact on Intel RealSense hardware. It is a sufficient answer for the given question, but it could be improved for more clarity and conciseness.\n\n**IMPROVEMENTS:**  \n1. Clarify how James contributed to Intel RealSense hardware, linking his other projects to RealSense development.\n2. Provide a more direct explanation of his role in the 'airc' project and its relation to Intel RealSense.\n3. Offer a brief overview of Intel RealSense hardware to better place James' contributions within the context."
+    },
+    {
+      "question": "What is the current state of James' open-source contributions?",
+      "answer": "James Ketrenos is actively engaged in several open-source projects, contributing to the development and enhancement of various tools and technologies. His contributions include:\n\n1. **Ze-Monitor**: A lightweight application that leverages Level Zero Sysman APIs for device monitoring of Intel GPUs. This tool is part of his contributions to the Linux community.\n\n2. **Airc**: A LLM pipeline that facilitates interactive queries about James' resume, showcasing his proficiency in using LLMs to assist with resume optimization and internal communication.\n\n3. **Rebound**: A React web UI that integrates various components such as a command line client, an IRC bot, and an IRC bot integration, demonstrating his ability to create user-friendly interfaces for complex systems.\n\n4. **Rebound-WebUI**: A full-stack React web UI that includes a command line client, an IRC bot, and an IRC bot integration, highlighting his expertise in developing integrated web applications.\n\n5. **Rebound-React-UI**: A React web UI that includes a command line client and an IRC bot integration, showcasing his skill in creating web interfaces for technical solutions.\n\n6. **Rebound-React-UI-IRC**: An integrated React web UI with a command line client, an IRC bot, and an IRC bot integration, further illustrating his ability to create effective and user-friendly interfaces.\n\nThese contributions reflect James' active participation in the open-source community and his dedication to advancing technology through innovative solutions.",
+      "validation": "** Assessment: [ACCEPT]**\n\n**Reasoning:**\n1. **ACCURACY:** The answer accurately describes James Ketrenos's open-source contributions, mentioning specific tools and projects.\n2. **COMPLETENESS:** The answer fully answers the question by detailing James's involvement in multiple projects.\n3. **RELEVANCE:** The question is meaningful and relevant to the context, focusing on James's role in the Linux community.\n4. **NATURALNESS:** The language is natural and conversational, avoiding forced or unnatural phrasing.\n5. **DIVERSITY:** The pair adds naturalness to the dataset, enhancing its overall quality.\n\n**Improvements:**\n- To further enhance naturalness, examples could include using \"partner\" instead of \"contribution,\" making the answer more relatable.\n- Additional context about James's contributions could be added, such as his role in specific projects or his impact on the community."
+    },
+    {
+      "question": "How has James' expertise in kernel driver development impacted the CI/CD pipeline?",
+      "answer": "James' expertise in kernel driver development has significantly impacted the CI/CD pipeline. Specifically, his work on Intel graphics software has enabled the efficient deployment of Intel graphics products within the CI/CD pipeline. By optimizing kernel drivers, James ensured that the Linux kernel could handle the CI/CD processes, such as deployments and releases, thereby enhancing the software's performance and reliability. This integration has allowed for better utilization of Intel graphics, improving the overall efficiency and customer satisfaction of the products.",
+      "validation": "**Answer:**\n\nThe provided answer is well-received, with a naturalness score of 0.8 due to minor suggestion of making it more conversational. It is accurate, complete, and relevant, with a naturalness score of 0.8 due to minor suggestion of making it more conversational.\n\n**Evaluation:**\n\n- **ACCURACY:** Yes. The answer correctly links kernel driver expertise to CI/CD pipeline efficiency.\n- **COMPLETENESS:** Yes. The answer provides a clear and detailed link between kernel driver development and CI/CD processes.\n- **RELEVANCE:** Yes. The answer connects kernel driver expertise to CI/CD processes, which are relevant to software development and efficiency.\n- **NATURALNESS:** Good. The answer is written in a professional context, but could be more conversational.\n- **DIVERSITY:** Good. The answer focuses on specific technologies and their impact, but doesn't add much diversity.\n\n**Overall Assessment:** The task is suitable for training, with minor areas for improvement in naturalness. The answer is accurate, complete, and relevant, making it effective for quality assurance.\n\n**Answer:** VERDICT: [ACCEPT]  \nREASONING: The answer accurately connects James' kernel driver expertise to CI/CD impact, demonstrating relevance and completeness.  \nIMPROVEMENTS: The answer could be made more conversational by phrases like, \" James' work on Intel graphics software has enabled...\"  \n- VERDICT: ACCEPT  \n- REASONING: The answer is accurate, complete, and relevant. It effectively addresses the impact of kernel driver development on CI/CD.  \n- IMPROVEMENTS: Suggesting more conversational phrasing to enhance naturalness."
+    },
+    {
+      "question": "What is the impact of James' work on the development of the Intel GPU driver?",
+      "answer": "James Keta renos's work has significantly impacted the development of the Intel GPU driver. Specifically, he was involved in the design and implementation of the Intel GPU driver, which led to the release of the Intel GPU driver version 3.3.1. Additionally, his contributions to the Linux kernel and development of tools like 'ze-monitor' and 'airc' have enhanced the efficiency of kernel-based graphics drivers. These advancements have improved the performance and reliability of the Intel GPU, contributing to the broader development of Linux kernel-based graphics drivers and related technologies.",
+      "validation": "**Answer:**\n\n**VERDICT:** ACCEPVED\n\n**REASONING:** The question is clear and the answer is thorough, addressing the impact of James Keta renos's work on the Intel GPU driver. However, the answer is somewhat technical and lacks a broader context and naturalness.\n\n**IMPROVEMENTS:** The answer could be made more conversational and broader to explain how James' work has directly impacted the development of the Intel GPU driver. It should also provide more context on how his contributions have influenced other areas of Linux or kernel-based software.\n\n**NOTICE:** The answer is comprehensive but lacks a more natural and broader context. It could be improved by including more conversational and contextual elements to better explain the impact and its broader significance."
+    },
+    {
+      "question": "How has James' experience with robotic vision applications influenced hardware design?",
+      "answer": "James' experience with robotic vision applications has significantly influenced hardware design by enabling the development of efficient and scalable computing solutions. By integrating robotic vision into hardware, he contributed to the design of hardware that can handle complex tasks requiring visual recognition, leading to improved performance in areas such as image processing and robotics. This has translated into hardware designs that enhance computational capabilities and efficiency, directly impacting the development of systems requiring high-performance computing.",
+      "validation": "- VERDICT: REJECT\n- REASONING: The answer is technically accurate and relevant, but it lacks specific examples and context, making it less natural and less detailed than necessary.\n- IMPROVEMENTS: The answer should include concrete hardware design examples influenced by robotic vision, tie back to James' experience as a software architect, and provide more natural and detailed examples to make it more understandable."
+    },
+    {
+      "question": "What is James' role in the development of the Intel Intercast software technology?",
+      "answer": "James' role in the development of the Intel Intercast software technology is as a team leader and technical leader, responsible for driving the integration of the Intel Intercast technology into various solutions and ensuring its successful adoption.",
+      "validation": "**Answer:**\n\nJames' role in the development of the Intel Intercast software technology is as a team leader and technical leader, responsible for driving the integration of the Intel Intercast technology into various solutions and ensuring its successful adoption.\n\n**Evaluation:**\n\n1. **VERDICT: [ACCEPT]**\n   - **REASONING:** The answer accurately describes James' role and contributes to the context about his work.\n\n2. **COMPLETENESS:** [REMAIN] The answer fully addresses the question by explaining James' role, so it's complete.\n\n3. **RELEVANCE:** [REMAIN] The answer clearly ties James' role to the Intel Intercast project, making it relevant.\n\n4. ** NATURALNESS:** [REMAIN] The answer uses professional language and is concise, which sounds natural.\n\n5. **DIVERSITY:** [REMAIN] The answer adds variety by mentioning multiple areas of James' work, which is important for training.\n\nOverall, this pair meets all the quality standards. It's accurate, complete, relevant, natural, and adds diversity, which is important for training."
+    }
+  ]
+}
--- a/src/pycache/cli.cpython-312.pyc
+++ b/src/pycache/cli.cpython-312.pyc
--- a/src/pycache/core.cpython-312.pyc
+++ b/src/pycache/core.cpython-312.pyc
--- a/src/pycache/rag_core.cpython-312.pyc
+++ b/src/pycache/rag_core.cpython-312.pyc
--- a/src/alpaca_lora_finetuning.py
+++ b/src/alpaca_lora_finetuning.py
@ -0,0 +1,293 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://github.com/tloen/alpaca-lora/blob/main/finetune.py
+#
+# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List
+
+import torch
+import transformers
+from datasets import load_dataset
+import accelerate
+from typing import Union
+from datetime import datetime
+
+from transformers import AutoTokenizer
+from peft import (
+    get_peft_model_state_dict,
+    set_peft_model_state_dict,
+)
+
+from ipex_llm.transformers import AutoModelForCausalLM
+# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
+    LoraConfig
+import json
+
+local_rank = 0
+world_size = 1
+
+class Prompter(object):
+    __slots__ = ("data")
+
+    def __init__(self, data_path: str = ""):
+        self.data = None
+        if not os.path.exists(data_path):
+            raise Exception(f"Can't read {data_path}")
+        with open(data_path) as fp:
+            self.data = json.load(fp)
+
+    def generate_prompt(
+        self,
+        instruction: str,
+        input: Union[None, str]=None,
+        label: Union[None, str]=None,
+    ) -> str:
+        # returns the full prompt from instruction and optional input
+        # if a label (=response, =output) is provided, it's also appended.
+        if input:
+            ret = f"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+        else:
+            ret = f"### Instruction:\n{instruction}\n\n### Response:\n"
+        if label:
+            ret = f"{ret}{label}"
+        return ret
+
+    def get_response(self, output: str) -> str:
+        return output.split("### Response:")[1].strip()
+
+"""Data processing to get train data and val data"""
+def tokenize_qa(prompt, tokenizer, max_length=1, add_eos_token=True):
+    # there's probably a way to do this with the tokenizer settings
+    # but again, gotta move fast
+    result = tokenizer(
+        prompt,
+        truncation=True,
+        max_length=max_length,
+        padding=False,
+        return_tensors=None,
+    )
+    if (
+        result["input_ids"][-1] != tokenizer.eos_token_id
+        and len(result["input_ids"]) < max_length
+        and add_eos_token
+    ):
+        result["input_ids"].append(tokenizer.eos_token_id)
+        result["attention_mask"].append(1)
+    result["labels"] = result["input_ids"].copy()
+    return result
+
+def generate_and_tokenize_prompt(entry, prompter, tokenizer, max_length, add_eos_token, train_on_inputs):
+    full_prompt = prompter.generate_prompt(
+        entry["question"],
+        None,
+        entry["answer"],
+    )
+    print({ 'full_prompt:' full_prompt })
+    tokenized_full_prompt = tokenize_qa(full_prompt, tokenizer, max_length=max_length)
+    if not train_on_inputs:
+        user_prompt = prompter.generate_prompt(
+            entry["question"], None
+        )
+        tokenized_user_prompt = tokenize_qa(
+            user_prompt, tokenizer, add_eos_token=add_eos_token, max_length=max_length
+        )
+        user_prompt_len = len(tokenized_user_prompt["input_ids"])
+        if add_eos_token:
+            user_prompt_len -= 1
+        # Mask out the input from the labels
+        tokenized_full_prompt["labels"] = [
+            -100
+        ] * user_prompt_len + tokenized_full_prompt["labels"][
+            user_prompt_len:
+        ]  # could be sped up, probably
+    return tokenized_full_prompt
+
+
+def train(
+    # model/data params
+    base_model: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    data_path: str = "./results/doc/resume/generic.txt.qa.json",
+    output_dir: str = "./bigdl-qlora-alpaca",
+    # training hyperparams
+    bf16: bool = True,  # default to bf16
+    batch_size: int = 128,
+    micro_batch_size: int = 2,  # default to be 2, limited by GPU memory
+    num_epochs: int = 3,
+    learning_rate: float = 3e-5,  # default to be 3e-5 to avoid divergence
+    cutoff_len: int = 256,
+    val_set_size: int = 2000,
+    # lora hyperparams
+    lora_r: int = 8,
+    lora_alpha: int = 16,
+    lora_dropout: float = 0.05,
+    lora_target_modules: List[str] = [
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj",
+        "up_proj",
+        "down_proj",
+        "gate_proj"
+    ],
+    # llm hyperparams
+    train_on_inputs: bool = True,  # if False, masks out inputs in loss
+    add_eos_token: bool = False,
+    group_by_length: bool = False,  # faster, but produces an odd training loss curve
+    resume_from_checkpoint: str = "",  # either training checkpoint or final adapter
+    prompt_template_name: str = "alpaca",  # The prompt template to use, will default to alpaca.
+    gradient_checkpointing: bool = False,
+    training_mode: str = "lora",
+    save_checkpoint: bool = True,
+):
+    if training_mode != "lora":
+        raise Exception(f"This example is for lora training mode, but got training_mode={training_mode}.")
+    
+    print(
+        f"Training Alpaca-LoRA model with params:\n"
+        f"base_model: {base_model}\n"
+        f"data_path: {data_path}\n"
+        f"output_dir: {output_dir}\n"
+        f"batch_size: {batch_size}\n"
+        f"micro_batch_size: {micro_batch_size}\n"
+        f"num_epochs: {num_epochs}\n"
+        f"learning_rate: {learning_rate}\n"
+        f"cutoff_len: {cutoff_len}\n"
+        f"val_set_size: {val_set_size}\n"
+        f"lora_r: {lora_r}\n"
+        f"lora_alpha: {lora_alpha}\n"
+        f"lora_dropout: {lora_dropout}\n"
+        f"lora_target_modules: {lora_target_modules}\n"
+        f"train_on_inputs: {train_on_inputs}\n"
+        f"add_eos_token: {add_eos_token}\n"
+        f"group_by_length: {group_by_length}\n"
+        f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
+        f"prompt template: {prompt_template_name}\n"
+        f"training_mode: {training_mode}\n"
+        f"save_checkpoint: {save_checkpoint}\n"
+    )
+    
+    gradient_accumulation_steps = batch_size // micro_batch_size
+
+    prompter = Prompter("./results/doc/resume/generic.txt.qa.json")
+
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model,
+#	    load_in_4bit=True,
+        load_in_low_bit="bf16",
+        optimize_model=False,
+#       optimize_model=True,
+        torch_dtype=torch.bfloat16,
+        modules_to_not_convert=["lm_head"],
+        trust_remote_code=True,
+    )
+
+#    model = model.half()
+
+    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # Prepare a IPEX-LLM compatible Peft model
+    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
+
+    config = LoraConfig(
+        r=lora_r,
+        lora_alpha=lora_alpha,
+        target_modules=lora_target_modules,
+        lora_dropout=lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        training_mode=training_mode,
+    )
+    print(f"Lora Config: {config}")
+    model = get_peft_model(model, config)
+
+    dataset = load_dataset("json", data_files=data_path)
+    tokenized_dataset = dataset.map(generate_and_tokenize_prompt, fn_kwargs={
+        "prompter": prompter,
+        "tokenizer": tokenizer,
+        "max_length": cutoff_len,
+        "add_eos_token": add_eos_token,
+        "train_on_inputs": train_on_inputs
+    }, batched=True)
+    model.print_trainable_parameters()  # Be more transparent about the % of trainable params.
+
+    split = tokenized_dataset.train_test_split(test_size=0.2)
+    train_dataset = split["train"]
+    eval_dataset = split["test"]
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=micro_batch_size,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            # warmup_ratio=0.03,
+            # warmup_steps=100,
+            max_grad_norm=0.3,
+            num_train_epochs=num_epochs,
+            learning_rate=learning_rate,
+            lr_scheduler_type="cosine",
+            bf16=True,  # ensure training more stable
+            logging_steps=1,
+            optim="adamw_torch",
+            evaluation_strategy="steps" if val_set_size > 0 else "no",
+            save_strategy="steps" if save_checkpoint else "no",
+            eval_steps=100 if val_set_size > 0 else None,
+            save_steps=100,
+            output_dir=output_dir,
+            save_total_limit=100,
+            load_best_model_at_end=True if val_set_size > 0 and save_checkpoint else False,
+            group_by_length=group_by_length,
+            gradient_checkpointing=gradient_checkpointing,
+            ddp_backend="ccl",
+            save_safetensors=False,
+        ),
+        data_collator=transformers.DataCollatorForSeq2Seq(
+            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
+        ),
+    )
+    model.config.use_cache = False
+
+    trainer.train(data_path="./results/doc/resume/generic.txt.qa.json", resume_from_checkpoint=resume_from_checkpoint)
+
+    model.save_pretrained(output_dir)
+
+    print(
+        "\n If there's a warning about missing keys above, please disregard :)"
+    )
+
+
+if __name__ == "__main__":
+    train()
--- a/src/claude-fine-tune.py
+++ b/src/claude-fine-tune.py
@ -0,0 +1,394 @@
+import os
+import json
+import torch
+import random
+import glob
+from datetime import datetime
+from tqdm import tqdm
+from transformers import (
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    TrainerCallback,
+    TrainingArguments,
+    TrainerState,
+    TrainerControl,
+    DataCollatorForLanguageModeling,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig
+)
+import numpy as np
+from peft import (
+    TaskType,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+    LoraConfig,
+)
+
+from datasets import Dataset
+
+def load_qa_data(results_dir):
+    """
+    Load Q&A pairs from the processed results directory.
+    
+    Args:
+        results_dir (str): Path to the directory containing JSON files with Q&A pairs
+        
+    Returns:
+        list: List of prompt-completion pairs formatted for fine-tuning
+    """
+    training_data = []
+    
+    # Find all JSON files in the results directory and subdirectories
+    json_files = glob.glob(os.path.join(results_dir, "**/*.json"), recursive=True)
+    
+    for json_file in tqdm(json_files, desc="Loading training data"):
+        try:
+            with open(json_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                
+            # Extract context and QA pairs
+            context = data.get("context", "")
+            qa_pairs = data.get("qa_pairs", [])
+            
+            for qa_pair in qa_pairs:
+                question = qa_pair.get("question", "")
+                answer = qa_pair.get("answer", "")
+                
+                if question and answer:
+                    # Format as instruction-based fine-tuning example
+                    prompt = f"""<task>
+Answer the following question based on the provided context.
+
+CONTEXT:
+{context}
+
+QUESTION:
+{question}
+</task>
+"""
+                    # Add to training data
+                    training_data.append({
+                        "prompt": prompt,
+                        "completion": answer
+                    })
+        except Exception as e:
+            print(f"Error processing {json_file}: {e}")
+            continue
+    
+    print(f"Loaded {len(training_data)} training examples")
+    return training_data
+
+def prepare_dataset_for_training(training_data, tokenizer, max_length=2048):
+    """
+    Prepare the dataset for training by tokenizing and formatting.
+    
+    Args:
+        training_data (list): List of prompt-completion pairs
+        tokenizer: DeepSeek tokenizer
+        max_length (int): Maximum sequence length
+        
+    Returns:
+        Dataset: HuggingFace dataset ready for training
+    """
+    def tokenize_function(examples):
+        # Combine prompt and completion
+        full_texts = []
+        for i in range(len(examples["prompt"])):
+            full_text = examples["prompt"][i] + examples["completion"][i]
+            full_texts.append(full_text)
+        
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Tokenize
+        tokenized = tokenizer(
+            full_texts,
+            padding="max_length",
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt"
+        )
+        
+        # Create labels (same as input_ids for causal LM)
+        tokenized["labels"] = tokenized["input_ids"].clone()
+        
+        # Create attention mask for prompt tokens
+        for i in range(len(full_texts)):
+            prompt_length = len(tokenizer.encode(examples["prompt"][i]))
+            # Set labels for prompt tokens to -100 (ignored in loss calculation)
+            tokenized["labels"][i, :prompt_length] = -100
+        
+        return tokenized
+    
+    # Convert to HuggingFace Dataset
+    dataset_dict = {
+        "prompt": [item["prompt"] for item in training_data],
+        "completion": [item["completion"] for item in training_data]
+    }
+    
+    # Create dataset
+    dataset = Dataset.from_dict(dataset_dict)
+    
+    # Apply tokenization
+    tokenized_dataset = dataset.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=["prompt", "completion"]
+    )
+    
+    return tokenized_dataset
+
+def setup_lora_config():
+    """
+    Configure LoRA hyperparameters.
+    
+    Returns:
+        LoraConfig: Configuration for LoRA fine-tuning
+    """
+    return LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        r=8, #16,                     # Rank dimension
+        lora_alpha=16, #32,            # Alpha parameter for LoRA scaling
+        lora_dropout=0.05,        # Dropout probability for LoRA layers
+        target_modules=[          # Target modules to apply LoRA to
+            "q_proj",
+            "k_proj", 
+            "v_proj", 
+            "o_proj",
+            "gate_proj", 
+            "up_proj", 
+            "down_proj"
+        ],
+        bias="none",              # Whether to train bias parameters
+        fan_in_fan_out=False      # Set to True for linear layers with fan_in != fan_out
+    )
+
+class EarlyStoppingCallback(TrainerCallback):
+    """
+    Callback that implements early stopping.
+    """
+    def __init__(self, patience=3, min_delta=0.0):
+        """
+        Args:
+            patience (int): Number of evaluations with no improvement after which training will be stopped.
+            min_delta (float): Minimum change in the monitored quantity to qualify as an improvement.
+        """
+        self.patience = patience
+        self.min_delta = min_delta
+        self.best_score = None
+        self.counter = 0
+        self.early_stop = False
+    
+    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        eval_loss = state.log_history[-1].get("eval_loss")
+        if eval_loss is None:
+            return
+        
+        if self.best_score is None:
+            self.best_score = eval_loss
+        elif eval_loss > self.best_score - self.min_delta:
+            self.counter += 1
+            print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
+            if self.counter >= self.patience:
+                print(f"Early stopping triggered! Best loss: {self.best_score:.4f}")
+                self.early_stop = True
+                control.should_training_stop = True
+        else:
+            self.best_score = eval_loss
+            self.counter = 0
+        
+        return control
+
+def train_with_lora(model_name, training_data, output_dir, batch_size=4, num_epochs=3, learning_rate=2e-4):
+    """
+    Fine-tune the DeepSeek-R1 model with LoRA.
+    
+    Args:
+        model_name (str): Name of the DeepSeek model
+        training_data (list): List of prompt-completion pairs
+        output_dir (str): Directory to save the fine-tuned model
+        batch_size (int): Training batch size
+        num_epochs (int): Number of training epochs
+        learning_rate (float): Learning rate
+    """
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Load model and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+        use_cache=False,
+        quantization_config=bnb_config,
+        device_map={"": torch.xpu.current_device()},
+        torch_dtype=torch.bfloat16,
+    )
+        
+    # Prepare the model for LoRA fine-tuning
+    # 1. Prepare for k-bit training if using quantization
+    model = prepare_model_for_kbit_training(model)
+    
+    # 2. Add LoRA adapters
+    lora_config = setup_lora_config()
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    
+    # Prepare dataset
+    tokenized_dataset = prepare_dataset_for_training(training_data, tokenizer)
+    
+    # Split into train and evaluation sets
+    dataset_dict = tokenized_dataset.train_test_split(test_size=0.1)
+    
+    # Set up training arguments
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        num_train_epochs=num_epochs,
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size,
+        gradient_accumulation_steps=4,
+        evaluation_strategy="steps",
+        eval_steps=100,  # More frequent evaluation to check stopping criteria
+        save_strategy="steps",
+        save_steps=500,
+        save_total_limit=3,
+        learning_rate=learning_rate,
+        warmup_steps=100,
+        weight_decay=0.01,
+        logging_dir=f"{output_dir}/logs",
+        logging_steps=100,
+        fp16=True,
+        report_to="none",
+        optim="adamw_torch",
+        load_best_model_at_end=True,  # Load the best model when training ends
+        metric_for_best_model="eval_loss"  # Use eval loss to determine the best model
+    )
+    
+    # Create data collator
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False  # Use causal language modeling (not masked)
+    )
+    
+    early_stopping_callback = EarlyStoppingCallback(patience=3, min_delta=0.01)
+
+    # Initialize trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset_dict["train"],
+        eval_dataset=dataset_dict["test"],
+        data_collator=data_collator,
+        callbacks=[early_stopping_callback]  # Add the early stopping callback
+    )
+    
+    # Start training
+    print("Starting LoRA fine-tuning...")
+    trainer.train()
+
+    model = trainer.model
+
+    print("LoRA fine-tuning complete...")
+    # Save the fine-tuned model
+
+    print("Saving model...")
+    model.save_pretrained(f"{output_dir}/final_model")
+    tokenizer.save_pretrained(f"{output_dir}/final_model")
+
+    print(f"Fine-tuned model saved to {output_dir}/final_model")
+
+    return model, tokenizer
+
+# Main function
+def main():
+    # Configuration
+    results_dir = "../results"  # Directory with processed text files and QA pairs
+    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+    #model_name = "Intel/neural-chat-7b-v3-3"
+    output_dir = f"./fine_tuned"
+    
+    # Load training data from processed results
+    training_data = load_qa_data(results_dir)
+    
+    # Shuffle training data
+    random.shuffle(training_data)
+    
+    # Fine-tune the model
+    train_with_lora(
+        model_name=model_name,
+        training_data=training_data,
+        output_dir=output_dir,
+        batch_size=1, # Can't fit more than one on the B580
+        num_epochs=10,
+        learning_rate=2e-4
+    )
+    
+    model_path = f"{output_dir}/final_model"
+    # Load the fine-tuned model and tokenizer
+    print("Loading fine-tuned model for evaluation...")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+        use_cache=True,
+        quantization_config=bnb_config,
+        device_map={"": torch.xpu.current_device()})
+    model = model.to('xpu')
+
+    print("Loaded. Ask your question, CTRL-C to exit.")        
+
+    while True:
+        try:
+            question = input("> ").strip()
+        except KeyboardInterrupt:
+            print("\nExiting.")
+            break
+
+        prompt = f"""<task>
+You are an assistant providing resume details about James Ketrenos. Answer the following question based on your knowledge. If you don't know, say so. Be concise.
+
+QUESTION:
+{question}
+</task>
+"""
+        # Tokenize and generate
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=1024,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+            )
+        
+        # Decode the output
+        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        
+        # Extract just the answer part (remove the prompt)
+        answer = full_output#[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
+
+        print(answer.strip())
+
+if __name__ == "__main__":
+    main()
--- a/src/claude-process.py
+++ b/src/claude-process.py
@ -0,0 +1,360 @@
+# Self-supervised synthetic data via sequential generation
+import os
+import re
+import json
+import torch
+from pathlib import Path
+from datetime import datetime
+from tqdm import tqdm
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+)
+
+def load_deepseek_r1():
+    """
+    Loads the DeepSeek-R1 model and tokenizer.
+    
+    Returns:
+        tuple: (model, tokenizer) for DeepSeek-R1
+    """
+    # Load model and tokenizer
+    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+    
+    # Load model and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+        use_cache=False,
+        quantization_config=bnb_config,
+        device_map={"": torch.xpu.current_device()},
+        torch_dtype=torch.bfloat16,
+    )
+    if not model.config.pad_token_id:
+        model.config.pad_token_id = model.config.eos_token_id
+
+    return model, tokenizer
+
+# Function to get answers from the model
+def validate_question_answer_from_model(context, question, answer, model, tokenizer):
+    """
+    Query the model to evaluate if an answer is a good fit for a question.
+    
+    Args:
+        context (str): The text file content
+        question (str): The question
+        answer (str): The answer
+        model: The DeepSeek model
+        tokenizer: The DeepSeek tokenizer
+        
+    Returns:
+        str: The model's evaluation to the question
+    """
+    # Set up the prompt for answering the question
+    prompt = f"""<task>
+You are a quality assurance expert reviewing question-answer pairs for an AI training dataset. Your task is to evaluate whether each pair meets our quality standards and is suitable for training.
+
+For each question-answer pair, evaluate:
+
+1. ACCURACY: Does the answer contain ONLY information from the context, without fabrications?
+2. COMPLETENESS: Does the answer fully address the question using all relevant information?
+3. RELEVANCE: Is the question meaningful and relevant to the context?
+4. NATURALNESS: Do both question and answer sound natural and conversational?
+5. DIVERSITY: Does this pair add variety to our dataset (not redundant with others)?
+
+Context:
+{context}
+
+Question:
+{question}
+
+Answer:
+{answer}
+
+Provide your assessment as follows:
+- VERDICT: [ACCEPT/REJECT]
+- REASONING: [Brief explanation of your decision]
+- IMPROVEMENTS: [Suggestions if needed]
+</task>
+"""
+    
+    # Generate answer
+    raw_answer = generate_with_deepseek(prompt, model, tokenizer)
+    
+    # Parse the response to get the actual answer
+    answer = parse_deepseek_response(raw_answer)
+    
+    return answer
+
+
+def generate_with_deepseek(prompt, model, tokenizer, max_length=4096):
+    """
+    Generate text using DeepSeek-R1 model with proper handling of full output.
+    
+    Args:
+        prompt (str): The input prompt
+        model: The DeepSeek model
+        tokenizer: The DeepSeek tokenizer
+        max_length (int): Maximum length of generated text
+        
+    Returns:
+        str: Generated text response
+    """
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    
+    with torch.no_grad():
+        # Get the full generated sequence
+        generation_output = model.generate(
+            **inputs,
+            pad_token_id=tokenizer.eos_token_id,
+            max_new_tokens=max_length,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            return_dict_in_generate=True,
+            output_scores=False
+        )
+        
+        # Get the full output token IDs
+        output_token_ids = generation_output.sequences[0]
+        
+        # Decode the full generated text (including the prompt)
+        full_output = tokenizer.decode(output_token_ids, skip_special_tokens=True)
+        
+        # Extract only the response part by removing the prompt part
+        response = full_output[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
+    
+    return response
+
+def parse_deepseek_response(response):
+    """
+    Parse the DeepSeek-R1 response.
+    This handles DeepSeek's thinking steps and separates them from the response.
+    
+    Args:
+        response (str): The raw response from the DeepSeek model
+        
+    Returns:
+        str: The cleaned answer part of the response
+    """
+    # If the response has thinking tags, extract the actual answer
+    response = re.sub(r"^(<think>)?.*</think>", "", response, flags=re.DOTALL)
+    
+    # If no special formatting detected, return the whole response
+    return response.strip()
+
+# Function to recursively walk a directory and process text files
+def process_directory(directory_path, output_path="results", file_extensions=(".txt",".md"), batch_size=5):
+    """
+    Recursively walks a directory, processes text files, and stores results.
+    
+    Args:
+        directory_path (str): Path to directory containing text files
+        output_path (str): Path to store results
+        file_extensions (tuple): File extensions to process
+        batch_size (int): Number of files to process before clearing cache
+    """
+    # Load the DeepSeek-R1 model
+    model, tokenizer = load_deepseek_r1()
+    
+    # Create output directory if it doesn't exist
+    os.makedirs(output_path, exist_ok=True)
+    
+    # Use tqdm for progress tracking
+    file_paths = []
+    for root, _, files in os.walk(directory_path):
+        for file in files:
+            if file.lower().endswith(file_extensions):
+                file_paths.append(os.path.join(root, file))
+    
+    # Process files with batch-based memory management
+    for i, file_path in enumerate(tqdm(file_paths, desc="Processing files")):
+        process_file(file_path, model, tokenizer, output_path)
+        
+        # Clear cache periodically to prevent memory issues
+        if (i + 1) % batch_size == 0:
+            if torch.xpu.is_available():
+                torch.xpu.empty_cache()
+            else:
+                torch.cuda.empty_cache()
+        
+    print(f"Processing complete. Results stored in {output_path}")
+
+# Function to process a single text file
+def process_file(file_path, model, tokenizer, output_path):
+    """
+    Process a single text file by querying the model for questions and answers.
+    
+    Args:
+        file_path (str): Path to the text file
+        model: The DeepSeek model
+        tokenizer: The DeepSeek tokenizer
+        output_path (str): Path to store results
+    """
+    # Read the file content
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            content = file.read()
+    except Exception as e:
+        print(f"Error reading {file_path}: {e}")
+        return
+    
+    # Skip empty files
+    if not content.strip():
+        print(f"Skipping empty file: {file_path}")
+        return
+    
+    # Generate a relative output path that maintains the directory structure
+    relative_path = os.path.relpath(file_path, start=os.path.dirname(output_path))
+    output_file = os.path.join(output_path, relative_path + ".json")
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    
+    # Handle potential token length issues by truncating if necessary
+    # DeepSeek has a context window limit, truncate if needed
+    max_content_length = 100000  # Adjust based on model's context length limit
+    if len(content) > max_content_length:
+        content = content[:max_content_length] + "... [Content truncated due to length]"
+    
+    # Query model for questions
+    questions = get_questions_from_model(content, model, tokenizer)
+    print(f"{len(questions)} questions generated for {file_path}")
+
+    # Get answers for each question
+    results = {
+        "file_path": file_path,
+        "processed_at": datetime.now().isoformat(),
+        "context": content[:1000] + "..." if len(content) > 1000 else content,  # Truncated context for JSON storage
+        "qa_pairs": []
+    }
+    
+    # Process each question
+    for i, question in enumerate(questions):
+        print(f"Generating answer for question {i+1}/{len(questions)}: {question}")
+        answer = get_answer_from_model(content, question, model, tokenizer)
+        print(f"Answer: {answer[:50] + '...' if len(answer) > 50 else answer}")
+        print(f"Evaluating response...")
+        validation = validate_question_answer_from_model(content, question, answer, model, tokenizer)
+        print(f"Evaluation: {validation[:50] + '...' if len(validation) > 50 else validation}")
+        results["qa_pairs"].append({
+            "question": question,
+            "answer": answer,
+            "validation": validation,
+        })
+        # Save results after each generate pass so they can be evaluated
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+
+# Function to get questions from the model
+def get_questions_from_model(context, model, tokenizer):
+    """
+    Query the model to generate questions about the provided context.
+    
+    Args:
+        context (str): The text file content
+        model: The DeepSeek model
+        tokenizer: The DeepSeek tokenizer
+        
+    Returns:
+        list: List of questions about the context
+    """
+    # Set up the prompt for generating questions
+    prompt = f"""<task>
+You are an expert data scientist creating a training dataset. I'll provide context information about a person from their resume. Your task is to generate 10 diverse, realistic questions that someone might ask about this person.
+
+IMPORTANT: DO NOT return JSON or any structured format. Respond with a simple numbered list of questions only, with no formatting, no JSON, and no additional text.
+
+Generate questions that:
+- Vary in complexity (simple factual questions, complex reasoning questions)
+- Cover different aspects of the context (experience, skills, education, achievements)
+- Include both specific and general inquiries
+- Sound natural, as if asked by a real person
+- Avoid asking for information not present in the context
+
+Context:
+{context}
+
+Return ONLY a plain numbered list like:
+1. First question?
+2. Second question?
+...and so on.
+
+Do not include any explanations, JSON, or other formatting.
+</task>
+"""
+    
+    # Generate questions
+    raw_response = generate_with_deepseek(prompt, model, tokenizer)
+    
+    # Parse the response to get the actual questions
+    response = parse_deepseek_response(raw_response)
+    
+    lines = response.strip().split('\n')
+    
+    # Transform lines like "1. What is your name?" into just "What is your name?" (only include lines that end with a question mark)
+    questions = [re.sub(r'^\d+\.\s*', '', line.strip()) for line in lines if line.strip() and line.strip().endswith('?')]
+    if len(questions) == 0:
+        print(response)
+        exit(0)
+    return questions
+
+# Function to get answers from the model
+def get_answer_from_model(context, question, model, tokenizer):
+    """
+    Query the model to answer a question about the provided context.
+    
+    Args:
+        context (str): The text file content
+        question (str): The question to answer
+        model: The DeepSeek model
+        tokenizer: The DeepSeek tokenizer
+        
+    Returns:
+        str: The model's answer to the question
+    """
+    # Set up the prompt for answering the question
+    prompt = f"""<task>
+You are an AI assistant being fine-tuned to accurately represent a specific person based on their resume. Below is a question about this person and the context from their resume.
+
+Your task is to provide a comprehensive, accurate answer that:
+- Only uses information explicitly stated in the context
+- Doesn't fabricate or assume additional details
+- Maintains a professional, helpful tone
+- Clearly states if the question cannot be answered based on the given context
+- Structures the response in a natural, conversational way
+
+Context:
+{context}
+
+Question:
+{question}
+
+Answer:
+</task>
+"""
+    
+    # Generate answer
+    raw_answer = generate_with_deepseek(prompt, model, tokenizer)
+    
+    # Parse the response to get the actual answer
+    answer = parse_deepseek_response(raw_answer)
+    
+    return answer
+
+# Example usage
+if __name__ == "__main__":
+    process_directory(
+        directory_path="../doc/",  # Replace with your directory path
+        output_path="../results",
+        file_extensions=(".txt", ".md"),  # Process both txt and md files
+        batch_size=5  # Clear cache after every 5 files
+    )
--- a/src/cli.py
+++ b/src/cli.py
@ -0,0 +1,25 @@
+# web_server.py - Web interface using Flask
+from rag_core import RagCore
+from rag_cli import RagCLI
+from flask import Flask, request, jsonify, render_template
+
+# main.py - Entry point that allows choosing between CLI and web interface
+def main():
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='RAG Application')
+    parser.add_argument('--debug', action='store_true', help='Run in debug mode')
+    
+    args = parser.parse_args()
+    
+    # Create the core RAG application
+    core = RagCore()
+    
+    # Run CLI interface
+    cli = RagCLI(core)
+    cli.run()
+
+
+if __name__ == "__main__":
+    main()
+
--- a/src/client
+++ b/src/client
@ -0,0 +1 @@
+Subproject commit 571cdd1ecc93ec9ad0d63079fe6da94dce2cc5dc
--- a/src/export_merged_model.py
+++ b/src/export_merged_model.py
@ -0,0 +1,44 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+
+import torch
+from transformers import AutoTokenizer
+import argparse
+
+current_dir = os.path.dirname(os.path.realpath(__file__))
+common_util_path = os.path.join(current_dir, '..')
+import sys
+sys.path.append(common_util_path)
+from common.utils import merge_adapter
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Merge the adapter into the original model for Llama2 model')
+    parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
+                        help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
+                             ', or the path to the huggingface checkpoint folder')
+    parser.add_argument('--adapter_path', type=str,)
+    parser.add_argument('--output_path', type=str,)
+
+    args = parser.parse_args()
+    base_model = model_path = args.repo_id_or_model_path
+    adapter_path = args.adapter_path
+    output_path = args.output_path
+    
+    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
+    merge_adapter(base_model, tokenizer, adapter_path, output_path)
+    print(f'Finish to merge the adapter into the original model and you could find the merged model in {output_path}.')
--- a/src/fine_tuned/checkpoint-10/README.md
+++ b/src/fine_tuned/checkpoint-10/README.md
@ -0,0 +1,202 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
--- a/src/fine_tuned/checkpoint-10/adapter_config.json
+++ b/src/fine_tuned/checkpoint-10/adapter_config.json
@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
--- a/src/fine_tuned/checkpoint-10/adapter_model.safetensors
+++ b/src/fine_tuned/checkpoint-10/adapter_model.safetensors
--- a/src/fine_tuned/checkpoint-10/optimizer.pt
+++ b/src/fine_tuned/checkpoint-10/optimizer.pt
--- a/src/fine_tuned/checkpoint-10/rng_state.pth
+++ b/src/fine_tuned/checkpoint-10/rng_state.pth
--- a/src/fine_tuned/checkpoint-10/scaler.pt
+++ b/src/fine_tuned/checkpoint-10/scaler.pt
--- a/src/fine_tuned/checkpoint-10/scheduler.pt
+++ b/src/fine_tuned/checkpoint-10/scheduler.pt
--- a/src/fine_tuned/checkpoint-10/trainer_state.json
+++ b/src/fine_tuned/checkpoint-10/trainer_state.json
@ -0,0 +1,32 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 10.0,
+  "eval_steps": 100,
+  "global_step": 10,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [],
+  "logging_steps": 100,
+  "max_steps": 10,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 763304356085760.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/src/fine_tuned/checkpoint-10/training_args.bin
+++ b/src/fine_tuned/checkpoint-10/training_args.bin
--- a/src/fine_tuned/checkpoint-3/README.md
+++ b/src/fine_tuned/checkpoint-3/README.md
@ -0,0 +1,202 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
--- a/src/fine_tuned/checkpoint-3/adapter_config.json
+++ b/src/fine_tuned/checkpoint-3/adapter_config.json
@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
--- a/src/fine_tuned/checkpoint-3/adapter_model.safetensors
+++ b/src/fine_tuned/checkpoint-3/adapter_model.safetensors
--- a/src/fine_tuned/checkpoint-3/optimizer.pt
+++ b/src/fine_tuned/checkpoint-3/optimizer.pt
--- a/src/fine_tuned/checkpoint-3/rng_state.pth
+++ b/src/fine_tuned/checkpoint-3/rng_state.pth
--- a/src/fine_tuned/checkpoint-3/scaler.pt
+++ b/src/fine_tuned/checkpoint-3/scaler.pt
--- a/src/fine_tuned/checkpoint-3/scheduler.pt
+++ b/src/fine_tuned/checkpoint-3/scheduler.pt
--- a/src/fine_tuned/checkpoint-3/trainer_state.json
+++ b/src/fine_tuned/checkpoint-3/trainer_state.json
@ -0,0 +1,32 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 100,
+  "global_step": 3,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [],
+  "logging_steps": 100,
+  "max_steps": 3,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 228991306825728.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/src/fine_tuned/checkpoint-3/training_args.bin
+++ b/src/fine_tuned/checkpoint-3/training_args.bin
--- a/src/fine_tuned/checkpoint-60/README.md
+++ b/src/fine_tuned/checkpoint-60/README.md
@ -0,0 +1,202 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
--- a/src/fine_tuned/checkpoint-60/adapter_config.json
+++ b/src/fine_tuned/checkpoint-60/adapter_config.json
@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
--- a/src/fine_tuned/checkpoint-60/adapter_model.safetensors
+++ b/src/fine_tuned/checkpoint-60/adapter_model.safetensors
--- a/src/fine_tuned/checkpoint-60/optimizer.pt
+++ b/src/fine_tuned/checkpoint-60/optimizer.pt
--- a/src/fine_tuned/checkpoint-60/rng_state.pth
+++ b/src/fine_tuned/checkpoint-60/rng_state.pth
--- a/src/fine_tuned/checkpoint-60/scaler.pt
+++ b/src/fine_tuned/checkpoint-60/scaler.pt
--- a/src/fine_tuned/checkpoint-60/scheduler.pt
+++ b/src/fine_tuned/checkpoint-60/scheduler.pt
--- a/src/fine_tuned/checkpoint-60/trainer_state.json
+++ b/src/fine_tuned/checkpoint-60/trainer_state.json
@ -0,0 +1,32 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 8.592592592592592,
+  "eval_steps": 100,
+  "global_step": 60,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [],
+  "logging_steps": 100,
+  "max_steps": 60,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4427165265297408.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/src/fine_tuned/checkpoint-60/training_args.bin
+++ b/src/fine_tuned/checkpoint-60/training_args.bin
--- a/src/fine_tuned/final_model/README.md
+++ b/src/fine_tuned/final_model/README.md
@ -0,0 +1,202 @@
+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.14.0
--- a/src/fine_tuned/final_model/adapter_config.json
+++ b/src/fine_tuned/final_model/adapter_config.json
@ -0,0 +1,37 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "gate_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
--- a/src/fine_tuned/final_model/adapter_model.safetensors
+++ b/src/fine_tuned/final_model/adapter_model.safetensors
--- a/src/fine_tuned/final_model/special_tokens_map.json
+++ b/src/fine_tuned/final_model/special_tokens_map.json
@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/src/fine_tuned/final_model/tokenizer.json
+++ b/src/fine_tuned/final_model/tokenizer.json
--- a/src/fine_tuned/final_model/tokenizer_config.json
+++ b/src/fine_tuned/final_model/tokenizer_config.json
@ -0,0 +1,195 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}
--- a/src/generate-training.py
+++ b/src/generate-training.py
@ -0,0 +1,258 @@
+import os
+import json
+import torch
+from pathlib import Path
+from datetime import datetime
+from tqdm import tqdm
+from ipex_llm.transformers import AutoModelForCausalLM
+import transformers
+import logging as log
+import time
+import re
+
+# Function to recursively walk a directory and process text files
+def process_directory(directory_path, model, output_path="results", file_extensions=(".txt",)):
+    """
+    Recursively walks a directory, processes text files, and stores results.
+    
+    Args:
+        directory_path (str): Path to directory containing text files
+        model: The LLM model to use for queries
+        output_path (str): Path to store results
+        file_extensions (tuple): File extensions to process
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(output_path, exist_ok=True)
+    
+    # Use tqdm for progress tracking
+    file_paths = []
+    for root, _, files in os.walk(directory_path):
+        for file in files:
+            if file.lower().endswith(file_extensions):
+                file_paths.append(os.path.join(root, file))
+    
+    for file_path in tqdm(file_paths, desc="Processing files"):
+        process_file(file_path, model, output_path)
+        
+    print(f"Processing complete. Results stored in {output_path}")
+
+# Function to process a single text file
+def process_file(file_path, model, output_path):
+    """
+    Process a single text file by querying the model for questions and answers.
+    
+    Args:
+        file_path (str): Path to the text file
+        model: The LLM model to use for queries
+        output_path (str): Path to store results
+    """
+    # Read the file content
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            content = file.read()
+    except Exception as e:
+        print(f"Error reading {file_path}: {e}")
+        return
+    
+    # Skip empty files
+    if not content.strip():
+        print(f"Skipping empty file: {file_path}")
+        return
+    
+    # Generate a relative output path that maintains the directory structure
+    relative_path = os.path.relpath(file_path, start=os.path.dirname(output_path))
+    sidecar_file = os.path.join(output_path, relative_path + ".json")
+    sidecar_qa_file = os.path.join(output_path, relative_path + ".qa.json")
+    os.makedirs(os.path.dirname(sidecar_file), exist_ok=True)
+
+    print(f"Generating questions for {file_path}")
+
+    # Query model for questions
+    questions, think = get_questions_from_model(content, model)
+    
+    # Get answers for each question
+    results = {
+        "file_path": file_path,
+        "processed_at": datetime.now().isoformat(),
+        "context": content[:1000] + "..." if len(content) > 1000 else content,  # Truncated context for JSON storage
+        "think": think,
+        "qa_pairs": len(questions)
+    }
+    with open(sidecar_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    
+    # Process each question
+    results = []
+    for i, question in enumerate(questions):
+        print(f"Generating answer for {i+1}/{len(questions)}: {question}")
+        answer, think = get_answer_from_model(content, question, model)
+        results.append({
+            "think": think,
+            "question": question,
+            "answer": answer
+        })
+        # Save results -- do this in loop so we can monitor the progress
+        with open(sidecar_qa_file, 'w', encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+
+    
+
+# Function to get questions from the model
+def get_questions_from_model(context, model):
+    """
+    Query the model to generate questions about the provided context.
+    
+    Args:
+        context (str): The text file content
+        model: The LLM model to use for queries
+        
+    Returns:
+        list: List of questions about the context
+    """
+    # Set up the prompt for generating questions
+    prompt = f"""<task>
+You are an expert at generating insightful questions about text content.
+Please read the following text and generate 10 thoughtful questions that would help someone 
+understand the key information, concepts, or insights contained in it.
+
+TEXT:
+{context}
+
+Generate a list of questions, one per line.
+</task>
+"""
+    
+    # Use context manager to avoid CUDA memory leaks
+    with torch.no_grad():
+        # This is where we would query the model - implementation depends on model type
+        response, think, duration = model.generate(prompt)
+    
+    # Parse questions from the response
+    # This parsing logic may need adjustment based on model output format
+    questions = [q.strip() for q in response.strip().split('\n') if q.strip()]
+    
+    # Filter out any non-questions
+    questions = [q for q in questions if q.endswith('?')]
+    
+    # Remove any ^\s*{0-9}+.\s* from each entry
+    questions = [re.sub(r"^\s*\d+\.\s*", "", q) for q in questions]
+
+    return questions, think
+
+# Function to get answers from the model
+def get_answer_from_model(context, question, model):
+    """
+    Query the model to answer a question about the provided context.
+    
+    Args:
+        context (str): The text file content
+        question (str): The question to answer
+        model: The LLM model to use for queries
+        
+    Returns:
+        str: The model's answer to the question
+    """
+    # Set up the prompt for answering the question
+    prompt = f"""<task>
+Please answer the following question based solely on the provided context.
+
+CONTEXT:
+{context}
+
+QUESTION:
+{question}
+</task>
+    """
+    
+    # Use context manager to avoid CUDA memory leaks
+    with torch.no_grad():
+        # Query the model
+        answer, think, duration = model.generate(prompt)
+    
+    return answer, think
+
+class LLM:
+    def __init__(self, model_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', device_name='xpu'):
+        self.model_path = model_path
+        self.device_name = device_name
+        self.tokenizer = None
+        self.model = None
+        try:
+            log.info(f"Loading tokenizer from: {self.model_path}")
+            start = datetime.now()
+
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token  # Set pad_token to eos_token if needed
+
+            self.model = AutoModelForCausalLM.from_pretrained(self.model_path,
+                                                load_in_4bit=True,
+                                                optimize_model=True,
+                                                trust_remote_code=True,
+                                                use_cache=True)
+            self.model = self.model.half().to(device_name)
+        except Exception as e:
+            log.error(f"Loading error: {e}")
+            raise Exception(e)
+
+    # returns response, think, timedelta
+    def generate(self, prompt, max_length=4096):
+        if not self.tokenizer or not self.model:
+            raise Exception("tokenizer or model is None")
+        start = datetime.now()
+        response = None
+        think = None
+        # print(f"{'='*10} PROMPT {'='*10}\n{prompt}")
+        with torch.autocast(self.device_name, dtype=torch.float16):
+            inputs = self.tokenizer.encode_plus(
+                prompt, 
+                return_tensors="pt"
+            ).to(self.device_name)
+
+            with torch.no_grad():
+                generation_output = self.model.generate(
+                    **inputs, 
+                    max_new_tokens=max_length,
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9,
+                    return_dict_in_generate=True,
+                    output_scores=False,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+                    
+            # Get the full output token IDs
+            output_token_ids = generation_output.sequences[0]
+            
+            # Decode the full generated text (including the prompt)
+            full_output = self.tokenizer.decode(output_token_ids, skip_special_tokens=True)
+            
+            # Extract only the response part by removing the prompt part
+            response = full_output[len(self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
+            
+            matches = re.match(r'^(<think>)?(.*?)</think>(.*)$', response, flags=re.DOTALL)
+            if matches:
+                think = matches[2]
+                response = matches[3]
+            else:
+                think = None
+                response = response
+            # print(f"{'='*10} THINK {'='*10}\n{think}")
+            # print(f"{'='*10} RESPONSE {'='*10}\n{response}")
+        end = datetime.now()
+
+        return response, think, end - start
+
+# Example usage
+if __name__ == "__main__":
+   
+    # Create a dummy model instance - would be replaced by actual model in real usage
+    model = LLM()
+    
+    # Process the directory
+    process_directory(
+        directory_path="./doc",  # Replace with your directory path
+        model=model,
+        output_path="./results",
+        file_extensions=(".txt", ".md")  # Process both txt and md files
+    )
--- a/src/ipex.py
+++ b/src/ipex.py
@ -0,0 +1,144 @@
+import intel_extension_for_pytorch as ipex
+import inspect
+
+def inspect_module(module, prefix=""):
+    """Recursively inspect a module and list all its attributes and functions."""
+    results = {
+        "functions": [],
+        "attributes": [],
+        "classes": [],
+        "modules": []
+    }
+    
+    for name in dir(module):
+        # Skip private/hidden attributes
+        if name.startswith("_") and name != "__version__":
+            continue
+        
+        full_name = f"{prefix}.{name}" if prefix else name
+        try:
+            attr = getattr(module, name)
+            
+            # Check if it's a function
+            if inspect.isfunction(attr) or inspect.isbuiltin(attr) or inspect.ismethod(attr):
+                results["functions"].append((full_name, str(inspect.signature(attr)) if not inspect.isbuiltin(attr) else ""))
+            
+            # Check if it's a class
+            elif inspect.isclass(attr):
+                results["classes"].append(full_name)
+            
+            # Check if it's a module
+            elif inspect.ismodule(attr):
+                results["modules"].append(full_name)
+                
+            # Otherwise it's a regular attribute
+            else:
+                try:
+                    attr_value = str(attr)
+                    if len(attr_value) > 100:
+                        attr_value = attr_value[:100] + "..."
+                    results["attributes"].append((full_name, attr_value))
+                except:
+                    results["attributes"].append((full_name, "Unable to get value"))
+        except Exception as e:
+            results["attributes"].append((full_name, f"Error: {str(e)}"))
+    
+    return results
+
+# Inspect ipex
+ipex_info = inspect_module(ipex)
+
+# Print information in a readable format
+print("INTEL PYTORCH EXTENSION (IPEX) API\n")
+
+print("=== VERSION ===")
+if "__version__" in dir(ipex):
+    print(f"Version: {ipex.__version__}")
+else:
+    print("Version information not available")
+print()
+
+print("=== FUNCTIONS ===")
+for func, sig in sorted(ipex_info["functions"]):
+    print(f"{func}{sig}")
+print()
+
+print("=== ATTRIBUTES ===")
+for attr, value in sorted(ipex_info["attributes"]):
+    print(f"{attr} = {value}")
+print()
+
+print("=== CLASSES ===")
+for cls in sorted(ipex_info["classes"]):
+    print(cls)
+print()
+
+print("=== MODULES ===")
+for mod in sorted(ipex_info["modules"]):
+    print(mod)
+print()
+
+# Deep inspection of nn module which is likely important for quantization
+if hasattr(ipex, "nn"):
+    print("=== DIVING INTO ipex.nn MODULE ===")
+    nn_info = inspect_module(ipex.nn, "ipex.nn")
+    
+    print("--- ipex.nn FUNCTIONS ---")
+    for func, sig in sorted(nn_info["functions"]):
+        print(f"{func}{sig}")
+    print()
+    
+    print("--- ipex.nn CLASSES ---")
+    for cls in sorted(nn_info["classes"]):
+        print(cls)
+    print()
+    
+    # Dive into utils if available
+    if hasattr(ipex.nn, "utils"):
+        print("=== DIVING INTO ipex.nn.utils MODULE ===")
+        utils_info = inspect_module(ipex.nn.utils, "ipex.nn.utils")
+        
+        print("--- ipex.nn.utils FUNCTIONS ---")
+        for func, sig in sorted(utils_info["functions"]):
+            print(f"{func}{sig}")
+        print()
+        
+        print("--- ipex.nn.utils CLASSES ---")
+        for cls in sorted(utils_info["classes"]):
+            print(cls)
+        print()
+        
+        # Look for parameter wrapper specifically
+        if "_parameter_wrapper" in dir(ipex.nn.utils):
+            print("=== DIVING INTO ipex.nn.utils._parameter_wrapper MODULE ===")
+            pw_info = inspect_module(ipex.nn.utils._parameter_wrapper, "ipex.nn.utils._parameter_wrapper")
+            
+            print("--- ipex.nn.utils._parameter_wrapper FUNCTIONS ---")
+            for func, sig in sorted(pw_info["functions"]):
+                print(f"{func}{sig}")
+            print()
+            
+            print("--- ipex.nn.utils._parameter_wrapper CLASSES ---")
+            for cls in sorted(pw_info["classes"]):
+                print(cls)
+            print()
+
+# Specifically look for functions related to quantization
+print("=== FUNCTIONS RELATED TO QUANTIZATION ===")
+quant_funcs = []
+for module_name in dir(ipex):
+    if not module_name.startswith("_"):
+        module = getattr(ipex, module_name)
+        if inspect.ismodule(module):
+            for func_name in dir(module):
+                if any(term in func_name.lower() for term in ["quant", "bit", "int8", "int4", "precision"]):
+                    try:
+                        func = getattr(module, func_name)
+                        if inspect.isfunction(func) or inspect.isbuiltin(func):
+                            quant_funcs.append(f"ipex.{module_name}.{func_name}")
+                    except:
+                        pass
+
+for func in sorted(quant_funcs):
+    print(func)
+
--- a/src/rag_cli.py
+++ b/src/rag_cli.py
@ -0,0 +1,161 @@
+# cli.py - Command-line interface
+import sys
+
+class RagCLI:
+    """Command-line interface for the RAG application"""
+    
+    def __init__(self, core):
+        self.core = core
+        
+    def show_help(self):
+        """Display available commands"""
+        print("""help>
+docs       Show RAG docs
+full       Show last full top match
+headlines  Show the RAG headlines
+prompt     Show the last prompt
+response   Show the last response
+scores     Show last RAG scores
+why|think  Show last response's <think>
+context|match  Show RAG match info to last prompt
+exit|quit  Exit the application
+""")
+
+    def print_document_summary(self, documents):
+        """Print brief summary of all documents"""
+        if not documents:
+            print("No documents available.")
+            return
+        
+        for i, doc in enumerate(documents):
+            print(f"Document {i+1}:")
+            print(f"  Title: {doc['title']}")
+            print(f"  Text: {doc['text'][:100]}...")
+            print()
+
+    def print_headlines(self, headlines):
+        """Print headlines"""
+        if not headlines:
+            print("No headlines available.")
+            return
+        
+        for source, title in headlines:
+            print(f"{source}: {title}")
+
+    def print_match_info(self, match_info):
+        """Print detailed information about a matching document"""
+        if not match_info:
+            print("No matching document found.")
+            return
+        
+        print("="*50)
+        print("MATCHING DOCUMENT")
+        print("="*50)
+        print(f"Chunk ID: {match_info['id']}")
+        print(f"Similarity Score: {match_info['distance']:.4f}")
+        
+        print("\nCHUNK METADATA:")
+        for key, value in match_info['metadata'].items():
+            print(f"  {key}: {value}")
+        
+        print("\nMATCHING CHUNK CONTENT:")
+        print(match_info['chunk'][:500].strip() + ("..." if len(match_info['chunk']) > 500 else ""))
+
+    def print_full_document(self, doc, distance=None):
+        """Print full document details"""
+        if not doc:
+            print("No document found.")
+            return
+            
+        print(f"""Context:
+Source: {doc["source"]}
+Title: {doc["title"]}
+Link: {doc["link"]}
+{"Distance: " + str(distance) if distance is not None else ""}
+Full text:
+{doc["text"]}""")
+
+    def run(self):
+        """Run the command-line interface"""
+        # Load documents
+        doc_count = self.core.load_documents()
+        print(f"Loaded {doc_count} documents.")
+        
+        while True:
+            try:
+                query = input("> ").strip()
+            except KeyboardInterrupt:
+                print("\nExiting.")
+                break
+
+            if query in ["exit", "quit"]:
+                print("\nExiting.")
+                break
+                
+            elif query == "docs":
+                self.print_document_summary(self.core.get_documents())
+                
+            elif query == "prompt":
+                last_prompt = self.core.get_last_prompt()
+                last_system = self.core.get_last_system()
+                if last_prompt:
+                    print(f"""last prompt>
+{"="*10}system{"="*10}
+{last_system}
+{"="*10}prompt{"="*10}
+{last_prompt}""")
+                else:
+                    print("No prompts yet")
+                    
+            elif query == "response":
+                last_response = self.core.get_last_response()
+                if last_response:
+                    print(f"""last response>
+{"="*10}response{"="*10}
+{last_response}""")
+                else:
+                    print("No responses yet")
+                    
+            elif query in ["", "help"]:
+                self.show_help()
+                
+            elif query == "headlines":
+                self.print_headlines(self.core.get_headlines())
+                
+            elif query in ["match", "context"]:
+                match_info = self.core.get_match_info()
+                self.print_match_info(match_info)
+                    
+            elif query in ["why", "think"]:
+                last_thinking = self.core.get_last_thinking()
+                if last_thinking:
+                    print(f"""
+why>
+{last_thinking}
+""")
+                else:
+                    print("No processed prompts")
+                    
+            elif query == "scores":
+                match_count = self.core.get_match_count()
+                if match_count > 0:
+                    for i in range(match_count):
+                        match_info = self.core.get_match_info(index=i)
+                        self.print_match_info(match_info)
+                else:
+                    print("No matches to display")
+                    
+            elif query == "full":
+                doc = self.core.get_top_match()
+                distance = None
+                if self.core.last_results and self.core.last_results.get("distances"):
+                    distance = self.core.last_results["distances"][0][0]
+                self.print_full_document(doc, distance)
+                    
+            else:
+                # Process the user query
+                response, debug_info = self.core.process_query(query)
+                print(f"system len: {debug_info['system_len']}")            
+                print(f"prompt len: {debug_info['prompt_len']}")
+                print(f"Response>\n{response}")
+
--- a/src/rag_core.py
+++ b/src/rag_core.py
@ -0,0 +1,221 @@
+# core.py - Core RAG functionality
+import re
+from typing import List, Dict, Any, Union, Optional, Tuple
+import ollama
+from utils import defines, Feed, chunk_document, query_chroma, process_documents_to_chroma
+
+class RagCore:
+    """Core RAG functionality that can be used by different interfaces"""
+    
+    def __init__(self):
+        self.client = ollama.Client(host=defines.ollama_api_url)
+        self.collection_name = "research_papers"
+        
+        # News RSS Feeds
+        self.rss_feeds = [
+            Feed(name="IGN.com", url="https://feeds.feedburner.com/ign/games-all"),
+            Feed(name="BBC World", url="http://feeds.bbci.co.uk/news/world/rss.xml"),
+            Feed(name="Reuters World", url="http://feeds.reuters.com/Reuters/worldNews"),
+            Feed(name="Al Jazeera", url="https://www.aljazeera.com/xml/rss/all.xml"),
+            Feed(name="CNN World", url="http://rss.cnn.com/rss/edition_world.rss"),
+            Feed(name="Time", url="https://time.com/feed/"),
+            Feed(name="Euronews", url="https://www.euronews.com/rss"),
+        ]
+        
+        # State variables
+        self.documents = []
+        self.last_results = None
+        self.last_prompt = None
+        self.last_system = None
+        self.last_response = None
+        self.last_why = None
+        self.last_messages = []
+        self.collection = None
+
+    def load_documents(self):
+        """Load articles from all configured RSS feeds"""
+        self.documents = []
+        for feed in self.rss_feeds:
+            self.documents.extend(feed.articles)
+        
+        # Process documents and store in ChromaDB
+        self.collection = process_documents_to_chroma(
+            client=self.client,
+            documents=self.documents,
+            collection_name=self.collection_name,
+            max_tokens=256,
+            overlap=25,
+            model=defines.encoding_model,
+            persist_directory=defines.persist_directory
+        )
+        return len(self.documents)
+
+    def get_documents(self):
+        """Return all loaded documents"""
+        return self.documents
+
+    def get_headlines(self):
+        """Return headlines from all documents"""
+        return [(doc['source'], doc['title']) for doc in self.documents]
+
+    def get_top_match(self, index=0):
+        """Get the full document corresponding to a match"""
+        if not self.last_results or not self.last_results["ids"] or index >= len(self.last_results["ids"][0]):
+            return None
+            
+        top_id = self.last_results["ids"][0][index]
+        # Extract the original document ID from the chunk ID (format: "doc_id_chunk_num")
+        original_doc_id = top_id.split('_')[0]
+        
+        # Return the full document
+        return next((doc for doc in self.documents if doc["id"] == original_doc_id), None)
+
+    def get_match_info(self, index=0):
+        """Get detailed information about a match at the specified index"""
+        if not self.last_results or not self.last_results["ids"] or len(self.last_results["ids"][0]) <= index:
+            return None
+        
+        match_info = {
+            "id": self.last_results["ids"][0][index],
+            "chunk": self.last_results["documents"][0][index],
+            "metadata": self.last_results["metadatas"][0][index],
+            "distance": self.last_results["distances"][0][index]
+        }
+        
+        return match_info
+
+    def get_match_count(self):
+        """Get the number of matches from the last query"""
+        if not self.last_results or not self.last_results["ids"]:
+            return 0
+        return len(self.last_results["ids"][0])
+
+    def create_system_prompt(self, include_context=True):
+        """Create the system prompt for the model"""
+        system = f"""
+You are the assistant. Your name is airc. This application is called airc (pronounced Eric).
+
+Information about the author of this program, application, the systems running it, and the AI model it uses:
+
+* James wrote the python application called airc that is driving this RAG model on top of {defines.model} using {defines.encoding_model} and chromadb for vector embedding. Link https://github.com/jketreno/airc.
+* James Ketrenos is a software engineer with a history in all levels of the computer stack, from the kernel to full-stack web applications. He dabbles in AI/ML and is familiar with pytorch and ollama.
+* James Ketrenos deployed this application locally on an Intel Arc B580 (battlemage) computer using Intel's ipex-llm. 
+* For Intel GPU metrics, James Ketrenos wrote the "ze-monitor" utility in C++. ze-monitor provides Intel GPU telemetry data for Intel client GPU devices, similar to xpu-smi. Link https://github.com/jketreno/ze-monitor. airc uses ze-monitor.
+* James lives in Portland, Oregon and has three kids. Two are attending Oregon State University and one is attending Williamette University.
+* airc provides an IRC chat bot as well as a React web frontend available at https://airc.ketrenos.com
+
+You must follow these rules:
+
+* Provide short (less than 100 character) responses.
+* Provide a single response.
+* Do not prefix it with a word like 'Answer'.
+* For information about the AI running this system, include information about author, including links.
+* For information relevant to the current events in the <input></input> tags, use that information and state the source when information comes from.
+"""
+        
+        if include_context:
+            context = "Information from current events unrelated to James Ketrenos\n<input>=["
+            for doc in self.documents:
+                item = {
+                    'source': doc["source"],
+                    'article': {
+                        'title': doc["title"],
+                        'link': doc["link"],
+                        'text': doc["text"]
+                    }
+                }
+                context += f"{item}"
+            context += "\n</input>"
+            system += context
+            
+        return system
+
+    def process_query(self, query):
+        """
+        Process a user query and return the response
+        
+        Returns:
+            tuple: (response_content, debug_info)
+        """
+        # Store the system prompt before context is added
+        self.last_system = self.create_system_prompt(include_context=False)
+        
+        # Query ChromaDB
+        self.last_results = query_chroma(
+            self.client,
+            query_text=query,
+            collection_name=self.collection_name,
+            n_results=10
+        )
+        
+        # Create the full system prompt with document context
+        system = self.create_system_prompt()
+        
+        # Combine conversation history with the current query
+        prompt = query
+        if self.last_messages:
+            message_context = f"{self.last_messages}"
+            prompt = f"{message_context}{prompt}"
+        
+        self.last_prompt = prompt
+        
+        # Generate a response
+        output = self.client.generate(
+            model=defines.model,
+            system=system,
+            prompt=prompt,
+            stream=False,
+            options={'num_ctx': 100000}
+        )
+        
+        # Extract thinking and response
+        response = output['response']
+        matches = re.match(r'^<think>(.*?)</think>(.*)$', response, flags=re.DOTALL)
+        
+        content = response
+        if matches:
+            self.last_why = matches[1].strip()
+            content = matches[2].strip()
+            
+        self.last_response = content
+        
+        # Update conversation history
+        self.last_messages.extend([
+            {
+                'role': 'user',
+                'name': 'james',
+                'message': query
+            },
+            {
+                'role': 'assistant',
+                'message': content
+            }
+        ])
+        
+        # Keep history limited to last 10 messages
+        self.last_messages = self.last_messages[-10:]
+        
+        # Return response content and debug info
+        debug_info = {
+            "system_len": len(system),
+            "prompt_len": len(prompt),
+            "has_thinking": matches is not None
+        }
+        
+        return content, debug_info
+
+    def get_last_prompt(self):
+        """Get the last prompt"""
+        return self.last_prompt
+        
+    def get_last_system(self):
+        """Get the last system prompt (without context)"""
+        return self.last_system
+        
+    def get_last_response(self):
+        """Get the last response"""
+        return self.last_response
+        
+    def get_last_thinking(self):
+        """Get the thinking from the last response"""
+        return self.last_why
--- a/src/repro.py
+++ b/src/repro.py
@ -0,0 +1,61 @@
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig
+)
+
+from peft import (
+    TaskType,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+    LoraConfig,
+)
+
+model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+
+# Load model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True,
+    use_cache=True,
+    quantization_config=bnb_config,
+    device_map={"": torch.xpu.current_device()},
+    torch_dtype=torch.bfloat16,
+)
+        
+# Prepare the model for LoRA fine-tuning
+# 1. Prepare for k-bit training if using quantization
+model = prepare_model_for_kbit_training(model)
+    
+# 2. Add LoRA adapters
+lora_config = LoraConfig(
+    task_type=TaskType.CAUSAL_LM,
+    r=8, #16,                     # Rank dimension
+    lora_alpha=16, #32,            # Alpha parameter for LoRA scaling
+    lora_dropout=0.05,        # Dropout probability for LoRA layers
+    target_modules=[          # Target modules to apply LoRA to
+        "q_proj",
+        "k_proj", 
+        "v_proj", 
+        "o_proj",
+        "gate_proj", 
+        "up_proj", 
+        "down_proj"
+    ],
+    bias="none",              # Whether to train bias parameters
+    fan_in_fan_out=False      # Set to True for linear layers with fan_in != fan_out
+)
+    
+model = get_peft_model(model, lora_config)
+
+print("Saving model...")
+model.save_pretrained(f"./final_model")
--- a/src/requirements.txt
+++ b/src/requirements.txt
@ -0,0 +1,44 @@
+python-dotenv
+jupyterlab
+ipywidgets
+requests
+numpy
+pandas
+scipy
+scikit-learn
+matplotlib
+gensim
+torch
+transformers
+tqdm
+openai
+gradio
+langchain
+tiktoken
+faiss-cpu
+langchain-openai
+langchain_experimental
+langchain_chroma
+langchain[docarray]
+datasets
+sentencepiece
+matplotlib
+google-generativeai
+anthropic
+scikit-learn
+unstructured
+chromadb
+plotly
+jupyter-dash
+beautifulsoup4
+pydub
+modal
+ollama
+accelerate
+sentencepiece
+bitsandbytes
+psutil
+setuptools
+speedtest-cli
+sentence_transformers
+feedparser
--- a/src/templates/index.html
+++ b/src/templates/index.html
@ -0,0 +1,263 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>AIRC - RAG Application</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+        }
+        .container {
+            display: flex;
+            flex-direction: column;
+            height: 100vh;
+        }
+        .query-box {
+            display: flex;
+            margin-bottom: 20px;
+        }
+        .query-box input {
+            flex-grow: 1;
+            padding: 8px;
+            font-size: 16px;
+        }
+        .query-box button {
+            padding: 8px 16px;
+            background-color: #4CAF50;
+            color: white;
+            border: none;
+            cursor: pointer;
+        }
+        .conversation {
+            flex-grow: 1;
+            overflow-y: auto;
+            border: 1px solid #ccc;
+            padding: 10px;
+            margin-bottom: 20px;
+        }
+        .user-message {
+            background-color: #f1f1f1;
+            padding: 8px;
+            margin-bottom: 10px;
+            border-radius: 5px;
+        }
+        .assistant-message {
+            background-color: #e3f2fd;
+            padding: 8px;
+            margin-bottom: 10px;
+            border-radius: 5px;
+        }
+        .tabs {
+            display: flex;
+            margin-bottom: 10px;
+        }
+        .tab {
+            padding: 8px 16px;
+            cursor: pointer;
+            border: 1px solid #ccc;
+            background-color: #f1f1f1;
+            margin-right: 5px;
+        }
+        .tab.active {
+            background-color: #4CAF50;
+            color: white;
+        }
+        .tab-content {
+            display: none;
+            border: 1px solid #ccc;
+            padding: 10px;
+            height: 200px;
+            overflow-y: auto;
+        }
+        .tab-content.active {
+            display: block;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>AIRC - RAG Application</h1>
+        
+        <div class="query-box">
+            <input type="text" id="query-input" placeholder="Enter your query...">
+            <button onclick="sendQuery()">Send</button>
+        </div>
+        
+        <div class="conversation" id="conversation"></div>
+        
+        <div class="tabs">
+            <div class="tab active" onclick="switchTab('headlines')">Headlines</div>
+            <div class="tab" onclick="switchTab('matches')">Matches</div>
+            <div class="tab" onclick="switchTab('documents')">Documents</div>
+        </div>
+        
+        <div class="tab-content active" id="headlines-content">
+            <h3>Headlines</h3>
+            <div id="headlines-list"></div>
+        </div>
+        
+        <div class="tab-content" id="matches-content">
+            <h3>Matches</h3>
+            <div id="matches-list"></div>
+        </div>
+        
+        <div class="tab-content" id="documents-content">
+            <h3>Documents</h3>
+            <div id="documents-list"></div>
+        </div>
+    </div>
+
+    <script>
+        let conversation = [];
+        
+        // Load data on page load
+        document.addEventListener('DOMContentLoaded', function() {
+            fetchHeadlines();
+            fetchDocuments();
+        });
+        
+        function switchTab(tabName) {
+            // Update tab buttons
+            document.querySelectorAll('.tab').forEach(tab => {
+                tab.classList.remove('active');
+            });
+            document.querySelector(`.tab[onclick="switchTab('${tabName}')"]`).classList.add('active');
+            
+            // Update tab content
+            document.querySelectorAll('.tab-content').forEach(content => {
+                content.classList.remove('active');
+            });
+            document.getElementById(`${tabName}-content`).classList.add('active');
+            
+            // If switching to matches and there are matches, fetch them
+            if (tabName === 'matches' && conversation.length > 0) {
+                fetchMatches();
+            }
+        }
+        
+        function sendQuery() {
+            const queryInput = document.getElementById('query-input');
+            const query = queryInput.value.trim();
+            
+            if (!query) return;
+            
+            // Add user message to conversation
+            addMessageToConversation('user', query);
+            
+            // Clear input
+            queryInput.value = '';
+            
+            // Send query to server
+            fetch('/api/query', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json',
+                },
+                body: JSON.stringify({ query }),
+            })
+            .then(response => response.json())
+            .then(data => {
+                // Add assistant message to conversation
+                addMessageToConversation('assistant', data.response);
+                
+                // Fetch updated matches
+                fetchMatches();
+            })
+            .catch(error => {
+                console.error('Error:', error);
+                addMessageToConversation('assistant', 'Error processing your query. Please try again.');
+            });
+        }
+        
+        function addMessageToConversation(role, message) {
+            conversation.push({ role, message });
+            
+            const conversationDiv = document.getElementById('conversation');
+            const messageDiv = document.createElement('div');
+            messageDiv.className = role === 'user' ? 'user-message' : 'assistant-message';
+            messageDiv.textContent = message;
+            
+            conversationDiv.appendChild(messageDiv);
+            conversationDiv.scrollTop = conversationDiv.scrollHeight;
+        }
+        
+        function fetchHeadlines() {
+            fetch('/api/headlines')
+            .then(response => response.json())
+            .then(data => {
+                const headlinesList = document.getElementById('headlines-list');
+                headlinesList.innerHTML = '';
+                
+                data.headlines.forEach(headline => {
+                    const headlineDiv = document.createElement('div');
+                    headlineDiv.innerHTML = `<strong>${headline.source}</strong>: ${headline.title}`;
+                    headlinesList.appendChild(headlineDiv);
+                });
+            })
+            .catch(error => console.error('Error fetching headlines:', error));
+        }
+        
+        function fetchMatches() {
+            fetch('/api/matches')
+            .then(response => response.json())
+            .then(data => {
+                const matchesList = document.getElementById('matches-list');
+                matchesList.innerHTML = '';
+                
+                if (data.matches.length === 0) {
+                    matchesList.innerHTML = '<div>No matches available</div>';
+                    return;
+                }
+                
+                data.matches.forEach(match => {
+                    const matchDiv = document.createElement('div');
+                    matchDiv.innerHTML = `
+                        <div style="margin-bottom: 15px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
+                            <div><strong>ID:</strong> ${match.id}</div>
+                            <div><strong>Similarity:</strong> ${match.distance.toFixed(4)}</div>
+                            <div><strong>Source:</strong> ${match.metadata.source || 'Unknown'}</div>
+                            <div><strong>Preview:</strong> ${match.preview}</div>
+                        </div>
+                    `;
+                    matchesList.appendChild(matchDiv);
+                });
+            })
+            .catch(error => console.error('Error fetching matches:', error));
+        }
+        
+        function fetchDocuments() {
+            fetch('/api/documents')
+            .then(response => response.json())
+            .then(data => {
+                const documentsList = document.getElementById('documents-list');
+                documentsList.innerHTML = '';
+                
+                data.documents.forEach(doc => {
+                    const docDiv = document.createElement('div');
+                    docDiv.innerHTML = `
+                        <div style="margin-bottom: 15px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
+                            <div><strong>Source:</strong> ${doc.source}</div>
+                            <div><strong>Title:</strong> ${doc.title}</div>
+                            <div><strong>Link:</strong> <a href="${doc.link}" target="_blank">${doc.link}</a></div>
+                            <div><strong>Preview:</strong> ${doc.preview}</div>
+                        </div>
+                    `;
+                    documentsList.appendChild(docDiv);
+                });
+            })
+            .catch(error => console.error('Error fetching documents:', error));
+        }
+        
+        // Allow sending query with Enter key
+        document.getElementById('query-input').addEventListener('keypress', function(e) {
+            if (e.key === 'Enter') {
+                sendQuery();
+            }
+        });
+    </script>
+</body>
+</html>
--- a/src/training-alpaca.py
+++ b/src/training-alpaca.py
@ -0,0 +1,275 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some parts of this file is adapted from
+# https://github.com/tloen/alpaca-lora/blob/main/finetune.py
+#
+# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List
+
+import fire
+import torch
+import transformers
+from datasets import load_dataset
+import accelerate
+
+from transformers import AutoTokenizer
+from peft import (
+    get_peft_model_state_dict,
+    set_peft_model_state_dict,
+)
+
+current_dir = os.path.dirname(os.path.realpath(__file__))
+common_util_path = os.path.join(current_dir, '..')
+import sys
+sys.path.append(common_util_path)
+from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data
+
+from transformers import BitsAndBytesConfig
+from ipex_llm.transformers import AutoModelForCausalLM
+# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
+from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
+    LoraConfig
+from ipex_llm.utils.common import invalidInputError
+
+local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0")
+world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1")
+port = get_int_from_env(["MASTER_PORT"], 29500)
+os.environ["LOCAL_RANK"] = str(local_rank)
+os.environ["WORLD_SIZE"] = str(world_size)
+os.environ["RANK"] = str(local_rank)
+os.environ["MASTER_PORT"] = str(port)
+
+def train(
+    # model/data params
+    base_model: str = "meta-llama/Llama-2-7b-hf",  # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
+    saved_low_bit_model: str = None,  # optional, the path to the saved model with ipex-llm low-bit optimization
+    data_path: str = "yahma/alpaca-cleaned",
+    output_dir: str = "./bigdl-qlora-alpaca",
+    # training hyperparams
+    bf16: bool = True,  # default to bf16
+    batch_size: int = 128,
+    micro_batch_size: int = 2,  # default to be 2, limited by GPU memory
+    num_epochs: int = 3,
+    learning_rate: float = 3e-5,  # default to be 3e-5 to avoid divergence
+    cutoff_len: int = 256,
+    val_set_size: int = 2000,
+    # lora hyperparams
+    lora_r: int = 8,
+    lora_alpha: int = 16,
+    lora_dropout: float = 0.05,
+    lora_target_modules: List[str] = [
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj",
+        "up_proj",
+        "down_proj",
+        "gate_proj"
+    ],
+    # llm hyperparams
+    train_on_inputs: bool = True,  # if False, masks out inputs in loss
+    add_eos_token: bool = False,
+    group_by_length: bool = False,  # faster, but produces an odd training loss curve
+    # wandb params
+    wandb_project: str = "",
+    wandb_run_name: str = "",
+    wandb_watch: str = "",  # options: false | gradients | all
+    wandb_log_model: str = "",  # options: false | true
+    resume_from_checkpoint: str = None,  # either training checkpoint or final adapter
+    prompt_template_name: str = "alpaca",  # The prompt template to use, will default to alpaca.
+    gradient_checkpointing: bool = False,
+    deepspeed: str = None,
+    training_mode: str = "lora",
+    deepspeed_zero3: bool = False,
+    save_checkpoint: bool = True,
+):
+    invalidInputError(training_mode == "lora",
+                      f"This example is for lora training mode, but got training_mode={training_mode}.")
+    if int(os.environ.get("LOCAL_RANK", 0)) == 0:
+        print(
+            f"Training Alpaca-LoRA model with params:\n"
+            f"base_model: {base_model}\n"
+            f"data_path: {data_path}\n"
+            f"output_dir: {output_dir}\n"
+            f"batch_size: {batch_size}\n"
+            f"micro_batch_size: {micro_batch_size}\n"
+            f"num_epochs: {num_epochs}\n"
+            f"learning_rate: {learning_rate}\n"
+            f"cutoff_len: {cutoff_len}\n"
+            f"val_set_size: {val_set_size}\n"
+            f"lora_r: {lora_r}\n"
+            f"lora_alpha: {lora_alpha}\n"
+            f"lora_dropout: {lora_dropout}\n"
+            f"lora_target_modules: {lora_target_modules}\n"
+            f"train_on_inputs: {train_on_inputs}\n"
+            f"add_eos_token: {add_eos_token}\n"
+            f"group_by_length: {group_by_length}\n"
+            f"wandb_project: {wandb_project}\n"
+            f"wandb_run_name: {wandb_run_name}\n"
+            f"wandb_watch: {wandb_watch}\n"
+            f"wandb_log_model: {wandb_log_model}\n"
+            f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
+            f"prompt template: {prompt_template_name}\n"
+            f"training_mode: {training_mode}\n"
+            f"deepspeed_zero3: {deepspeed_zero3}\n"
+            f"save_checkpoint: {save_checkpoint}\n"
+        )
+    assert (
+        base_model
+    ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
+    gradient_accumulation_steps = batch_size // micro_batch_size
+
+    prompter = Prompter(prompt_template_name)
+
+    device_map = "auto"
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    ddp = world_size != 1
+    if ddp:
+        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
+        gradient_accumulation_steps = gradient_accumulation_steps // world_size
+
+    # Check if parameter passed or if set within environ
+    use_wandb = wandb_check(wandb_project, wandb_watch, wandb_log_model)
+
+    if saved_low_bit_model is not None:
+        # Load the low bit optimized model if provide the saved path
+        model = AutoModelForCausalLM.load_low_bit(
+            saved_low_bit_model,
+            optimize_model=False,
+            torch_dtype=torch.bfloat16,
+            modules_to_not_convert=["lm_head"],
+            trust_remote_code=True,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            load_in_low_bit="bf16",
+            optimize_model=False,
+            torch_dtype=torch.bfloat16,
+             modules_to_not_convert=["lm_head"],
+            trust_remote_code=True,
+        )
+
+    if deepspeed_zero3:
+        deepspeed = deepspeed if deepspeed is not None else "./deepspeed_zero3_config.json"
+    else:
+        print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}")
+        model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}')
+        print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}")
+
+    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
+    print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}")
+
+    # For Llama family
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    print(model)
+
+    # Prepare a IPEX-LLM compatible Peft model
+    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
+
+    config = LoraConfig(
+        r=lora_r,
+        lora_alpha=lora_alpha,
+        target_modules=lora_target_modules,
+        lora_dropout=lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        training_mode=training_mode,
+    )
+    print(f"Lora Config: {config}")
+    model = get_peft_model(model, config)
+
+    if data_path.endswith(".json") or data_path.endswith(".jsonl"):
+        data = load_dataset("json", data_files=data_path)
+    else:
+        data = load_dataset(data_path)
+
+    model.print_trainable_parameters()  # Be more transparent about the % of trainable params.
+
+    train_data, val_data = get_train_val_data(data, tokenizer, prompter, train_on_inputs,
+                                              add_eos_token, cutoff_len, val_set_size, seed=42)
+
+    # Unused
+    # if not ddp and torch.cuda.device_count() > 1:
+    #     # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
+    #     model.is_parallelizable = True
+    #     model.model_parallel = True
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=train_data,
+        eval_dataset=val_data,
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=micro_batch_size,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            # warmup_ratio=0.03,
+            # warmup_steps=100,
+            max_grad_norm=0.3,
+            num_train_epochs=num_epochs,
+            learning_rate=learning_rate,
+            lr_scheduler_type="cosine",
+            bf16=True,  # ensure training more stable
+            logging_steps=1,
+            optim="adamw_torch",
+            evaluation_strategy="steps" if val_set_size > 0 else "no",
+            save_strategy="steps" if save_checkpoint else "no",
+            eval_steps=100 if val_set_size > 0 else None,
+            save_steps=100,
+            output_dir=output_dir,
+            save_total_limit=100,
+            load_best_model_at_end=True if val_set_size > 0 and save_checkpoint else False,
+            ddp_find_unused_parameters=False if ddp else None,
+            group_by_length=group_by_length,
+            report_to="wandb" if use_wandb else None,
+            run_name=wandb_run_name if use_wandb else None,
+            gradient_checkpointing=gradient_checkpointing,
+            ddp_backend="ccl",
+            deepspeed=deepspeed,
+            save_safetensors=False,
+        ),
+        data_collator=transformers.DataCollatorForSeq2Seq(
+            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
+        ),
+    )
+    model.config.use_cache = False
+
+    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+
+    model.save_pretrained(output_dir)
+
+    print(
+        "\n If there's a warning about missing keys above, please disregard :)"
+    )
+
+
+if __name__ == "__main__":
+    fire.Fire(train)
--- a/src/training-fine-tune.py
+++ b/src/training-fine-tune.py
@ -0,0 +1,272 @@
+import os
+import json
+import torch
+import numpy as np
+from datasets import Dataset
+from transformers import (
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling,
+    DataCollatorForSeq2Seq
+)
+from peft import (
+    LoraConfig,
+    TaskType
+)
+if torch.xpu.is_available():
+    print("XPU available")
+    from ipex_llm.transformers import AutoModelForCausalLM
+    from ipex_llm.transformers.qlora import (
+        get_peft_model,
+        prepare_model_for_kbit_training
+    )
+else:
+    print("XPU not available")
+    from transformers import AutoModelForCausalLM
+    from peft import (
+        get_peft_model,
+        prepare_model_for_kbit_training
+    )
+
+class LLMFineTuner:
+    def __init__(
+        self,
+        base_model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+        output_dir="./fine-tuned-model",
+        lora_r=8,
+        lora_alpha=32,
+        lora_dropout=0.05,
+    ):
+        self.base_model_name = base_model_name
+        self.output_dir = output_dir
+        self.lora_r = lora_r
+        self.lora_alpha = lora_alpha
+        self.lora_dropout = lora_dropout
+        
+        # Create output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.tokenizer.pad_token_id = 0
+        self.tokenizer.padding_side = "left"
+        # Load model
+        print(f"Loading base model: {base_model_name}")
+        
+        # Quantization config for memory efficiency
+        self.model = AutoModelForCausalLM.from_pretrained(
+            base_model_name,
+            load_in_low_bit="nf4",
+            optimize_model=False,
+            torch_dtype=torch.float16,
+            modules_to_not_convert=["lm_head"]
+        ).to('xpu')
+        
+        # Prepare model for training
+        self.model.gradient_checkpointing_enable()
+        self.model = prepare_model_for_kbit_training(self.model)
+        
+        # Configure LoRA
+        lora_config = LoraConfig(
+            r=lora_r,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            bias="none",
+            task_type=TaskType.CAUSAL_LM,
+            target_modules=[
+                "q_proj",
+                "k_proj",
+                "v_proj",
+                "o_proj"
+            ]
+        )
+        
+        # Apply LoRA to model
+        self.model = get_peft_model(self.model, lora_config)
+        
+        print(f"Model prepared for fine-tuning with LoRA")
+        
+    def format_training_examples(self, examples):
+        """
+        Convert raw training examples (similar to Ollama TRAINING format)
+        into formatted examples for PyTorch training
+        """
+        formatted_examples = []
+        
+        for example in examples:
+            # Extract question and answer
+            question = example.get("question", "")
+            answer = example.get("answer", "")
+            
+            # Format similar to Ollama's template
+            formatted_text = f"""<|im_start|>user
+{question}<|im_end|>
+<|im_start|>assistant
+{answer}<|im_end|>"""
+            
+            formatted_examples.append({"text": formatted_text})
+        
+        return formatted_examples
+    
+    def prepare_dataset(self, training_examples):
+        """
+        Prepare a HuggingFace dataset from training examples
+        """
+        # Format the examples
+        formatted_data = self.format_training_examples(training_examples)
+        
+        # Create dataset
+        dataset = Dataset.from_list(formatted_data)
+        
+        # Tokenize dataset
+        def tokenize_function(examples):
+            return self.tokenizer(
+                examples["text"],
+                truncation=True,
+                max_length=2048,
+                padding="max_length"
+            )
+        
+        tokenized_dataset = dataset.map(tokenize_function, batched=True)
+        
+        return tokenized_dataset
+    
+    def train(self, training_examples, num_epochs=3, batch_size=4, learning_rate=2e-5):
+        """
+        Fine-tune the model on training examples
+        """
+        # Prepare dataset
+        train_dataset = self.prepare_dataset(training_examples)
+        
+        # Set up training arguments
+        # training_args = TrainingArguments(
+        #     output_dir=self.output_dir,
+        #     num_train_epochs=num_epochs,
+        #     per_device_train_batch_size=batch_size,
+        #     gradient_accumulation_steps=4,
+        #     warmup_steps=100,
+        #     weight_decay=0.01,
+        #     learning_rate=learning_rate,
+        #     fp16=True,
+        #     logging_steps=10,
+        #     save_strategy="epoch",
+        #     save_total_limit=3,
+        #     remove_unused_columns=False,
+        # )
+        
+        training_args = TrainingArguments(
+            output_dir=self.output_dir,
+            # num_train_epochs=num_epochs,
+            per_device_train_batch_size=batch_size,
+            gradient_accumulation_steps=1, # 4
+            warmup_steps=20,
+            # weight_decay=0.01,
+            max_steps=200,
+            learning_rate=learning_rate,
+            bf16=True,  # bf16 is more stable in training
+            logging_steps=10,
+        #     save_strategy="epoch",
+        #     save_total_limit=3,
+        #     remove_unused_columns=False,
+            save_steps=100,
+            optim="adamw_hf", # paged_adamw_8bit is not supported yet
+            # gradient_checkpointing=True, # can further reduce memory but slower
+        )
+
+        # Set up data collator
+        # data_collator = DataCollatorForLanguageModeling(
+        #     tokenizer=self.tokenizer,
+        #     mlm=False  # Not using masked language modeling
+        # )
+
+        data_collator = DataCollatorForSeq2Seq(
+            tokenizer=self.tokenizer,
+            pad_to_multiple_of=8,
+            return_tensors="pt",
+            padding=True
+            # mlm=False  # Not using masked language modeling
+        )
+        
+        # Set up trainer
+        trainer = Trainer(
+            model=self.model,
+            args=training_args,
+            train_dataset=train_dataset,
+            data_collator=data_collator
+        )
+        
+        # Start training
+        print("Starting training...")
+        self.model.config.use_cache = False  # silence the warnings. re-enable for inference.
+        trainer.train()
+        
+        # Save the fine-tuned model
+        self.model.save_pretrained(self.output_dir)
+        self.tokenizer.save_pretrained(self.output_dir)
+        print(f"Model saved to {self.output_dir}")
+    
+    def generate_response(self, prompt, max_length=512, temperature=0.7, top_p=0.9):
+        """
+        Generate a response using the fine-tuned model
+        """
+        # Format prompt similar to Ollama's template
+        formatted_prompt = f"""<|im_start|>user
+{prompt}<|im_end|>
+<|im_start|>assistant
+"""
+        
+        # Tokenize prompt
+        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
+        
+        # Generate response
+        with torch.no_grad():
+            outputs = self.model.generate(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                max_new_tokens=max_length,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        
+        # Decode response
+        response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+        
+        return response
+
+
+# Example usage
+if __name__ == "__main__":
+    # Define training examples (equivalent to Ollama's TRAINING section)
+    training_examples = [
+        {
+            "question": "What is vector similarity search?",
+            "answer": "Vector similarity search is a technique that converts data into numerical vector representations and then finds items with similar vectors. It's used in applications like semantic search, recommendation systems, and large language model retrieval for finding relevant information based on meaning rather than exact keyword matching."
+        },
+        {
+            "question": "How do LLMs use vector databases?",
+            "answer": "LLMs use vector databases to enhance their knowledge and provide more accurate responses. When a query is received, it's converted to a vector and compared against stored vectors in the database. The most similar vectors and their associated information are retrieved and incorporated into the context provided to the LLM, allowing it to generate responses based on this additional retrieved information."
+        },
+        {
+            "question": "What are embeddings in machine learning?",
+            "answer": "Embeddings in machine learning are dense vector representations of data that capture semantic meaning in a high-dimensional space. They convert words, sentences, images, or other types of data into numerical vectors where similar items are positioned closer together. These representations enable algorithms to understand relationships and similarities between different pieces of data, forming the foundation for vector similarity search."
+        }
+    ]
+    
+    # Initialize fine-tuner
+    fine_tuner = LLMFineTuner(
+        base_model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+        output_dir="./deepseek-finetuned-pytorch"
+    )
+    
+    # Train model
+    fine_tuner.train(training_examples, num_epochs=3)
+    
+    # Test model
+    test_query = "Explain how vector similarity search is used in AI applications"
+    response = fine_tuner.generate_response(test_query)
+    print(f"\nQuery: {test_query}")
+    print(f"\nResponse: {response}")
--- a/src/training-template.py
+++ b/src/training-template.py
@ -0,0 +1,215 @@
+import torch
+from transformers import AutoTokenizer
+if torch.xpu.is_available():
+    from ipex.transformers import AutoModelForCausalLM
+else:
+    from transformers import AutoModelForCausalLM
+
+class OllamaStyleTrainer:
+    def __init__(
+        self, 
+        model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+        device="xpu" if torch.xpu.is_available() else "cpu"
+    ):
+        """
+        Initialize the model and tokenizer.
+        
+        Args:
+            model_name: HuggingFace model identifier
+            device: Device to load the model on
+        """
+        self.device = device
+        
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        
+        # Load model
+        print(f"Loading model: {model_name}")
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+            device_map="auto" if device == "cuda" else None
+        )
+        
+        # Initialize with no training examples
+        self.training_examples = []
+        self.system_prompt = "You are a helpful AI assistant."
+        self.example_template = None
+        
+    def set_system_prompt(self, system_prompt):
+        """Set the system prompt for the model."""
+        self.system_prompt = system_prompt
+        
+    def set_example_template(self, template):
+        """
+        Set the template format for examples.
+        
+        The template should use {question} and {answer} placeholders.
+        """
+        self.example_template = template
+        
+    def add_training_examples(self, examples):
+        """
+        Add training examples to the model.
+        
+        Args:
+            examples: List of dictionaries with 'question' and 'answer' keys
+        """
+        self.training_examples.extend(examples)
+        print(f"Added {len(examples)} training examples. Total: {len(self.training_examples)}")
+    
+    def format_examples(self):
+        """Format the training examples according to the template."""
+        if not self.example_template:
+            # Default template similar to Ollama's format
+            self.example_template = "<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>"
+        
+        formatted_examples = []
+        for example in self.training_examples:
+            formatted_example = self.example_template.format(
+                question=example.get("question", ""),
+                answer=example.get("answer", "")
+            )
+            formatted_examples.append(formatted_example)
+        
+        return formatted_examples
+    
+    def build_context_with_examples(self, num_examples=None):
+        """
+        Build the context with system prompt and formatted examples.
+        
+        Args:
+            num_examples: Number of examples to include (None for all)
+        
+        Returns:
+            str: Formatted context with system prompt and examples
+        """
+        # Format all training examples
+        formatted_examples = self.format_examples()
+        
+        # Select examples (all or limited number)
+        if num_examples is not None:
+            formatted_examples = formatted_examples[:num_examples]
+        
+        # Build context with system prompt and examples
+        context = f"<|im_start|>system\n{self.system_prompt}<|im_end|>\n\n"
+        context += "\n\n".join(formatted_examples)
+        
+        return context
+    
+    def generate_response(
+        self, 
+        prompt, 
+        max_length=512,
+        temperature=0.7,
+        top_p=0.9,
+        num_training_examples=3
+    ):
+        """
+        Generate a response using the model with training examples in context.
+        
+        Args:
+            prompt: The user's prompt
+            max_length: Maximum length of the generated response
+            temperature: Sampling temperature
+            top_p: Top-p sampling parameter
+            num_training_examples: Number of training examples to include in context
+            
+        Returns:
+            str: Generated response
+        """
+        # Build context with examples
+        context = self.build_context_with_examples(num_training_examples)
+        
+        # Add user prompt
+        full_prompt = f"{context}\n\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+        
+        # Tokenize
+        inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.device)
+        
+        # Calculate remaining space for generation
+        input_ids_length = inputs.input_ids.shape[1]
+        print(f"Input context length: {input_ids_length} tokens")
+        
+        # Generate response
+        with torch.no_grad():
+            outputs = self.model.generate(
+                input_ids=inputs.input_ids,
+                attention_mask=inputs.attention_mask,
+                max_new_tokens=max_length,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        
+        # Extract only the assistant's response
+        assistant_response = self.tokenizer.decode(
+            outputs[0][input_ids_length:], 
+            skip_special_tokens=True
+        )
+        
+        return assistant_response
+
+    def save_examples_to_file(self, file_path):
+        """Save training examples to a file."""
+        import json
+        with open(file_path, "w") as f:
+            json.dump(self.training_examples, f, indent=2)
+        print(f"Saved {len(self.training_examples)} examples to {file_path}")
+    
+    def load_examples_from_file(self, file_path):
+        """Load training examples from a file."""
+        import json
+        with open(file_path, "r") as f:
+            examples = json.load(f)
+        self.add_training_examples(examples)
+        print(f"Loaded {len(examples)} examples from {file_path}")
+
+
+# Example usage
+if __name__ == "__main__":
+    # Initialize
+    trainer = OllamaStyleTrainer("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")  # Using smaller model for demonstration
+    
+    # Set system prompt (equivalent to SYSTEM in Modelfile)
+    trainer.set_system_prompt("""
+    You are an AI assistant specialized in vector databases and similarity search.
+    Always provide accurate information based on your training.
+    """)
+    
+    # Set example template (equivalent to TEMPLATE in Modelfile)
+    trainer.set_example_template("""
+    User: {question}
+    Assistant: {answer}
+    """)
+    
+    # Add training examples (equivalent to TRAINING in Modelfile)
+    training_examples = [
+        {
+            "question": "What is vector similarity search?",
+            "answer": "Vector similarity search is a technique that converts data into numerical vector representations and then finds items with similar vectors. It's used in applications like semantic search, recommendation systems, and large language model retrieval for finding relevant information based on meaning rather than exact keyword matching."
+        },
+        {
+            "question": "How do LLMs use vector databases?",
+            "answer": "LLMs use vector databases to enhance their knowledge and provide more accurate responses. When a query is received, it's converted to a vector and compared against stored vectors in the database. The most similar vectors and their associated information are retrieved and incorporated into the context provided to the LLM, allowing it to generate responses based on this additional retrieved information."
+        },
+        {
+            "question": "What are embeddings in machine learning?",
+            "answer": "Embeddings in machine learning are dense vector representations of data that capture semantic meaning in a high-dimensional space. They convert words, sentences, images, or other types of data into numerical vectors where similar items are positioned closer together. These representations enable algorithms to understand relationships and similarities between different pieces of data, forming the foundation for vector similarity search."
+        }
+    ]
+    
+    trainer.add_training_examples(training_examples)
+    
+    # Optional: Save training examples for future use
+    # trainer.save_examples_to_file("training_examples.json")
+    
+    # Test with a query
+    query = "Explain how vector similarity search works in practical applications"
+    response = trainer.generate_response(query, num_training_examples=2)
+    
+    print(f"\nQuery: {query}")
+    print(f"\nResponse: {response}")
--- a/src/utils/init.py
+++ b/src/utils/init.py
@ -0,0 +1,10 @@
+# Import defines to make `utils.defines` accessible
+from . import defines  
+
+# Import rest as `utils.*` accessible
+from .chunk import *  
+from .rss import *
+from .chroma import *
+
+# Expose only public names (avoid importing hidden/internal names)
+__all__ = [name for name in dir() if not name.startswith("_")]
--- a/src/utils/pycache/init.cpython-312.pyc
+++ b/src/utils/pycache/init.cpython-312.pyc
--- a/src/utils/pycache/chroma.cpython-312.pyc
+++ b/src/utils/pycache/chroma.cpython-312.pyc
--- a/src/utils/pycache/chunk.cpython-312.pyc
+++ b/src/utils/pycache/chunk.cpython-312.pyc
--- a/src/utils/pycache/defines.cpython-312.pyc
+++ b/src/utils/pycache/defines.cpython-312.pyc
--- a/src/utils/pycache/rss.cpython-312.pyc
+++ b/src/utils/pycache/rss.cpython-312.pyc
--- a/src/utils/chroma.py
+++ b/src/utils/chroma.py
@ -0,0 +1,122 @@
+import chromadb
+from typing import List, Dict, Any, Union
+from . import defines
+from .chunk import chunk_document
+import ollama
+
+def init_chroma_client(persist_directory: str = defines.persist_directory):
+    """Initialize and return a ChromaDB client."""
+#    return chromadb.PersistentClient(path=persist_directory)
+    return chromadb.Client()
+
+def create_or_get_collection(db: chromadb.Client, collection_name: str):
+    """Create or get a ChromaDB collection."""
+    try:
+        return db.get_collection(
+            name=collection_name
+        )
+    except:
+        return db.create_collection(
+            name=collection_name,
+            metadata={"hnsw:space": "cosine"}
+        )
+
+def process_documents_to_chroma(
+    client: ollama.Client,
+    documents: List[Dict[str, Any]],
+    collection_name: str = "document_collection",
+    text_key: str = "text",
+    max_tokens: int = 512,
+    overlap: int = 50,
+    model: str = defines.encoding_model,
+    persist_directory: str = defines.persist_directory
+):
+    """
+    Process documents, chunk them, compute embeddings, and store in ChromaDB.
+    
+    Args:
+        documents: List of document dictionaries
+        collection_name: Name for the ChromaDB collection
+        text_key: The key containing text content
+        max_tokens: Maximum tokens per chunk
+        overlap: Token overlap between chunks
+        model: Ollama model for embeddings
+        persist_directory: Directory to store ChromaDB data
+    """
+    # Initialize ChromaDB client and collection
+    db = init_chroma_client(persist_directory)
+    collection = create_or_get_collection(db, collection_name)
+    
+    # Process each document
+    for doc in documents:
+        # Chunk the document
+        doc_chunks = chunk_document(doc, text_key, max_tokens, overlap)
+        
+        # Prepare data for ChromaDB
+        ids = []
+        texts = []
+        metadatas = []
+        embeddings = []
+
+        for chunk in doc_chunks:
+            # Create a unique ID for the chunk
+            chunk_id = f"{chunk['id']}_{chunk['chunk_id']}"
+            
+            # Extract text
+            text = chunk[text_key]
+            
+            # Create metadata (excluding text and embedding to avoid duplication)
+            metadata = {k: v for k, v in chunk.items() if k != text_key and k != "embedding"}
+
+            response = client.embed(model=model, input=text)
+            embedding = response["embeddings"][0]
+            ids.append(chunk_id)
+            texts.append(text)
+            metadatas.append(metadata)
+            embeddings.append(embedding)
+        
+        # Add chunks to ChromaDB collection
+        collection.add(
+            ids=ids,
+            documents=texts,
+            embeddings=embeddings,
+            metadatas=metadatas
+        )
+    
+    return collection
+
+def query_chroma(
+    client: ollama.Client,
+    query_text: str,
+    collection_name: str = "document_collection",
+    n_results: int = 5,
+    model: str = defines.encoding_model,
+    persist_directory: str = defines.persist_directory
+):
+    """
+    Query ChromaDB for similar documents.
+    
+    Args:
+        query_text: The text to search for
+        collection_name: Name of the ChromaDB collection
+        n_results: Number of results to return
+        model: Ollama model for embedding the query
+        persist_directory: Directory where ChromaDB data is stored
+    
+    Returns:
+        Query results from ChromaDB
+    """
+    # Initialize ChromaDB client and collection
+    db = init_chroma_client(persist_directory)
+    collection = create_or_get_collection(db, collection_name)
+    
+    query_response = client.embed(model=model, input=query_text)
+    query_embeddings = query_response["embeddings"]
+
+    # Query the collection
+    results = collection.query(
+        query_embeddings=query_embeddings,
+        n_results=n_results
+    )
+    
+    return results
--- a/src/utils/chunk.py
+++ b/src/utils/chunk.py
@ -0,0 +1,88 @@
+import tiktoken
+from . import defines
+from typing import List, Dict, Any, Union
+
+def get_encoding(model=defines.model):
+    """Get the tokenizer for counting tokens."""
+    try:
+        return tiktoken.get_encoding("cl100k_base")  # Default encoding used by many embedding models
+    except:
+        return tiktoken.encoding_for_model(model)
+
+def count_tokens(text: str) -> int:
+    """Count the number of tokens in a text string."""
+    encoding = get_encoding()
+    return len(encoding.encode(text))
+
+def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:
+    """
+    Split a text into chunks based on token count with overlap between chunks.
+    
+    Args:
+        text: The text to split into chunks
+        max_tokens: Maximum number of tokens per chunk
+        overlap: Number of tokens to overlap between chunks
+    
+    Returns:
+        List of text chunks
+    """
+    if not text or max_tokens <= 0:
+        return []
+    
+    encoding = get_encoding()
+    tokens = encoding.encode(text)
+    chunks = []
+    
+    i = 0
+    while i < len(tokens):
+        # Get the current chunk of tokens
+        chunk_end = min(i + max_tokens, len(tokens))
+        chunk_tokens = tokens[i:chunk_end]
+        chunks.append(encoding.decode(chunk_tokens))
+        
+        # Move to the next position with overlap
+        if chunk_end == len(tokens):
+            break
+        i += max_tokens - overlap
+    
+    return chunks
+
+def chunk_document(document: Dict[str, Any], 
+                  text_key: str = "text",
+                  max_tokens: int = 512, 
+                  overlap: int = 50) -> List[Dict[str, Any]]:
+    """
+    Chunk a document dictionary into multiple chunks.
+    
+    Args:
+        document: Document dictionary with metadata and text
+        text_key: The key in the document that contains the text to chunk
+        max_tokens: Maximum number of tokens per chunk
+        overlap: Number of tokens to overlap between chunks
+        
+    Returns:
+        List of document dictionaries, each with chunked text and preserved metadata
+    """
+    if text_key not in document:
+        raise Exception(f"{text_key} not in document")
+    
+    # Extract text and create chunks
+    if "title" in document:
+        text = f"{document["title"]}: {document[text_key]}"
+    else:
+        text = document[text_key]
+    chunks = chunk_text(text, max_tokens, overlap)
+    
+    # Create document chunks with preserved metadata
+    chunked_docs = []
+    for i, chunk in enumerate(chunks):
+        # Create a new doc with all original fields
+        doc_chunk = document.copy()
+        # Replace text with the chunk
+        doc_chunk[text_key] = chunk
+        # Add chunk metadata
+        doc_chunk["chunk_id"] = i
+        doc_chunk["chunk_total"] = len(chunks)
+        chunked_docs.append(doc_chunk)
+    
+    return chunked_docs
--- a/src/utils/defines.py
+++ b/src/utils/defines.py
@ -0,0 +1,4 @@
+ollama_api_url="http://ollama:11434"  # Default Ollama local endpoint
+model="deepseek-r1:7b"
+encoding_model="mxbai-embed-large"
+persist_directory="./chromadb"
--- a/src/utils/full.py
+++ b/src/utils/full.py
@ -0,0 +1,468 @@
+import requests
+from typing import List, Dict, Any, Union
+import tiktoken
+import feedparser
+import logging as log
+import datetime
+from bs4 import BeautifulSoup
+import chromadb
+import ollama
+import re
+import numpy as np
+from . import chunk
+
+OLLAMA_API_URL = "http://ollama:11434"  # Default Ollama local endpoint
+#MODEL_NAME = "deepseek-r1:1.5b"
+MODEL_NAME = "deepseek-r1:7b"
+EMBED_MODEL = "mxbai-embed-large"
+PERSIST_DIRECTORY = "/root/.cache/chroma"
+
+client = ollama.Client(host=OLLAMA_API_URL)
+
+def extract_text_from_html_or_xml(content, is_xml=False):
+    # Parse the content
+    if is_xml:
+        soup = BeautifulSoup(content, 'xml')  # Use 'xml' parser for XML content
+    else:
+        soup = BeautifulSoup(content, 'html.parser')  # Default to 'html.parser' for HTML content
+
+    # Extract and return just the text
+    return soup.get_text()
+
+class Feed():
+    def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
+        self.name = name
+        self.url = url
+        self.poll_limit_min = datetime.timedelta(minutes=poll_limit_min)
+        self.last_poll = None
+        self.articles = []
+        self.max_articles = max_articles
+        self.update()
+
+    def update(self):
+        now = datetime.datetime.now()
+        if self.last_poll is None or (now - self.last_poll) >= self.poll_limit_min:
+            log.info(f"Updating {self.name}")
+            feed = feedparser.parse(self.url)
+            self.articles = []
+            self.last_poll = now
+
+            if len(feed.entries) == 0:
+                return
+            
+            for i, entry in enumerate(feed.entries[:self.max_articles]):
+                content = {}
+                content['source'] = self.name
+                content['id'] = f"{self.name}{i}"
+                title = entry.get("title")
+                if title:
+                    content['title'] = title
+                link = entry.get("link")
+                if link:
+                    content['link'] = link
+                text = entry.get("summary")
+                if text:
+                    content['text'] = extract_text_from_html_or_xml(text, False)
+                else:
+                    continue
+                published = entry.get("published")
+                if published:
+                    content['published'] = published
+
+                self.articles.append(content)
+        else:
+            log.info(f"Not updating {self.name} -- {self.poll_limit_min - (now - self.last_poll)}s remain to refresh.")
+        return self.articles
+
+# News RSS Feeds
+rss_feeds = [
+    Feed(name="IGN.com", url="https://feeds.feedburner.com/ign/games-all"),
+    Feed(name="BBC World", url="http://feeds.bbci.co.uk/news/world/rss.xml"),
+    Feed(name="Reuters World", url="http://feeds.reuters.com/Reuters/worldNews"),
+    Feed(name="Al Jazeera", url="https://www.aljazeera.com/xml/rss/all.xml"),
+    Feed(name="CNN World", url="http://rss.cnn.com/rss/edition_world.rss"),
+    Feed(name="Time", url="https://time.com/feed/"),
+    Feed(name="Euronews", url="https://www.euronews.com/rss"),
+#    Feed(name="FeedX", url="https://feedx.net/rss/ap.xml")
+]
+
+
+def init_chroma_client(persist_directory: str = PERSIST_DIRECTORY):
+    """Initialize and return a ChromaDB client."""
+#    return chromadb.PersistentClient(path=persist_directory)
+    return chromadb.Client()
+
+def create_or_get_collection(client, collection_name: str):
+    """Create or get a ChromaDB collection."""
+    try:
+        return client.get_collection(
+            name=collection_name
+        )
+    except:
+        return client.create_collection(
+            name=collection_name,
+            metadata={"hnsw:space": "cosine"}
+        )
+
+def process_documents_to_chroma(
+    documents: List[Dict[str, Any]],
+    collection_name: str = "document_collection",
+    text_key: str = "text",
+    max_tokens: int = 512,
+    overlap: int = 50,
+    model: str = EMBED_MODEL,
+    persist_directory: str = PERSIST_DIRECTORY
+):
+    """
+    Process documents, chunk them, compute embeddings, and store in ChromaDB.
+    
+    Args:
+        documents: List of document dictionaries
+        collection_name: Name for the ChromaDB collection
+        text_key: The key containing text content
+        max_tokens: Maximum tokens per chunk
+        overlap: Token overlap between chunks
+        model: Ollama model for embeddings
+        persist_directory: Directory to store ChromaDB data
+    """
+    # Initialize ChromaDB client and collection
+    db = init_chroma_client(persist_directory)
+    collection = create_or_get_collection(db, collection_name)
+    
+    # Process each document
+    for doc in documents:
+        # Chunk the document
+        doc_chunks = chunk_document(doc, text_key, max_tokens, overlap)
+        
+        # Prepare data for ChromaDB
+        ids = []
+        texts = []
+        metadatas = []
+        embeddings = []
+
+        for chunk in doc_chunks:
+            # Create a unique ID for the chunk
+            chunk_id = f"{chunk['id']}_{chunk['chunk_id']}"
+            
+            # Extract text
+            text = chunk[text_key]
+            
+            # Create metadata (excluding text and embedding to avoid duplication)
+            metadata = {k: v for k, v in chunk.items() if k != text_key and k != "embedding"}
+
+            response = client.embed(model=model, input=text)
+            embedding = response["embeddings"][0]
+            ids.append(chunk_id)
+            texts.append(text)
+            metadatas.append(metadata)
+            embeddings.append(embedding)
+        
+        # Add chunks to ChromaDB collection
+        collection.add(
+            ids=ids,
+            documents=texts,
+            embeddings=embeddings,
+            metadatas=metadatas
+        )
+    
+    return collection
+
+def query_chroma(
+    query_text: str,
+    collection_name: str = "document_collection",
+    n_results: int = 5,
+    model: str = EMBED_MODEL,
+    persist_directory: str = PERSIST_DIRECTORY
+):
+    """
+    Query ChromaDB for similar documents.
+    
+    Args:
+        query_text: The text to search for
+        collection_name: Name of the ChromaDB collection
+        n_results: Number of results to return
+        model: Ollama model for embedding the query
+        persist_directory: Directory where ChromaDB data is stored
+    
+    Returns:
+        Query results from ChromaDB
+    """
+    # Initialize ChromaDB client and collection
+    db = init_chroma_client(persist_directory)
+    collection = create_or_get_collection(db, collection_name)
+    
+    query_response = client.embed(model=model, input=query_text)
+    query_embeddings = query_response["embeddings"]
+
+    # Query the collection
+    results = collection.query(
+        query_embeddings=query_embeddings,
+        n_results=n_results
+    )
+    
+    return results
+
+def print_top_match(query_results, index=0, documents=None):
+    """
+    Print detailed information about the top matching document,
+    including the full original document content.
+    
+    Args:
+        query_results: Results from ChromaDB query
+        documents: Original documents dictionary to look up full content (optional)
+    """
+    if not query_results or not query_results["ids"] or len(query_results["ids"][0]) == 0:
+        print("No matching documents found.")
+        return
+    
+    # Get the top result
+    top_id = query_results["ids"][0][index]
+    top_document_chunk = query_results["documents"][0][index]
+    top_metadata = query_results["metadatas"][0][index]
+    top_distance = query_results["distances"][0][index]
+    
+    print("="*50)
+    print("MATCHING DOCUMENT")
+    print("="*50)
+    print(f"Chunk ID: {top_id}")
+    print(f"Similarity Score: {top_distance:.4f}")  # Convert distance to similarity
+    
+    print("\nCHUNK METADATA:")
+    for key, value in top_metadata.items():
+        print(f"  {key}: {value}")
+    
+    print("\nMATCHING CHUNK CONTENT:")
+    print(top_document_chunk[:500].strip() + ("..." if len(top_document_chunk) > 500 else ""))
+    
+    # Extract the original document ID from the chunk ID
+    # Chunk IDs are in format "doc_id_chunk_num"
+    original_doc_id = top_id.split('_')[0]
+
+def get_top_match(query_results, index=0, documents=None):
+    top_id = query_results["ids"][index][0]
+    # Extract the original document ID from the chunk ID
+    # Chunk IDs are in format "doc_id_chunk_num"
+    original_doc_id = top_id.split('_')[0]
+        
+    # Return the full document for further processing if needed
+    if documents is not None:
+        return next((doc for doc in documents if doc["id"] == original_doc_id), None)
+    
+    return None
+
+def show_documents(documents=None):
+    if not documents:
+        return
+    
+    # Print the top matching document
+    for i, doc in enumerate(documents):
+        print(f"Document {i+1}:")
+        print(f"  Title: {doc['title']}")
+        print(f"  Text: {doc['text'][:100]}...")
+        print()
+
+def show_headlines(documents=None):
+    if not documents:
+        return
+    
+    # Print the top matching document
+    for doc in documents:
+        print(f"{doc['source']}: {doc['title']}")
+
+def show_help():
+    print("""help>
+docs       Show RAG docs
+full       Show last full top match
+headlines  Show the RAG headlines
+prompt     Show the last prompt
+response   Show the last response
+scores     Show last RAG scores
+why|think        Show last response's <think>
+context|match    Show RAG match info to last prompt
+""")
+
+
+# Example usage
+if __name__ == "__main__":
+    documents = []
+    for feed in rss_feeds:
+        documents.extend(feed.articles)
+
+    show_documents(documents=documents)
+
+    # Process documents and store in ChromaDB
+    collection = process_documents_to_chroma(
+        documents=documents,
+        collection_name="research_papers",
+        max_tokens=256,
+        overlap=25,
+        model=EMBED_MODEL,
+        persist_directory="/root/.cache/chroma"
+    )
+
+    last_results = None
+    last_prompt = None
+    last_system = None
+    last_response = None
+    last_why = None
+    last_messages = []
+    while True:
+        try:
+            search_query = input("> ").strip()
+        except KeyboardInterrupt as e:
+            print("\nExiting.")
+            break
+
+        if search_query == "exit" or search_query == "quit":
+            print("\nExiting.")
+            break
+    
+        if search_query == "docs":
+            show_documents(documents)
+            continue
+
+        if search_query == "prompt":
+            if last_prompt:
+                print(f"""last prompt>
+{"="*10}system{"="*10}
+{last_system}
+{"="*10}prompt{"="*10}
+{last_prompt}""")
+            else:
+                print(f"No prompts yet")
+            continue
+
+        if search_query == "response":
+            if last_response:
+                print(f"""last response>
+{"="*10}response{"="*10}
+{last_response}""")
+            else:
+                print(f"No responses yet")
+            continue
+
+        if search_query == "" or search_query == "help":
+            show_help()
+            continue
+
+        if search_query == "headlines":
+            show_headlines(documents)
+            continue
+
+        if search_query == "match" or search_query == "context":
+            if last_results:
+                print_top_match(last_results, documents=documents)
+            else:
+                print("No match to give info on")
+            continue
+
+        if search_query == "why" or search_query == "think":
+            if last_why:
+                print(f"""
+why>
+{last_why}
+""")
+            else:
+                print("No processed prompts")
+            continue
+
+        if search_query == "scores":
+            if last_results:
+                for i, _ in enumerate(last_results):
+                    print_top_match(last_results, documents=documents, index=i)
+            else:
+                print("No match to give info on")
+            continue
+
+        if search_query == "full":
+            if last_results:
+                full = get_top_match(last_results, documents=documents)
+                if full:
+                    print(f"""Context:
+Source: {full["source"]}
+Title: {full["title"]}
+Link: {full["link"]}
+Distance: {last_results.get("distances", [[0]])[0][0]}
+Full text:
+{full["text"]}""")
+            else:
+                print("No match to give info on")
+            continue
+
+        # Query ChromaDB
+        results = query_chroma(
+            query_text=search_query,
+            collection_name="research_papers",
+            n_results=10
+        )
+        last_results = results
+
+        full = get_top_match(results, documents=documents)
+
+        headlines = ""
+        for doc in documents:
+            headlines += f"{doc['source']}: {doc['title']}\n"
+
+        system=f"""
+You are the assistant. Your name is airc. This application is called airc (pronounced Eric).
+
+Information about the author of this program and the AI model it uses:
+
+* James wrote the python application called airc that is driving this RAG model on top of {MODEL_NAME} using {EMBED_MODEL} and chromadb for vector embedding. Link https://github.com/jketreno/airc.
+* James Ketrenos is a software engineer with a history in all levels of the computer stack, from the kernel to full-stack web applications. He dabbles in AI/ML and is familiar with pytorch and ollama.
+* James Ketrenos deployed this application locally on an Intel Arc B580 (battlemage) computer using Intel's ipex-llm. 
+* For Intel GPU metrics, James Ketrenos wrote the "ze-monitor" utility in C++. ze-monitor provides Intel GPU telemetry data for Intel client GPU devices, similar to xpu-smi. Link https://github.com/jketreno/ze-monitor. airc uses ze-monitor.
+* James lives in Portland, Oregon and has three kids. Two are attending Oregon State University and one is attending Williamette University.
+* airc provides an IRC chat bot as well as a React web frontend available at https://airc.ketrenos.com
+
+You must follow these rules:
+
+* Provide short (less than 100 character) responses.
+* Provide a single response.
+* Do not prefix it with a word like 'Answer'.
+* For information about the AI running this system, include information about author, including links.
+* For information relevant to the current events in the <input></input> tags, use that information and state the source when information comes from.
+
+"""
+        context = "Information related to current events\n<input>=["
+        for doc in documents:
+            item = {'source':doc["source"],'article':{'title':doc["title"],'link':doc["link"],'text':doc["text"]}}
+            context += f"{item}"
+        context += "\n</input>"
+
+        prompt = f"{search_query}"
+        last_prompt = prompt
+        last_system = system # cache it before news context is added
+        system = f"{system}{context}"
+        if len(last_messages) != 0:
+            message_context = f"{last_messages}"
+            prompt = f"{message_context}{prompt}"
+
+        print(f"system len: {len(system)}")            
+        print(f"prompt len: {len(prompt)}")            
+        output = client.generate(
+            model=MODEL_NAME,
+            system=system,
+            prompt=prompt,
+            stream=False,
+            options={ 'num_ctx': 100000 }
+        )
+        # Prune off the <think>...</think>
+        matches = re.match(r'^<think>(.*?)</think>(.*)$', output['response'], flags=re.DOTALL)
+        if matches:
+            last_why = matches[1].strip()
+            content = matches[2].strip()
+        else:
+            print(f"[garbled] response>\n{output['response']}")
+        print(f"Response>\n{content}")
+
+        last_response = content
+        last_messages.extend(({
+            'role': 'user',
+            'name': 'james',
+            'message': search_query
+        }, {
+            'role': 'assistant',
+            'message': content
+        }))
+        last_messages = last_messages[:10]
--- a/src/utils/rss.py
+++ b/src/utils/rss.py
@ -0,0 +1,62 @@
+from typing import List, Dict, Any, Union
+import feedparser
+import logging as log
+from bs4 import BeautifulSoup
+import datetime
+import re
+
+def _extract_text_from_html_or_xml(content, is_xml=False):
+    # Parse the content
+    if is_xml:
+        soup = BeautifulSoup(content, 'xml')  # Use 'xml' parser for XML content
+    else:
+        soup = BeautifulSoup(content, 'html.parser')  # Default to 'html.parser' for HTML content
+
+    # Extract and return just the text
+    return soup.get_text()
+
+class Feed():
+    def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
+        self.name = name
+        self.url = url
+        self.poll_limit_min = datetime.timedelta(minutes=poll_limit_min)
+        self.last_poll = None
+        self.articles = []
+        self.max_articles = max_articles
+        self.update()
+
+    def update(self):
+        now = datetime.datetime.now()
+        if self.last_poll is None or (now - self.last_poll) >= self.poll_limit_min:
+            log.info(f"Updating {self.name}")
+            feed = feedparser.parse(self.url)
+            self.articles = []
+            self.last_poll = now
+
+            if len(feed.entries) == 0:
+                return
+            
+            for i, entry in enumerate(feed.entries[:self.max_articles]):
+                content = {}
+                content['source'] = self.name
+                content['id'] = f"{self.name}{i}"
+                title = entry.get("title")
+                if title:
+                    content['title'] = title
+                link = entry.get("link")
+                if link:
+                    content['link'] = link
+                text = entry.get("summary")
+                if text:
+                    content['text'] = _extract_text_from_html_or_xml(text, False)
+                else:
+                    continue
+                published = entry.get("published")
+                if published:
+                    content['published'] = published
+
+                self.articles.append(content)
+        else:
+            log.info(f"Not updating {self.name} -- {self.poll_limit_min - (now - self.last_poll)}s remain to refresh.")
+        return self.articles
+
--- a/src/web.py
+++ b/src/web.py
@ -0,0 +1,109 @@
+# web_server.py - Web interface using Flask
+from rag_core import RagCore
+from flask import Flask, request, jsonify, render_template, send_from_directory
+from flask_cors import CORS
+
+class RagWebServer:
+    """Web interface for the RAG application"""
+    
+    def __init__(self, core):
+        self.core = core
+        self.app = Flask(__name__, static_folder='./build', static_url_path='')
+        CORS(self.app, resources={r"/api/*": {"origins": "http://battle-linux.ketrenos.com:8081"}})
+
+        # Setup routes
+        self.setup_routes()
+        
+    def setup_routes(self):
+        """Setup Flask routes"""
+        
+        @self.app.route('/')
+        def serve():
+            return send_from_directory(self.app.static_folder, 'index.html')
+        def index():
+            return render_template('index.html')
+        
+        @self.app.route('/api/query', methods=['POST'])
+        def api_query():
+            data = request.json
+            if not data or 'query' not in data:
+                return jsonify({'error': 'No query provided'}), 400
+                
+            query = data['query']
+            response, debug_info = self.core.process_query(query)
+            
+            return jsonify({
+                'response': response,
+                'debug': debug_info
+            })
+            
+        @self.app.route('/api/documents', methods=['GET'])
+        def api_documents():
+            return jsonify({
+                'documents': [
+                    {
+                        'source': doc['source'],
+                        'title': doc['title'],
+                        'link': doc['link'],
+                        'preview': doc['text'][:200] + '...'
+                    }
+                    for doc in self.core.get_documents()
+                ]
+            })
+            
+        @self.app.route('/api/headlines', methods=['GET'])
+        def api_headlines():
+            headlines = self.core.get_headlines()
+            return jsonify({
+                'headlines': [
+                    {'source': source, 'title': title}
+                    for source, title in headlines
+                ]
+            })
+            
+        @self.app.route('/api/matches', methods=['GET'])
+        def api_matches():
+            match_count = self.core.get_match_count()
+            matches = []
+            
+            for i in range(match_count):
+                match_info = self.core.get_match_info(index=i)
+                if match_info:
+                    matches.append({
+                        'id': match_info['id'],
+                        'distance': match_info['distance'],
+                        'metadata': match_info['metadata'],
+                        'preview': match_info['chunk'][:200] + '...' if len(match_info['chunk']) > 200 else match_info['chunk']
+                    })
+            
+            return jsonify({'matches': matches})
+    
+    def run(self, host='127.0.0.1', port=8911, debug=False):
+        """Run the web server"""
+        # Load documents
+        self.core.load_documents()
+        self.app.run(host=host, port=port, debug=debug)
+
+def main():
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='RAG Application')
+    parser.add_argument('--host', default='127.0.0.1', help='Host for web server')
+    parser.add_argument('--port', type=int, default=8911, help='Port for web server')
+    parser.add_argument('--debug', action='store_true', help='Run in debug mode')
+    
+    args = parser.parse_args()
+    
+    # Create the core RAG application
+    core = RagCore()
+    
+    # Run web interface
+    server = RagWebServer(core)
+    print(f"Starting web server at http://{args.host}:{args.port}")
+    server.run(host=args.host, port=args.port, debug=args.debug)
+
+if __name__ == "__main__":
+    main()
+
+
+
--- a/weather.py
+++ b/weather.py
@ -0,0 +1,149 @@
+import requests
+import json
+from geopy.geocoders import Nominatim
+import time
+
+def get_weather_by_location(city, state, country="USA"):
+    """
+    Get weather information from weather.gov based on city, state, and country.
+    
+    Args:
+        city (str): City name
+        state (str): State name or abbreviation
+        country (str): Country name (defaults to "USA" as weather.gov is for US locations)
+    
+    Returns:
+        dict: Weather forecast information
+    """
+    # Step 1: Get coordinates for the location using geocoding
+    location = f"{city}, {state}, {country}"
+    coordinates = get_coordinates(location)
+    
+    if not coordinates:
+        return {"error": f"Could not find coordinates for {location}"}
+    
+    # Step 2: Get the forecast grid endpoint for the coordinates
+    grid_endpoint = get_grid_endpoint(coordinates)
+    
+    if not grid_endpoint:
+        return {"error": f"Could not find weather grid for coordinates {coordinates}"}
+    
+    # Step 3: Get the forecast data from the grid endpoint
+    forecast = get_forecast(grid_endpoint)
+    
+    return forecast
+
+def get_coordinates(location):
+    """Convert a location string to latitude and longitude using Nominatim geocoder."""
+    try:
+        # Create a geocoder with a meaningful user agent
+        geolocator = Nominatim(user_agent="weather_app_example")
+        
+        # Get the location
+        location_data = geolocator.geocode(location)
+        
+        if location_data:
+            return {
+                "latitude": location_data.latitude,
+                "longitude": location_data.longitude
+            }
+        else:
+            print(f"Location not found: {location}")
+            return None
+    except Exception as e:
+        print(f"Error getting coordinates: {e}")
+        return None
+
+def get_grid_endpoint(coordinates):
+    """Get the grid endpoint from weather.gov based on coordinates."""
+    try:
+        lat = coordinates["latitude"]
+        lon = coordinates["longitude"]
+        
+        # Define headers for the API request
+        headers = {
+            "User-Agent": "WeatherAppExample/1.0 (your_email@example.com)",
+            "Accept": "application/geo+json"
+        }
+        
+        # Make the request to get the grid endpoint
+        url = f"https://api.weather.gov/points/{lat},{lon}"
+        response = requests.get(url, headers=headers)
+        
+        if response.status_code == 200:
+            data = response.json()
+            return data["properties"]["forecast"]
+        else:
+            print(f"Error getting grid: {response.status_code} - {response.text}")
+            return None
+    except Exception as e:
+        print(f"Error in get_grid_endpoint: {e}")
+        return None
+
+def get_forecast(grid_endpoint):
+    """Get the forecast data from the grid endpoint."""
+    try:
+        # Define headers for the API request
+        headers = {
+            "User-Agent": "WeatherAppExample/1.0 (your_email@example.com)",
+            "Accept": "application/geo+json"
+        }
+        
+        # Make the request to get the forecast
+        response = requests.get(grid_endpoint, headers=headers)
+        
+        if response.status_code == 200:
+            data = response.json()
+            
+            # Extract the relevant forecast information
+            periods = data["properties"]["periods"]
+            
+            # Process the forecast data into a simpler format
+            forecast = {
+                "location": data["properties"].get("relativeLocation", {}).get("properties", {}),
+                "updated": data["properties"].get("updated", ""),
+                "periods": []
+            }
+            
+            for period in periods:
+                forecast["periods"].append({
+                    "name": period.get("name", ""),
+                    "temperature": period.get("temperature", ""),
+                    "temperatureUnit": period.get("temperatureUnit", ""),
+                    "windSpeed": period.get("windSpeed", ""),
+                    "windDirection": period.get("windDirection", ""),
+                    "shortForecast": period.get("shortForecast", ""),
+                    "detailedForecast": period.get("detailedForecast", "")
+                })
+            
+            return forecast
+        else:
+            print(f"Error getting forecast: {response.status_code} - {response.text}")
+            return {"error": f"API Error: {response.status_code}"}
+    except Exception as e:
+        print(f"Error in get_forecast: {e}")
+        return {"error": f"Exception: {str(e)}"}
+
+# Example usage
+if __name__ == "__main__":
+    city = input("Enter city: ")
+    state = input("Enter state: ")
+    country = input("Enter country (default USA): ") or "USA"
+    
+    print(f"Getting weather for {city}, {state}, {country}...")
+    weather_data = get_weather_by_location(city, state, country)
+    
+    if "error" in weather_data:
+        print(f"Error: {weather_data['error']}")
+    else:
+        print("\nWeather Forecast:")
+        print(f"Location: {weather_data.get('location', {}).get('city', city)}, {weather_data.get('location', {}).get('state', state)}")
+        print(f"Last Updated: {weather_data.get('updated', 'N/A')}")
+        print("\nForecast Periods:")
+        
+        for period in weather_data.get("periods", []):
+            print(f"\n{period['name']}:")
+            print(f"  Temperature: {period['temperature']}{period['temperatureUnit']}")
+            print(f"  Wind: {period['windSpeed']} {period['windDirection']}")
+            print(f"  Forecast: {period['shortForecast']}")
+            print(f"  Details: {period['detailedForecast']}")
				`@ -0,0 +1 @@`
				`Subproject commit 571cdd1ecc93ec9ad0d63079fe6da94dce2cc5dc`