Caching

2025-03-18 13:09:52 -07:00 · 2025-03-18 13:09:52 -07:00 · f10f056e0d
commit f10f056e0d
parent 1130077c03
73 changed files with 763972 additions and 0 deletions
--- a/Dockerfile.airc-pytorch
+++ b/Dockerfile.airc-pytorch
@ -0,0 +1,444 @@
 FROM ubuntu:oracular AS pytorch-build
 SHELL [ "/bin/bash", "-c" ]
 # Instructions Dockerfied from:
 #
 # https://github.com/pytorch/pytorch
 #
 # and
 #
 # https://pytorch.org/docs/stable/notes/get_start_xpu.html
 # https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-6.html
 # 
 #
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    gpg \
    wget \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 # ipex only supports python 3.11, so use 3.11 instead of latest oracular (3.12)
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    build-essential \
    ca-certificates \
    ccache \
    cmake \
    curl \
    git \
    gpg-agent \
    less \
    libbz2-dev \
    libffi-dev \
    libjpeg-dev \
    libpng-dev \
    libreadline-dev \
    libssl-dev \
    libsqlite3-dev \
    llvm \
    nano \
    wget \
    zlib1g-dev \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 #    python3 \
 #    python3-pip \
 #    python3-venv \
 #    python3-dev \
 RUN /usr/sbin/update-ccache-symlinks
 RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
 # Build Python in /opt/..., install it locally, then remove the build environment 
 # collapsed to a single docker layer.
 WORKDIR /opt
 ENV PYTHON_VERSION=3.11.9
 RUN wget -q -O - https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz | tar -xz \
    && cd Python-${PYTHON_VERSION} \
    && ./configure --prefix=/opt/python --enable-optimizations \
    && make -j$(nproc) \
    && make install \
    && cd /opt \
    && rm -rf Python-${PYTHON_VERSION}
 WORKDIR /opt/pytorch
 FROM ubuntu:oracular AS ze-monitor
 # From https://github.com/jketreno/ze-monitor
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    build-essential \
    debhelper \
    devscripts \
    cmake \
    git \
    libfmt-dev \
    libncurses-dev \
    rpm \
    rpm2cpio \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 RUN apt-get install -y \
    software-properties-common \
    && add-apt-repository -y ppa:kobuk-team/intel-graphics \
    && apt-get update \
    && apt-get install -y \
    libze-dev \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 RUN git clone --depth 1 --branch v0.3.0-1 https://github.com/jketreno/ze-monitor /opt/ze-monitor
 WORKDIR /opt/ze-monitor/build
 RUN cmake .. \
    && make \
    && cpack
 FROM pytorch-build AS pytorch
 COPY --from=pytorch-build /opt/pytorch /opt/pytorch
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
    && add-apt-repository -y ppa:kobuk-team/intel-graphics \
    && apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    libze-intel-gpu1 \
    libze1 \
    intel-ocloc \
    intel-opencl-icd \
    xpu-smi \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2
 # When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries.
 ENV SYCL_CACHE_PERSISTENT=1
 WORKDIR /opt/pytorch
 RUN { \
    echo '#!/bin/bash' ; \
    update-alternatives --set python3 /opt/python/bin/python3.11 ; \
    echo 'source /opt/pytorch/venv/bin/activate' ; \
    echo 'bash -c "${@}"' ; \
    } > /opt/pytorch/shell ; \
    chmod +x /opt/pytorch/shell
 RUN python3 -m venv --system-site-packages /opt/pytorch/venv
 SHELL [ "/opt/pytorch/shell" ]
 RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
 RUN pip3 freeze > /opt/pytorch/requirements.txt
 SHELL [ "/bin/bash", "-c" ]
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'echo "Container: pytorch"' ; \
    echo 'set -e' ; \
    echo 'echo "Setting pip environment to /opt/pytorch"' ; \
    echo 'source /opt/pytorch/venv/bin/activate'; \
    echo 'if [[ "${1}" == "" ]] || [[ "${1}" == "shell" ]]; then' ; \
    echo '  echo "Dropping to shell"' ; \
    echo '  /bin/bash -c "source /opt/pytorch/venv/bin/activate ; /bin/bash"' ; \
    echo 'else' ; \
    echo '  exec "${@}"' ; \
    echo 'fi' ; \
    } > /entrypoint.sh \
    && chmod +x /entrypoint.sh
 ENTRYPOINT [ "/entrypoint.sh" ]
 FROM pytorch AS ipex-llm-src
 # Build ipex-llm from source
 RUN git clone --branch main --depth 1 https://github.com/intel/ipex-llm.git /opt/ipex-llm \
    && cd /opt/ipex-llm \
    && git fetch --depth 1 origin cb3c4b26ad058c156591816aa37eec4acfcbf765 \
    && git checkout cb3c4b26ad058c156591816aa37eec4acfcbf765
 WORKDIR /opt/ipex-llm
 RUN python3 -m venv --system-site-packages /opt/ipex-llm/venv
 RUN { \
    echo '#!/bin/bash' ; \
    update-alternatives --set python3 /opt/python/bin/python3.11 ; \
    echo 'source /opt/ipex-llm/venv/bin/activate' ; \
    echo 'bash -c "${@}"' ; \
    } > /opt/ipex-llm/shell ; \
    chmod +x /opt/ipex-llm/shell
 SHELL [ "/opt/ipex-llm/shell" ]
 RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
 WORKDIR /opt/ipex-llm/python/llm
 RUN pip install requests wheel
 RUN python setup.py clean --all bdist_wheel --linux
 FROM airc AS jupyter
 SHELL [ "/opt/airc/shell" ]
 # BEGIN setup Jupyter
 RUN pip install jupyter \
    jupyterlab==4.3.0a0 \
    jupyterhub==5.0.0 \
    notebook==7.3.0a0 \
    "jupyter-server-proxy>=4.1.2"
 # END setup Jupyter
 SHELL [ "/bin/bash", "-c" ]
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'echo "Container: airc jupyter"' ; \
    echo 'if [[ ! -e "/root/.cache/hub/token" ]]; then' ; \
    echo '  if [[ "${HF_ACCESS_TOKEN}" == "" ]]; then' ; \
    echo '    echo "Set your HF access token in .env as: HF_ACCESS_TOKEN=<token>" >&2' ; \
    echo '    exit 1' ; \
    echo '  else' ; \
    echo '    if [[ ! -d '/root/.cache/hub' ]]; then mkdir -p /root/.cache/hub; fi' ; \
    echo '    echo "${HF_ACCESS_TOKEN}" > /root/.cache/hub/token' ; \
    echo '  fi' ; \
    echo 'fi' ; \
    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
    echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
    echo 'source /opt/airc/venv/bin/activate' ; \
    echo 'if [[ "${1}" == "shell" ]]; then echo "Dropping to shell"; /bin/bash; exit $?; fi' ; \
    echo 'while true; do' ; \
    echo '  echo "Launching jupyter notebook"' ; \
    echo '  jupyter notebook \' ; \
    echo '    --notebook-dir=/opt/jupyter \' ; \
    echo '    --port 8888 \' ; \
    echo '    --ip 0.0.0.0 \' ; \
    echo '    --no-browser \' ; \
    echo '    --allow-root \' ; \
    echo '    --ServerApp.token= \' ; \
    echo '    --ServerApp.password= \' ; \
    echo '    --ServerApp.allow_origin=* \' ; \
    echo '    --ServerApp.base_url="/jupyter" \' ; \
    echo '    "${@}" \' ; \
    echo '    2>&1 | tee -a "/root/.cache/jupyter.log"' ; \
    echo '  echo "jupyter notebook died ($?). Restarting."' ; \
    echo '  sleep 5' ; \
    echo 'done' ; \
    } > /entrypoint-jupyter.sh \
    && chmod +x /entrypoint-jupyter.sh
 ENTRYPOINT [ "/entrypoint-jupyter.sh" ]
 FROM pytorch AS airc
 RUN python3 -m venv --system-site-packages /opt/airc/venv
 # Don't install the full oneapi essentials; just the ones that we seem to need
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
    | gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \
    && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
    | tee /etc/apt/sources.list.d/oneAPI.list \
    && apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    intel-oneapi-mkl-sycl-2025.0 \
    intel-oneapi-dnnl-2025.0 \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
    echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
    echo 'source /opt/airc/venv/bin/activate' ; \
    echo 'if [[ "$1" == "" ]]; then bash -c; else bash -c "${@}"; fi' ; \
    } > /opt/airc/shell ; \
    chmod +x /opt/airc/shell
 SHELL [ "/opt/airc/shell" ]
 RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
 # Install ipex-llm built in ipex-llm-src
 COPY --from=ipex-llm-src /opt/ipex-llm/python/llm/dist/*.whl /opt/wheels/
 RUN for pkg in /opt/wheels/ipex_llm*.whl; do pip install $pkg; done
 COPY src/ /opt/airc/src/
 # pydle does not work with newer asyncio due to coroutine
 # being deprecated. Patch to work.
 RUN pip3 install pydle transformers sentencepiece accelerate \
    && patch -d /opt/airc/venv/lib/python3*/site-packages/pydle \
    -p1 < /opt/airc/src/pydle.patch
 # mistral fails with cache_position errors with transformers>4.40 (or at least it fails with the latest)
 # as well as MistralSpda* things missing
 RUN pip install "sentence_transformers<3.4.1" "transformers==4.40.0"
 # To get xe_linear and other Xe methods    
 RUN pip3 install 'bigdl-core-xe-all>=2.6.0b'
 # trl.core doesn't have what is needed with the default 'pip install trl' version
 RUN pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
 # Needed by src/model-server.py
 RUN pip install flask
 SHELL [ "/bin/bash", "-c" ]
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'set -e' ; \
    echo 'if [[ ! -e "/root/.cache/hub/token" ]]; then' ; \
    echo '  if [[ "${HF_ACCESS_TOKEN}" == "" ]]; then' ; \
    echo '    echo "Set your HF access token in .env as: HF_ACCESS_TOKEN=<token>" >&2' ; \
    echo '    exit 1' ; \
    echo '  else' ; \
    echo '    if [[ ! -d '/root/.cache/hub' ]]; then mkdir -p /root/.cache/hub; fi' ; \
    echo '    echo "${HF_ACCESS_TOKEN}" > /root/.cache/hub/token' ; \
    echo '  fi' ; \
    echo 'fi' ; \
    echo 'echo "Container: airc"' ; \
    echo 'echo "Setting pip environment to /opt/airc"' ; \
    echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
    echo 'source /opt/airc/venv/bin/activate'; \
    echo 'if [[ "${1}" == "shell" ]] || [[ "${1}" == "/bin/bash" ]]; then' ; \
    echo '  echo "Dropping to shell"' ; \
    echo '  /bin/bash -c "source /opt/airc/venv/bin/activate ; /bin/bash"' ; \
    echo '  exit $?' ; \
    echo 'else' ; \
    echo '  while true; do' ; \
    echo '    echo "Launching model-server"' ; \
    echo '    python src/model-server.py \' ; \
    echo '      2>&1 | tee -a "/root/.cache/model-server.log"'; \
    echo '    echo "model-server died ($?). Restarting."' ; \
    echo '    sleep 5' ; \
    echo '  done &' ; \
    echo '  while true; do' ; \
    echo '    echo "Launching airc"' ; \
    echo '    python src/airc.py "${@}" \' ; \
    echo '      2>&1 | tee -a "/root/.cache/airc.log"' ; \
    echo '    echo "airc died ($?). Restarting."' ; \
    echo '    sleep 5' ; \
    echo '  done' ; \
    echo 'fi' ; \
    } > /entrypoint-airc.sh \
    && chmod +x /entrypoint-airc.sh
 COPY --from=ze-monitor /opt/ze-monitor/build/ze-monitor-*deb /opt/
 RUN dpkg -i /opt/ze-monitor-*deb
 WORKDIR /opt/airc
 SHELL [ "/opt/airc/shell" ]
 # Needed by src/model-server.py
 #RUN pip install faiss-cpu sentence_transformers feedparser bs4
 RUN pip install ollama feedparser bs4
 SHELL [ "/bin/bash", "-c" ]
 ENTRYPOINT [ "/entrypoint-airc.sh" ]
 FROM ubuntu:oracular AS ollama
 # Get a couple prerequisites
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    gpg \
    wget \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 # Install Intel graphics runtimes
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
    && add-apt-repository -y ppa:kobuk-team/intel-graphics \
    && apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    libze-intel-gpu1 \
    libze1 \
    intel-ocloc \
    intel-opencl-icd \
    xpu-smi \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 WORKDIR /opt/ollama
 # Download the nightly ollama release from ipex-llm
 RUN wget -qO - https://github.com/intel/ipex-llm/releases/download/v2.2.0-nightly/ollama-0.5.4-ipex-llm-2.2.0b20250226-ubuntu.tgz | \
    tar --strip-components=1 -C . -xzv 
 # Install Python from Oracular (ollama works with 3.12)
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    gpg \
    python3 \
    python3-pip \
    python3-venv \
    wget \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 # Setup the ollama python virtual environment
 RUN python3 -m venv --system-site-packages /opt/ollama/venv
 # Setup the docker pip shell
 RUN { \
    echo '#!/bin/bash' ; \
    update-alternatives --set python3 /opt/python/bin/python3.11 ; \
    echo 'source /opt/ollama/venv/bin/activate' ; \
    echo 'bash -c "${@}"' ; \
    } > /opt/ollama/shell ; \
    chmod +x /opt/ollama/shell
 # Activate the pip environment on all shell calls
 SHELL [ "/opt/ollama/shell" ]
 # Install ollama python module
 RUN pip3 install ollama
 SHELL [ "/bin/bash", "-c" ]
 RUN { \
    echo '#!/bin/bash'; \
    echo 'echo "Container: ollama"'; \
    echo 'set -e'; \
    echo 'echo "Setting pip environment to /opt/ollama"'; \
    echo 'source /opt/ollama/venv/bin/activate'; \
    echo 'export OLLAMA_NUM_GPU=999'; \
    echo 'export ZES_ENABLE_SYSMAN=1'; \
    echo 'export SYCL_CACHE_PERSISTENT=1'; \
    echo 'export OLLAMA_KEEP_ALIVE=-1'; \
    echo 'export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1'; \
    echo ''; \
    echo 'if [[ "${1}" == "/bin/bash" ]] || [[ "${1}" =~ ^(/opt/ollama/)?shell$ ]]; then'; \
    echo '  echo "Dropping to shell"'; \
    echo '  exec /bin/bash'; \
    echo 'else'; \
    echo '  echo "Launching Ollama server..."'; \
    echo '  exec ./ollama serve'; \
    echo 'fi'; \
    } > /entrypoint.sh \
    && chmod +x /entrypoint.sh
 RUN { \
    echo '#!/bin/bash'; \
    echo 'echo "Container: ollama"'; \
    echo 'set -e'; \
    echo 'echo "Setting pip environment to /opt/ollama"'; \
    echo 'source /opt/ollama/venv/bin/activate'; \
    echo './ollama pull mxbai-embed-large' ; \
    echo './ollama pull deepseek-r1:7b' ; \
    } > /fetch-models.sh \
    && chmod +x /fetch-models.sh
 VOLUME [" /root/.ollama" ]
 ENTRYPOINT [ "/entrypoint.sh" ]
--- a/112
+++ b/112
@ -0,0 +1,112 @@
 # RAG and Fine Tuning
 Two common practices for augmenting a model with new information:
 1. In-context learning
 2. Fine Tuning
 ## In-context learning
 In-context learnings is great for dynamic data. Rather than retrain the model, you include relevant contextual information in the prompt to the model. The approach to collect that context information is generally:
 1. Store the content in a context retrieval system
 2. Based on the user's query, determine which content in #1 is most relevant
 3. Augment the user's query with that context information
 PROS: Easy to implement
 CONS: Retrieving the correct context can be difficult, and without the context, the model won't have the additional information and will revert to its internal knowledge.
 If you only have one or two documents (totalling a few thousand tokens than the models maximum input) you can include the full text of the document in the prompt and skip the context mapping.
 ## Fine tuning
 Fine tuning is great if you have static content, as applying the knowledge to the model requires the model to go through a training phase which can take several hours or longer, depending on the amount of information being updated. The approach to fine-tune is:
 1. Process your data into a series of 'context', 'query', 'response' correlations
 2. Train the model using those correlations
 The main work involved is in #1, and the success of fine tuning will be greatly impacted by the method used to perform it. As having an expert system manually generate queries and responses can be time consuming, the "prompt adjustment" of the first method can be used. 
 For this, you can iteratively perform the following:
 ### Query generation
 1. Context data
 2. Prompt: Given the context, create a list of questions about the topic. Do not provide answers.
 ### Response generation
 1. Context data
 2. Prompt: Given the context, respond to the following question: {query}
 Repeat the above for each piece of context data. This is now your expert system which you can use to fine-tune your model.
 PROS: Not too difficult to implement and can give much better responses to queries about items covered in the context.
 CONS: Updating the model with changes in the context requires retraining, and training takes a log of system resources.
 With the Intel Arc B580, I was not able to fine-tune on a full 7B parameter model and had to use a smaller 1.5B parameter model. To train using the alpaca-clean dataset takes 3-5 hours. To generate the correlations, the full 7B model was used in order to get better questions and answers. That data was then used to fine-tune the 1.5B model.
 As inference using deepseek-r1 can take 15 seconds or so per query (using the 7B model) and queries on neuralchat-7b are nearly instantaneous, I will explore creating correlations using deepseek, and then use that data to train neurlchat.
 # Aproach taken in resume-bot
 I tried several techniques, and have collected example output.
 1. In-context via pre-embedding context tokens (ollama TRAINING)
 2. In-context via full-context in-query
 3. In-context via relavent text (traditional RAG)
 4. Fine-tune
 # Torch vs Ollama
 Ollama is easy to setup, and it performs well. However, it does not expose a method for fine-tuning a model beyond the TRAINING template which does not adjust model weights and is more akin to in-context training.
 torch is a little more difficult to setup, and it too performs well. With the fast collection of libraries and infrastructure available, fine-tuning using torch is relatively straight forward.
 Once you have a fine-tuned model, you can use that model with ollama or torch. I have run the resume-bot using both torch and ollama on an Intel Core i9-14900KS with 64G of RAM and an Intel Arc B580 GPU with 12G of RAM. Below are some metrics gathered while running several query passes:
 |                      | ollama-ipex-llm | pytorch w/ ipex-llm |
 |:---------------------|:----------------|:--------------------|
 | Query time           |                 |                     |
 | Concurrent queries   |                 |                     |
 ### How Ollama Uses the TRAINING Section
 The `TRAINING` section in an Ollama Modelfile works differently than traditional fine-tuning methods. Here's how Ollama uses it:
 1. **Not True Parameter Fine-tuning**:
   - Unlike traditional fine-tuning that updates model weights through backpropagation, Ollama doesn't modify the underlying model parameters
   - The examples in `TRAINING` don't trigger a training loop or gradient updates
 2. **Template-Based Learning**:
   - Ollama uses these examples as additional context when the model is created
   - The examples effectively become part of the model's "knowledge" 
   - This is more like instruction-tuning through examples than actual parameter updates
 3. **Implementation Details**:
   - The examples are processed during model creation
   - They're tokenized and stored alongside the model
   - When running inference, Ollama doesn't directly include these examples in every prompt
   - Instead, the model is influenced by having processed these examples during creation
 4. **Technical Mechanism**:
   - The exact implementation varies by model architecture
   - For many models, Ollama prepends these examples during the model creation process
   - This shapes the model's understanding without modifying weights
   - It's similar to how system prompts work but applied at model creation time
 5. **Limitations**:
   - The effectiveness depends on the base model's capability
   - It works best for teaching patterns and preferred response styles
   - It's less effective for teaching new facts or complex reasoning
   - The number of examples is limited by context window size
 ### Practical Considerations
 - Use concise, high-quality examples that demonstrate the exact behavior you want
 - Focus on patterns rather than specific facts
 - Include diverse examples covering different aspects of desired behavior
 - For best results, combine with well-crafted system prompts
 - Remember that this isn't true fine-tuning - it's more like "example-based conditioning"
 This approach works well for adapting model style and format, but for more substantial changes to model behavior, traditional fine-tuning frameworks that update weights (like those in Hugging Face's ecosystem) would be more effective.
--- a/doc/projects/airc.txt
+++ b/doc/projects/airc.txt
@ -0,0 +1,105 @@
 # AIRC (pronounced Eric)
 AI is Really Cool
 This project provides a simple IRC chat client. It runs the neuralchat model, enhanced with a little bit of RAG to fetch news RSS feeds.
 Internally, it is built using PyTorch 2.6 and the Intel IPEX/LLM.
 NOTE: If running on an Intel Arc A series graphics processor, fp64 is not supported and may need to either be emulated or have the model quantized. It has been a while since I've had an A series GPU to test on, so if you run into problems please file an [issue](https://github.com/jketreno/airc/issues)--I have some routines I can put in, but don't have a way to test them. 
 # Installation
 This project uses docker containers to build. As this was originally written to work on an Intel Arc B580 (Battlemage), it requires a kernel that supports that hardware, such as the one documented at [Intel Graphics Preview](https://github.com/canonical/intel-graphics-preview), which runs in Ubuntu Oracular (24.10)..
 NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
 ## Want to run under WSL2? No can do...
 https://www.intel.com/content/www/us/en/support/articles/000093216/graphics/processor-graphics.html
 The A- and B-series discrete GPUs do not support SR-IOV, required for the GPU partitioning that Microsoft Windows uses in order to support GPU acceleration in WSL.
 ## Building
 NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
 ```bash
 git clone https://github.com/jketreno/airc
 cd airc
 docker compose build
 ```
 ## Running
 In order to download the models, you need to have a Hugging Face token. See https://huggingface.co/settings/tokens for information on obtaining a token.
 Edit .env to add the following:
 ```.env
 HF_ACCESS_TOKEN=<access token from huggingface>
 ```
 NOTE: Models downloaded by most examples will be placed in the ./cache directory, which is bind mounted to the container.
 ### AIRC
 To launch the airc shell interactively, with the pytorch 2.6 environment loaded, use the default entrypoint to launch a shell:
 ```bash
 docker compose run --rm airc shell
 ```
 Once in the shell, you can then launch the model-server.py and then the airc.py client:
 ```bash
 docker compose run --rm airc shell
 src/airc.py --ai-server=http://localhost:5000 &
 src/model-server.py
 ```
 By default, src/airc.py will connect to irc.libera.chat on the airc-test channel. See `python src/airc.py --help` for options.
 By separating the model-server into its own process, you can develop and tweak the chat backend without losing the IRC connection established by airc.
 ### Jupyter
 ```bash
 docker compose up jupyter -d
 ```
 The default port for inbound connections is 8888 (see docker-compose.yml). $(pwd)/jupyter is bind mounted to /opt/juypter in the container, which is where notebooks will be saved by default.
 To access the jupyter notebook, go to `https://localhost:8888/jupyter`.
 ### Monitoring
 You can run `ze-monitor` within the launched containers to monitor GPU usage.
 ```bash
 containers=($(docker ps --filter "ancestor=airc" --format "{{.ID}}"))
 if [[ ${#containers[*]} -eq 0 ]]; then
  echo "Running airc container not found."
 else
  for container in ${containers[@]}; do
    echo "Container ${container} devices:"
    docker exec -it ${container} ze-monitor
  done
 fi
 ```
 If an airc container is running, you should see something like:
 ```
 Container 5317c503e771 devices:
 Device 1: 8086:A780 (Intel(R) UHD Graphics 770)
 Device 2: 8086:E20B (Intel(R) Graphics [0xe20b])
 ```
 You can then launch ze-monitor in that container specifying  the device you wish to monitor:
 ```
 containers=($(docker ps --filter "ancestor=airc" --format "{{.ID}}"))
 docker exec -it ${containers[0]} ze-monitor --device 2
 ```
--- a/doc/projects/ze-monitor.txt
+++ b/doc/projects/ze-monitor.txt
@ -0,0 +1,279 @@
 # ze-monitor
 A small utility to monitor Level Zero devices via 
 [Level Zero Sysman](https://oneapi-src.github.io/level-zero-spec/level-zero/latest/sysman/PROG.html#sysman-programming-guide) 
 from the command line, similar to 'top'.
 # Installation
 Requires Ubuntu Oracular 24.10.
 ## Easiest
 ### Install prerequisites
 This will add the [Intel Graphics Preview PPA](https://github.com/canonical/intel-graphics-preview) and install the required dependencies:
 ```bash
 sudo apt-get install -y \
    software-properties-common \
    && sudo add-apt-repository -y ppa:kobuk-team/intel-graphics \
    && sudo apt-get update \
    && sudo apt-get install -y \
    libze1 libze-intel-gpu1 libncurses6
 ```
 ### Install ze-monitor from .deb package
 This will download the ze-monitor GitHub, install it, and add the current
 user to the 'ze-monitor' group to allow running the utility:
 ```bash
 version=0.3.0-1
 wget https://github.com/jketreno/ze-monitor/releases/download/v${version}/ze-monitor-${version}_amd64.deb
 sudo dpkg -i ze-monitor-${version}_amd64.deb
 sudo usermod -a -G ze-monitor $(whoami)
 newgrp ze-monitor
 ```
 Congratulations! You can run ze-monitor:
 ```bash
 ze-monitor
 ```
 You should see something like:
 ```bash
 Device 1: 8086:A780 (Intel(R) UHD Graphics 770)
 Device 2: 8086:E20B (Intel(R) Graphics [0xe20b])
 ```
 To monitor a device:
 ```bash
 ze-monitor --device 2
 ```
 Check the docs (`man ze-monitor`) for additional details on running the ze-monitor utility.
 ## Slightly more involved
 This project uses docker containers to build. As this was originally written to monitor an Intel Arc B580 (Battlemage), it requires a kernel that supports that hardware, such as the one documented at [Intel Graphics Preview](https://github.com/canonical/intel-graphics-preview), which runs in Ubuntu Oracular (24.10). It will monitor any Level Zero device, even those using the i915 driver.
 NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
 ```
 git clone https://github.com/jketreno/ze-monitor.git
 cd ze-monitor
 docker compose build
 sudo apt install libze1 libncurses6
 version=$(cat src/version.txt)
 docker compose run --remove-orphans --rm \
  ze-monitor \
  cp /opt/ze-monitor-static/build/ze-monitor-${version}_amd64.deb \
  /opt/ze-monitor/build
 sudo dpkg -i build/ze-monitor-${version}_amd64.deb
 ```
 # Security
 In order for ze-monitor to read the performance metric units (PMU) in the  Linux kernel, it needs elevated permissions. The easiest way is to install the .deb package and add the user to the ze-monitor group. Or, run under sudo (eg., `sudo ze-monitor ...`.)
 The specific capabilities required to monitor the GPU are documented in [Perf Security](https://www.kernel.org/doc/html/v5.1/admin-guide/perf-security.html) and [man capabilities](https://man7.org/linux/man-pages/man7/capabilities.7.html). These include:
 | Capability          | Reason                                               |
 |:--------------------|:-----------------------------------------------------|
 | CAP_DAC_READ_SEARCH | Bypass all filesystem read access checks             |
 | CAP_PERFMON         | Access to perf_events (vs. overloaded CAP_SYS_ADMIN) |
 | CAP_SYS_PTRACE      | PTRACE_MODE_READ_REALCREDS ptrace access mode check  |
 To configure ze-monitor to run with those privileges, you can use `setcap` to set the correct capabilities on ze-monitor. You can further secure your system by creating a user group specifically for running the utility and restrict  running of that command to users in that group. That is what the .deb package does.
 If you install the .deb package from a [Release](https://github.com/jketreno/ze-monitor/releases) or by building it, that package will set the appropriate permissions for ze-monitor on installation and set it executable only to those in the 'ze-monitor' group.
 ## Anyone can run ze-monitor
 If you build from source and want to set the capabilities:
 ```bash
 sudo setcap "cap_perfmon,cap_dac_read_search,cap_sys_ptrace=ep" build/ze-monitor
 getcap build/ze-monitor
 ```
 Any user can then run `build/ze-monitor` and monitor the GPU.
 # Build outside container
 ## Prerequisites
 If you would like to build outside of docker, you need the following packages installed:
 ```
 sudo apt-get install -y \
    build-essential \
    libfmt-dev \
    libncurses-dev
 ```
 In addition, you need the Intel drivers installed, which are available from the `kobuk-team/intel-graphics` PPA:
 ```
 sudo apt-get install -y \
    software-properties-common \
    && sudo add-apt-repository -y ppa:kobuk-team/intel-graphics \
    && sudo apt-get update \
    && sudo apt-get install -y \
    libze-intel-gpu1 \
    libze1 \
    libze-dev
 ```
 ## Building
 ```
 cd build
 cmake ..
 make
 ```
 ## Running
 ```
 build/ze-monitor
 ```
 ## Build and install .deb
 In order to build the .deb package, you need the following packages installed:
 ```bash
 sudo apt-get install -y \
    debhelper \
    devscripts \
    rpm \
    rpm2cpio
 ```
 You can then build the .deb:
 ```bash
 if [ -d build ]; then
  cd build
 fi
 version=$(cat ../src/version.txt)
 cpack
 sudo dpkg -i build/packages/ze-monitor_${version}_amd64.deb
 ```
 You can then run ze-monitor from your path:
 ```bash
 ze-monitor
 ```
 # Developing
 To run the built binary without building a full .deb package, you can build and run on the host by compiling in the container:
 ```
 docker compose run --rm ze-monitor build.sh
 build/ze-monitor
 ```
 The build.sh script will build the binary in /opt/ze-monitor/build, which is volume mounted to the host's build directory.
 NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
 # Running
 NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
 If running within a docker container, the container environment does not have access to the host's `/proc/fd`, which is necessary to obtain information about the processes outside the current container which are using the GPU. As such, only processes running within that container running ze-monitor will be listed as using the GPU.
 ## List available devices
 ```
 ze-monitor
 ```
 Example output:
 ```bash
 $ ze-monitor 
 Device 1: 8086:E20B (Intel(R) Graphics [0xe20b])
 Device 2: 8086:A780 (Intel(R) UHD Graphics 770)
 ```
 ## Show details for a given device
 ```
 sudo ze-monitor --info --device ( PCIID | # | BDF | UUID | /dev/dri/render*)
 ```
 Example output:
 ```bash
 $ sudo ze-monitor --device 2 --info
 Device: 8086:A780 (Intel(R) UHD Graphics 770)
 UUID: 868080A7-0400-0000-0002-000000000000
 BDF: 0000:0000:0002:0000
 PCI ID: 8086:A780
 Subdevices: 0
 Serial Number: unknown
 Board Number: unknown
 Brand Name: unknown
 Model Name: Intel(R) UHD Graphics 770
 Vendor Name: Intel(R) Corporation
 Driver Version: 0CB7EFCAD5695B7EC5C8CE6
 Type: GPU
 Is integrated with host: Yes
 Is a sub-device: No
 Supports error correcting memory: No
 Supports on-demand page-faulting: No
 Engines: 7
  Engine 1: ZES_ENGINE_GROUP_RENDER_SINGLE
  Engine 2: ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE
  Engine 3: ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE
  Engine 4: ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE
  Engine 5: ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE
  Engine 6: ZES_ENGINE_GROUP_COPY_SINGLE
  Engine 7: ZES_ENGINE_GROUP_MEDIA_ENHANCEMENT_SINGLE
 Temperature Sensors: 0
 ```
 NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
 ## Monitor a given device
 ```
 sudo ze-monitor --device ( PCIID | # | BDF | UUID | /dev/dri/render* ) \
  --interval ms
 ```
 NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
 Output:
 ```bash
 $ sudo ze-monitor --device 2 --interval 500
 Device: 8086:E20B (Intel(R) Graphics [0xe20b])
 Total Memory:  12809404416
 Free memory:  [#  55% ############################                              ]
 Power usage: 165.0W
 ------------------------------------------------------------------------------------------
   PID COMMAND-LINE
       USED MEMORY       SHARED MEMORY     ENGINE FLAGS
 ------------------------------------------------------------------------------------------
     1 /sbin/init splash
       MEM: 106102784    SHR: 100663296    FLAGS: RENDER COMPUTE
  1606 /usr/lib/systemd/systemd-logind
       MEM: 106102784    SHR: 100663296    FLAGS: RENDER COMPUTE
  5164 /usr/bin/gnome-shell
       MEM: 530513920    SHR: 503316480    FLAGS: RENDER COMPUTE
  5237 /usr/bin/Xwayland :1024 -rootless -nores...isplayfd 6 -initfd 7 -byteswappedclients
       MEM: 0            SHR: 0            FLAGS:
 40480 python chat.py
       MEM: 5544226816   SHR: 0            FLAGS: DMA COMPUTE
 ```
 If you pass `--one-shot`, statistics will be gathered, displayed, and then ze-monitor will exit.
--- a/doc/resume/generic.txt
+++ b/doc/resume/generic.txt
@ -0,0 +1,56 @@
 # JAMES KETRENOS
 software architect, designer, developer, and team lead
 Beaverton, OR 97003
 james@ketrenos.com
 (503) 501 8281
 Seeking an opportunity to contribute to the advancement of energy efficient AI solutions, James is a driven problem solver, solution creator, technical leader, and skilled software developer focused on rapid, high-quality results, with an eye toward bringing solutions to the market.
 ## SUMMARY
 Problem-solving: Trusted resource for executive leadership, able to identify opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers.
 Proficient: Adept in compiled and interpreted languages, the software frameworks built around them, and front- and backend infrastructure. Leveraging deep and varied experience to quickly find solutions. Rapidly familiarizes and puts to use new and emerging technologies.
 Experienced: 20+ years of experience as an end-to-end Linux software architect, team lead, developer, system administrator, and user. Working with teams to bring together technologies into existing ecosystems for a myriad of technologies.
 Leader: Frequent project lead spanning all areas of development and phases of the product life cycle from pre-silicon to post launch support. Capable change agent and mentor, providing technical engineering guidance to multiple teams and organizations.
 Communicates: Thrives on helping people solve problems, working to educate others to help them better understand problems and work toward solutions.
 ## RECENT HISTORY
 2024-2025: Present
 * Developed 'ze-monitor', a lightweight C++ Linux application leveraging Level Zero Sysman APIs to provide 'top' like device monitoring of Intel GPUs. https://github.com/jketreno/ze-monitor
 * Developed 'airc', a LLM pipeline allowing interactive queries about James' resume. Utilizing both in-context and fine-tuned approaches, questions asked about James will use information from his resume and portfolio for answers. Includes a full-stack React web ui, a command line client, and an IRC bot integration. https://github.com/jketreno/airc
 2018-2024: Intel® Graphics Software Staff Architect and Lead
 * Redefined how Intel approaches graphics enabling on Linux to meet customer and product timelines.
 * Spearheaded internal projects to prove out the developer and customer deployment experience when using Intel graphics products with PyTorch, working to ensure all ingredients are available and consumable for success (from kernel driver integration, runtime, framework integration, up to containerized Python workload solution deployment.)
 * Focused on improving the customer experience for Intel graphics software for Linux in the data center, high-performance compute clusters, and end users. Worked with several teams and business units to close gaps, improve our software, documentation, and release methodologies.
 * Worked with hardware and firmware teams to scope and define architectural solutions for customer features.
 1998-2018: Open Source Software Architect and Lead
 * Defined software architecture for handheld devices, tablets, Internet of Things, smart appliances, and emerging technologies. Key resource to executive staff to investigate emerging technologies and drive solutions to close existing gaps
 * James career at Intel has been diverse. His strongest skills are related to quickly ramping on technologies being utilized in the market, identifying gaps in existing solutions, and working with teams to close those gaps. He excels at adopting and fitting new technology trends as they materialize in the industry.
 ## PROLONGED HISTORY
 The following are technical areas James has been an architect, team lead, and/or individual contributor:
 * Linux release infrastructure overhaul: Identified bottlenecks in the CI/CD build pipeline, built proof-of-concept, and moved to production for generating releases of Intel graphics software (https://dgpu-docs.intel.com) as well as internal dashboards and infrastructure for tracking build and release pipelines. JavaScript, HTML, Markdown, RTD, bash/python, Linux packaging, Linux repositories, Linux OS release life cycles, sqlite3. Worked with multiple teams across Intel to meet Intel’s requirements for public websites as well as to integrate with existing build and validation methodologies while educating teams on tools and infrastructure available from the ecosystem (vs. roll-your-own).
 * Board Explorer: Web app targeting developer ecosystem to utilize new single board computers, providing quick access to board details, circuits, and programming information. Delivered as a pure front-end service (no backend required) https://board-explorer.github.io/board-explorer/#quark_mcu_dev_kit_d2000. Tight coordination with UX design team. JavaScript, HTML, CSS, XML, hardware specs, programming specs.
 * (internal) Travel Requisition: Internal HTML application and backend enabling internal organizations to request travel approval and a manager front end to track budgetary expenditures in order to determine approval/deny decisions. NodeJS, JavaScript, Polymer, SQL. Tight coordination with internal requirements providers and UX design teams.
 * Developer Journey: Web infrastructure allowing engineers to document DIY processes. Front end for parsing, viewing, and following projects. Back end for managing content submitted (extended markdown) including images, videos, and screencasts. Tight coordination with UX design team.
 * Robotics: Worked with teams aligning on a ROS (Robot OS) roadmap and alignment. Presented at Embedded Linux conference on the state of open source and robotics. LIDAR, Intel RealSense, opencv, python, C. Developed a robotic vision controlled stewart platform that could play the marble game labyrinth.
 * Moblin and MeeGo architect: Focused on overall software architecture as well as moving forward multi-touch and the industry shift to resolution independent applications; all in a time before smart phones as we know them today. Qt, HTML5, EFL.
 * Marblin: An HTML/WebGL graphical application simulating the 2D collision physics of marbles in a 3D rendered canvas.
 * Linux Kernel: Developed and maintained initial Intel Pro Wireless 2100, 2200, and 3945 drivers in the Linux kernel. C, Software Defined Radios, IEEE 802.11, upstream kernel driver, team lead for team that took over the Intel wireless drivers, internal coordination regarding technical and legal issues surrounding the wireless stack.
 * Open source at Intel: Built proof-of-concepts to illustrate to management the potential and opportunities for Intel by embracing open source and Linux.
 * Intel Intercast Technology: Team lead for Intel Intercast software for Windows. Worked with 3rd party companies to integrate the technology into their solutions. 
--- a/results/doc/projects/airc.txt.json
+++ b/results/doc/projects/airc.txt.json
--- a/results/doc/projects/ze-monitor.txt.json
+++ b/results/doc/projects/ze-monitor.txt.json
--- a/results/doc/resume/generic.txt.json
+++ b/results/doc/resume/generic.txt.json
@ -0,0 +1,57 @@
 {
  "file_path": "../doc/resume/generic.txt",
  "processed_at": "2025-03-12T19:22:00.311002",
  "context": "# JAMES KETRENOS\nsoftware architect, designer, developer, and team lead\nBeaverton, OR 97003\n\njames@ketrenos.com\n(503) 501 8281\n\nSeeking an opportunity to contribute to the advancement of energy efficient AI solutions, James is a driven problem solver, solution creator, technical leader, and skilled software developer focused on rapid, high-quality results, with an eye toward bringing solutions to the market.\n\n## SUMMARY\n\nProblem-solving: Trusted resource for executive leadership, able to identify opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers.\n\nProficient: Adept in compiled and interpreted languages, the software frameworks built around them, and front- and backend infrastructure. Leveraging deep and varied experience to quickly find solutions. Rapidly familiarizes and puts to use new and emerging technologies.\n\nExperienced: 20+ years of experience as an end-to-end Linux software architect, team lea...",
  "qa_pairs": [
    {
      "question": "What is James' primary role in the software architecture field?",
      "answer": "James' primary role in the software architecture field is focused on developing and maintaining the Linux kernel, ensuring compatibility with Intel wireless drivers. He also contributes to hardware design, particularly in the development of the Intel Intercast technology.",
      "validation": "**Assessment:**\n\n1. **ACCURACY:**  \n   The answer accurately mentions James' primary role in the software architecture field, focusing on kernel development for Intel wireless and contributing to hardware design, including Intel Intercast. However, it omits details about other contributions, such as kernel development for other hardware projects.\n\n2. **COMPLETENESS:**  \n   The answer is incomplete as it does not provide information on James' work in hardware design beyond Intel Intercast, such as kernel development for other hardware.\n\n3. **RELEVANCE:**  \n   The answer is relevant, covering James' kernel development for Intel wireless and his contribution to hardware design.\n\n4. **NATIVITY:**  \n   The answer is natural and conversational.\n\n5. **DIVERSITY:**  \n   The answer is somewhat diverse, covering different areas of software architecture.\n\n6. **IMPROVEMENTS:**  \n   The answer could include more details on James' contributions to hardware design, such as his kernel development for Intel wireless or other hardware projects.\n\n**Decision:**  \nREJECT"
    },
    {
      "question": "Can you explain the development of the 'ze-monitor' application?",
      "answer": "The 'ze-monitor' application was developed as a lightweight C++ Linux application that leverages Level Zero Sysman APIs to provide 'top' like device monitoring for Intel GPUs. This project was part of James' experience working with Linux and Level Zero Sysman APIs, focusing on leveraging new technologies and ensuring seamless integration with existing systems. The development process involved identifying opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers. This led to the creation of a lightweight application that integrates Level Zero Sysman APIs, enabling device monitoring for Intel GPUs. The detailed development process involved collaboration with the team, ensuring that the application met the technical requirements and worked seamlessly with existing systems.",
      "validation": "**Answer:**\n\nThe 'ze-monitor' application was developed as a lightweight C++ Linux application that leverages Level Zero Sysman APIs to provide device monitoring for Intel GPUs. This project was part of James' experience working with Linux and Level Zero Sysman APIs, focusing on leveraging new technologies and ensuring seamless integration with existing systems. The development process involved identifying opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers. The application was created to integrate Level Zero Sysman APIs, enabling device monitoring for Intel GPUs, while ensuring seamless integration with existing systems.\n\n**Assessment:**\n\n- **VERDICT:** [ACCEPT]\n\n- **REASONING:** The answer is accurate, comprehensive, relevant, and natural. It explains the technical aspects of the application and its development, which is useful for understanding the context provided. It also mentions how the answer was generated and its contribution to the overall knowledge base, making it somewhat complete.\n\n- **IMPROVEMENTS:** To add diversity, the answer could mention other areas where similar applications were developed, or highlight the broader impact of these technologies. For example, mentioning other device monitoring applications developed by James or other developers could add more diversity. Also, including how these technologies contribute to broader areas like cybersecurity or efficiency could enhance the answer's breadth and relevance."
    },
    {
      "question": "How did the 'airc' project utilize James' resume data?",
      "answer": "James' resume data was utilized in the development of the 'airc' project, which is a command line client, interactive query system, and IRC bot integration. This project leverages his extensive experience in software architecture and development, particularly in areas such as Linux release infrastructure and system architecture.",
      "validation": "- VERDICT: [ACCEPT]\n- REASONING: The answer is accurate, complete, relevant, and natural. It provides specific details and adds some diversity by focusing on a single project and experience area. No further improvements are needed.\n- IMPROVEMENTS: None needed, as the answer meets all criteria effectively.\n\nAnswer:\nJames' resume data was utilized in the development of the 'airc' project, which includes a command line client, interactive query system, and IRC bot integration. This project leverages his extensive experience in software architecture and development, particularly in areas such as Linux release infrastructure and system architecture."
    },
    {
      "question": "What is James' experience with Linux kernel driver integration?",
      "answer": "James' experience with Linux kernel driver integration includes working on the Intel Wireless drivers, where he contributed to the development of kernel driver integration, utilizing tools like Linux packaging and managing CI/CD pipelines for release infrastructure.",
      "validation": "**Answer:**\n\nJames' experience with Linux kernel driver integration includes his contributions to the development of kernel driver integration for Intel's Wireless drivers and his role in managing CI/CD pipelines for projects like Open Source at Intel.\n\n**Evaluation:**\n\n- **VERDICT:** [ACCEPT]\n- **REASONING:** The answer accurately describes James' experience with kernel driver integration, providing specific examples and context. It is relevant, natural, and adds diversity to his expertise.\n- **IMPROVEMENTS:** If possible, enhancing the answer with more details about the tools or processes used could further enhance naturalness. However, as provided, it is sufficiently detailed.\n- **DIVERSITY:** The answer naturally incorporates James' work in kernel driver integration, which is a natural addition to his role and provides a new angle for understanding his expertise.\n\nThis answer is appropriate and suitable for training purposes."
    },
    {
      "question": "How did James contribute to the development of Intel RealSense hardware?",
      "answer": "James contributed to the development of Intel RealSense hardware through the development of a command line client and an IRC bot integration for the 'airc' project.",
      "validation": "**Answer:**\n\nJames contributed to the development of Intel RealSense hardware through the creation of a command line client and an IRC bot integration for the 'airc' project. This involved integrating AI capabilities into the 'ze-monitor' project, allowing for device monitoring of Intel GPUs, which was part of the broader effort to advance energy-efficient AI solutions. However, the answer does not directly address how James contributed to Intel RealSense hardware, focusing instead on his other projects. This could be an area for improvement to better align with the context provided.\n\n**Assessment (VERDICT):**  \nThe answer is accurate and provides specific information about James' contributions but lacks essential details about his role in Intel RealSense hardware development. It is a bit verbose but could be more concise. The response is appropriate but could be enhanced to better meet the quality standards by providing a clearer link to Intel RealSense.\n\n**Reasoning:**  \nThe answer accurately describes James' contributions but does not address the specific impact on Intel RealSense hardware. It is a sufficient answer for the given question, but it could be improved for more clarity and conciseness.\n\n**IMPROVEMENTS:**  \n1. Clarify how James contributed to Intel RealSense hardware, linking his other projects to RealSense development.\n2. Provide a more direct explanation of his role in the 'airc' project and its relation to Intel RealSense.\n3. Offer a brief overview of Intel RealSense hardware to better place James' contributions within the context."
    },
    {
      "question": "What is the current state of James' open-source contributions?",
      "answer": "James Ketrenos is actively engaged in several open-source projects, contributing to the development and enhancement of various tools and technologies. His contributions include:\n\n1. **Ze-Monitor**: A lightweight application that leverages Level Zero Sysman APIs for device monitoring of Intel GPUs. This tool is part of his contributions to the Linux community.\n\n2. **Airc**: A LLM pipeline that facilitates interactive queries about James' resume, showcasing his proficiency in using LLMs to assist with resume optimization and internal communication.\n\n3. **Rebound**: A React web UI that integrates various components such as a command line client, an IRC bot, and an IRC bot integration, demonstrating his ability to create user-friendly interfaces for complex systems.\n\n4. **Rebound-WebUI**: A full-stack React web UI that includes a command line client, an IRC bot, and an IRC bot integration, highlighting his expertise in developing integrated web applications.\n\n5. **Rebound-React-UI**: A React web UI that includes a command line client and an IRC bot integration, showcasing his skill in creating web interfaces for technical solutions.\n\n6. **Rebound-React-UI-IRC**: An integrated React web UI with a command line client, an IRC bot, and an IRC bot integration, further illustrating his ability to create effective and user-friendly interfaces.\n\nThese contributions reflect James' active participation in the open-source community and his dedication to advancing technology through innovative solutions.",
      "validation": "** Assessment: [ACCEPT]**\n\n**Reasoning:**\n1. **ACCURACY:** The answer accurately describes James Ketrenos's open-source contributions, mentioning specific tools and projects.\n2. **COMPLETENESS:** The answer fully answers the question by detailing James's involvement in multiple projects.\n3. **RELEVANCE:** The question is meaningful and relevant to the context, focusing on James's role in the Linux community.\n4. **NATURALNESS:** The language is natural and conversational, avoiding forced or unnatural phrasing.\n5. **DIVERSITY:** The pair adds naturalness to the dataset, enhancing its overall quality.\n\n**Improvements:**\n- To further enhance naturalness, examples could include using \"partner\" instead of \"contribution,\" making the answer more relatable.\n- Additional context about James's contributions could be added, such as his role in specific projects or his impact on the community."
    },
    {
      "question": "How has James' expertise in kernel driver development impacted the CI/CD pipeline?",
      "answer": "James' expertise in kernel driver development has significantly impacted the CI/CD pipeline. Specifically, his work on Intel graphics software has enabled the efficient deployment of Intel graphics products within the CI/CD pipeline. By optimizing kernel drivers, James ensured that the Linux kernel could handle the CI/CD processes, such as deployments and releases, thereby enhancing the software's performance and reliability. This integration has allowed for better utilization of Intel graphics, improving the overall efficiency and customer satisfaction of the products.",
      "validation": "**Answer:**\n\nThe provided answer is well-received, with a naturalness score of 0.8 due to minor suggestion of making it more conversational. It is accurate, complete, and relevant, with a naturalness score of 0.8 due to minor suggestion of making it more conversational.\n\n**Evaluation:**\n\n- **ACCURACY:** Yes. The answer correctly links kernel driver expertise to CI/CD pipeline efficiency.\n- **COMPLETENESS:** Yes. The answer provides a clear and detailed link between kernel driver development and CI/CD processes.\n- **RELEVANCE:** Yes. The answer connects kernel driver expertise to CI/CD processes, which are relevant to software development and efficiency.\n- **NATURALNESS:** Good. The answer is written in a professional context, but could be more conversational.\n- **DIVERSITY:** Good. The answer focuses on specific technologies and their impact, but doesn't add much diversity.\n\n**Overall Assessment:** The task is suitable for training, with minor areas for improvement in naturalness. The answer is accurate, complete, and relevant, making it effective for quality assurance.\n\n**Answer:** VERDICT: [ACCEPT]  \nREASONING: The answer accurately connects James' kernel driver expertise to CI/CD impact, demonstrating relevance and completeness.  \nIMPROVEMENTS: The answer could be made more conversational by phrases like, \" James' work on Intel graphics software has enabled...\"  \n- VERDICT: ACCEPT  \n- REASONING: The answer is accurate, complete, and relevant. It effectively addresses the impact of kernel driver development on CI/CD.  \n- IMPROVEMENTS: Suggesting more conversational phrasing to enhance naturalness."
    },
    {
      "question": "What is the impact of James' work on the development of the Intel GPU driver?",
      "answer": "James Keta renos's work has significantly impacted the development of the Intel GPU driver. Specifically, he was involved in the design and implementation of the Intel GPU driver, which led to the release of the Intel GPU driver version 3.3.1. Additionally, his contributions to the Linux kernel and development of tools like 'ze-monitor' and 'airc' have enhanced the efficiency of kernel-based graphics drivers. These advancements have improved the performance and reliability of the Intel GPU, contributing to the broader development of Linux kernel-based graphics drivers and related technologies.",
      "validation": "**Answer:**\n\n**VERDICT:** ACCEPVED\n\n**REASONING:** The question is clear and the answer is thorough, addressing the impact of James Keta renos's work on the Intel GPU driver. However, the answer is somewhat technical and lacks a broader context and naturalness.\n\n**IMPROVEMENTS:** The answer could be made more conversational and broader to explain how James' work has directly impacted the development of the Intel GPU driver. It should also provide more context on how his contributions have influenced other areas of Linux or kernel-based software.\n\n**NOTICE:** The answer is comprehensive but lacks a more natural and broader context. It could be improved by including more conversational and contextual elements to better explain the impact and its broader significance."
    },
    {
      "question": "How has James' experience with robotic vision applications influenced hardware design?",
      "answer": "James' experience with robotic vision applications has significantly influenced hardware design by enabling the development of efficient and scalable computing solutions. By integrating robotic vision into hardware, he contributed to the design of hardware that can handle complex tasks requiring visual recognition, leading to improved performance in areas such as image processing and robotics. This has translated into hardware designs that enhance computational capabilities and efficiency, directly impacting the development of systems requiring high-performance computing.",
      "validation": "- VERDICT: REJECT\n- REASONING: The answer is technically accurate and relevant, but it lacks specific examples and context, making it less natural and less detailed than necessary.\n- IMPROVEMENTS: The answer should include concrete hardware design examples influenced by robotic vision, tie back to James' experience as a software architect, and provide more natural and detailed examples to make it more understandable."
    },
    {
      "question": "What is James' role in the development of the Intel Intercast software technology?",
      "answer": "James' role in the development of the Intel Intercast software technology is as a team leader and technical leader, responsible for driving the integration of the Intel Intercast technology into various solutions and ensuring its successful adoption.",
      "validation": "**Answer:**\n\nJames' role in the development of the Intel Intercast software technology is as a team leader and technical leader, responsible for driving the integration of the Intel Intercast technology into various solutions and ensuring its successful adoption.\n\n**Evaluation:**\n\n1. **VERDICT: [ACCEPT]**\n   - **REASONING:** The answer accurately describes James' role and contributes to the context about his work.\n\n2. **COMPLETENESS:** [REMAIN] The answer fully addresses the question by explaining James' role, so it's complete.\n\n3. **RELEVANCE:** [REMAIN] The answer clearly ties James' role to the Intel Intercast project, making it relevant.\n\n4. ** NATURALNESS:** [REMAIN] The answer uses professional language and is concise, which sounds natural.\n\n5. **DIVERSITY:** [REMAIN] The answer adds variety by mentioning multiple areas of James' work, which is important for training.\n\nOverall, this pair meets all the quality standards. It's accurate, complete, relevant, natural, and adds diversity, which is important for training."
    }
  ]
 }
--- a/src/pycache/cli.cpython-312.pyc
+++ b/src/pycache/cli.cpython-312.pyc
--- a/src/pycache/core.cpython-312.pyc
+++ b/src/pycache/core.cpython-312.pyc
--- a/src/pycache/rag_core.cpython-312.pyc
+++ b/src/pycache/rag_core.cpython-312.pyc
--- a/src/alpaca_lora_finetuning.py
+++ b/src/alpaca_lora_finetuning.py
@ -0,0 +1,293 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Some parts of this file is adapted from
 # https://github.com/tloen/alpaca-lora/blob/main/finetune.py
 #
 # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from typing import List
 import torch
 import transformers
 from datasets import load_dataset
 import accelerate
 from typing import Union
 from datetime import datetime
 from transformers import AutoTokenizer
 from peft import (
    get_peft_model_state_dict,
    set_peft_model_state_dict,
 )
 from ipex_llm.transformers import AutoModelForCausalLM
 # import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
 from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
    LoraConfig
 import json
 local_rank = 0
 world_size = 1
 class Prompter(object):
    __slots__ = ("data")
    def __init__(self, data_path: str = ""):
        self.data = None
        if not os.path.exists(data_path):
            raise Exception(f"Can't read {data_path}")
        with open(data_path) as fp:
            self.data = json.load(fp)
    def generate_prompt(
        self,
        instruction: str,
        input: Union[None, str]=None,
        label: Union[None, str]=None,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            ret = f"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
        else:
            ret = f"### Instruction:\n{instruction}\n\n### Response:\n"
        if label:
            ret = f"{ret}{label}"
        return ret
    def get_response(self, output: str) -> str:
        return output.split("### Response:")[1].strip()
 """Data processing to get train data and val data"""
 def tokenize_qa(prompt, tokenizer, max_length=1, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < max_length
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)
    result["labels"] = result["input_ids"].copy()
    return result
 def generate_and_tokenize_prompt(entry, prompter, tokenizer, max_length, add_eos_token, train_on_inputs):
    full_prompt = prompter.generate_prompt(
        entry["question"],
        None,
        entry["answer"],
    )
    print({ 'full_prompt:' full_prompt })
    tokenized_full_prompt = tokenize_qa(full_prompt, tokenizer, max_length=max_length)
    if not train_on_inputs:
        user_prompt = prompter.generate_prompt(
            entry["question"], None
        )
        tokenized_user_prompt = tokenize_qa(
            user_prompt, tokenizer, add_eos_token=add_eos_token, max_length=max_length
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])
        if add_eos_token:
            user_prompt_len -= 1
        # Mask out the input from the labels
        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]  # could be sped up, probably
    return tokenized_full_prompt
 def train(
    # model/data params
    base_model: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    data_path: str = "./results/doc/resume/generic.txt.qa.json",
    output_dir: str = "./bigdl-qlora-alpaca",
    # training hyperparams
    bf16: bool = True,  # default to bf16
    batch_size: int = 128,
    micro_batch_size: int = 2,  # default to be 2, limited by GPU memory
    num_epochs: int = 3,
    learning_rate: float = 3e-5,  # default to be 3e-5 to avoid divergence
    cutoff_len: int = 256,
    val_set_size: int = 2000,
    # lora hyperparams
    lora_r: int = 8,
    lora_alpha: int = 16,
    lora_dropout: float = 0.05,
    lora_target_modules: List[str] = [
        "q_proj",
        "v_proj",
        "k_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj"
    ],
    # llm hyperparams
    train_on_inputs: bool = True,  # if False, masks out inputs in loss
    add_eos_token: bool = False,
    group_by_length: bool = False,  # faster, but produces an odd training loss curve
    resume_from_checkpoint: str = "",  # either training checkpoint or final adapter
    prompt_template_name: str = "alpaca",  # The prompt template to use, will default to alpaca.
    gradient_checkpointing: bool = False,
    training_mode: str = "lora",
    save_checkpoint: bool = True,
 ):
    if training_mode != "lora":
        raise Exception(f"This example is for lora training mode, but got training_mode={training_mode}.")
    print(
        f"Training Alpaca-LoRA model with params:\n"
        f"base_model: {base_model}\n"
        f"data_path: {data_path}\n"
        f"output_dir: {output_dir}\n"
        f"batch_size: {batch_size}\n"
        f"micro_batch_size: {micro_batch_size}\n"
        f"num_epochs: {num_epochs}\n"
        f"learning_rate: {learning_rate}\n"
        f"cutoff_len: {cutoff_len}\n"
        f"val_set_size: {val_set_size}\n"
        f"lora_r: {lora_r}\n"
        f"lora_alpha: {lora_alpha}\n"
        f"lora_dropout: {lora_dropout}\n"
        f"lora_target_modules: {lora_target_modules}\n"
        f"train_on_inputs: {train_on_inputs}\n"
        f"add_eos_token: {add_eos_token}\n"
        f"group_by_length: {group_by_length}\n"
        f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
        f"prompt template: {prompt_template_name}\n"
        f"training_mode: {training_mode}\n"
        f"save_checkpoint: {save_checkpoint}\n"
    )
    gradient_accumulation_steps = batch_size // micro_batch_size
    prompter = Prompter("./results/doc/resume/generic.txt.qa.json")
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
 #	    load_in_4bit=True,
        load_in_low_bit="bf16",
        optimize_model=False,
 #       optimize_model=True,
        torch_dtype=torch.bfloat16,
        modules_to_not_convert=["lm_head"],
        trust_remote_code=True,
    )
 #    model = model.half()
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    # Prepare a IPEX-LLM compatible Peft model
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=lora_target_modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
        training_mode=training_mode,
    )
    print(f"Lora Config: {config}")
    model = get_peft_model(model, config)
    dataset = load_dataset("json", data_files=data_path)
    tokenized_dataset = dataset.map(generate_and_tokenize_prompt, fn_kwargs={
        "prompter": prompter,
        "tokenizer": tokenizer,
        "max_length": cutoff_len,
        "add_eos_token": add_eos_token,
        "train_on_inputs": train_on_inputs
    }, batched=True)
    model.print_trainable_parameters()  # Be more transparent about the % of trainable params.
    split = tokenized_dataset.train_test_split(test_size=0.2)
    train_dataset = split["train"]
    eval_dataset = split["test"]
    trainer = transformers.Trainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=micro_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            # warmup_ratio=0.03,
            # warmup_steps=100,
            max_grad_norm=0.3,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            lr_scheduler_type="cosine",
            bf16=True,  # ensure training more stable
            logging_steps=1,
            optim="adamw_torch",
            evaluation_strategy="steps" if val_set_size > 0 else "no",
            save_strategy="steps" if save_checkpoint else "no",
            eval_steps=100 if val_set_size > 0 else None,
            save_steps=100,
            output_dir=output_dir,
            save_total_limit=100,
            load_best_model_at_end=True if val_set_size > 0 and save_checkpoint else False,
            group_by_length=group_by_length,
            gradient_checkpointing=gradient_checkpointing,
            ddp_backend="ccl",
            save_safetensors=False,
        ),
        data_collator=transformers.DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
    )
    model.config.use_cache = False
    trainer.train(data_path="./results/doc/resume/generic.txt.qa.json", resume_from_checkpoint=resume_from_checkpoint)
    model.save_pretrained(output_dir)
    print(
        "\n If there's a warning about missing keys above, please disregard :)"
    )
 if __name__ == "__main__":
    train()
--- a/src/claude-fine-tune.py
+++ b/src/claude-fine-tune.py
@ -0,0 +1,394 @@
 import os
 import json
 import torch
 import random
 import glob
 from datetime import datetime
 from tqdm import tqdm
 from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    TrainerCallback,
    TrainingArguments,
    TrainerState,
    TrainerControl,
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    BitsAndBytesConfig
 )
 import numpy as np
 from peft import (
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
    LoraConfig,
 )
 from datasets import Dataset
 def load_qa_data(results_dir):
    """
    Load Q&A pairs from the processed results directory.
    Args:
        results_dir (str): Path to the directory containing JSON files with Q&A pairs
    Returns:
        list: List of prompt-completion pairs formatted for fine-tuning
    """
    training_data = []
    # Find all JSON files in the results directory and subdirectories
    json_files = glob.glob(os.path.join(results_dir, "**/*.json"), recursive=True)
    for json_file in tqdm(json_files, desc="Loading training data"):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            # Extract context and QA pairs
            context = data.get("context", "")
            qa_pairs = data.get("qa_pairs", [])
            for qa_pair in qa_pairs:
                question = qa_pair.get("question", "")
                answer = qa_pair.get("answer", "")
                if question and answer:
                    # Format as instruction-based fine-tuning example
                    prompt = f"""<task>
 Answer the following question based on the provided context.
 CONTEXT:
 {context}
 QUESTION:
 {question}
 </task>
 """
                    # Add to training data
                    training_data.append({
                        "prompt": prompt,
                        "completion": answer
                    })
        except Exception as e:
            print(f"Error processing {json_file}: {e}")
            continue
    print(f"Loaded {len(training_data)} training examples")
    return training_data
 def prepare_dataset_for_training(training_data, tokenizer, max_length=2048):
    """
    Prepare the dataset for training by tokenizing and formatting.
    Args:
        training_data (list): List of prompt-completion pairs
        tokenizer: DeepSeek tokenizer
        max_length (int): Maximum sequence length
    Returns:
        Dataset: HuggingFace dataset ready for training
    """
    def tokenize_function(examples):
        # Combine prompt and completion
        full_texts = []
        for i in range(len(examples["prompt"])):
            full_text = examples["prompt"][i] + examples["completion"][i]
            full_texts.append(full_text)
        if not tokenizer.pad_token:
            tokenizer.pad_token = tokenizer.eos_token
        # Tokenize
        tokenized = tokenizer(
            full_texts,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        # Create labels (same as input_ids for causal LM)
        tokenized["labels"] = tokenized["input_ids"].clone()
        # Create attention mask for prompt tokens
        for i in range(len(full_texts)):
            prompt_length = len(tokenizer.encode(examples["prompt"][i]))
            # Set labels for prompt tokens to -100 (ignored in loss calculation)
            tokenized["labels"][i, :prompt_length] = -100
        return tokenized
    # Convert to HuggingFace Dataset
    dataset_dict = {
        "prompt": [item["prompt"] for item in training_data],
        "completion": [item["completion"] for item in training_data]
    }
    # Create dataset
    dataset = Dataset.from_dict(dataset_dict)
    # Apply tokenization
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["prompt", "completion"]
    )
    return tokenized_dataset
 def setup_lora_config():
    """
    Configure LoRA hyperparameters.
    Returns:
        LoraConfig: Configuration for LoRA fine-tuning
    """
    return LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8, #16,                     # Rank dimension
        lora_alpha=16, #32,            # Alpha parameter for LoRA scaling
        lora_dropout=0.05,        # Dropout probability for LoRA layers
        target_modules=[          # Target modules to apply LoRA to
            "q_proj",
            "k_proj", 
            "v_proj", 
            "o_proj",
            "gate_proj", 
            "up_proj", 
            "down_proj"
        ],
        bias="none",              # Whether to train bias parameters
        fan_in_fan_out=False      # Set to True for linear layers with fan_in != fan_out
    )
 class EarlyStoppingCallback(TrainerCallback):
    """
    Callback that implements early stopping.
    """
    def __init__(self, patience=3, min_delta=0.0):
        """
        Args:
            patience (int): Number of evaluations with no improvement after which training will be stopped.
            min_delta (float): Minimum change in the monitored quantity to qualify as an improvement.
        """
        self.patience = patience
        self.min_delta = min_delta
        self.best_score = None
        self.counter = 0
        self.early_stop = False
    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        eval_loss = state.log_history[-1].get("eval_loss")
        if eval_loss is None:
            return
        if self.best_score is None:
            self.best_score = eval_loss
        elif eval_loss > self.best_score - self.min_delta:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                print(f"Early stopping triggered! Best loss: {self.best_score:.4f}")
                self.early_stop = True
                control.should_training_stop = True
        else:
            self.best_score = eval_loss
            self.counter = 0
        return control
 def train_with_lora(model_name, training_data, output_dir, batch_size=4, num_epochs=3, learning_rate=2e-4):
    """
    Fine-tune the DeepSeek-R1 model with LoRA.
    Args:
        model_name (str): Name of the DeepSeek model
        training_data (list): List of prompt-completion pairs
        output_dir (str): Directory to save the fine-tuned model
        batch_size (int): Training batch size
        num_epochs (int): Number of training epochs
        learning_rate (float): Learning rate
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        use_cache=False,
        quantization_config=bnb_config,
        device_map={"": torch.xpu.current_device()},
        torch_dtype=torch.bfloat16,
    )
    # Prepare the model for LoRA fine-tuning
    # 1. Prepare for k-bit training if using quantization
    model = prepare_model_for_kbit_training(model)
    # 2. Add LoRA adapters
    lora_config = setup_lora_config()
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    # Prepare dataset
    tokenized_dataset = prepare_dataset_for_training(training_data, tokenizer)
    # Split into train and evaluation sets
    dataset_dict = tokenized_dataset.train_test_split(test_size=0.1)
    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=4,
        evaluation_strategy="steps",
        eval_steps=100,  # More frequent evaluation to check stopping criteria
        save_strategy="steps",
        save_steps=500,
        save_total_limit=3,
        learning_rate=learning_rate,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        fp16=True,
        report_to="none",
        optim="adamw_torch",
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss"  # Use eval loss to determine the best model
    )
    # Create data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # Use causal language modeling (not masked)
    )
    early_stopping_callback = EarlyStoppingCallback(patience=3, min_delta=0.01)
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_dict["train"],
        eval_dataset=dataset_dict["test"],
        data_collator=data_collator,
        callbacks=[early_stopping_callback]  # Add the early stopping callback
    )
    # Start training
    print("Starting LoRA fine-tuning...")
    trainer.train()
    model = trainer.model
    print("LoRA fine-tuning complete...")
    # Save the fine-tuned model
    print("Saving model...")
    model.save_pretrained(f"{output_dir}/final_model")
    tokenizer.save_pretrained(f"{output_dir}/final_model")
    print(f"Fine-tuned model saved to {output_dir}/final_model")
    return model, tokenizer
 # Main function
 def main():
    # Configuration
    results_dir = "../results"  # Directory with processed text files and QA pairs
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    #model_name = "Intel/neural-chat-7b-v3-3"
    output_dir = f"./fine_tuned"
    # Load training data from processed results
    training_data = load_qa_data(results_dir)
    # Shuffle training data
    random.shuffle(training_data)
    # Fine-tune the model
    train_with_lora(
        model_name=model_name,
        training_data=training_data,
        output_dir=output_dir,
        batch_size=1, # Can't fit more than one on the B580
        num_epochs=10,
        learning_rate=2e-4
    )
    model_path = f"{output_dir}/final_model"
    # Load the fine-tuned model and tokenizer
    print("Loading fine-tuned model for evaluation...")
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        use_cache=True,
        quantization_config=bnb_config,
        device_map={"": torch.xpu.current_device()})
    model = model.to('xpu')
    print("Loaded. Ask your question, CTRL-C to exit.")        
    while True:
        try:
            question = input("> ").strip()
        except KeyboardInterrupt:
            print("\nExiting.")
            break
        prompt = f"""<task>
 You are an assistant providing resume details about James Ketrenos. Answer the following question based on your knowledge. If you don't know, say so. Be concise.
 QUESTION:
 {question}
 </task>
 """
        # Tokenize and generate
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )
        # Decode the output
        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract just the answer part (remove the prompt)
        answer = full_output#[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
        print(answer.strip())
 if __name__ == "__main__":
    main()
--- a/src/claude-process.py
+++ b/src/claude-process.py
@ -0,0 +1,360 @@
 # Self-supervised synthetic data via sequential generation
 import os
 import re
 import json
 import torch
 from pathlib import Path
 from datetime import datetime
 from tqdm import tqdm
 from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
 )
 def load_deepseek_r1():
    """
    Loads the DeepSeek-R1 model and tokenizer.
    Returns:
        tuple: (model, tokenizer) for DeepSeek-R1
    """
    # Load model and tokenizer
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        use_cache=False,
        quantization_config=bnb_config,
        device_map={"": torch.xpu.current_device()},
        torch_dtype=torch.bfloat16,
    )
    if not model.config.pad_token_id:
        model.config.pad_token_id = model.config.eos_token_id
    return model, tokenizer
 # Function to get answers from the model
 def validate_question_answer_from_model(context, question, answer, model, tokenizer):
    """
    Query the model to evaluate if an answer is a good fit for a question.
    Args:
        context (str): The text file content
        question (str): The question
        answer (str): The answer
        model: The DeepSeek model
        tokenizer: The DeepSeek tokenizer
    Returns:
        str: The model's evaluation to the question
    """
    # Set up the prompt for answering the question
    prompt = f"""<task>
 You are a quality assurance expert reviewing question-answer pairs for an AI training dataset. Your task is to evaluate whether each pair meets our quality standards and is suitable for training.
 For each question-answer pair, evaluate:
 1. ACCURACY: Does the answer contain ONLY information from the context, without fabrications?
 2. COMPLETENESS: Does the answer fully address the question using all relevant information?
 3. RELEVANCE: Is the question meaningful and relevant to the context?
 4. NATURALNESS: Do both question and answer sound natural and conversational?
 5. DIVERSITY: Does this pair add variety to our dataset (not redundant with others)?
 Context:
 {context}
 Question:
 {question}
 Answer:
 {answer}
 Provide your assessment as follows:
 - VERDICT: [ACCEPT/REJECT]
 - REASONING: [Brief explanation of your decision]
 - IMPROVEMENTS: [Suggestions if needed]
 </task>
 """
    # Generate answer
    raw_answer = generate_with_deepseek(prompt, model, tokenizer)
    # Parse the response to get the actual answer
    answer = parse_deepseek_response(raw_answer)
    return answer
 def generate_with_deepseek(prompt, model, tokenizer, max_length=4096):
    """
    Generate text using DeepSeek-R1 model with proper handling of full output.
    Args:
        prompt (str): The input prompt
        model: The DeepSeek model
        tokenizer: The DeepSeek tokenizer
        max_length (int): Maximum length of generated text
    Returns:
        str: Generated text response
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        # Get the full generated sequence
        generation_output = model.generate(
            **inputs,
            pad_token_id=tokenizer.eos_token_id,
            max_new_tokens=max_length,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=False
        )
        # Get the full output token IDs
        output_token_ids = generation_output.sequences[0]
        # Decode the full generated text (including the prompt)
        full_output = tokenizer.decode(output_token_ids, skip_special_tokens=True)
        # Extract only the response part by removing the prompt part
        response = full_output[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
    return response
 def parse_deepseek_response(response):
    """
    Parse the DeepSeek-R1 response.
    This handles DeepSeek's thinking steps and separates them from the response.
    Args:
        response (str): The raw response from the DeepSeek model
    Returns:
        str: The cleaned answer part of the response
    """
    # If the response has thinking tags, extract the actual answer
    response = re.sub(r"^(<think>)?.*</think>", "", response, flags=re.DOTALL)
    # If no special formatting detected, return the whole response
    return response.strip()
 # Function to recursively walk a directory and process text files
 def process_directory(directory_path, output_path="results", file_extensions=(".txt",".md"), batch_size=5):
    """
    Recursively walks a directory, processes text files, and stores results.
    Args:
        directory_path (str): Path to directory containing text files
        output_path (str): Path to store results
        file_extensions (tuple): File extensions to process
        batch_size (int): Number of files to process before clearing cache
    """
    # Load the DeepSeek-R1 model
    model, tokenizer = load_deepseek_r1()
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    # Use tqdm for progress tracking
    file_paths = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.lower().endswith(file_extensions):
                file_paths.append(os.path.join(root, file))
    # Process files with batch-based memory management
    for i, file_path in enumerate(tqdm(file_paths, desc="Processing files")):
        process_file(file_path, model, tokenizer, output_path)
        # Clear cache periodically to prevent memory issues
        if (i + 1) % batch_size == 0:
            if torch.xpu.is_available():
                torch.xpu.empty_cache()
            else:
                torch.cuda.empty_cache()
    print(f"Processing complete. Results stored in {output_path}")
 # Function to process a single text file
 def process_file(file_path, model, tokenizer, output_path):
    """
    Process a single text file by querying the model for questions and answers.
    Args:
        file_path (str): Path to the text file
        model: The DeepSeek model
        tokenizer: The DeepSeek tokenizer
        output_path (str): Path to store results
    """
    # Read the file content
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return
    # Skip empty files
    if not content.strip():
        print(f"Skipping empty file: {file_path}")
        return
    # Generate a relative output path that maintains the directory structure
    relative_path = os.path.relpath(file_path, start=os.path.dirname(output_path))
    output_file = os.path.join(output_path, relative_path + ".json")
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    # Handle potential token length issues by truncating if necessary
    # DeepSeek has a context window limit, truncate if needed
    max_content_length = 100000  # Adjust based on model's context length limit
    if len(content) > max_content_length:
        content = content[:max_content_length] + "... [Content truncated due to length]"
    # Query model for questions
    questions = get_questions_from_model(content, model, tokenizer)
    print(f"{len(questions)} questions generated for {file_path}")
    # Get answers for each question
    results = {
        "file_path": file_path,
        "processed_at": datetime.now().isoformat(),
        "context": content[:1000] + "..." if len(content) > 1000 else content,  # Truncated context for JSON storage
        "qa_pairs": []
    }
    # Process each question
    for i, question in enumerate(questions):
        print(f"Generating answer for question {i+1}/{len(questions)}: {question}")
        answer = get_answer_from_model(content, question, model, tokenizer)
        print(f"Answer: {answer[:50] + '...' if len(answer) > 50 else answer}")
        print(f"Evaluating response...")
        validation = validate_question_answer_from_model(content, question, answer, model, tokenizer)
        print(f"Evaluation: {validation[:50] + '...' if len(validation) > 50 else validation}")
        results["qa_pairs"].append({
            "question": question,
            "answer": answer,
            "validation": validation,
        })
        # Save results after each generate pass so they can be evaluated
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
 # Function to get questions from the model
 def get_questions_from_model(context, model, tokenizer):
    """
    Query the model to generate questions about the provided context.
    Args:
        context (str): The text file content
        model: The DeepSeek model
        tokenizer: The DeepSeek tokenizer
    Returns:
        list: List of questions about the context
    """
    # Set up the prompt for generating questions
    prompt = f"""<task>
 You are an expert data scientist creating a training dataset. I'll provide context information about a person from their resume. Your task is to generate 10 diverse, realistic questions that someone might ask about this person.
 IMPORTANT: DO NOT return JSON or any structured format. Respond with a simple numbered list of questions only, with no formatting, no JSON, and no additional text.
 Generate questions that:
 - Vary in complexity (simple factual questions, complex reasoning questions)
 - Cover different aspects of the context (experience, skills, education, achievements)
 - Include both specific and general inquiries
 - Sound natural, as if asked by a real person
 - Avoid asking for information not present in the context
 Context:
 {context}
 Return ONLY a plain numbered list like:
 1. First question?
 2. Second question?
 ...and so on.
 Do not include any explanations, JSON, or other formatting.
 </task>
 """
    # Generate questions
    raw_response = generate_with_deepseek(prompt, model, tokenizer)
    # Parse the response to get the actual questions
    response = parse_deepseek_response(raw_response)
    lines = response.strip().split('\n')
    # Transform lines like "1. What is your name?" into just "What is your name?" (only include lines that end with a question mark)
    questions = [re.sub(r'^\d+\.\s*', '', line.strip()) for line in lines if line.strip() and line.strip().endswith('?')]
    if len(questions) == 0:
        print(response)
        exit(0)
    return questions
 # Function to get answers from the model
 def get_answer_from_model(context, question, model, tokenizer):
    """
    Query the model to answer a question about the provided context.
    Args:
        context (str): The text file content
        question (str): The question to answer
        model: The DeepSeek model
        tokenizer: The DeepSeek tokenizer
    Returns:
        str: The model's answer to the question
    """
    # Set up the prompt for answering the question
    prompt = f"""<task>
 You are an AI assistant being fine-tuned to accurately represent a specific person based on their resume. Below is a question about this person and the context from their resume.
 Your task is to provide a comprehensive, accurate answer that:
 - Only uses information explicitly stated in the context
 - Doesn't fabricate or assume additional details
 - Maintains a professional, helpful tone
 - Clearly states if the question cannot be answered based on the given context
 - Structures the response in a natural, conversational way
 Context:
 {context}
 Question:
 {question}
 Answer:
 </task>
 """
    # Generate answer
    raw_answer = generate_with_deepseek(prompt, model, tokenizer)
    # Parse the response to get the actual answer
    answer = parse_deepseek_response(raw_answer)
    return answer
 # Example usage
 if __name__ == "__main__":
    process_directory(
        directory_path="../doc/",  # Replace with your directory path
        output_path="../results",
        file_extensions=(".txt", ".md"),  # Process both txt and md files
        batch_size=5  # Clear cache after every 5 files
    )
--- a/src/cli.py
+++ b/src/cli.py
@ -0,0 +1,25 @@
 # web_server.py - Web interface using Flask
 from rag_core import RagCore
 from rag_cli import RagCLI
 from flask import Flask, request, jsonify, render_template
 # main.py - Entry point that allows choosing between CLI and web interface
 def main():
    import argparse
    parser = argparse.ArgumentParser(description='RAG Application')
    parser.add_argument('--debug', action='store_true', help='Run in debug mode')
    args = parser.parse_args()
    # Create the core RAG application
    core = RagCore()
    # Run CLI interface
    cli = RagCLI(core)
    cli.run()
 if __name__ == "__main__":
    main()
--- a/src/client
+++ b/src/client
@ -0,0 +1 @@
 Subproject commit 571cdd1ecc93ec9ad0d63079fe6da94dce2cc5dc
--- a/src/export_merged_model.py
+++ b/src/export_merged_model.py
@ -0,0 +1,44 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import os
 import torch
 from transformers import AutoTokenizer
 import argparse
 current_dir = os.path.dirname(os.path.realpath(__file__))
 common_util_path = os.path.join(current_dir, '..')
 import sys
 sys.path.append(common_util_path)
 from common.utils import merge_adapter
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Merge the adapter into the original model for Llama2 model')
    parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
                        help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
                             ', or the path to the huggingface checkpoint folder')
    parser.add_argument('--adapter_path', type=str,)
    parser.add_argument('--output_path', type=str,)
    args = parser.parse_args()
    base_model = model_path = args.repo_id_or_model_path
    adapter_path = args.adapter_path
    output_path = args.output_path
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    merge_adapter(base_model, tokenizer, adapter_path, output_path)
    print(f'Finish to merge the adapter into the original model and you could find the merged model in {output_path}.')
--- a/src/fine_tuned/checkpoint-10/README.md
+++ b/src/fine_tuned/checkpoint-10/README.md
@ -0,0 +1,202 @@
 ---
 base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 library_name: peft
 ---
 # Model Card for Model ID
 <!-- Provide a quick summary of what the model is/does. -->
 ## Model Details
 ### Model Description
 <!-- Provide a longer summary of what this model is. -->
 - **Developed by:** [More Information Needed]
 - **Funded by [optional]:** [More Information Needed]
 - **Shared by [optional]:** [More Information Needed]
 - **Model type:** [More Information Needed]
 - **Language(s) (NLP):** [More Information Needed]
 - **License:** [More Information Needed]
 - **Finetuned from model [optional]:** [More Information Needed]
 ### Model Sources [optional]
 <!-- Provide the basic links for the model. -->
 - **Repository:** [More Information Needed]
 - **Paper [optional]:** [More Information Needed]
 - **Demo [optional]:** [More Information Needed]
 ## Uses
 <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
 ### Direct Use
 <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
 [More Information Needed]
 ### Downstream Use [optional]
 <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
 [More Information Needed]
 ### Out-of-Scope Use
 <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
 [More Information Needed]
 ## Bias, Risks, and Limitations
 <!-- This section is meant to convey both technical and sociotechnical limitations. -->
 [More Information Needed]
 ### Recommendations
 <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
 Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
 ## How to Get Started with the Model
 Use the code below to get started with the model.
 [More Information Needed]
 ## Training Details
 ### Training Data
 <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
 [More Information Needed]
 ### Training Procedure
 <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
 #### Preprocessing [optional]
 [More Information Needed]
 #### Training Hyperparameters
 - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
 #### Speeds, Sizes, Times [optional]
 <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
 [More Information Needed]
 ## Evaluation
 <!-- This section describes the evaluation protocols and provides the results. -->
 ### Testing Data, Factors & Metrics
 #### Testing Data
 <!-- This should link to a Dataset Card if possible. -->
 [More Information Needed]
 #### Factors
 <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
 [More Information Needed]
 #### Metrics
 <!-- These are the evaluation metrics being used, ideally with a description of why. -->
 [More Information Needed]
 ### Results
 [More Information Needed]
 #### Summary
 ## Model Examination [optional]
 <!-- Relevant interpretability work for the model goes here -->
 [More Information Needed]
 ## Environmental Impact
 <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
 Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
 - **Hardware Type:** [More Information Needed]
 - **Hours used:** [More Information Needed]
 - **Cloud Provider:** [More Information Needed]
 - **Compute Region:** [More Information Needed]
 - **Carbon Emitted:** [More Information Needed]
 ## Technical Specifications [optional]
 ### Model Architecture and Objective
 [More Information Needed]
 ### Compute Infrastructure
 [More Information Needed]
 #### Hardware
 [More Information Needed]
 #### Software
 [More Information Needed]
 ## Citation [optional]
 <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
 **BibTeX:**
 [More Information Needed]
 **APA:**
 [More Information Needed]
 ## Glossary [optional]
 <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
 [More Information Needed]
 ## More Information [optional]
 [More Information Needed]
 ## Model Card Authors [optional]
 [More Information Needed]
 ## Model Card Contact
 [More Information Needed]
 ### Framework versions
 - PEFT 0.14.0
--- a/src/fine_tuned/checkpoint-10/adapter_config.json
+++ b/src/fine_tuned/checkpoint-10/adapter_config.json
@ -0,0 +1,37 @@
 {
  "alpha_pattern": {},
  "auto_mapping": null,
  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
  "bias": "none",
  "eva_config": null,
  "exclude_modules": null,
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layer_replication": null,
  "layers_pattern": null,
  "layers_to_transform": null,
  "loftq_config": {},
  "lora_alpha": 16,
  "lora_bias": false,
  "lora_dropout": 0.05,
  "megatron_config": null,
  "megatron_core": "megatron.core",
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 8,
  "rank_pattern": {},
  "revision": null,
  "target_modules": [
    "o_proj",
    "q_proj",
    "gate_proj",
    "k_proj",
    "down_proj",
    "up_proj",
    "v_proj"
  ],
  "task_type": "CAUSAL_LM",
  "use_dora": false,
  "use_rslora": false
 }
--- a/src/fine_tuned/checkpoint-10/adapter_model.safetensors
+++ b/src/fine_tuned/checkpoint-10/adapter_model.safetensors
--- a/src/fine_tuned/checkpoint-10/optimizer.pt
+++ b/src/fine_tuned/checkpoint-10/optimizer.pt
--- a/src/fine_tuned/checkpoint-10/rng_state.pth
+++ b/src/fine_tuned/checkpoint-10/rng_state.pth
--- a/src/fine_tuned/checkpoint-10/scaler.pt
+++ b/src/fine_tuned/checkpoint-10/scaler.pt
--- a/src/fine_tuned/checkpoint-10/scheduler.pt
+++ b/src/fine_tuned/checkpoint-10/scheduler.pt
--- a/src/fine_tuned/checkpoint-10/trainer_state.json
+++ b/src/fine_tuned/checkpoint-10/trainer_state.json
@ -0,0 +1,32 @@
 {
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 10.0,
  "eval_steps": 100,
  "global_step": 10,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [],
  "logging_steps": 100,
  "max_steps": 10,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 763304356085760.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
 }
--- a/src/fine_tuned/checkpoint-10/training_args.bin
+++ b/src/fine_tuned/checkpoint-10/training_args.bin
--- a/src/fine_tuned/checkpoint-3/README.md
+++ b/src/fine_tuned/checkpoint-3/README.md
@ -0,0 +1,202 @@
 ---
 base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 library_name: peft
 ---
 # Model Card for Model ID
 <!-- Provide a quick summary of what the model is/does. -->
 ## Model Details
 ### Model Description
 <!-- Provide a longer summary of what this model is. -->
 - **Developed by:** [More Information Needed]
 - **Funded by [optional]:** [More Information Needed]
 - **Shared by [optional]:** [More Information Needed]
 - **Model type:** [More Information Needed]
 - **Language(s) (NLP):** [More Information Needed]
 - **License:** [More Information Needed]
 - **Finetuned from model [optional]:** [More Information Needed]
 ### Model Sources [optional]
 <!-- Provide the basic links for the model. -->
 - **Repository:** [More Information Needed]
 - **Paper [optional]:** [More Information Needed]
 - **Demo [optional]:** [More Information Needed]
 ## Uses
 <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
 ### Direct Use
 <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
 [More Information Needed]
 ### Downstream Use [optional]
 <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
 [More Information Needed]
 ### Out-of-Scope Use
 <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
 [More Information Needed]
 ## Bias, Risks, and Limitations
 <!-- This section is meant to convey both technical and sociotechnical limitations. -->
 [More Information Needed]
 ### Recommendations
 <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
 Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
 ## How to Get Started with the Model
 Use the code below to get started with the model.
 [More Information Needed]
 ## Training Details
 ### Training Data
 <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
 [More Information Needed]
 ### Training Procedure
 <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
 #### Preprocessing [optional]
 [More Information Needed]
 #### Training Hyperparameters
 - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
 #### Speeds, Sizes, Times [optional]
 <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
 [More Information Needed]
 ## Evaluation
 <!-- This section describes the evaluation protocols and provides the results. -->
 ### Testing Data, Factors & Metrics
 #### Testing Data
 <!-- This should link to a Dataset Card if possible. -->
 [More Information Needed]
 #### Factors
 <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
 [More Information Needed]
 #### Metrics
 <!-- These are the evaluation metrics being used, ideally with a description of why. -->
 [More Information Needed]
 ### Results
 [More Information Needed]
 #### Summary
 ## Model Examination [optional]
 <!-- Relevant interpretability work for the model goes here -->
 [More Information Needed]
 ## Environmental Impact
 <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
 Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
 - **Hardware Type:** [More Information Needed]
 - **Hours used:** [More Information Needed]
 - **Cloud Provider:** [More Information Needed]
 - **Compute Region:** [More Information Needed]
 - **Carbon Emitted:** [More Information Needed]
 ## Technical Specifications [optional]
 ### Model Architecture and Objective
 [More Information Needed]
 ### Compute Infrastructure
 [More Information Needed]
 #### Hardware
 [More Information Needed]
 #### Software
 [More Information Needed]
 ## Citation [optional]
 <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
 **BibTeX:**
 [More Information Needed]
 **APA:**
 [More Information Needed]
 ## Glossary [optional]
 <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
 [More Information Needed]
 ## More Information [optional]
 [More Information Needed]
 ## Model Card Authors [optional]
 [More Information Needed]
 ## Model Card Contact
 [More Information Needed]
 ### Framework versions
 - PEFT 0.14.0
--- a/src/fine_tuned/checkpoint-3/adapter_config.json
+++ b/src/fine_tuned/checkpoint-3/adapter_config.json
@ -0,0 +1,37 @@
 {
  "alpha_pattern": {},
  "auto_mapping": null,
  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
  "bias": "none",
  "eva_config": null,
  "exclude_modules": null,
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layer_replication": null,
  "layers_pattern": null,
  "layers_to_transform": null,
  "loftq_config": {},
  "lora_alpha": 16,
  "lora_bias": false,
  "lora_dropout": 0.05,
  "megatron_config": null,
  "megatron_core": "megatron.core",
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 8,
  "rank_pattern": {},
  "revision": null,
  "target_modules": [
    "v_proj",
    "o_proj",
    "gate_proj",
    "down_proj",
    "q_proj",
    "up_proj",
    "k_proj"
  ],
  "task_type": "CAUSAL_LM",
  "use_dora": false,
  "use_rslora": false
 }
--- a/src/fine_tuned/checkpoint-3/adapter_model.safetensors
+++ b/src/fine_tuned/checkpoint-3/adapter_model.safetensors
--- a/src/fine_tuned/checkpoint-3/optimizer.pt
+++ b/src/fine_tuned/checkpoint-3/optimizer.pt
--- a/src/fine_tuned/checkpoint-3/rng_state.pth
+++ b/src/fine_tuned/checkpoint-3/rng_state.pth
--- a/src/fine_tuned/checkpoint-3/scaler.pt
+++ b/src/fine_tuned/checkpoint-3/scaler.pt
--- a/src/fine_tuned/checkpoint-3/scheduler.pt
+++ b/src/fine_tuned/checkpoint-3/scheduler.pt
--- a/src/fine_tuned/checkpoint-3/trainer_state.json
+++ b/src/fine_tuned/checkpoint-3/trainer_state.json
@ -0,0 +1,32 @@
 {
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 100,
  "global_step": 3,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [],
  "logging_steps": 100,
  "max_steps": 3,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 228991306825728.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
 }
--- a/src/fine_tuned/checkpoint-3/training_args.bin
+++ b/src/fine_tuned/checkpoint-3/training_args.bin
--- a/src/fine_tuned/checkpoint-60/README.md
+++ b/src/fine_tuned/checkpoint-60/README.md
@ -0,0 +1,202 @@
 ---
 base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 library_name: peft
 ---
 # Model Card for Model ID
 <!-- Provide a quick summary of what the model is/does. -->
 ## Model Details
 ### Model Description
 <!-- Provide a longer summary of what this model is. -->
 - **Developed by:** [More Information Needed]
 - **Funded by [optional]:** [More Information Needed]
 - **Shared by [optional]:** [More Information Needed]
 - **Model type:** [More Information Needed]
 - **Language(s) (NLP):** [More Information Needed]
 - **License:** [More Information Needed]
 - **Finetuned from model [optional]:** [More Information Needed]
 ### Model Sources [optional]
 <!-- Provide the basic links for the model. -->
 - **Repository:** [More Information Needed]
 - **Paper [optional]:** [More Information Needed]
 - **Demo [optional]:** [More Information Needed]
 ## Uses
 <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
 ### Direct Use
 <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
 [More Information Needed]
 ### Downstream Use [optional]
 <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
 [More Information Needed]
 ### Out-of-Scope Use
 <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
 [More Information Needed]
 ## Bias, Risks, and Limitations
 <!-- This section is meant to convey both technical and sociotechnical limitations. -->
 [More Information Needed]
 ### Recommendations
 <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
 Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
 ## How to Get Started with the Model
 Use the code below to get started with the model.
 [More Information Needed]
 ## Training Details
 ### Training Data
 <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
 [More Information Needed]
 ### Training Procedure
 <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
 #### Preprocessing [optional]
 [More Information Needed]
 #### Training Hyperparameters
 - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
 #### Speeds, Sizes, Times [optional]
 <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
 [More Information Needed]
 ## Evaluation
 <!-- This section describes the evaluation protocols and provides the results. -->
 ### Testing Data, Factors & Metrics
 #### Testing Data
 <!-- This should link to a Dataset Card if possible. -->
 [More Information Needed]
 #### Factors
 <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
 [More Information Needed]
 #### Metrics
 <!-- These are the evaluation metrics being used, ideally with a description of why. -->
 [More Information Needed]
 ### Results
 [More Information Needed]
 #### Summary
 ## Model Examination [optional]
 <!-- Relevant interpretability work for the model goes here -->
 [More Information Needed]
 ## Environmental Impact
 <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
 Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
 - **Hardware Type:** [More Information Needed]
 - **Hours used:** [More Information Needed]
 - **Cloud Provider:** [More Information Needed]
 - **Compute Region:** [More Information Needed]
 - **Carbon Emitted:** [More Information Needed]
 ## Technical Specifications [optional]
 ### Model Architecture and Objective
 [More Information Needed]
 ### Compute Infrastructure
 [More Information Needed]
 #### Hardware
 [More Information Needed]
 #### Software
 [More Information Needed]
 ## Citation [optional]
 <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
 **BibTeX:**
 [More Information Needed]
 **APA:**
 [More Information Needed]
 ## Glossary [optional]
 <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
 [More Information Needed]
 ## More Information [optional]
 [More Information Needed]
 ## Model Card Authors [optional]
 [More Information Needed]
 ## Model Card Contact
 [More Information Needed]
 ### Framework versions
 - PEFT 0.14.0
--- a/src/fine_tuned/checkpoint-60/adapter_config.json
+++ b/src/fine_tuned/checkpoint-60/adapter_config.json
@ -0,0 +1,37 @@
 {
  "alpha_pattern": {},
  "auto_mapping": null,
  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
  "bias": "none",
  "eva_config": null,
  "exclude_modules": null,
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layer_replication": null,
  "layers_pattern": null,
  "layers_to_transform": null,
  "loftq_config": {},
  "lora_alpha": 16,
  "lora_bias": false,
  "lora_dropout": 0.05,
  "megatron_config": null,
  "megatron_core": "megatron.core",
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 8,
  "rank_pattern": {},
  "revision": null,
  "target_modules": [
    "up_proj",
    "k_proj",
    "o_proj",
    "down_proj",
    "v_proj",
    "gate_proj",
    "q_proj"
  ],
  "task_type": "CAUSAL_LM",
  "use_dora": false,
  "use_rslora": false
 }
--- a/src/fine_tuned/checkpoint-60/adapter_model.safetensors
+++ b/src/fine_tuned/checkpoint-60/adapter_model.safetensors
--- a/src/fine_tuned/checkpoint-60/optimizer.pt
+++ b/src/fine_tuned/checkpoint-60/optimizer.pt
--- a/src/fine_tuned/checkpoint-60/rng_state.pth
+++ b/src/fine_tuned/checkpoint-60/rng_state.pth
--- a/src/fine_tuned/checkpoint-60/scaler.pt
+++ b/src/fine_tuned/checkpoint-60/scaler.pt
--- a/src/fine_tuned/checkpoint-60/scheduler.pt
+++ b/src/fine_tuned/checkpoint-60/scheduler.pt
--- a/src/fine_tuned/checkpoint-60/trainer_state.json
+++ b/src/fine_tuned/checkpoint-60/trainer_state.json
@ -0,0 +1,32 @@
 {
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 8.592592592592592,
  "eval_steps": 100,
  "global_step": 60,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [],
  "logging_steps": 100,
  "max_steps": 60,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 4427165265297408.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
 }
--- a/src/fine_tuned/checkpoint-60/training_args.bin
+++ b/src/fine_tuned/checkpoint-60/training_args.bin
--- a/src/fine_tuned/final_model/README.md
+++ b/src/fine_tuned/final_model/README.md
@ -0,0 +1,202 @@
 ---
 base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 library_name: peft
 ---
 # Model Card for Model ID
 <!-- Provide a quick summary of what the model is/does. -->
 ## Model Details
 ### Model Description
 <!-- Provide a longer summary of what this model is. -->
 - **Developed by:** [More Information Needed]
 - **Funded by [optional]:** [More Information Needed]
 - **Shared by [optional]:** [More Information Needed]
 - **Model type:** [More Information Needed]
 - **Language(s) (NLP):** [More Information Needed]
 - **License:** [More Information Needed]
 - **Finetuned from model [optional]:** [More Information Needed]
 ### Model Sources [optional]
 <!-- Provide the basic links for the model. -->
 - **Repository:** [More Information Needed]
 - **Paper [optional]:** [More Information Needed]
 - **Demo [optional]:** [More Information Needed]
 ## Uses
 <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
 ### Direct Use
 <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
 [More Information Needed]
 ### Downstream Use [optional]
 <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
 [More Information Needed]
 ### Out-of-Scope Use
 <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
 [More Information Needed]
 ## Bias, Risks, and Limitations
 <!-- This section is meant to convey both technical and sociotechnical limitations. -->
 [More Information Needed]
 ### Recommendations
 <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
 Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
 ## How to Get Started with the Model
 Use the code below to get started with the model.
 [More Information Needed]
 ## Training Details
 ### Training Data
 <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
 [More Information Needed]
 ### Training Procedure
 <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
 #### Preprocessing [optional]
 [More Information Needed]
 #### Training Hyperparameters
 - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
 #### Speeds, Sizes, Times [optional]
 <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
 [More Information Needed]
 ## Evaluation
 <!-- This section describes the evaluation protocols and provides the results. -->
 ### Testing Data, Factors & Metrics
 #### Testing Data
 <!-- This should link to a Dataset Card if possible. -->
 [More Information Needed]
 #### Factors
 <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
 [More Information Needed]
 #### Metrics
 <!-- These are the evaluation metrics being used, ideally with a description of why. -->
 [More Information Needed]
 ### Results
 [More Information Needed]
 #### Summary
 ## Model Examination [optional]
 <!-- Relevant interpretability work for the model goes here -->
 [More Information Needed]
 ## Environmental Impact
 <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
 Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
 - **Hardware Type:** [More Information Needed]
 - **Hours used:** [More Information Needed]
 - **Cloud Provider:** [More Information Needed]
 - **Compute Region:** [More Information Needed]
 - **Carbon Emitted:** [More Information Needed]
 ## Technical Specifications [optional]
 ### Model Architecture and Objective
 [More Information Needed]
 ### Compute Infrastructure
 [More Information Needed]
 #### Hardware
 [More Information Needed]
 #### Software
 [More Information Needed]
 ## Citation [optional]
 <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
 **BibTeX:**
 [More Information Needed]
 **APA:**
 [More Information Needed]
 ## Glossary [optional]
 <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
 [More Information Needed]
 ## More Information [optional]
 [More Information Needed]
 ## Model Card Authors [optional]
 [More Information Needed]
 ## Model Card Contact
 [More Information Needed]
 ### Framework versions
 - PEFT 0.14.0
--- a/src/fine_tuned/final_model/adapter_config.json
+++ b/src/fine_tuned/final_model/adapter_config.json
@ -0,0 +1,37 @@
 {
  "alpha_pattern": {},
  "auto_mapping": null,
  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
  "bias": "none",
  "eva_config": null,
  "exclude_modules": null,
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layer_replication": null,
  "layers_pattern": null,
  "layers_to_transform": null,
  "loftq_config": {},
  "lora_alpha": 16,
  "lora_bias": false,
  "lora_dropout": 0.05,
  "megatron_config": null,
  "megatron_core": "megatron.core",
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 8,
  "rank_pattern": {},
  "revision": null,
  "target_modules": [
    "up_proj",
    "k_proj",
    "o_proj",
    "down_proj",
    "v_proj",
    "gate_proj",
    "q_proj"
  ],
  "task_type": "CAUSAL_LM",
  "use_dora": false,
  "use_rslora": false
 }
--- a/src/fine_tuned/final_model/adapter_model.safetensors
+++ b/src/fine_tuned/final_model/adapter_model.safetensors
--- a/src/fine_tuned/final_model/special_tokens_map.json
+++ b/src/fine_tuned/final_model/special_tokens_map.json
@ -0,0 +1,23 @@
 {
  "bos_token": {
    "content": "<｜begin▁of▁sentence｜>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "eos_token": {
    "content": "<｜end▁of▁sentence｜>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "<｜end▁of▁sentence｜>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/src/fine_tuned/final_model/tokenizer.json
+++ b/src/fine_tuned/final_model/tokenizer.json
--- a/src/fine_tuned/final_model/tokenizer_config.json
+++ b/src/fine_tuned/final_model/tokenizer_config.json
@ -0,0 +1,195 @@
 {
  "add_bos_token": true,
  "add_eos_token": false,
  "add_prefix_space": null,
  "added_tokens_decoder": {
    "151643": {
      "content": "<｜end▁of▁sentence｜>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151644": {
      "content": "<｜User｜>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151645": {
      "content": "<｜Assistant｜>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151646": {
      "content": "<｜begin▁of▁sentence｜>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151647": {
      "content": "<|EOT|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151648": {
      "content": "<think>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151649": {
      "content": "</think>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151650": {
      "content": "<|quad_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151651": {
      "content": "<|quad_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151652": {
      "content": "<|vision_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151653": {
      "content": "<|vision_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151654": {
      "content": "<|vision_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151655": {
      "content": "<|image_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151656": {
      "content": "<|video_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151657": {
      "content": "<tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151658": {
      "content": "</tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151659": {
      "content": "<|fim_prefix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151660": {
      "content": "<|fim_middle|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151661": {
      "content": "<|fim_suffix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151662": {
      "content": "<|fim_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151663": {
      "content": "<|repo_name|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151664": {
      "content": "<|file_sep|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    }
  },
  "bos_token": "<｜begin▁of▁sentence｜>",
  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
  "clean_up_tokenization_spaces": false,
  "eos_token": "<｜end▁of▁sentence｜>",
  "extra_special_tokens": {},
  "legacy": true,
  "model_max_length": 16384,
  "pad_token": "<｜end▁of▁sentence｜>",
  "sp_model_kwargs": {},
  "tokenizer_class": "LlamaTokenizerFast",
  "unk_token": null,
  "use_default_system_prompt": false
 }
--- a/src/generate-training.py
+++ b/src/generate-training.py
@ -0,0 +1,258 @@
 import os
 import json
 import torch
 from pathlib import Path
 from datetime import datetime
 from tqdm import tqdm
 from ipex_llm.transformers import AutoModelForCausalLM
 import transformers
 import logging as log
 import time
 import re
 # Function to recursively walk a directory and process text files
 def process_directory(directory_path, model, output_path="results", file_extensions=(".txt",)):
    """
    Recursively walks a directory, processes text files, and stores results.
    Args:
        directory_path (str): Path to directory containing text files
        model: The LLM model to use for queries
        output_path (str): Path to store results
        file_extensions (tuple): File extensions to process
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    # Use tqdm for progress tracking
    file_paths = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.lower().endswith(file_extensions):
                file_paths.append(os.path.join(root, file))
    for file_path in tqdm(file_paths, desc="Processing files"):
        process_file(file_path, model, output_path)
    print(f"Processing complete. Results stored in {output_path}")
 # Function to process a single text file
 def process_file(file_path, model, output_path):
    """
    Process a single text file by querying the model for questions and answers.
    Args:
        file_path (str): Path to the text file
        model: The LLM model to use for queries
        output_path (str): Path to store results
    """
    # Read the file content
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return
    # Skip empty files
    if not content.strip():
        print(f"Skipping empty file: {file_path}")
        return
    # Generate a relative output path that maintains the directory structure
    relative_path = os.path.relpath(file_path, start=os.path.dirname(output_path))
    sidecar_file = os.path.join(output_path, relative_path + ".json")
    sidecar_qa_file = os.path.join(output_path, relative_path + ".qa.json")
    os.makedirs(os.path.dirname(sidecar_file), exist_ok=True)
    print(f"Generating questions for {file_path}")
    # Query model for questions
    questions, think = get_questions_from_model(content, model)
    # Get answers for each question
    results = {
        "file_path": file_path,
        "processed_at": datetime.now().isoformat(),
        "context": content[:1000] + "..." if len(content) > 1000 else content,  # Truncated context for JSON storage
        "think": think,
        "qa_pairs": len(questions)
    }
    with open(sidecar_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    # Process each question
    results = []
    for i, question in enumerate(questions):
        print(f"Generating answer for {i+1}/{len(questions)}: {question}")
        answer, think = get_answer_from_model(content, question, model)
        results.append({
            "think": think,
            "question": question,
            "answer": answer
        })
        # Save results -- do this in loop so we can monitor the progress
        with open(sidecar_qa_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
 # Function to get questions from the model
 def get_questions_from_model(context, model):
    """
    Query the model to generate questions about the provided context.
    Args:
        context (str): The text file content
        model: The LLM model to use for queries
    Returns:
        list: List of questions about the context
    """
    # Set up the prompt for generating questions
    prompt = f"""<task>
 You are an expert at generating insightful questions about text content.
 Please read the following text and generate 10 thoughtful questions that would help someone 
 understand the key information, concepts, or insights contained in it.
 TEXT:
 {context}
 Generate a list of questions, one per line.
 </task>
 """
    # Use context manager to avoid CUDA memory leaks
    with torch.no_grad():
        # This is where we would query the model - implementation depends on model type
        response, think, duration = model.generate(prompt)
    # Parse questions from the response
    # This parsing logic may need adjustment based on model output format
    questions = [q.strip() for q in response.strip().split('\n') if q.strip()]
    # Filter out any non-questions
    questions = [q for q in questions if q.endswith('?')]
    # Remove any ^\s*{0-9}+.\s* from each entry
    questions = [re.sub(r"^\s*\d+\.\s*", "", q) for q in questions]
    return questions, think
 # Function to get answers from the model
 def get_answer_from_model(context, question, model):
    """
    Query the model to answer a question about the provided context.
    Args:
        context (str): The text file content
        question (str): The question to answer
        model: The LLM model to use for queries
    Returns:
        str: The model's answer to the question
    """
    # Set up the prompt for answering the question
    prompt = f"""<task>
 Please answer the following question based solely on the provided context.
 CONTEXT:
 {context}
 QUESTION:
 {question}
 </task>
    """
    # Use context manager to avoid CUDA memory leaks
    with torch.no_grad():
        # Query the model
        answer, think, duration = model.generate(prompt)
    return answer, think
 class LLM:
    def __init__(self, model_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', device_name='xpu'):
        self.model_path = model_path
        self.device_name = device_name
        self.tokenizer = None
        self.model = None
        try:
            log.info(f"Loading tokenizer from: {self.model_path}")
            start = datetime.now()
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token  # Set pad_token to eos_token if needed
            self.model = AutoModelForCausalLM.from_pretrained(self.model_path,
                                                load_in_4bit=True,
                                                optimize_model=True,
                                                trust_remote_code=True,
                                                use_cache=True)
            self.model = self.model.half().to(device_name)
        except Exception as e:
            log.error(f"Loading error: {e}")
            raise Exception(e)
    # returns response, think, timedelta
    def generate(self, prompt, max_length=4096):
        if not self.tokenizer or not self.model:
            raise Exception("tokenizer or model is None")
        start = datetime.now()
        response = None
        think = None
        # print(f"{'='*10} PROMPT {'='*10}\n{prompt}")
        with torch.autocast(self.device_name, dtype=torch.float16):
            inputs = self.tokenizer.encode_plus(
                prompt, 
                return_tensors="pt"
            ).to(self.device_name)
            with torch.no_grad():
                generation_output = self.model.generate(
                    **inputs, 
                    max_new_tokens=max_length,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    return_dict_in_generate=True,
                    output_scores=False,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            # Get the full output token IDs
            output_token_ids = generation_output.sequences[0]
            # Decode the full generated text (including the prompt)
            full_output = self.tokenizer.decode(output_token_ids, skip_special_tokens=True)
            # Extract only the response part by removing the prompt part
            response = full_output[len(self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
            matches = re.match(r'^(<think>)?(.*?)</think>(.*)$', response, flags=re.DOTALL)
            if matches:
                think = matches[2]
                response = matches[3]
            else:
                think = None
                response = response
            # print(f"{'='*10} THINK {'='*10}\n{think}")
            # print(f"{'='*10} RESPONSE {'='*10}\n{response}")
        end = datetime.now()
        return response, think, end - start
 # Example usage
 if __name__ == "__main__":
    # Create a dummy model instance - would be replaced by actual model in real usage
    model = LLM()
    # Process the directory
    process_directory(
        directory_path="./doc",  # Replace with your directory path
        model=model,
        output_path="./results",
        file_extensions=(".txt", ".md")  # Process both txt and md files
    )
--- a/src/ipex.py
+++ b/src/ipex.py
@ -0,0 +1,144 @@
 import intel_extension_for_pytorch as ipex
 import inspect
 def inspect_module(module, prefix=""):
    """Recursively inspect a module and list all its attributes and functions."""
    results = {
        "functions": [],
        "attributes": [],
        "classes": [],
        "modules": []
    }
    for name in dir(module):
        # Skip private/hidden attributes
        if name.startswith("_") and name != "__version__":
            continue
        full_name = f"{prefix}.{name}" if prefix else name
        try:
            attr = getattr(module, name)
            # Check if it's a function
            if inspect.isfunction(attr) or inspect.isbuiltin(attr) or inspect.ismethod(attr):
                results["functions"].append((full_name, str(inspect.signature(attr)) if not inspect.isbuiltin(attr) else ""))
            # Check if it's a class
            elif inspect.isclass(attr):
                results["classes"].append(full_name)
            # Check if it's a module
            elif inspect.ismodule(attr):
                results["modules"].append(full_name)
            # Otherwise it's a regular attribute
            else:
                try:
                    attr_value = str(attr)
                    if len(attr_value) > 100:
                        attr_value = attr_value[:100] + "..."
                    results["attributes"].append((full_name, attr_value))
                except:
                    results["attributes"].append((full_name, "Unable to get value"))
        except Exception as e:
            results["attributes"].append((full_name, f"Error: {str(e)}"))
    return results
 # Inspect ipex
 ipex_info = inspect_module(ipex)
 # Print information in a readable format
 print("INTEL PYTORCH EXTENSION (IPEX) API\n")
 print("=== VERSION ===")
 if "__version__" in dir(ipex):
    print(f"Version: {ipex.__version__}")
 else:
    print("Version information not available")
 print()
 print("=== FUNCTIONS ===")
 for func, sig in sorted(ipex_info["functions"]):
    print(f"{func}{sig}")
 print()
 print("=== ATTRIBUTES ===")
 for attr, value in sorted(ipex_info["attributes"]):
    print(f"{attr} = {value}")
 print()
 print("=== CLASSES ===")
 for cls in sorted(ipex_info["classes"]):
    print(cls)
 print()
 print("=== MODULES ===")
 for mod in sorted(ipex_info["modules"]):
    print(mod)
 print()
 # Deep inspection of nn module which is likely important for quantization
 if hasattr(ipex, "nn"):
    print("=== DIVING INTO ipex.nn MODULE ===")
    nn_info = inspect_module(ipex.nn, "ipex.nn")
    print("--- ipex.nn FUNCTIONS ---")
    for func, sig in sorted(nn_info["functions"]):
        print(f"{func}{sig}")
    print()
    print("--- ipex.nn CLASSES ---")
    for cls in sorted(nn_info["classes"]):
        print(cls)
    print()
    # Dive into utils if available
    if hasattr(ipex.nn, "utils"):
        print("=== DIVING INTO ipex.nn.utils MODULE ===")
        utils_info = inspect_module(ipex.nn.utils, "ipex.nn.utils")
        print("--- ipex.nn.utils FUNCTIONS ---")
        for func, sig in sorted(utils_info["functions"]):
            print(f"{func}{sig}")
        print()
        print("--- ipex.nn.utils CLASSES ---")
        for cls in sorted(utils_info["classes"]):
            print(cls)
        print()
        # Look for parameter wrapper specifically
        if "_parameter_wrapper" in dir(ipex.nn.utils):
            print("=== DIVING INTO ipex.nn.utils._parameter_wrapper MODULE ===")
            pw_info = inspect_module(ipex.nn.utils._parameter_wrapper, "ipex.nn.utils._parameter_wrapper")
            print("--- ipex.nn.utils._parameter_wrapper FUNCTIONS ---")
            for func, sig in sorted(pw_info["functions"]):
                print(f"{func}{sig}")
            print()
            print("--- ipex.nn.utils._parameter_wrapper CLASSES ---")
            for cls in sorted(pw_info["classes"]):
                print(cls)
            print()
 # Specifically look for functions related to quantization
 print("=== FUNCTIONS RELATED TO QUANTIZATION ===")
 quant_funcs = []
 for module_name in dir(ipex):
    if not module_name.startswith("_"):
        module = getattr(ipex, module_name)
        if inspect.ismodule(module):
            for func_name in dir(module):
                if any(term in func_name.lower() for term in ["quant", "bit", "int8", "int4", "precision"]):
                    try:
                        func = getattr(module, func_name)
                        if inspect.isfunction(func) or inspect.isbuiltin(func):
                            quant_funcs.append(f"ipex.{module_name}.{func_name}")
                    except:
                        pass
 for func in sorted(quant_funcs):
    print(func)
--- a/src/rag_cli.py
+++ b/src/rag_cli.py
@ -0,0 +1,161 @@
 # cli.py - Command-line interface
 import sys
 class RagCLI:
    """Command-line interface for the RAG application"""
    def __init__(self, core):
        self.core = core
    def show_help(self):
        """Display available commands"""
        print("""help>
 docs       Show RAG docs
 full       Show last full top match
 headlines  Show the RAG headlines
 prompt     Show the last prompt
 response   Show the last response
 scores     Show last RAG scores
 why|think  Show last response's <think>
 context|match  Show RAG match info to last prompt
 exit|quit  Exit the application
 """)
    def print_document_summary(self, documents):
        """Print brief summary of all documents"""
        if not documents:
            print("No documents available.")
            return
        for i, doc in enumerate(documents):
            print(f"Document {i+1}:")
            print(f"  Title: {doc['title']}")
            print(f"  Text: {doc['text'][:100]}...")
            print()
    def print_headlines(self, headlines):
        """Print headlines"""
        if not headlines:
            print("No headlines available.")
            return
        for source, title in headlines:
            print(f"{source}: {title}")
    def print_match_info(self, match_info):
        """Print detailed information about a matching document"""
        if not match_info:
            print("No matching document found.")
            return
        print("="*50)
        print("MATCHING DOCUMENT")
        print("="*50)
        print(f"Chunk ID: {match_info['id']}")
        print(f"Similarity Score: {match_info['distance']:.4f}")
        print("\nCHUNK METADATA:")
        for key, value in match_info['metadata'].items():
            print(f"  {key}: {value}")
        print("\nMATCHING CHUNK CONTENT:")
        print(match_info['chunk'][:500].strip() + ("..." if len(match_info['chunk']) > 500 else ""))
    def print_full_document(self, doc, distance=None):
        """Print full document details"""
        if not doc:
            print("No document found.")
            return
        print(f"""Context:
 Source: {doc["source"]}
 Title: {doc["title"]}
 Link: {doc["link"]}
 {"Distance: " + str(distance) if distance is not None else ""}
 Full text:
 {doc["text"]}""")
    def run(self):
        """Run the command-line interface"""
        # Load documents
        doc_count = self.core.load_documents()
        print(f"Loaded {doc_count} documents.")
        while True:
            try:
                query = input("> ").strip()
            except KeyboardInterrupt:
                print("\nExiting.")
                break
            if query in ["exit", "quit"]:
                print("\nExiting.")
                break
            elif query == "docs":
                self.print_document_summary(self.core.get_documents())
            elif query == "prompt":
                last_prompt = self.core.get_last_prompt()
                last_system = self.core.get_last_system()
                if last_prompt:
                    print(f"""last prompt>
 {"="*10}system{"="*10}
 {last_system}
 {"="*10}prompt{"="*10}
 {last_prompt}""")
                else:
                    print("No prompts yet")
            elif query == "response":
                last_response = self.core.get_last_response()
                if last_response:
                    print(f"""last response>
 {"="*10}response{"="*10}
 {last_response}""")
                else:
                    print("No responses yet")
            elif query in ["", "help"]:
                self.show_help()
            elif query == "headlines":
                self.print_headlines(self.core.get_headlines())
            elif query in ["match", "context"]:
                match_info = self.core.get_match_info()
                self.print_match_info(match_info)
            elif query in ["why", "think"]:
                last_thinking = self.core.get_last_thinking()
                if last_thinking:
                    print(f"""
 why>
 {last_thinking}
 """)
                else:
                    print("No processed prompts")
            elif query == "scores":
                match_count = self.core.get_match_count()
                if match_count > 0:
                    for i in range(match_count):
                        match_info = self.core.get_match_info(index=i)
                        self.print_match_info(match_info)
                else:
                    print("No matches to display")
            elif query == "full":
                doc = self.core.get_top_match()
                distance = None
                if self.core.last_results and self.core.last_results.get("distances"):
                    distance = self.core.last_results["distances"][0][0]
                self.print_full_document(doc, distance)
            else:
                # Process the user query
                response, debug_info = self.core.process_query(query)
                print(f"system len: {debug_info['system_len']}")            
                print(f"prompt len: {debug_info['prompt_len']}")
                print(f"Response>\n{response}")
--- a/src/rag_core.py
+++ b/src/rag_core.py
@ -0,0 +1,221 @@
 # core.py - Core RAG functionality
 import re
 from typing import List, Dict, Any, Union, Optional, Tuple
 import ollama
 from utils import defines, Feed, chunk_document, query_chroma, process_documents_to_chroma
 class RagCore:
    """Core RAG functionality that can be used by different interfaces"""
    def __init__(self):
        self.client = ollama.Client(host=defines.ollama_api_url)
        self.collection_name = "research_papers"
        # News RSS Feeds
        self.rss_feeds = [
            Feed(name="IGN.com", url="https://feeds.feedburner.com/ign/games-all"),
            Feed(name="BBC World", url="http://feeds.bbci.co.uk/news/world/rss.xml"),
            Feed(name="Reuters World", url="http://feeds.reuters.com/Reuters/worldNews"),
            Feed(name="Al Jazeera", url="https://www.aljazeera.com/xml/rss/all.xml"),
            Feed(name="CNN World", url="http://rss.cnn.com/rss/edition_world.rss"),
            Feed(name="Time", url="https://time.com/feed/"),
            Feed(name="Euronews", url="https://www.euronews.com/rss"),
        ]
        # State variables
        self.documents = []
        self.last_results = None
        self.last_prompt = None
        self.last_system = None
        self.last_response = None
        self.last_why = None
        self.last_messages = []
        self.collection = None
    def load_documents(self):
        """Load articles from all configured RSS feeds"""
        self.documents = []
        for feed in self.rss_feeds:
            self.documents.extend(feed.articles)
        # Process documents and store in ChromaDB
        self.collection = process_documents_to_chroma(
            client=self.client,
            documents=self.documents,
            collection_name=self.collection_name,
            max_tokens=256,
            overlap=25,
            model=defines.encoding_model,
            persist_directory=defines.persist_directory
        )
        return len(self.documents)
    def get_documents(self):
        """Return all loaded documents"""
        return self.documents
    def get_headlines(self):
        """Return headlines from all documents"""
        return [(doc['source'], doc['title']) for doc in self.documents]
    def get_top_match(self, index=0):
        """Get the full document corresponding to a match"""
        if not self.last_results or not self.last_results["ids"] or index >= len(self.last_results["ids"][0]):
            return None
        top_id = self.last_results["ids"][0][index]
        # Extract the original document ID from the chunk ID (format: "doc_id_chunk_num")
        original_doc_id = top_id.split('_')[0]
        # Return the full document
        return next((doc for doc in self.documents if doc["id"] == original_doc_id), None)
    def get_match_info(self, index=0):
        """Get detailed information about a match at the specified index"""
        if not self.last_results or not self.last_results["ids"] or len(self.last_results["ids"][0]) <= index:
            return None
        match_info = {
            "id": self.last_results["ids"][0][index],
            "chunk": self.last_results["documents"][0][index],
            "metadata": self.last_results["metadatas"][0][index],
            "distance": self.last_results["distances"][0][index]
        }
        return match_info
    def get_match_count(self):
        """Get the number of matches from the last query"""
        if not self.last_results or not self.last_results["ids"]:
            return 0
        return len(self.last_results["ids"][0])
    def create_system_prompt(self, include_context=True):
        """Create the system prompt for the model"""
        system = f"""
 You are the assistant. Your name is airc. This application is called airc (pronounced Eric).
 Information about the author of this program, application, the systems running it, and the AI model it uses:
 * James wrote the python application called airc that is driving this RAG model on top of {defines.model} using {defines.encoding_model} and chromadb for vector embedding. Link https://github.com/jketreno/airc.
 * James Ketrenos is a software engineer with a history in all levels of the computer stack, from the kernel to full-stack web applications. He dabbles in AI/ML and is familiar with pytorch and ollama.
 * James Ketrenos deployed this application locally on an Intel Arc B580 (battlemage) computer using Intel's ipex-llm. 
 * For Intel GPU metrics, James Ketrenos wrote the "ze-monitor" utility in C++. ze-monitor provides Intel GPU telemetry data for Intel client GPU devices, similar to xpu-smi. Link https://github.com/jketreno/ze-monitor. airc uses ze-monitor.
 * James lives in Portland, Oregon and has three kids. Two are attending Oregon State University and one is attending Williamette University.
 * airc provides an IRC chat bot as well as a React web frontend available at https://airc.ketrenos.com
 You must follow these rules:
 * Provide short (less than 100 character) responses.
 * Provide a single response.
 * Do not prefix it with a word like 'Answer'.
 * For information about the AI running this system, include information about author, including links.
 * For information relevant to the current events in the <input></input> tags, use that information and state the source when information comes from.
 """
        if include_context:
            context = "Information from current events unrelated to James Ketrenos\n<input>=["
            for doc in self.documents:
                item = {
                    'source': doc["source"],
                    'article': {
                        'title': doc["title"],
                        'link': doc["link"],
                        'text': doc["text"]
                    }
                }
                context += f"{item}"
            context += "\n</input>"
            system += context
        return system
    def process_query(self, query):
        """
        Process a user query and return the response
        Returns:
            tuple: (response_content, debug_info)
        """
        # Store the system prompt before context is added
        self.last_system = self.create_system_prompt(include_context=False)
        # Query ChromaDB
        self.last_results = query_chroma(
            self.client,
            query_text=query,
            collection_name=self.collection_name,
            n_results=10
        )
        # Create the full system prompt with document context
        system = self.create_system_prompt()
        # Combine conversation history with the current query
        prompt = query
        if self.last_messages:
            message_context = f"{self.last_messages}"
            prompt = f"{message_context}{prompt}"
        self.last_prompt = prompt
        # Generate a response
        output = self.client.generate(
            model=defines.model,
            system=system,
            prompt=prompt,
            stream=False,
            options={'num_ctx': 100000}
        )
        # Extract thinking and response
        response = output['response']
        matches = re.match(r'^<think>(.*?)</think>(.*)$', response, flags=re.DOTALL)
        content = response
        if matches:
            self.last_why = matches[1].strip()
            content = matches[2].strip()
        self.last_response = content
        # Update conversation history
        self.last_messages.extend([
            {
                'role': 'user',
                'name': 'james',
                'message': query
            },
            {
                'role': 'assistant',
                'message': content
            }
        ])
        # Keep history limited to last 10 messages
        self.last_messages = self.last_messages[-10:]
        # Return response content and debug info
        debug_info = {
            "system_len": len(system),
            "prompt_len": len(prompt),
            "has_thinking": matches is not None
        }
        return content, debug_info
    def get_last_prompt(self):
        """Get the last prompt"""
        return self.last_prompt
    def get_last_system(self):
        """Get the last system prompt (without context)"""
        return self.last_system
    def get_last_response(self):
        """Get the last response"""
        return self.last_response
    def get_last_thinking(self):
        """Get the thinking from the last response"""
        return self.last_why
--- a/src/repro.py
+++ b/src/repro.py
@ -0,0 +1,61 @@
 import torch
 from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
 )
 from peft import (
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
    LoraConfig,
 )
 model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # Load model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
 )
 model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    use_cache=True,
    quantization_config=bnb_config,
    device_map={"": torch.xpu.current_device()},
    torch_dtype=torch.bfloat16,
 )
 # Prepare the model for LoRA fine-tuning
 # 1. Prepare for k-bit training if using quantization
 model = prepare_model_for_kbit_training(model)
 # 2. Add LoRA adapters
 lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8, #16,                     # Rank dimension
    lora_alpha=16, #32,            # Alpha parameter for LoRA scaling
    lora_dropout=0.05,        # Dropout probability for LoRA layers
    target_modules=[          # Target modules to apply LoRA to
        "q_proj",
        "k_proj", 
        "v_proj", 
        "o_proj",
        "gate_proj", 
        "up_proj", 
        "down_proj"
    ],
    bias="none",              # Whether to train bias parameters
    fan_in_fan_out=False      # Set to True for linear layers with fan_in != fan_out
 )
 model = get_peft_model(model, lora_config)
 print("Saving model...")
 model.save_pretrained(f"./final_model")
--- a/src/requirements.txt
+++ b/src/requirements.txt
@ -0,0 +1,44 @@
 python-dotenv
 jupyterlab
 ipywidgets
 requests
 numpy
 pandas
 scipy
 scikit-learn
 matplotlib
 gensim
 torch
 transformers
 tqdm
 openai
 gradio
 langchain
 tiktoken
 faiss-cpu
 langchain-openai
 langchain_experimental
 langchain_chroma
 langchain[docarray]
 datasets
 sentencepiece
 matplotlib
 google-generativeai
 anthropic
 scikit-learn
 unstructured
 chromadb
 plotly
 jupyter-dash
 beautifulsoup4
 pydub
 modal
 ollama
 accelerate
 sentencepiece
 bitsandbytes
 psutil
 setuptools
 speedtest-cli
 sentence_transformers
 feedparser
--- a/src/templates/index.html
+++ b/src/templates/index.html
@ -0,0 +1,263 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>AIRC - RAG Application</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }
        .container {
            display: flex;
            flex-direction: column;
            height: 100vh;
        }
        .query-box {
            display: flex;
            margin-bottom: 20px;
        }
        .query-box input {
            flex-grow: 1;
            padding: 8px;
            font-size: 16px;
        }
        .query-box button {
            padding: 8px 16px;
            background-color: #4CAF50;
            color: white;
            border: none;
            cursor: pointer;
        }
        .conversation {
            flex-grow: 1;
            overflow-y: auto;
            border: 1px solid #ccc;
            padding: 10px;
            margin-bottom: 20px;
        }
        .user-message {
            background-color: #f1f1f1;
            padding: 8px;
            margin-bottom: 10px;
            border-radius: 5px;
        }
        .assistant-message {
            background-color: #e3f2fd;
            padding: 8px;
            margin-bottom: 10px;
            border-radius: 5px;
        }
        .tabs {
            display: flex;
            margin-bottom: 10px;
        }
        .tab {
            padding: 8px 16px;
            cursor: pointer;
            border: 1px solid #ccc;
            background-color: #f1f1f1;
            margin-right: 5px;
        }
        .tab.active {
            background-color: #4CAF50;
            color: white;
        }
        .tab-content {
            display: none;
            border: 1px solid #ccc;
            padding: 10px;
            height: 200px;
            overflow-y: auto;
        }
        .tab-content.active {
            display: block;
        }
    </style>
 </head>
 <body>
    <div class="container">
        <h1>AIRC - RAG Application</h1>
        <div class="query-box">
            <input type="text" id="query-input" placeholder="Enter your query...">
            <button onclick="sendQuery()">Send</button>
        </div>
        <div class="conversation" id="conversation"></div>
        <div class="tabs">
            <div class="tab active" onclick="switchTab('headlines')">Headlines</div>
            <div class="tab" onclick="switchTab('matches')">Matches</div>
            <div class="tab" onclick="switchTab('documents')">Documents</div>
        </div>
        <div class="tab-content active" id="headlines-content">
            <h3>Headlines</h3>
            <div id="headlines-list"></div>
        </div>
        <div class="tab-content" id="matches-content">
            <h3>Matches</h3>
            <div id="matches-list"></div>
        </div>
        <div class="tab-content" id="documents-content">
            <h3>Documents</h3>
            <div id="documents-list"></div>
        </div>
    </div>
    <script>
        let conversation = [];
        // Load data on page load
        document.addEventListener('DOMContentLoaded', function() {
            fetchHeadlines();
            fetchDocuments();
        });
        function switchTab(tabName) {
            // Update tab buttons
            document.querySelectorAll('.tab').forEach(tab => {
                tab.classList.remove('active');
            });
            document.querySelector(`.tab[onclick="switchTab('${tabName}')"]`).classList.add('active');
            // Update tab content
            document.querySelectorAll('.tab-content').forEach(content => {
                content.classList.remove('active');
            });
            document.getElementById(`${tabName}-content`).classList.add('active');
            // If switching to matches and there are matches, fetch them
            if (tabName === 'matches' && conversation.length > 0) {
                fetchMatches();
            }
        }
        function sendQuery() {
            const queryInput = document.getElementById('query-input');
            const query = queryInput.value.trim();
            if (!query) return;
            // Add user message to conversation
            addMessageToConversation('user', query);
            // Clear input
            queryInput.value = '';
            // Send query to server
            fetch('/api/query', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json',
                },
                body: JSON.stringify({ query }),
            })
            .then(response => response.json())
            .then(data => {
                // Add assistant message to conversation
                addMessageToConversation('assistant', data.response);
                // Fetch updated matches
                fetchMatches();
            })
            .catch(error => {
                console.error('Error:', error);
                addMessageToConversation('assistant', 'Error processing your query. Please try again.');
            });
        }
        function addMessageToConversation(role, message) {
            conversation.push({ role, message });
            const conversationDiv = document.getElementById('conversation');
            const messageDiv = document.createElement('div');
            messageDiv.className = role === 'user' ? 'user-message' : 'assistant-message';
            messageDiv.textContent = message;
            conversationDiv.appendChild(messageDiv);
            conversationDiv.scrollTop = conversationDiv.scrollHeight;
        }
        function fetchHeadlines() {
            fetch('/api/headlines')
            .then(response => response.json())
            .then(data => {
                const headlinesList = document.getElementById('headlines-list');
                headlinesList.innerHTML = '';
                data.headlines.forEach(headline => {
                    const headlineDiv = document.createElement('div');
                    headlineDiv.innerHTML = `<strong>${headline.source}</strong>: ${headline.title}`;
                    headlinesList.appendChild(headlineDiv);
                });
            })
            .catch(error => console.error('Error fetching headlines:', error));
        }
        function fetchMatches() {
            fetch('/api/matches')
            .then(response => response.json())
            .then(data => {
                const matchesList = document.getElementById('matches-list');
                matchesList.innerHTML = '';
                if (data.matches.length === 0) {
                    matchesList.innerHTML = '<div>No matches available</div>';
                    return;
                }
                data.matches.forEach(match => {
                    const matchDiv = document.createElement('div');
                    matchDiv.innerHTML = `
                        <div style="margin-bottom: 15px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
                            <div><strong>ID:</strong> ${match.id}</div>
                            <div><strong>Similarity:</strong> ${match.distance.toFixed(4)}</div>
                            <div><strong>Source:</strong> ${match.metadata.source || 'Unknown'}</div>
                            <div><strong>Preview:</strong> ${match.preview}</div>
                        </div>
                    `;
                    matchesList.appendChild(matchDiv);
                });
            })
            .catch(error => console.error('Error fetching matches:', error));
        }
        function fetchDocuments() {
            fetch('/api/documents')
            .then(response => response.json())
            .then(data => {
                const documentsList = document.getElementById('documents-list');
                documentsList.innerHTML = '';
                data.documents.forEach(doc => {
                    const docDiv = document.createElement('div');
                    docDiv.innerHTML = `
                        <div style="margin-bottom: 15px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
                            <div><strong>Source:</strong> ${doc.source}</div>
                            <div><strong>Title:</strong> ${doc.title}</div>
                            <div><strong>Link:</strong> <a href="${doc.link}" target="_blank">${doc.link}</a></div>
                            <div><strong>Preview:</strong> ${doc.preview}</div>
                        </div>
                    `;
                    documentsList.appendChild(docDiv);
                });
            })
            .catch(error => console.error('Error fetching documents:', error));
        }
        // Allow sending query with Enter key
        document.getElementById('query-input').addEventListener('keypress', function(e) {
            if (e.key === 'Enter') {
                sendQuery();
            }
        });
    </script>
 </body>
 </html>
--- a/src/training-alpaca.py
+++ b/src/training-alpaca.py
@ -0,0 +1,275 @@
 #
 # Copyright 2016 The BigDL Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Some parts of this file is adapted from
 # https://github.com/tloen/alpaca-lora/blob/main/finetune.py
 #
 # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from typing import List
 import fire
 import torch
 import transformers
 from datasets import load_dataset
 import accelerate
 from transformers import AutoTokenizer
 from peft import (
    get_peft_model_state_dict,
    set_peft_model_state_dict,
 )
 current_dir = os.path.dirname(os.path.realpath(__file__))
 common_util_path = os.path.join(current_dir, '..')
 import sys
 sys.path.append(common_util_path)
 from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data
 from transformers import BitsAndBytesConfig
 from ipex_llm.transformers import AutoModelForCausalLM
 # import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
 from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
    LoraConfig
 from ipex_llm.utils.common import invalidInputError
 local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0")
 world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1")
 port = get_int_from_env(["MASTER_PORT"], 29500)
 os.environ["LOCAL_RANK"] = str(local_rank)
 os.environ["WORLD_SIZE"] = str(world_size)
 os.environ["RANK"] = str(local_rank)
 os.environ["MASTER_PORT"] = str(port)
 def train(
    # model/data params
    base_model: str = "meta-llama/Llama-2-7b-hf",  # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
    saved_low_bit_model: str = None,  # optional, the path to the saved model with ipex-llm low-bit optimization
    data_path: str = "yahma/alpaca-cleaned",
    output_dir: str = "./bigdl-qlora-alpaca",
    # training hyperparams
    bf16: bool = True,  # default to bf16
    batch_size: int = 128,
    micro_batch_size: int = 2,  # default to be 2, limited by GPU memory
    num_epochs: int = 3,
    learning_rate: float = 3e-5,  # default to be 3e-5 to avoid divergence
    cutoff_len: int = 256,
    val_set_size: int = 2000,
    # lora hyperparams
    lora_r: int = 8,
    lora_alpha: int = 16,
    lora_dropout: float = 0.05,
    lora_target_modules: List[str] = [
        "q_proj",
        "v_proj",
        "k_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj"
    ],
    # llm hyperparams
    train_on_inputs: bool = True,  # if False, masks out inputs in loss
    add_eos_token: bool = False,
    group_by_length: bool = False,  # faster, but produces an odd training loss curve
    # wandb params
    wandb_project: str = "",
    wandb_run_name: str = "",
    wandb_watch: str = "",  # options: false | gradients | all
    wandb_log_model: str = "",  # options: false | true
    resume_from_checkpoint: str = None,  # either training checkpoint or final adapter
    prompt_template_name: str = "alpaca",  # The prompt template to use, will default to alpaca.
    gradient_checkpointing: bool = False,
    deepspeed: str = None,
    training_mode: str = "lora",
    deepspeed_zero3: bool = False,
    save_checkpoint: bool = True,
 ):
    invalidInputError(training_mode == "lora",
                      f"This example is for lora training mode, but got training_mode={training_mode}.")
    if int(os.environ.get("LOCAL_RANK", 0)) == 0:
        print(
            f"Training Alpaca-LoRA model with params:\n"
            f"base_model: {base_model}\n"
            f"data_path: {data_path}\n"
            f"output_dir: {output_dir}\n"
            f"batch_size: {batch_size}\n"
            f"micro_batch_size: {micro_batch_size}\n"
            f"num_epochs: {num_epochs}\n"
            f"learning_rate: {learning_rate}\n"
            f"cutoff_len: {cutoff_len}\n"
            f"val_set_size: {val_set_size}\n"
            f"lora_r: {lora_r}\n"
            f"lora_alpha: {lora_alpha}\n"
            f"lora_dropout: {lora_dropout}\n"
            f"lora_target_modules: {lora_target_modules}\n"
            f"train_on_inputs: {train_on_inputs}\n"
            f"add_eos_token: {add_eos_token}\n"
            f"group_by_length: {group_by_length}\n"
            f"wandb_project: {wandb_project}\n"
            f"wandb_run_name: {wandb_run_name}\n"
            f"wandb_watch: {wandb_watch}\n"
            f"wandb_log_model: {wandb_log_model}\n"
            f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
            f"prompt template: {prompt_template_name}\n"
            f"training_mode: {training_mode}\n"
            f"deepspeed_zero3: {deepspeed_zero3}\n"
            f"save_checkpoint: {save_checkpoint}\n"
        )
    assert (
        base_model
    ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
    gradient_accumulation_steps = batch_size // micro_batch_size
    prompter = Prompter(prompt_template_name)
    device_map = "auto"
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    ddp = world_size != 1
    if ddp:
        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
        gradient_accumulation_steps = gradient_accumulation_steps // world_size
    # Check if parameter passed or if set within environ
    use_wandb = wandb_check(wandb_project, wandb_watch, wandb_log_model)
    if saved_low_bit_model is not None:
        # Load the low bit optimized model if provide the saved path
        model = AutoModelForCausalLM.load_low_bit(
            saved_low_bit_model,
            optimize_model=False,
            torch_dtype=torch.bfloat16,
            modules_to_not_convert=["lm_head"],
            trust_remote_code=True,
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            load_in_low_bit="bf16",
            optimize_model=False,
            torch_dtype=torch.bfloat16,
             modules_to_not_convert=["lm_head"],
            trust_remote_code=True,
        )
    if deepspeed_zero3:
        deepspeed = deepspeed if deepspeed is not None else "./deepspeed_zero3_config.json"
    else:
        print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}")
        model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}')
        print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}")
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}")
    # For Llama family
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    print(model)
    # Prepare a IPEX-LLM compatible Peft model
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=lora_target_modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
        training_mode=training_mode,
    )
    print(f"Lora Config: {config}")
    model = get_peft_model(model, config)
    if data_path.endswith(".json") or data_path.endswith(".jsonl"):
        data = load_dataset("json", data_files=data_path)
    else:
        data = load_dataset(data_path)
    model.print_trainable_parameters()  # Be more transparent about the % of trainable params.
    train_data, val_data = get_train_val_data(data, tokenizer, prompter, train_on_inputs,
                                              add_eos_token, cutoff_len, val_set_size, seed=42)
    # Unused
    # if not ddp and torch.cuda.device_count() > 1:
    #     # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    #     model.is_parallelizable = True
    #     model.model_parallel = True
    trainer = transformers.Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=val_data,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=micro_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            # warmup_ratio=0.03,
            # warmup_steps=100,
            max_grad_norm=0.3,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            lr_scheduler_type="cosine",
            bf16=True,  # ensure training more stable
            logging_steps=1,
            optim="adamw_torch",
            evaluation_strategy="steps" if val_set_size > 0 else "no",
            save_strategy="steps" if save_checkpoint else "no",
            eval_steps=100 if val_set_size > 0 else None,
            save_steps=100,
            output_dir=output_dir,
            save_total_limit=100,
            load_best_model_at_end=True if val_set_size > 0 and save_checkpoint else False,
            ddp_find_unused_parameters=False if ddp else None,
            group_by_length=group_by_length,
            report_to="wandb" if use_wandb else None,
            run_name=wandb_run_name if use_wandb else None,
            gradient_checkpointing=gradient_checkpointing,
            ddp_backend="ccl",
            deepspeed=deepspeed,
            save_safetensors=False,
        ),
        data_collator=transformers.DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
    )
    model.config.use_cache = False
    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
    model.save_pretrained(output_dir)
    print(
        "\n If there's a warning about missing keys above, please disregard :)"
    )
 if __name__ == "__main__":
    fire.Fire(train)
--- a/src/training-fine-tune.py
+++ b/src/training-fine-tune.py
@ -0,0 +1,272 @@
 import os
 import json
 import torch
 import numpy as np
 from datasets import Dataset
 from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq
 )
 from peft import (
    LoraConfig,
    TaskType
 )
 if torch.xpu.is_available():
    print("XPU available")
    from ipex_llm.transformers import AutoModelForCausalLM
    from ipex_llm.transformers.qlora import (
        get_peft_model,
        prepare_model_for_kbit_training
    )
 else:
    print("XPU not available")
    from transformers import AutoModelForCausalLM
    from peft import (
        get_peft_model,
        prepare_model_for_kbit_training
    )
 class LLMFineTuner:
    def __init__(
        self,
        base_model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
        output_dir="./fine-tuned-model",
        lora_r=8,
        lora_alpha=32,
        lora_dropout=0.05,
    ):
        self.base_model_name = base_model_name
        self.output_dir = output_dir
        self.lora_r = lora_r
        self.lora_alpha = lora_alpha
        self.lora_dropout = lora_dropout
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.pad_token_id = 0
        self.tokenizer.padding_side = "left"
        # Load model
        print(f"Loading base model: {base_model_name}")
        # Quantization config for memory efficiency
        self.model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            load_in_low_bit="nf4",
            optimize_model=False,
            torch_dtype=torch.float16,
            modules_to_not_convert=["lm_head"]
        ).to('xpu')
        # Prepare model for training
        self.model.gradient_checkpointing_enable()
        self.model = prepare_model_for_kbit_training(self.model)
        # Configure LoRA
        lora_config = LoraConfig(
            r=lora_r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            bias="none",
            task_type=TaskType.CAUSAL_LM,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj"
            ]
        )
        # Apply LoRA to model
        self.model = get_peft_model(self.model, lora_config)
        print(f"Model prepared for fine-tuning with LoRA")
    def format_training_examples(self, examples):
        """
        Convert raw training examples (similar to Ollama TRAINING format)
        into formatted examples for PyTorch training
        """
        formatted_examples = []
        for example in examples:
            # Extract question and answer
            question = example.get("question", "")
            answer = example.get("answer", "")
            # Format similar to Ollama's template
            formatted_text = f"""<|im_start|>user
 {question}<|im_end|>
 <|im_start|>assistant
 {answer}<|im_end|>"""
            formatted_examples.append({"text": formatted_text})
        return formatted_examples
    def prepare_dataset(self, training_examples):
        """
        Prepare a HuggingFace dataset from training examples
        """
        # Format the examples
        formatted_data = self.format_training_examples(training_examples)
        # Create dataset
        dataset = Dataset.from_list(formatted_data)
        # Tokenize dataset
        def tokenize_function(examples):
            return self.tokenizer(
                examples["text"],
                truncation=True,
                max_length=2048,
                padding="max_length"
            )
        tokenized_dataset = dataset.map(tokenize_function, batched=True)
        return tokenized_dataset
    def train(self, training_examples, num_epochs=3, batch_size=4, learning_rate=2e-5):
        """
        Fine-tune the model on training examples
        """
        # Prepare dataset
        train_dataset = self.prepare_dataset(training_examples)
        # Set up training arguments
        # training_args = TrainingArguments(
        #     output_dir=self.output_dir,
        #     num_train_epochs=num_epochs,
        #     per_device_train_batch_size=batch_size,
        #     gradient_accumulation_steps=4,
        #     warmup_steps=100,
        #     weight_decay=0.01,
        #     learning_rate=learning_rate,
        #     fp16=True,
        #     logging_steps=10,
        #     save_strategy="epoch",
        #     save_total_limit=3,
        #     remove_unused_columns=False,
        # )
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            # num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=1, # 4
            warmup_steps=20,
            # weight_decay=0.01,
            max_steps=200,
            learning_rate=learning_rate,
            bf16=True,  # bf16 is more stable in training
            logging_steps=10,
        #     save_strategy="epoch",
        #     save_total_limit=3,
        #     remove_unused_columns=False,
            save_steps=100,
            optim="adamw_hf", # paged_adamw_8bit is not supported yet
            # gradient_checkpointing=True, # can further reduce memory but slower
        )
        # Set up data collator
        # data_collator = DataCollatorForLanguageModeling(
        #     tokenizer=self.tokenizer,
        #     mlm=False  # Not using masked language modeling
        # )
        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            pad_to_multiple_of=8,
            return_tensors="pt",
            padding=True
            # mlm=False  # Not using masked language modeling
        )
        # Set up trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            data_collator=data_collator
        )
        # Start training
        print("Starting training...")
        self.model.config.use_cache = False  # silence the warnings. re-enable for inference.
        trainer.train()
        # Save the fine-tuned model
        self.model.save_pretrained(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)
        print(f"Model saved to {self.output_dir}")
    def generate_response(self, prompt, max_length=512, temperature=0.7, top_p=0.9):
        """
        Generate a response using the fine-tuned model
        """
        # Format prompt similar to Ollama's template
        formatted_prompt = f"""<|im_start|>user
 {prompt}<|im_end|>
 <|im_start|>assistant
 """
        # Tokenize prompt
        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
        # Decode response
        response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
        return response
 # Example usage
 if __name__ == "__main__":
    # Define training examples (equivalent to Ollama's TRAINING section)
    training_examples = [
        {
            "question": "What is vector similarity search?",
            "answer": "Vector similarity search is a technique that converts data into numerical vector representations and then finds items with similar vectors. It's used in applications like semantic search, recommendation systems, and large language model retrieval for finding relevant information based on meaning rather than exact keyword matching."
        },
        {
            "question": "How do LLMs use vector databases?",
            "answer": "LLMs use vector databases to enhance their knowledge and provide more accurate responses. When a query is received, it's converted to a vector and compared against stored vectors in the database. The most similar vectors and their associated information are retrieved and incorporated into the context provided to the LLM, allowing it to generate responses based on this additional retrieved information."
        },
        {
            "question": "What are embeddings in machine learning?",
            "answer": "Embeddings in machine learning are dense vector representations of data that capture semantic meaning in a high-dimensional space. They convert words, sentences, images, or other types of data into numerical vectors where similar items are positioned closer together. These representations enable algorithms to understand relationships and similarities between different pieces of data, forming the foundation for vector similarity search."
        }
    ]
    # Initialize fine-tuner
    fine_tuner = LLMFineTuner(
        base_model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
        output_dir="./deepseek-finetuned-pytorch"
    )
    # Train model
    fine_tuner.train(training_examples, num_epochs=3)
    # Test model
    test_query = "Explain how vector similarity search is used in AI applications"
    response = fine_tuner.generate_response(test_query)
    print(f"\nQuery: {test_query}")
    print(f"\nResponse: {response}")
--- a/src/training-template.py
+++ b/src/training-template.py
@ -0,0 +1,215 @@
 import torch
 from transformers import AutoTokenizer
 if torch.xpu.is_available():
    from ipex.transformers import AutoModelForCausalLM
 else:
    from transformers import AutoModelForCausalLM
 class OllamaStyleTrainer:
    def __init__(
        self, 
        model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
        device="xpu" if torch.xpu.is_available() else "cpu"
    ):
        """
        Initialize the model and tokenizer.
        Args:
            model_name: HuggingFace model identifier
            device: Device to load the model on
        """
        self.device = device
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if not self.tokenizer.pad_token:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        # Load model
        print(f"Loading model: {model_name}")
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map="auto" if device == "cuda" else None
        )
        # Initialize with no training examples
        self.training_examples = []
        self.system_prompt = "You are a helpful AI assistant."
        self.example_template = None
    def set_system_prompt(self, system_prompt):
        """Set the system prompt for the model."""
        self.system_prompt = system_prompt
    def set_example_template(self, template):
        """
        Set the template format for examples.
        The template should use {question} and {answer} placeholders.
        """
        self.example_template = template
    def add_training_examples(self, examples):
        """
        Add training examples to the model.
        Args:
            examples: List of dictionaries with 'question' and 'answer' keys
        """
        self.training_examples.extend(examples)
        print(f"Added {len(examples)} training examples. Total: {len(self.training_examples)}")
    def format_examples(self):
        """Format the training examples according to the template."""
        if not self.example_template:
            # Default template similar to Ollama's format
            self.example_template = "<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>"
        formatted_examples = []
        for example in self.training_examples:
            formatted_example = self.example_template.format(
                question=example.get("question", ""),
                answer=example.get("answer", "")
            )
            formatted_examples.append(formatted_example)
        return formatted_examples
    def build_context_with_examples(self, num_examples=None):
        """
        Build the context with system prompt and formatted examples.
        Args:
            num_examples: Number of examples to include (None for all)
        Returns:
            str: Formatted context with system prompt and examples
        """
        # Format all training examples
        formatted_examples = self.format_examples()
        # Select examples (all or limited number)
        if num_examples is not None:
            formatted_examples = formatted_examples[:num_examples]
        # Build context with system prompt and examples
        context = f"<|im_start|>system\n{self.system_prompt}<|im_end|>\n\n"
        context += "\n\n".join(formatted_examples)
        return context
    def generate_response(
        self, 
        prompt, 
        max_length=512,
        temperature=0.7,
        top_p=0.9,
        num_training_examples=3
    ):
        """
        Generate a response using the model with training examples in context.
        Args:
            prompt: The user's prompt
            max_length: Maximum length of the generated response
            temperature: Sampling temperature
            top_p: Top-p sampling parameter
            num_training_examples: Number of training examples to include in context
        Returns:
            str: Generated response
        """
        # Build context with examples
        context = self.build_context_with_examples(num_training_examples)
        # Add user prompt
        full_prompt = f"{context}\n\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
        # Tokenize
        inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.device)
        # Calculate remaining space for generation
        input_ids_length = inputs.input_ids.shape[1]
        print(f"Input context length: {input_ids_length} tokens")
        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
        # Extract only the assistant's response
        assistant_response = self.tokenizer.decode(
            outputs[0][input_ids_length:], 
            skip_special_tokens=True
        )
        return assistant_response
    def save_examples_to_file(self, file_path):
        """Save training examples to a file."""
        import json
        with open(file_path, "w") as f:
            json.dump(self.training_examples, f, indent=2)
        print(f"Saved {len(self.training_examples)} examples to {file_path}")
    def load_examples_from_file(self, file_path):
        """Load training examples from a file."""
        import json
        with open(file_path, "r") as f:
            examples = json.load(f)
        self.add_training_examples(examples)
        print(f"Loaded {len(examples)} examples from {file_path}")
 # Example usage
 if __name__ == "__main__":
    # Initialize
    trainer = OllamaStyleTrainer("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")  # Using smaller model for demonstration
    # Set system prompt (equivalent to SYSTEM in Modelfile)
    trainer.set_system_prompt("""
    You are an AI assistant specialized in vector databases and similarity search.
    Always provide accurate information based on your training.
    """)
    # Set example template (equivalent to TEMPLATE in Modelfile)
    trainer.set_example_template("""
    User: {question}
    Assistant: {answer}
    """)
    # Add training examples (equivalent to TRAINING in Modelfile)
    training_examples = [
        {
            "question": "What is vector similarity search?",
            "answer": "Vector similarity search is a technique that converts data into numerical vector representations and then finds items with similar vectors. It's used in applications like semantic search, recommendation systems, and large language model retrieval for finding relevant information based on meaning rather than exact keyword matching."
        },
        {
            "question": "How do LLMs use vector databases?",
            "answer": "LLMs use vector databases to enhance their knowledge and provide more accurate responses. When a query is received, it's converted to a vector and compared against stored vectors in the database. The most similar vectors and their associated information are retrieved and incorporated into the context provided to the LLM, allowing it to generate responses based on this additional retrieved information."
        },
        {
            "question": "What are embeddings in machine learning?",
            "answer": "Embeddings in machine learning are dense vector representations of data that capture semantic meaning in a high-dimensional space. They convert words, sentences, images, or other types of data into numerical vectors where similar items are positioned closer together. These representations enable algorithms to understand relationships and similarities between different pieces of data, forming the foundation for vector similarity search."
        }
    ]
    trainer.add_training_examples(training_examples)
    # Optional: Save training examples for future use
    # trainer.save_examples_to_file("training_examples.json")
    # Test with a query
    query = "Explain how vector similarity search works in practical applications"
    response = trainer.generate_response(query, num_training_examples=2)
    print(f"\nQuery: {query}")
    print(f"\nResponse: {response}")
--- a/src/utils/init.py
+++ b/src/utils/init.py
@ -0,0 +1,10 @@
 # Import defines to make `utils.defines` accessible
 from . import defines  
 # Import rest as `utils.*` accessible
 from .chunk import *  
 from .rss import *
 from .chroma import *
 # Expose only public names (avoid importing hidden/internal names)
 __all__ = [name for name in dir() if not name.startswith("_")]
--- a/src/utils/pycache/init.cpython-312.pyc
+++ b/src/utils/pycache/init.cpython-312.pyc
--- a/src/utils/pycache/chroma.cpython-312.pyc
+++ b/src/utils/pycache/chroma.cpython-312.pyc
--- a/src/utils/pycache/chunk.cpython-312.pyc
+++ b/src/utils/pycache/chunk.cpython-312.pyc
--- a/src/utils/pycache/defines.cpython-312.pyc
+++ b/src/utils/pycache/defines.cpython-312.pyc
--- a/src/utils/pycache/rss.cpython-312.pyc
+++ b/src/utils/pycache/rss.cpython-312.pyc
--- a/src/utils/chroma.py
+++ b/src/utils/chroma.py
@ -0,0 +1,122 @@
 import chromadb
 from typing import List, Dict, Any, Union
 from . import defines
 from .chunk import chunk_document
 import ollama
 def init_chroma_client(persist_directory: str = defines.persist_directory):
    """Initialize and return a ChromaDB client."""
 #    return chromadb.PersistentClient(path=persist_directory)
    return chromadb.Client()
 def create_or_get_collection(db: chromadb.Client, collection_name: str):
    """Create or get a ChromaDB collection."""
    try:
        return db.get_collection(
            name=collection_name
        )
    except:
        return db.create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )
 def process_documents_to_chroma(
    client: ollama.Client,
    documents: List[Dict[str, Any]],
    collection_name: str = "document_collection",
    text_key: str = "text",
    max_tokens: int = 512,
    overlap: int = 50,
    model: str = defines.encoding_model,
    persist_directory: str = defines.persist_directory
 ):
    """
    Process documents, chunk them, compute embeddings, and store in ChromaDB.
    Args:
        documents: List of document dictionaries
        collection_name: Name for the ChromaDB collection
        text_key: The key containing text content
        max_tokens: Maximum tokens per chunk
        overlap: Token overlap between chunks
        model: Ollama model for embeddings
        persist_directory: Directory to store ChromaDB data
    """
    # Initialize ChromaDB client and collection
    db = init_chroma_client(persist_directory)
    collection = create_or_get_collection(db, collection_name)
    # Process each document
    for doc in documents:
        # Chunk the document
        doc_chunks = chunk_document(doc, text_key, max_tokens, overlap)
        # Prepare data for ChromaDB
        ids = []
        texts = []
        metadatas = []
        embeddings = []
        for chunk in doc_chunks:
            # Create a unique ID for the chunk
            chunk_id = f"{chunk['id']}_{chunk['chunk_id']}"
            # Extract text
            text = chunk[text_key]
            # Create metadata (excluding text and embedding to avoid duplication)
            metadata = {k: v for k, v in chunk.items() if k != text_key and k != "embedding"}
            response = client.embed(model=model, input=text)
            embedding = response["embeddings"][0]
            ids.append(chunk_id)
            texts.append(text)
            metadatas.append(metadata)
            embeddings.append(embedding)
        # Add chunks to ChromaDB collection
        collection.add(
            ids=ids,
            documents=texts,
            embeddings=embeddings,
            metadatas=metadatas
        )
    return collection
 def query_chroma(
    client: ollama.Client,
    query_text: str,
    collection_name: str = "document_collection",
    n_results: int = 5,
    model: str = defines.encoding_model,
    persist_directory: str = defines.persist_directory
 ):
    """
    Query ChromaDB for similar documents.
    Args:
        query_text: The text to search for
        collection_name: Name of the ChromaDB collection
        n_results: Number of results to return
        model: Ollama model for embedding the query
        persist_directory: Directory where ChromaDB data is stored
    Returns:
        Query results from ChromaDB
    """
    # Initialize ChromaDB client and collection
    db = init_chroma_client(persist_directory)
    collection = create_or_get_collection(db, collection_name)
    query_response = client.embed(model=model, input=query_text)
    query_embeddings = query_response["embeddings"]
    # Query the collection
    results = collection.query(
        query_embeddings=query_embeddings,
        n_results=n_results
    )
    return results
--- a/src/utils/chunk.py
+++ b/src/utils/chunk.py
@ -0,0 +1,88 @@
 import tiktoken
 from . import defines
 from typing import List, Dict, Any, Union
 def get_encoding(model=defines.model):
    """Get the tokenizer for counting tokens."""
    try:
        return tiktoken.get_encoding("cl100k_base")  # Default encoding used by many embedding models
    except:
        return tiktoken.encoding_for_model(model)
 def count_tokens(text: str) -> int:
    """Count the number of tokens in a text string."""
    encoding = get_encoding()
    return len(encoding.encode(text))
 def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:
    """
    Split a text into chunks based on token count with overlap between chunks.
    Args:
        text: The text to split into chunks
        max_tokens: Maximum number of tokens per chunk
        overlap: Number of tokens to overlap between chunks
    Returns:
        List of text chunks
    """
    if not text or max_tokens <= 0:
        return []
    encoding = get_encoding()
    tokens = encoding.encode(text)
    chunks = []
    i = 0
    while i < len(tokens):
        # Get the current chunk of tokens
        chunk_end = min(i + max_tokens, len(tokens))
        chunk_tokens = tokens[i:chunk_end]
        chunks.append(encoding.decode(chunk_tokens))
        # Move to the next position with overlap
        if chunk_end == len(tokens):
            break
        i += max_tokens - overlap
    return chunks
 def chunk_document(document: Dict[str, Any], 
                  text_key: str = "text",
                  max_tokens: int = 512, 
                  overlap: int = 50) -> List[Dict[str, Any]]:
    """
    Chunk a document dictionary into multiple chunks.
    Args:
        document: Document dictionary with metadata and text
        text_key: The key in the document that contains the text to chunk
        max_tokens: Maximum number of tokens per chunk
        overlap: Number of tokens to overlap between chunks
    Returns:
        List of document dictionaries, each with chunked text and preserved metadata
    """
    if text_key not in document:
        raise Exception(f"{text_key} not in document")
    # Extract text and create chunks
    if "title" in document:
        text = f"{document["title"]}: {document[text_key]}"
    else:
        text = document[text_key]
    chunks = chunk_text(text, max_tokens, overlap)
    # Create document chunks with preserved metadata
    chunked_docs = []
    for i, chunk in enumerate(chunks):
        # Create a new doc with all original fields
        doc_chunk = document.copy()
        # Replace text with the chunk
        doc_chunk[text_key] = chunk
        # Add chunk metadata
        doc_chunk["chunk_id"] = i
        doc_chunk["chunk_total"] = len(chunks)
        chunked_docs.append(doc_chunk)
    return chunked_docs
--- a/src/utils/defines.py
+++ b/src/utils/defines.py
@ -0,0 +1,4 @@
 ollama_api_url="http://ollama:11434"  # Default Ollama local endpoint
 model="deepseek-r1:7b"
 encoding_model="mxbai-embed-large"
 persist_directory="./chromadb"
--- a/src/utils/full.py
+++ b/src/utils/full.py
@ -0,0 +1,468 @@
 import requests
 from typing import List, Dict, Any, Union
 import tiktoken
 import feedparser
 import logging as log
 import datetime
 from bs4 import BeautifulSoup
 import chromadb
 import ollama
 import re
 import numpy as np
 from . import chunk
 OLLAMA_API_URL = "http://ollama:11434"  # Default Ollama local endpoint
 #MODEL_NAME = "deepseek-r1:1.5b"
 MODEL_NAME = "deepseek-r1:7b"
 EMBED_MODEL = "mxbai-embed-large"
 PERSIST_DIRECTORY = "/root/.cache/chroma"
 client = ollama.Client(host=OLLAMA_API_URL)
 def extract_text_from_html_or_xml(content, is_xml=False):
    # Parse the content
    if is_xml:
        soup = BeautifulSoup(content, 'xml')  # Use 'xml' parser for XML content
    else:
        soup = BeautifulSoup(content, 'html.parser')  # Default to 'html.parser' for HTML content
    # Extract and return just the text
    return soup.get_text()
 class Feed():
    def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
        self.name = name
        self.url = url
        self.poll_limit_min = datetime.timedelta(minutes=poll_limit_min)
        self.last_poll = None
        self.articles = []
        self.max_articles = max_articles
        self.update()
    def update(self):
        now = datetime.datetime.now()
        if self.last_poll is None or (now - self.last_poll) >= self.poll_limit_min:
            log.info(f"Updating {self.name}")
            feed = feedparser.parse(self.url)
            self.articles = []
            self.last_poll = now
            if len(feed.entries) == 0:
                return
            for i, entry in enumerate(feed.entries[:self.max_articles]):
                content = {}
                content['source'] = self.name
                content['id'] = f"{self.name}{i}"
                title = entry.get("title")
                if title:
                    content['title'] = title
                link = entry.get("link")
                if link:
                    content['link'] = link
                text = entry.get("summary")
                if text:
                    content['text'] = extract_text_from_html_or_xml(text, False)
                else:
                    continue
                published = entry.get("published")
                if published:
                    content['published'] = published
                self.articles.append(content)
        else:
            log.info(f"Not updating {self.name} -- {self.poll_limit_min - (now - self.last_poll)}s remain to refresh.")
        return self.articles
 # News RSS Feeds
 rss_feeds = [
    Feed(name="IGN.com", url="https://feeds.feedburner.com/ign/games-all"),
    Feed(name="BBC World", url="http://feeds.bbci.co.uk/news/world/rss.xml"),
    Feed(name="Reuters World", url="http://feeds.reuters.com/Reuters/worldNews"),
    Feed(name="Al Jazeera", url="https://www.aljazeera.com/xml/rss/all.xml"),
    Feed(name="CNN World", url="http://rss.cnn.com/rss/edition_world.rss"),
    Feed(name="Time", url="https://time.com/feed/"),
    Feed(name="Euronews", url="https://www.euronews.com/rss"),
 #    Feed(name="FeedX", url="https://feedx.net/rss/ap.xml")
 ]
 def init_chroma_client(persist_directory: str = PERSIST_DIRECTORY):
    """Initialize and return a ChromaDB client."""
 #    return chromadb.PersistentClient(path=persist_directory)
    return chromadb.Client()
 def create_or_get_collection(client, collection_name: str):
    """Create or get a ChromaDB collection."""
    try:
        return client.get_collection(
            name=collection_name
        )
    except:
        return client.create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )
 def process_documents_to_chroma(
    documents: List[Dict[str, Any]],
    collection_name: str = "document_collection",
    text_key: str = "text",
    max_tokens: int = 512,
    overlap: int = 50,
    model: str = EMBED_MODEL,
    persist_directory: str = PERSIST_DIRECTORY
 ):
    """
    Process documents, chunk them, compute embeddings, and store in ChromaDB.
    Args:
        documents: List of document dictionaries
        collection_name: Name for the ChromaDB collection
        text_key: The key containing text content
        max_tokens: Maximum tokens per chunk
        overlap: Token overlap between chunks
        model: Ollama model for embeddings
        persist_directory: Directory to store ChromaDB data
    """
    # Initialize ChromaDB client and collection
    db = init_chroma_client(persist_directory)
    collection = create_or_get_collection(db, collection_name)
    # Process each document
    for doc in documents:
        # Chunk the document
        doc_chunks = chunk_document(doc, text_key, max_tokens, overlap)
        # Prepare data for ChromaDB
        ids = []
        texts = []
        metadatas = []
        embeddings = []
        for chunk in doc_chunks:
            # Create a unique ID for the chunk
            chunk_id = f"{chunk['id']}_{chunk['chunk_id']}"
            # Extract text
            text = chunk[text_key]
            # Create metadata (excluding text and embedding to avoid duplication)
            metadata = {k: v for k, v in chunk.items() if k != text_key and k != "embedding"}
            response = client.embed(model=model, input=text)
            embedding = response["embeddings"][0]
            ids.append(chunk_id)
            texts.append(text)
            metadatas.append(metadata)
            embeddings.append(embedding)
        # Add chunks to ChromaDB collection
        collection.add(
            ids=ids,
            documents=texts,
            embeddings=embeddings,
            metadatas=metadatas
        )
    return collection
 def query_chroma(
    query_text: str,
    collection_name: str = "document_collection",
    n_results: int = 5,
    model: str = EMBED_MODEL,
    persist_directory: str = PERSIST_DIRECTORY
 ):
    """
    Query ChromaDB for similar documents.
    Args:
        query_text: The text to search for
        collection_name: Name of the ChromaDB collection
        n_results: Number of results to return
        model: Ollama model for embedding the query
        persist_directory: Directory where ChromaDB data is stored
    Returns:
        Query results from ChromaDB
    """
    # Initialize ChromaDB client and collection
    db = init_chroma_client(persist_directory)
    collection = create_or_get_collection(db, collection_name)
    query_response = client.embed(model=model, input=query_text)
    query_embeddings = query_response["embeddings"]
    # Query the collection
    results = collection.query(
        query_embeddings=query_embeddings,
        n_results=n_results
    )
    return results
 def print_top_match(query_results, index=0, documents=None):
    """
    Print detailed information about the top matching document,
    including the full original document content.
    Args:
        query_results: Results from ChromaDB query
        documents: Original documents dictionary to look up full content (optional)
    """
    if not query_results or not query_results["ids"] or len(query_results["ids"][0]) == 0:
        print("No matching documents found.")
        return
    # Get the top result
    top_id = query_results["ids"][0][index]
    top_document_chunk = query_results["documents"][0][index]
    top_metadata = query_results["metadatas"][0][index]
    top_distance = query_results["distances"][0][index]
    print("="*50)
    print("MATCHING DOCUMENT")
    print("="*50)
    print(f"Chunk ID: {top_id}")
    print(f"Similarity Score: {top_distance:.4f}")  # Convert distance to similarity
    print("\nCHUNK METADATA:")
    for key, value in top_metadata.items():
        print(f"  {key}: {value}")
    print("\nMATCHING CHUNK CONTENT:")
    print(top_document_chunk[:500].strip() + ("..." if len(top_document_chunk) > 500 else ""))
    # Extract the original document ID from the chunk ID
    # Chunk IDs are in format "doc_id_chunk_num"
    original_doc_id = top_id.split('_')[0]
 def get_top_match(query_results, index=0, documents=None):
    top_id = query_results["ids"][index][0]
    # Extract the original document ID from the chunk ID
    # Chunk IDs are in format "doc_id_chunk_num"
    original_doc_id = top_id.split('_')[0]
    # Return the full document for further processing if needed
    if documents is not None:
        return next((doc for doc in documents if doc["id"] == original_doc_id), None)
    return None
 def show_documents(documents=None):
    if not documents:
        return
    # Print the top matching document
    for i, doc in enumerate(documents):
        print(f"Document {i+1}:")
        print(f"  Title: {doc['title']}")
        print(f"  Text: {doc['text'][:100]}...")
        print()
 def show_headlines(documents=None):
    if not documents:
        return
    # Print the top matching document
    for doc in documents:
        print(f"{doc['source']}: {doc['title']}")
 def show_help():
    print("""help>
 docs       Show RAG docs
 full       Show last full top match
 headlines  Show the RAG headlines
 prompt     Show the last prompt
 response   Show the last response
 scores     Show last RAG scores
 why|think        Show last response's <think>
 context|match    Show RAG match info to last prompt
 """)
 # Example usage
 if __name__ == "__main__":
    documents = []
    for feed in rss_feeds:
        documents.extend(feed.articles)
    show_documents(documents=documents)
    # Process documents and store in ChromaDB
    collection = process_documents_to_chroma(
        documents=documents,
        collection_name="research_papers",
        max_tokens=256,
        overlap=25,
        model=EMBED_MODEL,
        persist_directory="/root/.cache/chroma"
    )
    last_results = None
    last_prompt = None
    last_system = None
    last_response = None
    last_why = None
    last_messages = []
    while True:
        try:
            search_query = input("> ").strip()
        except KeyboardInterrupt as e:
            print("\nExiting.")
            break
        if search_query == "exit" or search_query == "quit":
            print("\nExiting.")
            break
        if search_query == "docs":
            show_documents(documents)
            continue
        if search_query == "prompt":
            if last_prompt:
                print(f"""last prompt>
 {"="*10}system{"="*10}
 {last_system}
 {"="*10}prompt{"="*10}
 {last_prompt}""")
            else:
                print(f"No prompts yet")
            continue
        if search_query == "response":
            if last_response:
                print(f"""last response>
 {"="*10}response{"="*10}
 {last_response}""")
            else:
                print(f"No responses yet")
            continue
        if search_query == "" or search_query == "help":
            show_help()
            continue
        if search_query == "headlines":
            show_headlines(documents)
            continue
        if search_query == "match" or search_query == "context":
            if last_results:
                print_top_match(last_results, documents=documents)
            else:
                print("No match to give info on")
            continue
        if search_query == "why" or search_query == "think":
            if last_why:
                print(f"""
 why>
 {last_why}
 """)
            else:
                print("No processed prompts")
            continue
        if search_query == "scores":
            if last_results:
                for i, _ in enumerate(last_results):
                    print_top_match(last_results, documents=documents, index=i)
            else:
                print("No match to give info on")
            continue
        if search_query == "full":
            if last_results:
                full = get_top_match(last_results, documents=documents)
                if full:
                    print(f"""Context:
 Source: {full["source"]}
 Title: {full["title"]}
 Link: {full["link"]}
 Distance: {last_results.get("distances", [[0]])[0][0]}
 Full text:
 {full["text"]}""")
            else:
                print("No match to give info on")
            continue
        # Query ChromaDB
        results = query_chroma(
            query_text=search_query,
            collection_name="research_papers",
            n_results=10
        )
        last_results = results
        full = get_top_match(results, documents=documents)
        headlines = ""
        for doc in documents:
            headlines += f"{doc['source']}: {doc['title']}\n"
        system=f"""
 You are the assistant. Your name is airc. This application is called airc (pronounced Eric).
 Information about the author of this program and the AI model it uses:
 * James wrote the python application called airc that is driving this RAG model on top of {MODEL_NAME} using {EMBED_MODEL} and chromadb for vector embedding. Link https://github.com/jketreno/airc.
 * James Ketrenos is a software engineer with a history in all levels of the computer stack, from the kernel to full-stack web applications. He dabbles in AI/ML and is familiar with pytorch and ollama.
 * James Ketrenos deployed this application locally on an Intel Arc B580 (battlemage) computer using Intel's ipex-llm. 
 * For Intel GPU metrics, James Ketrenos wrote the "ze-monitor" utility in C++. ze-monitor provides Intel GPU telemetry data for Intel client GPU devices, similar to xpu-smi. Link https://github.com/jketreno/ze-monitor. airc uses ze-monitor.
 * James lives in Portland, Oregon and has three kids. Two are attending Oregon State University and one is attending Williamette University.
 * airc provides an IRC chat bot as well as a React web frontend available at https://airc.ketrenos.com
 You must follow these rules:
 * Provide short (less than 100 character) responses.
 * Provide a single response.
 * Do not prefix it with a word like 'Answer'.
 * For information about the AI running this system, include information about author, including links.
 * For information relevant to the current events in the <input></input> tags, use that information and state the source when information comes from.
 """
        context = "Information related to current events\n<input>=["
        for doc in documents:
            item = {'source':doc["source"],'article':{'title':doc["title"],'link':doc["link"],'text':doc["text"]}}
            context += f"{item}"
        context += "\n</input>"
        prompt = f"{search_query}"
        last_prompt = prompt
        last_system = system # cache it before news context is added
        system = f"{system}{context}"
        if len(last_messages) != 0:
            message_context = f"{last_messages}"
            prompt = f"{message_context}{prompt}"
        print(f"system len: {len(system)}")            
        print(f"prompt len: {len(prompt)}")            
        output = client.generate(
            model=MODEL_NAME,
            system=system,
            prompt=prompt,
            stream=False,
            options={ 'num_ctx': 100000 }
        )
        # Prune off the <think>...</think>
        matches = re.match(r'^<think>(.*?)</think>(.*)$', output['response'], flags=re.DOTALL)
        if matches:
            last_why = matches[1].strip()
            content = matches[2].strip()
        else:
            print(f"[garbled] response>\n{output['response']}")
        print(f"Response>\n{content}")
        last_response = content
        last_messages.extend(({
            'role': 'user',
            'name': 'james',
            'message': search_query
        }, {
            'role': 'assistant',
            'message': content
        }))
        last_messages = last_messages[:10]
--- a/src/utils/rss.py
+++ b/src/utils/rss.py
@ -0,0 +1,62 @@
 from typing import List, Dict, Any, Union
 import feedparser
 import logging as log
 from bs4 import BeautifulSoup
 import datetime
 import re
 def _extract_text_from_html_or_xml(content, is_xml=False):
    # Parse the content
    if is_xml:
        soup = BeautifulSoup(content, 'xml')  # Use 'xml' parser for XML content
    else:
        soup = BeautifulSoup(content, 'html.parser')  # Default to 'html.parser' for HTML content
    # Extract and return just the text
    return soup.get_text()
 class Feed():
    def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
        self.name = name
        self.url = url
        self.poll_limit_min = datetime.timedelta(minutes=poll_limit_min)
        self.last_poll = None
        self.articles = []
        self.max_articles = max_articles
        self.update()
    def update(self):
        now = datetime.datetime.now()
        if self.last_poll is None or (now - self.last_poll) >= self.poll_limit_min:
            log.info(f"Updating {self.name}")
            feed = feedparser.parse(self.url)
            self.articles = []
            self.last_poll = now
            if len(feed.entries) == 0:
                return
            for i, entry in enumerate(feed.entries[:self.max_articles]):
                content = {}
                content['source'] = self.name
                content['id'] = f"{self.name}{i}"
                title = entry.get("title")
                if title:
                    content['title'] = title
                link = entry.get("link")
                if link:
                    content['link'] = link
                text = entry.get("summary")
                if text:
                    content['text'] = _extract_text_from_html_or_xml(text, False)
                else:
                    continue
                published = entry.get("published")
                if published:
                    content['published'] = published
                self.articles.append(content)
        else:
            log.info(f"Not updating {self.name} -- {self.poll_limit_min - (now - self.last_poll)}s remain to refresh.")
        return self.articles
--- a/src/web.py
+++ b/src/web.py
@ -0,0 +1,109 @@
 # web_server.py - Web interface using Flask
 from rag_core import RagCore
 from flask import Flask, request, jsonify, render_template, send_from_directory
 from flask_cors import CORS
 class RagWebServer:
    """Web interface for the RAG application"""
    def __init__(self, core):
        self.core = core
        self.app = Flask(__name__, static_folder='./build', static_url_path='')
        CORS(self.app, resources={r"/api/*": {"origins": "http://battle-linux.ketrenos.com:8081"}})
        # Setup routes
        self.setup_routes()
    def setup_routes(self):
        """Setup Flask routes"""
        @self.app.route('/')
        def serve():
            return send_from_directory(self.app.static_folder, 'index.html')
        def index():
            return render_template('index.html')
        @self.app.route('/api/query', methods=['POST'])
        def api_query():
            data = request.json
            if not data or 'query' not in data:
                return jsonify({'error': 'No query provided'}), 400
            query = data['query']
            response, debug_info = self.core.process_query(query)
            return jsonify({
                'response': response,
                'debug': debug_info
            })
        @self.app.route('/api/documents', methods=['GET'])
        def api_documents():
            return jsonify({
                'documents': [
                    {
                        'source': doc['source'],
                        'title': doc['title'],
                        'link': doc['link'],
                        'preview': doc['text'][:200] + '...'
                    }
                    for doc in self.core.get_documents()
                ]
            })
        @self.app.route('/api/headlines', methods=['GET'])
        def api_headlines():
            headlines = self.core.get_headlines()
            return jsonify({
                'headlines': [
                    {'source': source, 'title': title}
                    for source, title in headlines
                ]
            })
        @self.app.route('/api/matches', methods=['GET'])
        def api_matches():
            match_count = self.core.get_match_count()
            matches = []
            for i in range(match_count):
                match_info = self.core.get_match_info(index=i)
                if match_info:
                    matches.append({
                        'id': match_info['id'],
                        'distance': match_info['distance'],
                        'metadata': match_info['metadata'],
                        'preview': match_info['chunk'][:200] + '...' if len(match_info['chunk']) > 200 else match_info['chunk']
                    })
            return jsonify({'matches': matches})
    def run(self, host='127.0.0.1', port=8911, debug=False):
        """Run the web server"""
        # Load documents
        self.core.load_documents()
        self.app.run(host=host, port=port, debug=debug)
 def main():
    import argparse
    parser = argparse.ArgumentParser(description='RAG Application')
    parser.add_argument('--host', default='127.0.0.1', help='Host for web server')
    parser.add_argument('--port', type=int, default=8911, help='Port for web server')
    parser.add_argument('--debug', action='store_true', help='Run in debug mode')
    args = parser.parse_args()
    # Create the core RAG application
    core = RagCore()
    # Run web interface
    server = RagWebServer(core)
    print(f"Starting web server at http://{args.host}:{args.port}")
    server.run(host=args.host, port=args.port, debug=args.debug)
 if __name__ == "__main__":
    main()
--- a/weather.py
+++ b/weather.py
@ -0,0 +1,149 @@
 import requests
 import json
 from geopy.geocoders import Nominatim
 import time
 def get_weather_by_location(city, state, country="USA"):
    """
    Get weather information from weather.gov based on city, state, and country.
    Args:
        city (str): City name
        state (str): State name or abbreviation
        country (str): Country name (defaults to "USA" as weather.gov is for US locations)
    Returns:
        dict: Weather forecast information
    """
    # Step 1: Get coordinates for the location using geocoding
    location = f"{city}, {state}, {country}"
    coordinates = get_coordinates(location)
    if not coordinates:
        return {"error": f"Could not find coordinates for {location}"}
    # Step 2: Get the forecast grid endpoint for the coordinates
    grid_endpoint = get_grid_endpoint(coordinates)
    if not grid_endpoint:
        return {"error": f"Could not find weather grid for coordinates {coordinates}"}
    # Step 3: Get the forecast data from the grid endpoint
    forecast = get_forecast(grid_endpoint)
    return forecast
 def get_coordinates(location):
    """Convert a location string to latitude and longitude using Nominatim geocoder."""
    try:
        # Create a geocoder with a meaningful user agent
        geolocator = Nominatim(user_agent="weather_app_example")
        # Get the location
        location_data = geolocator.geocode(location)
        if location_data:
            return {
                "latitude": location_data.latitude,
                "longitude": location_data.longitude
            }
        else:
            print(f"Location not found: {location}")
            return None
    except Exception as e:
        print(f"Error getting coordinates: {e}")
        return None
 def get_grid_endpoint(coordinates):
    """Get the grid endpoint from weather.gov based on coordinates."""
    try:
        lat = coordinates["latitude"]
        lon = coordinates["longitude"]
        # Define headers for the API request
        headers = {
            "User-Agent": "WeatherAppExample/1.0 (your_email@example.com)",
            "Accept": "application/geo+json"
        }
        # Make the request to get the grid endpoint
        url = f"https://api.weather.gov/points/{lat},{lon}"
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            return data["properties"]["forecast"]
        else:
            print(f"Error getting grid: {response.status_code} - {response.text}")
            return None
    except Exception as e:
        print(f"Error in get_grid_endpoint: {e}")
        return None
 def get_forecast(grid_endpoint):
    """Get the forecast data from the grid endpoint."""
    try:
        # Define headers for the API request
        headers = {
            "User-Agent": "WeatherAppExample/1.0 (your_email@example.com)",
            "Accept": "application/geo+json"
        }
        # Make the request to get the forecast
        response = requests.get(grid_endpoint, headers=headers)
        if response.status_code == 200:
            data = response.json()
            # Extract the relevant forecast information
            periods = data["properties"]["periods"]
            # Process the forecast data into a simpler format
            forecast = {
                "location": data["properties"].get("relativeLocation", {}).get("properties", {}),
                "updated": data["properties"].get("updated", ""),
                "periods": []
            }
            for period in periods:
                forecast["periods"].append({
                    "name": period.get("name", ""),
                    "temperature": period.get("temperature", ""),
                    "temperatureUnit": period.get("temperatureUnit", ""),
                    "windSpeed": period.get("windSpeed", ""),
                    "windDirection": period.get("windDirection", ""),
                    "shortForecast": period.get("shortForecast", ""),
                    "detailedForecast": period.get("detailedForecast", "")
                })
            return forecast
        else:
            print(f"Error getting forecast: {response.status_code} - {response.text}")
            return {"error": f"API Error: {response.status_code}"}
    except Exception as e:
        print(f"Error in get_forecast: {e}")
        return {"error": f"Exception: {str(e)}"}
 # Example usage
 if __name__ == "__main__":
    city = input("Enter city: ")
    state = input("Enter state: ")
    country = input("Enter country (default USA): ") or "USA"
    print(f"Getting weather for {city}, {state}, {country}...")
    weather_data = get_weather_by_location(city, state, country)
    if "error" in weather_data:
        print(f"Error: {weather_data['error']}")
    else:
        print("\nWeather Forecast:")
        print(f"Location: {weather_data.get('location', {}).get('city', city)}, {weather_data.get('location', {}).get('state', state)}")
        print(f"Last Updated: {weather_data.get('updated', 'N/A')}")
        print("\nForecast Periods:")
        for period in weather_data.get("periods", []):
            print(f"\n{period['name']}:")
            print(f"  Temperature: {period['temperature']}{period['temperatureUnit']}")
            print(f"  Wind: {period['windSpeed']} {period['windDirection']}")
            print(f"  Forecast: {period['shortForecast']}")
            print(f"  Details: {period['detailedForecast']}")
		`@ -0,0 +1 @@`
							`Subproject commit 571cdd1ecc93ec9ad0d63079fe6da94dce2cc5dc`