This commit is contained in:
James Ketr 2025-03-18 13:09:52 -07:00
parent 1130077c03
commit f10f056e0d
73 changed files with 763972 additions and 0 deletions

444
Dockerfile.airc-pytorch Normal file
View File

@ -0,0 +1,444 @@
FROM ubuntu:oracular AS pytorch-build
SHELL [ "/bin/bash", "-c" ]
# Instructions Dockerfied from:
#
# https://github.com/pytorch/pytorch
#
# and
#
# https://pytorch.org/docs/stable/notes/get_start_xpu.html
# https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-6.html
#
#
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
gpg \
wget \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
# ipex only supports python 3.11, so use 3.11 instead of latest oracular (3.12)
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
build-essential \
ca-certificates \
ccache \
cmake \
curl \
git \
gpg-agent \
less \
libbz2-dev \
libffi-dev \
libjpeg-dev \
libpng-dev \
libreadline-dev \
libssl-dev \
libsqlite3-dev \
llvm \
nano \
wget \
zlib1g-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
# python3 \
# python3-pip \
# python3-venv \
# python3-dev \
RUN /usr/sbin/update-ccache-symlinks
RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
# Build Python in /opt/..., install it locally, then remove the build environment
# collapsed to a single docker layer.
WORKDIR /opt
ENV PYTHON_VERSION=3.11.9
RUN wget -q -O - https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz | tar -xz \
&& cd Python-${PYTHON_VERSION} \
&& ./configure --prefix=/opt/python --enable-optimizations \
&& make -j$(nproc) \
&& make install \
&& cd /opt \
&& rm -rf Python-${PYTHON_VERSION}
WORKDIR /opt/pytorch
FROM ubuntu:oracular AS ze-monitor
# From https://github.com/jketreno/ze-monitor
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
build-essential \
debhelper \
devscripts \
cmake \
git \
libfmt-dev \
libncurses-dev \
rpm \
rpm2cpio \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
RUN apt-get install -y \
software-properties-common \
&& add-apt-repository -y ppa:kobuk-team/intel-graphics \
&& apt-get update \
&& apt-get install -y \
libze-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
RUN git clone --depth 1 --branch v0.3.0-1 https://github.com/jketreno/ze-monitor /opt/ze-monitor
WORKDIR /opt/ze-monitor/build
RUN cmake .. \
&& make \
&& cpack
FROM pytorch-build AS pytorch
COPY --from=pytorch-build /opt/pytorch /opt/pytorch
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
&& add-apt-repository -y ppa:kobuk-team/intel-graphics \
&& apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
libze-intel-gpu1 \
libze1 \
intel-ocloc \
intel-opencl-icd \
xpu-smi \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2
# When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries.
ENV SYCL_CACHE_PERSISTENT=1
WORKDIR /opt/pytorch
RUN { \
echo '#!/bin/bash' ; \
update-alternatives --set python3 /opt/python/bin/python3.11 ; \
echo 'source /opt/pytorch/venv/bin/activate' ; \
echo 'bash -c "${@}"' ; \
} > /opt/pytorch/shell ; \
chmod +x /opt/pytorch/shell
RUN python3 -m venv --system-site-packages /opt/pytorch/venv
SHELL [ "/opt/pytorch/shell" ]
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
RUN pip3 freeze > /opt/pytorch/requirements.txt
SHELL [ "/bin/bash", "-c" ]
RUN { \
echo '#!/bin/bash' ; \
echo 'echo "Container: pytorch"' ; \
echo 'set -e' ; \
echo 'echo "Setting pip environment to /opt/pytorch"' ; \
echo 'source /opt/pytorch/venv/bin/activate'; \
echo 'if [[ "${1}" == "" ]] || [[ "${1}" == "shell" ]]; then' ; \
echo ' echo "Dropping to shell"' ; \
echo ' /bin/bash -c "source /opt/pytorch/venv/bin/activate ; /bin/bash"' ; \
echo 'else' ; \
echo ' exec "${@}"' ; \
echo 'fi' ; \
} > /entrypoint.sh \
&& chmod +x /entrypoint.sh
ENTRYPOINT [ "/entrypoint.sh" ]
FROM pytorch AS ipex-llm-src
# Build ipex-llm from source
RUN git clone --branch main --depth 1 https://github.com/intel/ipex-llm.git /opt/ipex-llm \
&& cd /opt/ipex-llm \
&& git fetch --depth 1 origin cb3c4b26ad058c156591816aa37eec4acfcbf765 \
&& git checkout cb3c4b26ad058c156591816aa37eec4acfcbf765
WORKDIR /opt/ipex-llm
RUN python3 -m venv --system-site-packages /opt/ipex-llm/venv
RUN { \
echo '#!/bin/bash' ; \
update-alternatives --set python3 /opt/python/bin/python3.11 ; \
echo 'source /opt/ipex-llm/venv/bin/activate' ; \
echo 'bash -c "${@}"' ; \
} > /opt/ipex-llm/shell ; \
chmod +x /opt/ipex-llm/shell
SHELL [ "/opt/ipex-llm/shell" ]
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
WORKDIR /opt/ipex-llm/python/llm
RUN pip install requests wheel
RUN python setup.py clean --all bdist_wheel --linux
FROM airc AS jupyter
SHELL [ "/opt/airc/shell" ]
# BEGIN setup Jupyter
RUN pip install jupyter \
jupyterlab==4.3.0a0 \
jupyterhub==5.0.0 \
notebook==7.3.0a0 \
"jupyter-server-proxy>=4.1.2"
# END setup Jupyter
SHELL [ "/bin/bash", "-c" ]
RUN { \
echo '#!/bin/bash' ; \
echo 'echo "Container: airc jupyter"' ; \
echo 'if [[ ! -e "/root/.cache/hub/token" ]]; then' ; \
echo ' if [[ "${HF_ACCESS_TOKEN}" == "" ]]; then' ; \
echo ' echo "Set your HF access token in .env as: HF_ACCESS_TOKEN=<token>" >&2' ; \
echo ' exit 1' ; \
echo ' else' ; \
echo ' if [[ ! -d '/root/.cache/hub' ]]; then mkdir -p /root/.cache/hub; fi' ; \
echo ' echo "${HF_ACCESS_TOKEN}" > /root/.cache/hub/token' ; \
echo ' fi' ; \
echo 'fi' ; \
echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
echo 'source /opt/airc/venv/bin/activate' ; \
echo 'if [[ "${1}" == "shell" ]]; then echo "Dropping to shell"; /bin/bash; exit $?; fi' ; \
echo 'while true; do' ; \
echo ' echo "Launching jupyter notebook"' ; \
echo ' jupyter notebook \' ; \
echo ' --notebook-dir=/opt/jupyter \' ; \
echo ' --port 8888 \' ; \
echo ' --ip 0.0.0.0 \' ; \
echo ' --no-browser \' ; \
echo ' --allow-root \' ; \
echo ' --ServerApp.token= \' ; \
echo ' --ServerApp.password= \' ; \
echo ' --ServerApp.allow_origin=* \' ; \
echo ' --ServerApp.base_url="/jupyter" \' ; \
echo ' "${@}" \' ; \
echo ' 2>&1 | tee -a "/root/.cache/jupyter.log"' ; \
echo ' echo "jupyter notebook died ($?). Restarting."' ; \
echo ' sleep 5' ; \
echo 'done' ; \
} > /entrypoint-jupyter.sh \
&& chmod +x /entrypoint-jupyter.sh
ENTRYPOINT [ "/entrypoint-jupyter.sh" ]
FROM pytorch AS airc
RUN python3 -m venv --system-site-packages /opt/airc/venv
# Don't install the full oneapi essentials; just the ones that we seem to need
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \
&& echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
| tee /etc/apt/sources.list.d/oneAPI.list \
&& apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
intel-oneapi-mkl-sycl-2025.0 \
intel-oneapi-dnnl-2025.0 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
RUN { \
echo '#!/bin/bash' ; \
echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
echo 'source /opt/airc/venv/bin/activate' ; \
echo 'if [[ "$1" == "" ]]; then bash -c; else bash -c "${@}"; fi' ; \
} > /opt/airc/shell ; \
chmod +x /opt/airc/shell
SHELL [ "/opt/airc/shell" ]
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
# Install ipex-llm built in ipex-llm-src
COPY --from=ipex-llm-src /opt/ipex-llm/python/llm/dist/*.whl /opt/wheels/
RUN for pkg in /opt/wheels/ipex_llm*.whl; do pip install $pkg; done
COPY src/ /opt/airc/src/
# pydle does not work with newer asyncio due to coroutine
# being deprecated. Patch to work.
RUN pip3 install pydle transformers sentencepiece accelerate \
&& patch -d /opt/airc/venv/lib/python3*/site-packages/pydle \
-p1 < /opt/airc/src/pydle.patch
# mistral fails with cache_position errors with transformers>4.40 (or at least it fails with the latest)
# as well as MistralSpda* things missing
RUN pip install "sentence_transformers<3.4.1" "transformers==4.40.0"
# To get xe_linear and other Xe methods
RUN pip3 install 'bigdl-core-xe-all>=2.6.0b'
# trl.core doesn't have what is needed with the default 'pip install trl' version
RUN pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
# Needed by src/model-server.py
RUN pip install flask
SHELL [ "/bin/bash", "-c" ]
RUN { \
echo '#!/bin/bash' ; \
echo 'set -e' ; \
echo 'if [[ ! -e "/root/.cache/hub/token" ]]; then' ; \
echo ' if [[ "${HF_ACCESS_TOKEN}" == "" ]]; then' ; \
echo ' echo "Set your HF access token in .env as: HF_ACCESS_TOKEN=<token>" >&2' ; \
echo ' exit 1' ; \
echo ' else' ; \
echo ' if [[ ! -d '/root/.cache/hub' ]]; then mkdir -p /root/.cache/hub; fi' ; \
echo ' echo "${HF_ACCESS_TOKEN}" > /root/.cache/hub/token' ; \
echo ' fi' ; \
echo 'fi' ; \
echo 'echo "Container: airc"' ; \
echo 'echo "Setting pip environment to /opt/airc"' ; \
echo 'if [[ -e /opt/intel/oneapi/setvars.sh ]]; then source /opt/intel/oneapi/setvars.sh; fi' ; \
echo 'source /opt/airc/venv/bin/activate'; \
echo 'if [[ "${1}" == "shell" ]] || [[ "${1}" == "/bin/bash" ]]; then' ; \
echo ' echo "Dropping to shell"' ; \
echo ' /bin/bash -c "source /opt/airc/venv/bin/activate ; /bin/bash"' ; \
echo ' exit $?' ; \
echo 'else' ; \
echo ' while true; do' ; \
echo ' echo "Launching model-server"' ; \
echo ' python src/model-server.py \' ; \
echo ' 2>&1 | tee -a "/root/.cache/model-server.log"'; \
echo ' echo "model-server died ($?). Restarting."' ; \
echo ' sleep 5' ; \
echo ' done &' ; \
echo ' while true; do' ; \
echo ' echo "Launching airc"' ; \
echo ' python src/airc.py "${@}" \' ; \
echo ' 2>&1 | tee -a "/root/.cache/airc.log"' ; \
echo ' echo "airc died ($?). Restarting."' ; \
echo ' sleep 5' ; \
echo ' done' ; \
echo 'fi' ; \
} > /entrypoint-airc.sh \
&& chmod +x /entrypoint-airc.sh
COPY --from=ze-monitor /opt/ze-monitor/build/ze-monitor-*deb /opt/
RUN dpkg -i /opt/ze-monitor-*deb
WORKDIR /opt/airc
SHELL [ "/opt/airc/shell" ]
# Needed by src/model-server.py
#RUN pip install faiss-cpu sentence_transformers feedparser bs4
RUN pip install ollama feedparser bs4
SHELL [ "/bin/bash", "-c" ]
ENTRYPOINT [ "/entrypoint-airc.sh" ]
FROM ubuntu:oracular AS ollama
# Get a couple prerequisites
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
gpg \
wget \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
# Install Intel graphics runtimes
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
&& add-apt-repository -y ppa:kobuk-team/intel-graphics \
&& apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
libze-intel-gpu1 \
libze1 \
intel-ocloc \
intel-opencl-icd \
xpu-smi \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
WORKDIR /opt/ollama
# Download the nightly ollama release from ipex-llm
RUN wget -qO - https://github.com/intel/ipex-llm/releases/download/v2.2.0-nightly/ollama-0.5.4-ipex-llm-2.2.0b20250226-ubuntu.tgz | \
tar --strip-components=1 -C . -xzv
# Install Python from Oracular (ollama works with 3.12)
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
gpg \
python3 \
python3-pip \
python3-venv \
wget \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
# Setup the ollama python virtual environment
RUN python3 -m venv --system-site-packages /opt/ollama/venv
# Setup the docker pip shell
RUN { \
echo '#!/bin/bash' ; \
update-alternatives --set python3 /opt/python/bin/python3.11 ; \
echo 'source /opt/ollama/venv/bin/activate' ; \
echo 'bash -c "${@}"' ; \
} > /opt/ollama/shell ; \
chmod +x /opt/ollama/shell
# Activate the pip environment on all shell calls
SHELL [ "/opt/ollama/shell" ]
# Install ollama python module
RUN pip3 install ollama
SHELL [ "/bin/bash", "-c" ]
RUN { \
echo '#!/bin/bash'; \
echo 'echo "Container: ollama"'; \
echo 'set -e'; \
echo 'echo "Setting pip environment to /opt/ollama"'; \
echo 'source /opt/ollama/venv/bin/activate'; \
echo 'export OLLAMA_NUM_GPU=999'; \
echo 'export ZES_ENABLE_SYSMAN=1'; \
echo 'export SYCL_CACHE_PERSISTENT=1'; \
echo 'export OLLAMA_KEEP_ALIVE=-1'; \
echo 'export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1'; \
echo ''; \
echo 'if [[ "${1}" == "/bin/bash" ]] || [[ "${1}" =~ ^(/opt/ollama/)?shell$ ]]; then'; \
echo ' echo "Dropping to shell"'; \
echo ' exec /bin/bash'; \
echo 'else'; \
echo ' echo "Launching Ollama server..."'; \
echo ' exec ./ollama serve'; \
echo 'fi'; \
} > /entrypoint.sh \
&& chmod +x /entrypoint.sh
RUN { \
echo '#!/bin/bash'; \
echo 'echo "Container: ollama"'; \
echo 'set -e'; \
echo 'echo "Setting pip environment to /opt/ollama"'; \
echo 'source /opt/ollama/venv/bin/activate'; \
echo './ollama pull mxbai-embed-large' ; \
echo './ollama pull deepseek-r1:7b' ; \
} > /fetch-models.sh \
&& chmod +x /fetch-models.sh
VOLUME [" /root/.ollama" ]
ENTRYPOINT [ "/entrypoint.sh" ]

112
LEARNINGS Normal file
View File

@ -0,0 +1,112 @@
# RAG and Fine Tuning
Two common practices for augmenting a model with new information:
1. In-context learning
2. Fine Tuning
## In-context learning
In-context learnings is great for dynamic data. Rather than retrain the model, you include relevant contextual information in the prompt to the model. The approach to collect that context information is generally:
1. Store the content in a context retrieval system
2. Based on the user's query, determine which content in #1 is most relevant
3. Augment the user's query with that context information
PROS: Easy to implement
CONS: Retrieving the correct context can be difficult, and without the context, the model won't have the additional information and will revert to its internal knowledge.
If you only have one or two documents (totalling a few thousand tokens than the models maximum input) you can include the full text of the document in the prompt and skip the context mapping.
## Fine tuning
Fine tuning is great if you have static content, as applying the knowledge to the model requires the model to go through a training phase which can take several hours or longer, depending on the amount of information being updated. The approach to fine-tune is:
1. Process your data into a series of 'context', 'query', 'response' correlations
2. Train the model using those correlations
The main work involved is in #1, and the success of fine tuning will be greatly impacted by the method used to perform it. As having an expert system manually generate queries and responses can be time consuming, the "prompt adjustment" of the first method can be used.
For this, you can iteratively perform the following:
### Query generation
1. Context data
2. Prompt: Given the context, create a list of questions about the topic. Do not provide answers.
### Response generation
1. Context data
2. Prompt: Given the context, respond to the following question: {query}
Repeat the above for each piece of context data. This is now your expert system which you can use to fine-tune your model.
PROS: Not too difficult to implement and can give much better responses to queries about items covered in the context.
CONS: Updating the model with changes in the context requires retraining, and training takes a log of system resources.
With the Intel Arc B580, I was not able to fine-tune on a full 7B parameter model and had to use a smaller 1.5B parameter model. To train using the alpaca-clean dataset takes 3-5 hours. To generate the correlations, the full 7B model was used in order to get better questions and answers. That data was then used to fine-tune the 1.5B model.
As inference using deepseek-r1 can take 15 seconds or so per query (using the 7B model) and queries on neuralchat-7b are nearly instantaneous, I will explore creating correlations using deepseek, and then use that data to train neurlchat.
# Aproach taken in resume-bot
I tried several techniques, and have collected example output.
1. In-context via pre-embedding context tokens (ollama TRAINING)
2. In-context via full-context in-query
3. In-context via relavent text (traditional RAG)
4. Fine-tune
# Torch vs Ollama
Ollama is easy to setup, and it performs well. However, it does not expose a method for fine-tuning a model beyond the TRAINING template which does not adjust model weights and is more akin to in-context training.
torch is a little more difficult to setup, and it too performs well. With the fast collection of libraries and infrastructure available, fine-tuning using torch is relatively straight forward.
Once you have a fine-tuned model, you can use that model with ollama or torch. I have run the resume-bot using both torch and ollama on an Intel Core i9-14900KS with 64G of RAM and an Intel Arc B580 GPU with 12G of RAM. Below are some metrics gathered while running several query passes:
| | ollama-ipex-llm | pytorch w/ ipex-llm |
|:---------------------|:----------------|:--------------------|
| Query time | | |
| Concurrent queries | | |
### How Ollama Uses the TRAINING Section
The `TRAINING` section in an Ollama Modelfile works differently than traditional fine-tuning methods. Here's how Ollama uses it:
1. **Not True Parameter Fine-tuning**:
- Unlike traditional fine-tuning that updates model weights through backpropagation, Ollama doesn't modify the underlying model parameters
- The examples in `TRAINING` don't trigger a training loop or gradient updates
2. **Template-Based Learning**:
- Ollama uses these examples as additional context when the model is created
- The examples effectively become part of the model's "knowledge"
- This is more like instruction-tuning through examples than actual parameter updates
3. **Implementation Details**:
- The examples are processed during model creation
- They're tokenized and stored alongside the model
- When running inference, Ollama doesn't directly include these examples in every prompt
- Instead, the model is influenced by having processed these examples during creation
4. **Technical Mechanism**:
- The exact implementation varies by model architecture
- For many models, Ollama prepends these examples during the model creation process
- This shapes the model's understanding without modifying weights
- It's similar to how system prompts work but applied at model creation time
5. **Limitations**:
- The effectiveness depends on the base model's capability
- It works best for teaching patterns and preferred response styles
- It's less effective for teaching new facts or complex reasoning
- The number of examples is limited by context window size
### Practical Considerations
- Use concise, high-quality examples that demonstrate the exact behavior you want
- Focus on patterns rather than specific facts
- Include diverse examples covering different aspects of desired behavior
- For best results, combine with well-crafted system prompts
- Remember that this isn't true fine-tuning - it's more like "example-based conditioning"
This approach works well for adapting model style and format, but for more substantial changes to model behavior, traditional fine-tuning frameworks that update weights (like those in Hugging Face's ecosystem) would be more effective.

105
doc/projects/airc.txt Normal file
View File

@ -0,0 +1,105 @@
# AIRC (pronounced Eric)
AI is Really Cool
This project provides a simple IRC chat client. It runs the neuralchat model, enhanced with a little bit of RAG to fetch news RSS feeds.
Internally, it is built using PyTorch 2.6 and the Intel IPEX/LLM.
NOTE: If running on an Intel Arc A series graphics processor, fp64 is not supported and may need to either be emulated or have the model quantized. It has been a while since I've had an A series GPU to test on, so if you run into problems please file an [issue](https://github.com/jketreno/airc/issues)--I have some routines I can put in, but don't have a way to test them.
# Installation
This project uses docker containers to build. As this was originally written to work on an Intel Arc B580 (Battlemage), it requires a kernel that supports that hardware, such as the one documented at [Intel Graphics Preview](https://github.com/canonical/intel-graphics-preview), which runs in Ubuntu Oracular (24.10)..
NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
## Want to run under WSL2? No can do...
https://www.intel.com/content/www/us/en/support/articles/000093216/graphics/processor-graphics.html
The A- and B-series discrete GPUs do not support SR-IOV, required for the GPU partitioning that Microsoft Windows uses in order to support GPU acceleration in WSL.
## Building
NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
```bash
git clone https://github.com/jketreno/airc
cd airc
docker compose build
```
## Running
In order to download the models, you need to have a Hugging Face token. See https://huggingface.co/settings/tokens for information on obtaining a token.
Edit .env to add the following:
```.env
HF_ACCESS_TOKEN=<access token from huggingface>
```
NOTE: Models downloaded by most examples will be placed in the ./cache directory, which is bind mounted to the container.
### AIRC
To launch the airc shell interactively, with the pytorch 2.6 environment loaded, use the default entrypoint to launch a shell:
```bash
docker compose run --rm airc shell
```
Once in the shell, you can then launch the model-server.py and then the airc.py client:
```bash
docker compose run --rm airc shell
src/airc.py --ai-server=http://localhost:5000 &
src/model-server.py
```
By default, src/airc.py will connect to irc.libera.chat on the airc-test channel. See `python src/airc.py --help` for options.
By separating the model-server into its own process, you can develop and tweak the chat backend without losing the IRC connection established by airc.
### Jupyter
```bash
docker compose up jupyter -d
```
The default port for inbound connections is 8888 (see docker-compose.yml). $(pwd)/jupyter is bind mounted to /opt/juypter in the container, which is where notebooks will be saved by default.
To access the jupyter notebook, go to `https://localhost:8888/jupyter`.
### Monitoring
You can run `ze-monitor` within the launched containers to monitor GPU usage.
```bash
containers=($(docker ps --filter "ancestor=airc" --format "{{.ID}}"))
if [[ ${#containers[*]} -eq 0 ]]; then
echo "Running airc container not found."
else
for container in ${containers[@]}; do
echo "Container ${container} devices:"
docker exec -it ${container} ze-monitor
done
fi
```
If an airc container is running, you should see something like:
```
Container 5317c503e771 devices:
Device 1: 8086:A780 (Intel(R) UHD Graphics 770)
Device 2: 8086:E20B (Intel(R) Graphics [0xe20b])
```
You can then launch ze-monitor in that container specifying the device you wish to monitor:
```
containers=($(docker ps --filter "ancestor=airc" --format "{{.ID}}"))
docker exec -it ${containers[0]} ze-monitor --device 2
```

279
doc/projects/ze-monitor.txt Normal file
View File

@ -0,0 +1,279 @@
# ze-monitor
A small utility to monitor Level Zero devices via
[Level Zero Sysman](https://oneapi-src.github.io/level-zero-spec/level-zero/latest/sysman/PROG.html#sysman-programming-guide)
from the command line, similar to 'top'.
# Installation
Requires Ubuntu Oracular 24.10.
## Easiest
### Install prerequisites
This will add the [Intel Graphics Preview PPA](https://github.com/canonical/intel-graphics-preview) and install the required dependencies:
```bash
sudo apt-get install -y \
software-properties-common \
&& sudo add-apt-repository -y ppa:kobuk-team/intel-graphics \
&& sudo apt-get update \
&& sudo apt-get install -y \
libze1 libze-intel-gpu1 libncurses6
```
### Install ze-monitor from .deb package
This will download the ze-monitor GitHub, install it, and add the current
user to the 'ze-monitor' group to allow running the utility:
```bash
version=0.3.0-1
wget https://github.com/jketreno/ze-monitor/releases/download/v${version}/ze-monitor-${version}_amd64.deb
sudo dpkg -i ze-monitor-${version}_amd64.deb
sudo usermod -a -G ze-monitor $(whoami)
newgrp ze-monitor
```
Congratulations! You can run ze-monitor:
```bash
ze-monitor
```
You should see something like:
```bash
Device 1: 8086:A780 (Intel(R) UHD Graphics 770)
Device 2: 8086:E20B (Intel(R) Graphics [0xe20b])
```
To monitor a device:
```bash
ze-monitor --device 2
```
Check the docs (`man ze-monitor`) for additional details on running the ze-monitor utility.
## Slightly more involved
This project uses docker containers to build. As this was originally written to monitor an Intel Arc B580 (Battlemage), it requires a kernel that supports that hardware, such as the one documented at [Intel Graphics Preview](https://github.com/canonical/intel-graphics-preview), which runs in Ubuntu Oracular (24.10). It will monitor any Level Zero device, even those using the i915 driver.
NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
```
git clone https://github.com/jketreno/ze-monitor.git
cd ze-monitor
docker compose build
sudo apt install libze1 libncurses6
version=$(cat src/version.txt)
docker compose run --remove-orphans --rm \
ze-monitor \
cp /opt/ze-monitor-static/build/ze-monitor-${version}_amd64.deb \
/opt/ze-monitor/build
sudo dpkg -i build/ze-monitor-${version}_amd64.deb
```
# Security
In order for ze-monitor to read the performance metric units (PMU) in the Linux kernel, it needs elevated permissions. The easiest way is to install the .deb package and add the user to the ze-monitor group. Or, run under sudo (eg., `sudo ze-monitor ...`.)
The specific capabilities required to monitor the GPU are documented in [Perf Security](https://www.kernel.org/doc/html/v5.1/admin-guide/perf-security.html) and [man capabilities](https://man7.org/linux/man-pages/man7/capabilities.7.html). These include:
| Capability | Reason |
|:--------------------|:-----------------------------------------------------|
| CAP_DAC_READ_SEARCH | Bypass all filesystem read access checks |
| CAP_PERFMON | Access to perf_events (vs. overloaded CAP_SYS_ADMIN) |
| CAP_SYS_PTRACE | PTRACE_MODE_READ_REALCREDS ptrace access mode check |
To configure ze-monitor to run with those privileges, you can use `setcap` to set the correct capabilities on ze-monitor. You can further secure your system by creating a user group specifically for running the utility and restrict running of that command to users in that group. That is what the .deb package does.
If you install the .deb package from a [Release](https://github.com/jketreno/ze-monitor/releases) or by building it, that package will set the appropriate permissions for ze-monitor on installation and set it executable only to those in the 'ze-monitor' group.
## Anyone can run ze-monitor
If you build from source and want to set the capabilities:
```bash
sudo setcap "cap_perfmon,cap_dac_read_search,cap_sys_ptrace=ep" build/ze-monitor
getcap build/ze-monitor
```
Any user can then run `build/ze-monitor` and monitor the GPU.
# Build outside container
## Prerequisites
If you would like to build outside of docker, you need the following packages installed:
```
sudo apt-get install -y \
build-essential \
libfmt-dev \
libncurses-dev
```
In addition, you need the Intel drivers installed, which are available from the `kobuk-team/intel-graphics` PPA:
```
sudo apt-get install -y \
software-properties-common \
&& sudo add-apt-repository -y ppa:kobuk-team/intel-graphics \
&& sudo apt-get update \
&& sudo apt-get install -y \
libze-intel-gpu1 \
libze1 \
libze-dev
```
## Building
```
cd build
cmake ..
make
```
## Running
```
build/ze-monitor
```
## Build and install .deb
In order to build the .deb package, you need the following packages installed:
```bash
sudo apt-get install -y \
debhelper \
devscripts \
rpm \
rpm2cpio
```
You can then build the .deb:
```bash
if [ -d build ]; then
cd build
fi
version=$(cat ../src/version.txt)
cpack
sudo dpkg -i build/packages/ze-monitor_${version}_amd64.deb
```
You can then run ze-monitor from your path:
```bash
ze-monitor
```
# Developing
To run the built binary without building a full .deb package, you can build and run on the host by compiling in the container:
```
docker compose run --rm ze-monitor build.sh
build/ze-monitor
```
The build.sh script will build the binary in /opt/ze-monitor/build, which is volume mounted to the host's build directory.
NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
# Running
NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
If running within a docker container, the container environment does not have access to the host's `/proc/fd`, which is necessary to obtain information about the processes outside the current container which are using the GPU. As such, only processes running within that container running ze-monitor will be listed as using the GPU.
## List available devices
```
ze-monitor
```
Example output:
```bash
$ ze-monitor
Device 1: 8086:E20B (Intel(R) Graphics [0xe20b])
Device 2: 8086:A780 (Intel(R) UHD Graphics 770)
```
## Show details for a given device
```
sudo ze-monitor --info --device ( PCIID | # | BDF | UUID | /dev/dri/render*)
```
Example output:
```bash
$ sudo ze-monitor --device 2 --info
Device: 8086:A780 (Intel(R) UHD Graphics 770)
UUID: 868080A7-0400-0000-0002-000000000000
BDF: 0000:0000:0002:0000
PCI ID: 8086:A780
Subdevices: 0
Serial Number: unknown
Board Number: unknown
Brand Name: unknown
Model Name: Intel(R) UHD Graphics 770
Vendor Name: Intel(R) Corporation
Driver Version: 0CB7EFCAD5695B7EC5C8CE6
Type: GPU
Is integrated with host: Yes
Is a sub-device: No
Supports error correcting memory: No
Supports on-demand page-faulting: No
Engines: 7
Engine 1: ZES_ENGINE_GROUP_RENDER_SINGLE
Engine 2: ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE
Engine 3: ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE
Engine 4: ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE
Engine 5: ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE
Engine 6: ZES_ENGINE_GROUP_COPY_SINGLE
Engine 7: ZES_ENGINE_GROUP_MEDIA_ENHANCEMENT_SINGLE
Temperature Sensors: 0
```
NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
## Monitor a given device
```
sudo ze-monitor --device ( PCIID | # | BDF | UUID | /dev/dri/render* ) \
--interval ms
```
NOTE: See [Security](#security) for information on running ze-monitor with required kernel access capabilities.
Output:
```bash
$ sudo ze-monitor --device 2 --interval 500
Device: 8086:E20B (Intel(R) Graphics [0xe20b])
Total Memory: 12809404416
Free memory: [# 55% ############################ ]
Power usage: 165.0W
------------------------------------------------------------------------------------------
PID COMMAND-LINE
USED MEMORY SHARED MEMORY ENGINE FLAGS
------------------------------------------------------------------------------------------
1 /sbin/init splash
MEM: 106102784 SHR: 100663296 FLAGS: RENDER COMPUTE
1606 /usr/lib/systemd/systemd-logind
MEM: 106102784 SHR: 100663296 FLAGS: RENDER COMPUTE
5164 /usr/bin/gnome-shell
MEM: 530513920 SHR: 503316480 FLAGS: RENDER COMPUTE
5237 /usr/bin/Xwayland :1024 -rootless -nores...isplayfd 6 -initfd 7 -byteswappedclients
MEM: 0 SHR: 0 FLAGS:
40480 python chat.py
MEM: 5544226816 SHR: 0 FLAGS: DMA COMPUTE
```
If you pass `--one-shot`, statistics will be gathered, displayed, and then ze-monitor will exit.

56
doc/resume/generic.txt Normal file
View File

@ -0,0 +1,56 @@
# JAMES KETRENOS
software architect, designer, developer, and team lead
Beaverton, OR 97003
james@ketrenos.com
(503) 501 8281
Seeking an opportunity to contribute to the advancement of energy efficient AI solutions, James is a driven problem solver, solution creator, technical leader, and skilled software developer focused on rapid, high-quality results, with an eye toward bringing solutions to the market.
## SUMMARY
Problem-solving: Trusted resource for executive leadership, able to identify opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers.
Proficient: Adept in compiled and interpreted languages, the software frameworks built around them, and front- and backend infrastructure. Leveraging deep and varied experience to quickly find solutions. Rapidly familiarizes and puts to use new and emerging technologies.
Experienced: 20+ years of experience as an end-to-end Linux software architect, team lead, developer, system administrator, and user. Working with teams to bring together technologies into existing ecosystems for a myriad of technologies.
Leader: Frequent project lead spanning all areas of development and phases of the product life cycle from pre-silicon to post launch support. Capable change agent and mentor, providing technical engineering guidance to multiple teams and organizations.
Communicates: Thrives on helping people solve problems, working to educate others to help them better understand problems and work toward solutions.
## RECENT HISTORY
2024-2025: Present
* Developed 'ze-monitor', a lightweight C++ Linux application leveraging Level Zero Sysman APIs to provide 'top' like device monitoring of Intel GPUs. https://github.com/jketreno/ze-monitor
* Developed 'airc', a LLM pipeline allowing interactive queries about James' resume. Utilizing both in-context and fine-tuned approaches, questions asked about James will use information from his resume and portfolio for answers. Includes a full-stack React web ui, a command line client, and an IRC bot integration. https://github.com/jketreno/airc
2018-2024: Intel® Graphics Software Staff Architect and Lead
* Redefined how Intel approaches graphics enabling on Linux to meet customer and product timelines.
* Spearheaded internal projects to prove out the developer and customer deployment experience when using Intel graphics products with PyTorch, working to ensure all ingredients are available and consumable for success (from kernel driver integration, runtime, framework integration, up to containerized Python workload solution deployment.)
* Focused on improving the customer experience for Intel graphics software for Linux in the data center, high-performance compute clusters, and end users. Worked with several teams and business units to close gaps, improve our software, documentation, and release methodologies.
* Worked with hardware and firmware teams to scope and define architectural solutions for customer features.
1998-2018: Open Source Software Architect and Lead
* Defined software architecture for handheld devices, tablets, Internet of Things, smart appliances, and emerging technologies. Key resource to executive staff to investigate emerging technologies and drive solutions to close existing gaps
* James career at Intel has been diverse. His strongest skills are related to quickly ramping on technologies being utilized in the market, identifying gaps in existing solutions, and working with teams to close those gaps. He excels at adopting and fitting new technology trends as they materialize in the industry.
## PROLONGED HISTORY
The following are technical areas James has been an architect, team lead, and/or individual contributor:
* Linux release infrastructure overhaul: Identified bottlenecks in the CI/CD build pipeline, built proof-of-concept, and moved to production for generating releases of Intel graphics software (https://dgpu-docs.intel.com) as well as internal dashboards and infrastructure for tracking build and release pipelines. JavaScript, HTML, Markdown, RTD, bash/python, Linux packaging, Linux repositories, Linux OS release life cycles, sqlite3. Worked with multiple teams across Intel to meet Intels requirements for public websites as well as to integrate with existing build and validation methodologies while educating teams on tools and infrastructure available from the ecosystem (vs. roll-your-own).
* Board Explorer: Web app targeting developer ecosystem to utilize new single board computers, providing quick access to board details, circuits, and programming information. Delivered as a pure front-end service (no backend required) https://board-explorer.github.io/board-explorer/#quark_mcu_dev_kit_d2000. Tight coordination with UX design team. JavaScript, HTML, CSS, XML, hardware specs, programming specs.
* (internal) Travel Requisition: Internal HTML application and backend enabling internal organizations to request travel approval and a manager front end to track budgetary expenditures in order to determine approval/deny decisions. NodeJS, JavaScript, Polymer, SQL. Tight coordination with internal requirements providers and UX design teams.
* Developer Journey: Web infrastructure allowing engineers to document DIY processes. Front end for parsing, viewing, and following projects. Back end for managing content submitted (extended markdown) including images, videos, and screencasts. Tight coordination with UX design team.
* Robotics: Worked with teams aligning on a ROS (Robot OS) roadmap and alignment. Presented at Embedded Linux conference on the state of open source and robotics. LIDAR, Intel RealSense, opencv, python, C. Developed a robotic vision controlled stewart platform that could play the marble game labyrinth.
* Moblin and MeeGo architect: Focused on overall software architecture as well as moving forward multi-touch and the industry shift to resolution independent applications; all in a time before smart phones as we know them today. Qt, HTML5, EFL.
* Marblin: An HTML/WebGL graphical application simulating the 2D collision physics of marbles in a 3D rendered canvas.
* Linux Kernel: Developed and maintained initial Intel Pro Wireless 2100, 2200, and 3945 drivers in the Linux kernel. C, Software Defined Radios, IEEE 802.11, upstream kernel driver, team lead for team that took over the Intel wireless drivers, internal coordination regarding technical and legal issues surrounding the wireless stack.
* Open source at Intel: Built proof-of-concepts to illustrate to management the potential and opportunities for Intel by embracing open source and Linux.
* Intel Intercast Technology: Team lead for Intel Intercast software for Windows. Worked with 3rd party companies to integrate the technology into their solutions.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,57 @@
{
"file_path": "../doc/resume/generic.txt",
"processed_at": "2025-03-12T19:22:00.311002",
"context": "# JAMES KETRENOS\nsoftware architect, designer, developer, and team lead\nBeaverton, OR 97003\n\njames@ketrenos.com\n(503) 501 8281\n\nSeeking an opportunity to contribute to the advancement of energy efficient AI solutions, James is a driven problem solver, solution creator, technical leader, and skilled software developer focused on rapid, high-quality results, with an eye toward bringing solutions to the market.\n\n## SUMMARY\n\nProblem-solving: Trusted resource for executive leadership, able to identify opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers.\n\nProficient: Adept in compiled and interpreted languages, the software frameworks built around them, and front- and backend infrastructure. Leveraging deep and varied experience to quickly find solutions. Rapidly familiarizes and puts to use new and emerging technologies.\n\nExperienced: 20+ years of experience as an end-to-end Linux software architect, team lea...",
"qa_pairs": [
{
"question": "What is James' primary role in the software architecture field?",
"answer": "James' primary role in the software architecture field is focused on developing and maintaining the Linux kernel, ensuring compatibility with Intel wireless drivers. He also contributes to hardware design, particularly in the development of the Intel Intercast technology.",
"validation": "**Assessment:**\n\n1. **ACCURACY:** \n The answer accurately mentions James' primary role in the software architecture field, focusing on kernel development for Intel wireless and contributing to hardware design, including Intel Intercast. However, it omits details about other contributions, such as kernel development for other hardware projects.\n\n2. **COMPLETENESS:** \n The answer is incomplete as it does not provide information on James' work in hardware design beyond Intel Intercast, such as kernel development for other hardware.\n\n3. **RELEVANCE:** \n The answer is relevant, covering James' kernel development for Intel wireless and his contribution to hardware design.\n\n4. **NATIVITY:** \n The answer is natural and conversational.\n\n5. **DIVERSITY:** \n The answer is somewhat diverse, covering different areas of software architecture.\n\n6. **IMPROVEMENTS:** \n The answer could include more details on James' contributions to hardware design, such as his kernel development for Intel wireless or other hardware projects.\n\n**Decision:** \nREJECT"
},
{
"question": "Can you explain the development of the 'ze-monitor' application?",
"answer": "The 'ze-monitor' application was developed as a lightweight C++ Linux application that leverages Level Zero Sysman APIs to provide 'top' like device monitoring for Intel GPUs. This project was part of James' experience working with Linux and Level Zero Sysman APIs, focusing on leveraging new technologies and ensuring seamless integration with existing systems. The development process involved identifying opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers. This led to the creation of a lightweight application that integrates Level Zero Sysman APIs, enabling device monitoring for Intel GPUs. The detailed development process involved collaboration with the team, ensuring that the application met the technical requirements and worked seamlessly with existing systems.",
"validation": "**Answer:**\n\nThe 'ze-monitor' application was developed as a lightweight C++ Linux application that leverages Level Zero Sysman APIs to provide device monitoring for Intel GPUs. This project was part of James' experience working with Linux and Level Zero Sysman APIs, focusing on leveraging new technologies and ensuring seamless integration with existing systems. The development process involved identifying opportunities to bridge technical gaps, adopt new technologies, and improve efficiency and quality for internal and external customers. The application was created to integrate Level Zero Sysman APIs, enabling device monitoring for Intel GPUs, while ensuring seamless integration with existing systems.\n\n**Assessment:**\n\n- **VERDICT:** [ACCEPT]\n\n- **REASONING:** The answer is accurate, comprehensive, relevant, and natural. It explains the technical aspects of the application and its development, which is useful for understanding the context provided. It also mentions how the answer was generated and its contribution to the overall knowledge base, making it somewhat complete.\n\n- **IMPROVEMENTS:** To add diversity, the answer could mention other areas where similar applications were developed, or highlight the broader impact of these technologies. For example, mentioning other device monitoring applications developed by James or other developers could add more diversity. Also, including how these technologies contribute to broader areas like cybersecurity or efficiency could enhance the answer's breadth and relevance."
},
{
"question": "How did the 'airc' project utilize James' resume data?",
"answer": "James' resume data was utilized in the development of the 'airc' project, which is a command line client, interactive query system, and IRC bot integration. This project leverages his extensive experience in software architecture and development, particularly in areas such as Linux release infrastructure and system architecture.",
"validation": "- VERDICT: [ACCEPT]\n- REASONING: The answer is accurate, complete, relevant, and natural. It provides specific details and adds some diversity by focusing on a single project and experience area. No further improvements are needed.\n- IMPROVEMENTS: None needed, as the answer meets all criteria effectively.\n\nAnswer:\nJames' resume data was utilized in the development of the 'airc' project, which includes a command line client, interactive query system, and IRC bot integration. This project leverages his extensive experience in software architecture and development, particularly in areas such as Linux release infrastructure and system architecture."
},
{
"question": "What is James' experience with Linux kernel driver integration?",
"answer": "James' experience with Linux kernel driver integration includes working on the Intel Wireless drivers, where he contributed to the development of kernel driver integration, utilizing tools like Linux packaging and managing CI/CD pipelines for release infrastructure.",
"validation": "**Answer:**\n\nJames' experience with Linux kernel driver integration includes his contributions to the development of kernel driver integration for Intel's Wireless drivers and his role in managing CI/CD pipelines for projects like Open Source at Intel.\n\n**Evaluation:**\n\n- **VERDICT:** [ACCEPT]\n- **REASONING:** The answer accurately describes James' experience with kernel driver integration, providing specific examples and context. It is relevant, natural, and adds diversity to his expertise.\n- **IMPROVEMENTS:** If possible, enhancing the answer with more details about the tools or processes used could further enhance naturalness. However, as provided, it is sufficiently detailed.\n- **DIVERSITY:** The answer naturally incorporates James' work in kernel driver integration, which is a natural addition to his role and provides a new angle for understanding his expertise.\n\nThis answer is appropriate and suitable for training purposes."
},
{
"question": "How did James contribute to the development of Intel RealSense hardware?",
"answer": "James contributed to the development of Intel RealSense hardware through the development of a command line client and an IRC bot integration for the 'airc' project.",
"validation": "**Answer:**\n\nJames contributed to the development of Intel RealSense hardware through the creation of a command line client and an IRC bot integration for the 'airc' project. This involved integrating AI capabilities into the 'ze-monitor' project, allowing for device monitoring of Intel GPUs, which was part of the broader effort to advance energy-efficient AI solutions. However, the answer does not directly address how James contributed to Intel RealSense hardware, focusing instead on his other projects. This could be an area for improvement to better align with the context provided.\n\n**Assessment (VERDICT):** \nThe answer is accurate and provides specific information about James' contributions but lacks essential details about his role in Intel RealSense hardware development. It is a bit verbose but could be more concise. The response is appropriate but could be enhanced to better meet the quality standards by providing a clearer link to Intel RealSense.\n\n**Reasoning:** \nThe answer accurately describes James' contributions but does not address the specific impact on Intel RealSense hardware. It is a sufficient answer for the given question, but it could be improved for more clarity and conciseness.\n\n**IMPROVEMENTS:** \n1. Clarify how James contributed to Intel RealSense hardware, linking his other projects to RealSense development.\n2. Provide a more direct explanation of his role in the 'airc' project and its relation to Intel RealSense.\n3. Offer a brief overview of Intel RealSense hardware to better place James' contributions within the context."
},
{
"question": "What is the current state of James' open-source contributions?",
"answer": "James Ketrenos is actively engaged in several open-source projects, contributing to the development and enhancement of various tools and technologies. His contributions include:\n\n1. **Ze-Monitor**: A lightweight application that leverages Level Zero Sysman APIs for device monitoring of Intel GPUs. This tool is part of his contributions to the Linux community.\n\n2. **Airc**: A LLM pipeline that facilitates interactive queries about James' resume, showcasing his proficiency in using LLMs to assist with resume optimization and internal communication.\n\n3. **Rebound**: A React web UI that integrates various components such as a command line client, an IRC bot, and an IRC bot integration, demonstrating his ability to create user-friendly interfaces for complex systems.\n\n4. **Rebound-WebUI**: A full-stack React web UI that includes a command line client, an IRC bot, and an IRC bot integration, highlighting his expertise in developing integrated web applications.\n\n5. **Rebound-React-UI**: A React web UI that includes a command line client and an IRC bot integration, showcasing his skill in creating web interfaces for technical solutions.\n\n6. **Rebound-React-UI-IRC**: An integrated React web UI with a command line client, an IRC bot, and an IRC bot integration, further illustrating his ability to create effective and user-friendly interfaces.\n\nThese contributions reflect James' active participation in the open-source community and his dedication to advancing technology through innovative solutions.",
"validation": "** Assessment: [ACCEPT]**\n\n**Reasoning:**\n1. **ACCURACY:** The answer accurately describes James Ketrenos's open-source contributions, mentioning specific tools and projects.\n2. **COMPLETENESS:** The answer fully answers the question by detailing James's involvement in multiple projects.\n3. **RELEVANCE:** The question is meaningful and relevant to the context, focusing on James's role in the Linux community.\n4. **NATURALNESS:** The language is natural and conversational, avoiding forced or unnatural phrasing.\n5. **DIVERSITY:** The pair adds naturalness to the dataset, enhancing its overall quality.\n\n**Improvements:**\n- To further enhance naturalness, examples could include using \"partner\" instead of \"contribution,\" making the answer more relatable.\n- Additional context about James's contributions could be added, such as his role in specific projects or his impact on the community."
},
{
"question": "How has James' expertise in kernel driver development impacted the CI/CD pipeline?",
"answer": "James' expertise in kernel driver development has significantly impacted the CI/CD pipeline. Specifically, his work on Intel graphics software has enabled the efficient deployment of Intel graphics products within the CI/CD pipeline. By optimizing kernel drivers, James ensured that the Linux kernel could handle the CI/CD processes, such as deployments and releases, thereby enhancing the software's performance and reliability. This integration has allowed for better utilization of Intel graphics, improving the overall efficiency and customer satisfaction of the products.",
"validation": "**Answer:**\n\nThe provided answer is well-received, with a naturalness score of 0.8 due to minor suggestion of making it more conversational. It is accurate, complete, and relevant, with a naturalness score of 0.8 due to minor suggestion of making it more conversational.\n\n**Evaluation:**\n\n- **ACCURACY:** Yes. The answer correctly links kernel driver expertise to CI/CD pipeline efficiency.\n- **COMPLETENESS:** Yes. The answer provides a clear and detailed link between kernel driver development and CI/CD processes.\n- **RELEVANCE:** Yes. The answer connects kernel driver expertise to CI/CD processes, which are relevant to software development and efficiency.\n- **NATURALNESS:** Good. The answer is written in a professional context, but could be more conversational.\n- **DIVERSITY:** Good. The answer focuses on specific technologies and their impact, but doesn't add much diversity.\n\n**Overall Assessment:** The task is suitable for training, with minor areas for improvement in naturalness. The answer is accurate, complete, and relevant, making it effective for quality assurance.\n\n**Answer:** VERDICT: [ACCEPT] \nREASONING: The answer accurately connects James' kernel driver expertise to CI/CD impact, demonstrating relevance and completeness. \nIMPROVEMENTS: The answer could be made more conversational by phrases like, \" James' work on Intel graphics software has enabled...\" \n- VERDICT: ACCEPT \n- REASONING: The answer is accurate, complete, and relevant. It effectively addresses the impact of kernel driver development on CI/CD. \n- IMPROVEMENTS: Suggesting more conversational phrasing to enhance naturalness."
},
{
"question": "What is the impact of James' work on the development of the Intel GPU driver?",
"answer": "James Keta renos's work has significantly impacted the development of the Intel GPU driver. Specifically, he was involved in the design and implementation of the Intel GPU driver, which led to the release of the Intel GPU driver version 3.3.1. Additionally, his contributions to the Linux kernel and development of tools like 'ze-monitor' and 'airc' have enhanced the efficiency of kernel-based graphics drivers. These advancements have improved the performance and reliability of the Intel GPU, contributing to the broader development of Linux kernel-based graphics drivers and related technologies.",
"validation": "**Answer:**\n\n**VERDICT:** ACCEPVED\n\n**REASONING:** The question is clear and the answer is thorough, addressing the impact of James Keta renos's work on the Intel GPU driver. However, the answer is somewhat technical and lacks a broader context and naturalness.\n\n**IMPROVEMENTS:** The answer could be made more conversational and broader to explain how James' work has directly impacted the development of the Intel GPU driver. It should also provide more context on how his contributions have influenced other areas of Linux or kernel-based software.\n\n**NOTICE:** The answer is comprehensive but lacks a more natural and broader context. It could be improved by including more conversational and contextual elements to better explain the impact and its broader significance."
},
{
"question": "How has James' experience with robotic vision applications influenced hardware design?",
"answer": "James' experience with robotic vision applications has significantly influenced hardware design by enabling the development of efficient and scalable computing solutions. By integrating robotic vision into hardware, he contributed to the design of hardware that can handle complex tasks requiring visual recognition, leading to improved performance in areas such as image processing and robotics. This has translated into hardware designs that enhance computational capabilities and efficiency, directly impacting the development of systems requiring high-performance computing.",
"validation": "- VERDICT: REJECT\n- REASONING: The answer is technically accurate and relevant, but it lacks specific examples and context, making it less natural and less detailed than necessary.\n- IMPROVEMENTS: The answer should include concrete hardware design examples influenced by robotic vision, tie back to James' experience as a software architect, and provide more natural and detailed examples to make it more understandable."
},
{
"question": "What is James' role in the development of the Intel Intercast software technology?",
"answer": "James' role in the development of the Intel Intercast software technology is as a team leader and technical leader, responsible for driving the integration of the Intel Intercast technology into various solutions and ensuring its successful adoption.",
"validation": "**Answer:**\n\nJames' role in the development of the Intel Intercast software technology is as a team leader and technical leader, responsible for driving the integration of the Intel Intercast technology into various solutions and ensuring its successful adoption.\n\n**Evaluation:**\n\n1. **VERDICT: [ACCEPT]**\n - **REASONING:** The answer accurately describes James' role and contributes to the context about his work.\n\n2. **COMPLETENESS:** [REMAIN] The answer fully addresses the question by explaining James' role, so it's complete.\n\n3. **RELEVANCE:** [REMAIN] The answer clearly ties James' role to the Intel Intercast project, making it relevant.\n\n4. ** NATURALNESS:** [REMAIN] The answer uses professional language and is concise, which sounds natural.\n\n5. **DIVERSITY:** [REMAIN] The answer adds variety by mentioning multiple areas of James' work, which is important for training.\n\nOverall, this pair meets all the quality standards. It's accurate, complete, relevant, natural, and adds diversity, which is important for training."
}
]
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,293 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Some parts of this file is adapted from
# https://github.com/tloen/alpaca-lora/blob/main/finetune.py
#
# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import List
import torch
import transformers
from datasets import load_dataset
import accelerate
from typing import Union
from datetime import datetime
from transformers import AutoTokenizer
from peft import (
get_peft_model_state_dict,
set_peft_model_state_dict,
)
from ipex_llm.transformers import AutoModelForCausalLM
# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
LoraConfig
import json
local_rank = 0
world_size = 1
class Prompter(object):
__slots__ = ("data")
def __init__(self, data_path: str = ""):
self.data = None
if not os.path.exists(data_path):
raise Exception(f"Can't read {data_path}")
with open(data_path) as fp:
self.data = json.load(fp)
def generate_prompt(
self,
instruction: str,
input: Union[None, str]=None,
label: Union[None, str]=None,
) -> str:
# returns the full prompt from instruction and optional input
# if a label (=response, =output) is provided, it's also appended.
if input:
ret = f"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
else:
ret = f"### Instruction:\n{instruction}\n\n### Response:\n"
if label:
ret = f"{ret}{label}"
return ret
def get_response(self, output: str) -> str:
return output.split("### Response:")[1].strip()
"""Data processing to get train data and val data"""
def tokenize_qa(prompt, tokenizer, max_length=1, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=max_length,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < max_length
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)
result["labels"] = result["input_ids"].copy()
return result
def generate_and_tokenize_prompt(entry, prompter, tokenizer, max_length, add_eos_token, train_on_inputs):
full_prompt = prompter.generate_prompt(
entry["question"],
None,
entry["answer"],
)
print({ 'full_prompt:' full_prompt })
tokenized_full_prompt = tokenize_qa(full_prompt, tokenizer, max_length=max_length)
if not train_on_inputs:
user_prompt = prompter.generate_prompt(
entry["question"], None
)
tokenized_user_prompt = tokenize_qa(
user_prompt, tokenizer, add_eos_token=add_eos_token, max_length=max_length
)
user_prompt_len = len(tokenized_user_prompt["input_ids"])
if add_eos_token:
user_prompt_len -= 1
# Mask out the input from the labels
tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt
def train(
# model/data params
base_model: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
data_path: str = "./results/doc/resume/generic.txt.qa.json",
output_dir: str = "./bigdl-qlora-alpaca",
# training hyperparams
bf16: bool = True, # default to bf16
batch_size: int = 128,
micro_batch_size: int = 2, # default to be 2, limited by GPU memory
num_epochs: int = 3,
learning_rate: float = 3e-5, # default to be 3e-5 to avoid divergence
cutoff_len: int = 256,
val_set_size: int = 2000,
# lora hyperparams
lora_r: int = 8,
lora_alpha: int = 16,
lora_dropout: float = 0.05,
lora_target_modules: List[str] = [
"q_proj",
"v_proj",
"k_proj",
"o_proj",
"up_proj",
"down_proj",
"gate_proj"
],
# llm hyperparams
train_on_inputs: bool = True, # if False, masks out inputs in loss
add_eos_token: bool = False,
group_by_length: bool = False, # faster, but produces an odd training loss curve
resume_from_checkpoint: str = "", # either training checkpoint or final adapter
prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca.
gradient_checkpointing: bool = False,
training_mode: str = "lora",
save_checkpoint: bool = True,
):
if training_mode != "lora":
raise Exception(f"This example is for lora training mode, but got training_mode={training_mode}.")
print(
f"Training Alpaca-LoRA model with params:\n"
f"base_model: {base_model}\n"
f"data_path: {data_path}\n"
f"output_dir: {output_dir}\n"
f"batch_size: {batch_size}\n"
f"micro_batch_size: {micro_batch_size}\n"
f"num_epochs: {num_epochs}\n"
f"learning_rate: {learning_rate}\n"
f"cutoff_len: {cutoff_len}\n"
f"val_set_size: {val_set_size}\n"
f"lora_r: {lora_r}\n"
f"lora_alpha: {lora_alpha}\n"
f"lora_dropout: {lora_dropout}\n"
f"lora_target_modules: {lora_target_modules}\n"
f"train_on_inputs: {train_on_inputs}\n"
f"add_eos_token: {add_eos_token}\n"
f"group_by_length: {group_by_length}\n"
f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
f"prompt template: {prompt_template_name}\n"
f"training_mode: {training_mode}\n"
f"save_checkpoint: {save_checkpoint}\n"
)
gradient_accumulation_steps = batch_size // micro_batch_size
prompter = Prompter("./results/doc/resume/generic.txt.qa.json")
model = AutoModelForCausalLM.from_pretrained(
base_model,
# load_in_4bit=True,
load_in_low_bit="bf16",
optimize_model=False,
# optimize_model=True,
torch_dtype=torch.bfloat16,
modules_to_not_convert=["lm_head"],
trust_remote_code=True,
)
# model = model.half()
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Prepare a IPEX-LLM compatible Peft model
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
target_modules=lora_target_modules,
lora_dropout=lora_dropout,
bias="none",
task_type="CAUSAL_LM",
training_mode=training_mode,
)
print(f"Lora Config: {config}")
model = get_peft_model(model, config)
dataset = load_dataset("json", data_files=data_path)
tokenized_dataset = dataset.map(generate_and_tokenize_prompt, fn_kwargs={
"prompter": prompter,
"tokenizer": tokenizer,
"max_length": cutoff_len,
"add_eos_token": add_eos_token,
"train_on_inputs": train_on_inputs
}, batched=True)
model.print_trainable_parameters() # Be more transparent about the % of trainable params.
split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]
trainer = transformers.Trainer(
model=model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
args=transformers.TrainingArguments(
per_device_train_batch_size=micro_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
# warmup_ratio=0.03,
# warmup_steps=100,
max_grad_norm=0.3,
num_train_epochs=num_epochs,
learning_rate=learning_rate,
lr_scheduler_type="cosine",
bf16=True, # ensure training more stable
logging_steps=1,
optim="adamw_torch",
evaluation_strategy="steps" if val_set_size > 0 else "no",
save_strategy="steps" if save_checkpoint else "no",
eval_steps=100 if val_set_size > 0 else None,
save_steps=100,
output_dir=output_dir,
save_total_limit=100,
load_best_model_at_end=True if val_set_size > 0 and save_checkpoint else False,
group_by_length=group_by_length,
gradient_checkpointing=gradient_checkpointing,
ddp_backend="ccl",
save_safetensors=False,
),
data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
model.config.use_cache = False
trainer.train(data_path="./results/doc/resume/generic.txt.qa.json", resume_from_checkpoint=resume_from_checkpoint)
model.save_pretrained(output_dir)
print(
"\n If there's a warning about missing keys above, please disregard :)"
)
if __name__ == "__main__":
train()

394
src/claude-fine-tune.py Normal file
View File

@ -0,0 +1,394 @@
import os
import json
import torch
import random
import glob
from datetime import datetime
from tqdm import tqdm
from transformers import (
AutoTokenizer,
TrainingArguments,
Trainer,
TrainerCallback,
TrainingArguments,
TrainerState,
TrainerControl,
DataCollatorForLanguageModeling,
AutoModelForCausalLM,
BitsAndBytesConfig
)
import numpy as np
from peft import (
TaskType,
get_peft_model,
prepare_model_for_kbit_training,
LoraConfig,
)
from datasets import Dataset
def load_qa_data(results_dir):
"""
Load Q&A pairs from the processed results directory.
Args:
results_dir (str): Path to the directory containing JSON files with Q&A pairs
Returns:
list: List of prompt-completion pairs formatted for fine-tuning
"""
training_data = []
# Find all JSON files in the results directory and subdirectories
json_files = glob.glob(os.path.join(results_dir, "**/*.json"), recursive=True)
for json_file in tqdm(json_files, desc="Loading training data"):
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Extract context and QA pairs
context = data.get("context", "")
qa_pairs = data.get("qa_pairs", [])
for qa_pair in qa_pairs:
question = qa_pair.get("question", "")
answer = qa_pair.get("answer", "")
if question and answer:
# Format as instruction-based fine-tuning example
prompt = f"""<task>
Answer the following question based on the provided context.
CONTEXT:
{context}
QUESTION:
{question}
</task>
"""
# Add to training data
training_data.append({
"prompt": prompt,
"completion": answer
})
except Exception as e:
print(f"Error processing {json_file}: {e}")
continue
print(f"Loaded {len(training_data)} training examples")
return training_data
def prepare_dataset_for_training(training_data, tokenizer, max_length=2048):
"""
Prepare the dataset for training by tokenizing and formatting.
Args:
training_data (list): List of prompt-completion pairs
tokenizer: DeepSeek tokenizer
max_length (int): Maximum sequence length
Returns:
Dataset: HuggingFace dataset ready for training
"""
def tokenize_function(examples):
# Combine prompt and completion
full_texts = []
for i in range(len(examples["prompt"])):
full_text = examples["prompt"][i] + examples["completion"][i]
full_texts.append(full_text)
if not tokenizer.pad_token:
tokenizer.pad_token = tokenizer.eos_token
# Tokenize
tokenized = tokenizer(
full_texts,
padding="max_length",
truncation=True,
max_length=max_length,
return_tensors="pt"
)
# Create labels (same as input_ids for causal LM)
tokenized["labels"] = tokenized["input_ids"].clone()
# Create attention mask for prompt tokens
for i in range(len(full_texts)):
prompt_length = len(tokenizer.encode(examples["prompt"][i]))
# Set labels for prompt tokens to -100 (ignored in loss calculation)
tokenized["labels"][i, :prompt_length] = -100
return tokenized
# Convert to HuggingFace Dataset
dataset_dict = {
"prompt": [item["prompt"] for item in training_data],
"completion": [item["completion"] for item in training_data]
}
# Create dataset
dataset = Dataset.from_dict(dataset_dict)
# Apply tokenization
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=["prompt", "completion"]
)
return tokenized_dataset
def setup_lora_config():
"""
Configure LoRA hyperparameters.
Returns:
LoraConfig: Configuration for LoRA fine-tuning
"""
return LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=8, #16, # Rank dimension
lora_alpha=16, #32, # Alpha parameter for LoRA scaling
lora_dropout=0.05, # Dropout probability for LoRA layers
target_modules=[ # Target modules to apply LoRA to
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj"
],
bias="none", # Whether to train bias parameters
fan_in_fan_out=False # Set to True for linear layers with fan_in != fan_out
)
class EarlyStoppingCallback(TrainerCallback):
"""
Callback that implements early stopping.
"""
def __init__(self, patience=3, min_delta=0.0):
"""
Args:
patience (int): Number of evaluations with no improvement after which training will be stopped.
min_delta (float): Minimum change in the monitored quantity to qualify as an improvement.
"""
self.patience = patience
self.min_delta = min_delta
self.best_score = None
self.counter = 0
self.early_stop = False
def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
eval_loss = state.log_history[-1].get("eval_loss")
if eval_loss is None:
return
if self.best_score is None:
self.best_score = eval_loss
elif eval_loss > self.best_score - self.min_delta:
self.counter += 1
print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
if self.counter >= self.patience:
print(f"Early stopping triggered! Best loss: {self.best_score:.4f}")
self.early_stop = True
control.should_training_stop = True
else:
self.best_score = eval_loss
self.counter = 0
return control
def train_with_lora(model_name, training_data, output_dir, batch_size=4, num_epochs=3, learning_rate=2e-4):
"""
Fine-tune the DeepSeek-R1 model with LoRA.
Args:
model_name (str): Name of the DeepSeek model
training_data (list): List of prompt-completion pairs
output_dir (str): Directory to save the fine-tuned model
batch_size (int): Training batch size
num_epochs (int): Number of training epochs
learning_rate (float): Learning rate
"""
# Create output directory
os.makedirs(output_dir, exist_ok=True)
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
low_cpu_mem_usage=True,
trust_remote_code=True,
use_cache=False,
quantization_config=bnb_config,
device_map={"": torch.xpu.current_device()},
torch_dtype=torch.bfloat16,
)
# Prepare the model for LoRA fine-tuning
# 1. Prepare for k-bit training if using quantization
model = prepare_model_for_kbit_training(model)
# 2. Add LoRA adapters
lora_config = setup_lora_config()
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Prepare dataset
tokenized_dataset = prepare_dataset_for_training(training_data, tokenizer)
# Split into train and evaluation sets
dataset_dict = tokenized_dataset.train_test_split(test_size=0.1)
# Set up training arguments
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=4,
evaluation_strategy="steps",
eval_steps=100, # More frequent evaluation to check stopping criteria
save_strategy="steps",
save_steps=500,
save_total_limit=3,
learning_rate=learning_rate,
warmup_steps=100,
weight_decay=0.01,
logging_dir=f"{output_dir}/logs",
logging_steps=100,
fp16=True,
report_to="none",
optim="adamw_torch",
load_best_model_at_end=True, # Load the best model when training ends
metric_for_best_model="eval_loss" # Use eval loss to determine the best model
)
# Create data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False # Use causal language modeling (not masked)
)
early_stopping_callback = EarlyStoppingCallback(patience=3, min_delta=0.01)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset_dict["train"],
eval_dataset=dataset_dict["test"],
data_collator=data_collator,
callbacks=[early_stopping_callback] # Add the early stopping callback
)
# Start training
print("Starting LoRA fine-tuning...")
trainer.train()
model = trainer.model
print("LoRA fine-tuning complete...")
# Save the fine-tuned model
print("Saving model...")
model.save_pretrained(f"{output_dir}/final_model")
tokenizer.save_pretrained(f"{output_dir}/final_model")
print(f"Fine-tuned model saved to {output_dir}/final_model")
return model, tokenizer
# Main function
def main():
# Configuration
results_dir = "../results" # Directory with processed text files and QA pairs
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
#model_name = "Intel/neural-chat-7b-v3-3"
output_dir = f"./fine_tuned"
# Load training data from processed results
training_data = load_qa_data(results_dir)
# Shuffle training data
random.shuffle(training_data)
# Fine-tune the model
train_with_lora(
model_name=model_name,
training_data=training_data,
output_dir=output_dir,
batch_size=1, # Can't fit more than one on the B580
num_epochs=10,
learning_rate=2e-4
)
model_path = f"{output_dir}/final_model"
# Load the fine-tuned model and tokenizer
print("Loading fine-tuned model for evaluation...")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
use_cache=True,
quantization_config=bnb_config,
device_map={"": torch.xpu.current_device()})
model = model.to('xpu')
print("Loaded. Ask your question, CTRL-C to exit.")
while True:
try:
question = input("> ").strip()
except KeyboardInterrupt:
print("\nExiting.")
break
prompt = f"""<task>
You are an assistant providing resume details about James Ketrenos. Answer the following question based on your knowledge. If you don't know, say so. Be concise.
QUESTION:
{question}
</task>
"""
# Tokenize and generate
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=1024,
do_sample=True,
temperature=0.7,
top_p=0.9,
)
# Decode the output
full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract just the answer part (remove the prompt)
answer = full_output#[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
print(answer.strip())
if __name__ == "__main__":
main()

360
src/claude-process.py Normal file
View File

@ -0,0 +1,360 @@
# Self-supervised synthetic data via sequential generation
import os
import re
import json
import torch
from pathlib import Path
from datetime import datetime
from tqdm import tqdm
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
)
def load_deepseek_r1():
"""
Loads the DeepSeek-R1 model and tokenizer.
Returns:
tuple: (model, tokenizer) for DeepSeek-R1
"""
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
low_cpu_mem_usage=True,
trust_remote_code=True,
use_cache=False,
quantization_config=bnb_config,
device_map={"": torch.xpu.current_device()},
torch_dtype=torch.bfloat16,
)
if not model.config.pad_token_id:
model.config.pad_token_id = model.config.eos_token_id
return model, tokenizer
# Function to get answers from the model
def validate_question_answer_from_model(context, question, answer, model, tokenizer):
"""
Query the model to evaluate if an answer is a good fit for a question.
Args:
context (str): The text file content
question (str): The question
answer (str): The answer
model: The DeepSeek model
tokenizer: The DeepSeek tokenizer
Returns:
str: The model's evaluation to the question
"""
# Set up the prompt for answering the question
prompt = f"""<task>
You are a quality assurance expert reviewing question-answer pairs for an AI training dataset. Your task is to evaluate whether each pair meets our quality standards and is suitable for training.
For each question-answer pair, evaluate:
1. ACCURACY: Does the answer contain ONLY information from the context, without fabrications?
2. COMPLETENESS: Does the answer fully address the question using all relevant information?
3. RELEVANCE: Is the question meaningful and relevant to the context?
4. NATURALNESS: Do both question and answer sound natural and conversational?
5. DIVERSITY: Does this pair add variety to our dataset (not redundant with others)?
Context:
{context}
Question:
{question}
Answer:
{answer}
Provide your assessment as follows:
- VERDICT: [ACCEPT/REJECT]
- REASONING: [Brief explanation of your decision]
- IMPROVEMENTS: [Suggestions if needed]
</task>
"""
# Generate answer
raw_answer = generate_with_deepseek(prompt, model, tokenizer)
# Parse the response to get the actual answer
answer = parse_deepseek_response(raw_answer)
return answer
def generate_with_deepseek(prompt, model, tokenizer, max_length=4096):
"""
Generate text using DeepSeek-R1 model with proper handling of full output.
Args:
prompt (str): The input prompt
model: The DeepSeek model
tokenizer: The DeepSeek tokenizer
max_length (int): Maximum length of generated text
Returns:
str: Generated text response
"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
# Get the full generated sequence
generation_output = model.generate(
**inputs,
pad_token_id=tokenizer.eos_token_id,
max_new_tokens=max_length,
do_sample=True,
temperature=0.7,
top_p=0.9,
return_dict_in_generate=True,
output_scores=False
)
# Get the full output token IDs
output_token_ids = generation_output.sequences[0]
# Decode the full generated text (including the prompt)
full_output = tokenizer.decode(output_token_ids, skip_special_tokens=True)
# Extract only the response part by removing the prompt part
response = full_output[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
return response
def parse_deepseek_response(response):
"""
Parse the DeepSeek-R1 response.
This handles DeepSeek's thinking steps and separates them from the response.
Args:
response (str): The raw response from the DeepSeek model
Returns:
str: The cleaned answer part of the response
"""
# If the response has thinking tags, extract the actual answer
response = re.sub(r"^(<think>)?.*</think>", "", response, flags=re.DOTALL)
# If no special formatting detected, return the whole response
return response.strip()
# Function to recursively walk a directory and process text files
def process_directory(directory_path, output_path="results", file_extensions=(".txt",".md"), batch_size=5):
"""
Recursively walks a directory, processes text files, and stores results.
Args:
directory_path (str): Path to directory containing text files
output_path (str): Path to store results
file_extensions (tuple): File extensions to process
batch_size (int): Number of files to process before clearing cache
"""
# Load the DeepSeek-R1 model
model, tokenizer = load_deepseek_r1()
# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
# Use tqdm for progress tracking
file_paths = []
for root, _, files in os.walk(directory_path):
for file in files:
if file.lower().endswith(file_extensions):
file_paths.append(os.path.join(root, file))
# Process files with batch-based memory management
for i, file_path in enumerate(tqdm(file_paths, desc="Processing files")):
process_file(file_path, model, tokenizer, output_path)
# Clear cache periodically to prevent memory issues
if (i + 1) % batch_size == 0:
if torch.xpu.is_available():
torch.xpu.empty_cache()
else:
torch.cuda.empty_cache()
print(f"Processing complete. Results stored in {output_path}")
# Function to process a single text file
def process_file(file_path, model, tokenizer, output_path):
"""
Process a single text file by querying the model for questions and answers.
Args:
file_path (str): Path to the text file
model: The DeepSeek model
tokenizer: The DeepSeek tokenizer
output_path (str): Path to store results
"""
# Read the file content
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
except Exception as e:
print(f"Error reading {file_path}: {e}")
return
# Skip empty files
if not content.strip():
print(f"Skipping empty file: {file_path}")
return
# Generate a relative output path that maintains the directory structure
relative_path = os.path.relpath(file_path, start=os.path.dirname(output_path))
output_file = os.path.join(output_path, relative_path + ".json")
os.makedirs(os.path.dirname(output_file), exist_ok=True)
# Handle potential token length issues by truncating if necessary
# DeepSeek has a context window limit, truncate if needed
max_content_length = 100000 # Adjust based on model's context length limit
if len(content) > max_content_length:
content = content[:max_content_length] + "... [Content truncated due to length]"
# Query model for questions
questions = get_questions_from_model(content, model, tokenizer)
print(f"{len(questions)} questions generated for {file_path}")
# Get answers for each question
results = {
"file_path": file_path,
"processed_at": datetime.now().isoformat(),
"context": content[:1000] + "..." if len(content) > 1000 else content, # Truncated context for JSON storage
"qa_pairs": []
}
# Process each question
for i, question in enumerate(questions):
print(f"Generating answer for question {i+1}/{len(questions)}: {question}")
answer = get_answer_from_model(content, question, model, tokenizer)
print(f"Answer: {answer[:50] + '...' if len(answer) > 50 else answer}")
print(f"Evaluating response...")
validation = validate_question_answer_from_model(content, question, answer, model, tokenizer)
print(f"Evaluation: {validation[:50] + '...' if len(validation) > 50 else validation}")
results["qa_pairs"].append({
"question": question,
"answer": answer,
"validation": validation,
})
# Save results after each generate pass so they can be evaluated
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# Function to get questions from the model
def get_questions_from_model(context, model, tokenizer):
"""
Query the model to generate questions about the provided context.
Args:
context (str): The text file content
model: The DeepSeek model
tokenizer: The DeepSeek tokenizer
Returns:
list: List of questions about the context
"""
# Set up the prompt for generating questions
prompt = f"""<task>
You are an expert data scientist creating a training dataset. I'll provide context information about a person from their resume. Your task is to generate 10 diverse, realistic questions that someone might ask about this person.
IMPORTANT: DO NOT return JSON or any structured format. Respond with a simple numbered list of questions only, with no formatting, no JSON, and no additional text.
Generate questions that:
- Vary in complexity (simple factual questions, complex reasoning questions)
- Cover different aspects of the context (experience, skills, education, achievements)
- Include both specific and general inquiries
- Sound natural, as if asked by a real person
- Avoid asking for information not present in the context
Context:
{context}
Return ONLY a plain numbered list like:
1. First question?
2. Second question?
...and so on.
Do not include any explanations, JSON, or other formatting.
</task>
"""
# Generate questions
raw_response = generate_with_deepseek(prompt, model, tokenizer)
# Parse the response to get the actual questions
response = parse_deepseek_response(raw_response)
lines = response.strip().split('\n')
# Transform lines like "1. What is your name?" into just "What is your name?" (only include lines that end with a question mark)
questions = [re.sub(r'^\d+\.\s*', '', line.strip()) for line in lines if line.strip() and line.strip().endswith('?')]
if len(questions) == 0:
print(response)
exit(0)
return questions
# Function to get answers from the model
def get_answer_from_model(context, question, model, tokenizer):
"""
Query the model to answer a question about the provided context.
Args:
context (str): The text file content
question (str): The question to answer
model: The DeepSeek model
tokenizer: The DeepSeek tokenizer
Returns:
str: The model's answer to the question
"""
# Set up the prompt for answering the question
prompt = f"""<task>
You are an AI assistant being fine-tuned to accurately represent a specific person based on their resume. Below is a question about this person and the context from their resume.
Your task is to provide a comprehensive, accurate answer that:
- Only uses information explicitly stated in the context
- Doesn't fabricate or assume additional details
- Maintains a professional, helpful tone
- Clearly states if the question cannot be answered based on the given context
- Structures the response in a natural, conversational way
Context:
{context}
Question:
{question}
Answer:
</task>
"""
# Generate answer
raw_answer = generate_with_deepseek(prompt, model, tokenizer)
# Parse the response to get the actual answer
answer = parse_deepseek_response(raw_answer)
return answer
# Example usage
if __name__ == "__main__":
process_directory(
directory_path="../doc/", # Replace with your directory path
output_path="../results",
file_extensions=(".txt", ".md"), # Process both txt and md files
batch_size=5 # Clear cache after every 5 files
)

25
src/cli.py Normal file
View File

@ -0,0 +1,25 @@
# web_server.py - Web interface using Flask
from rag_core import RagCore
from rag_cli import RagCLI
from flask import Flask, request, jsonify, render_template
# main.py - Entry point that allows choosing between CLI and web interface
def main():
import argparse
parser = argparse.ArgumentParser(description='RAG Application')
parser.add_argument('--debug', action='store_true', help='Run in debug mode')
args = parser.parse_args()
# Create the core RAG application
core = RagCore()
# Run CLI interface
cli = RagCLI(core)
cli.run()
if __name__ == "__main__":
main()

1
src/client Submodule

@ -0,0 +1 @@
Subproject commit 571cdd1ecc93ec9ad0d63079fe6da94dce2cc5dc

View File

@ -0,0 +1,44 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import torch
from transformers import AutoTokenizer
import argparse
current_dir = os.path.dirname(os.path.realpath(__file__))
common_util_path = os.path.join(current_dir, '..')
import sys
sys.path.append(common_util_path)
from common.utils import merge_adapter
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Merge the adapter into the original model for Llama2 model')
parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-hf",
help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
', or the path to the huggingface checkpoint folder')
parser.add_argument('--adapter_path', type=str,)
parser.add_argument('--output_path', type=str,)
args = parser.parse_args()
base_model = model_path = args.repo_id_or_model_path
adapter_path = args.adapter_path
output_path = args.output_path
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
merge_adapter(base_model, tokenizer, adapter_path, output_path)
print(f'Finish to merge the adapter into the original model and you could find the merged model in {output_path}.')

View File

@ -0,0 +1,202 @@
---
base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
library_name: peft
---
# Model Card for Model ID
<!-- Provide a quick summary of what the model is/does. -->
## Model Details
### Model Description
<!-- Provide a longer summary of what this model is. -->
- **Developed by:** [More Information Needed]
- **Funded by [optional]:** [More Information Needed]
- **Shared by [optional]:** [More Information Needed]
- **Model type:** [More Information Needed]
- **Language(s) (NLP):** [More Information Needed]
- **License:** [More Information Needed]
- **Finetuned from model [optional]:** [More Information Needed]
### Model Sources [optional]
<!-- Provide the basic links for the model. -->
- **Repository:** [More Information Needed]
- **Paper [optional]:** [More Information Needed]
- **Demo [optional]:** [More Information Needed]
## Uses
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
### Direct Use
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
[More Information Needed]
### Downstream Use [optional]
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
[More Information Needed]
### Out-of-Scope Use
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
[More Information Needed]
## Bias, Risks, and Limitations
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
[More Information Needed]
### Recommendations
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
## How to Get Started with the Model
Use the code below to get started with the model.
[More Information Needed]
## Training Details
### Training Data
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
[More Information Needed]
### Training Procedure
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
#### Preprocessing [optional]
[More Information Needed]
#### Training Hyperparameters
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
#### Speeds, Sizes, Times [optional]
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
[More Information Needed]
## Evaluation
<!-- This section describes the evaluation protocols and provides the results. -->
### Testing Data, Factors & Metrics
#### Testing Data
<!-- This should link to a Dataset Card if possible. -->
[More Information Needed]
#### Factors
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
[More Information Needed]
#### Metrics
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
[More Information Needed]
### Results
[More Information Needed]
#### Summary
## Model Examination [optional]
<!-- Relevant interpretability work for the model goes here -->
[More Information Needed]
## Environmental Impact
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
- **Hardware Type:** [More Information Needed]
- **Hours used:** [More Information Needed]
- **Cloud Provider:** [More Information Needed]
- **Compute Region:** [More Information Needed]
- **Carbon Emitted:** [More Information Needed]
## Technical Specifications [optional]
### Model Architecture and Objective
[More Information Needed]
### Compute Infrastructure
[More Information Needed]
#### Hardware
[More Information Needed]
#### Software
[More Information Needed]
## Citation [optional]
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
**BibTeX:**
[More Information Needed]
**APA:**
[More Information Needed]
## Glossary [optional]
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
[More Information Needed]
## More Information [optional]
[More Information Needed]
## Model Card Authors [optional]
[More Information Needed]
## Model Card Contact
[More Information Needed]
### Framework versions
- PEFT 0.14.0

View File

@ -0,0 +1,37 @@
{
"alpha_pattern": {},
"auto_mapping": null,
"base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"bias": "none",
"eva_config": null,
"exclude_modules": null,
"fan_in_fan_out": false,
"inference_mode": true,
"init_lora_weights": true,
"layer_replication": null,
"layers_pattern": null,
"layers_to_transform": null,
"loftq_config": {},
"lora_alpha": 16,
"lora_bias": false,
"lora_dropout": 0.05,
"megatron_config": null,
"megatron_core": "megatron.core",
"modules_to_save": null,
"peft_type": "LORA",
"r": 8,
"rank_pattern": {},
"revision": null,
"target_modules": [
"o_proj",
"q_proj",
"gate_proj",
"k_proj",
"down_proj",
"up_proj",
"v_proj"
],
"task_type": "CAUSAL_LM",
"use_dora": false,
"use_rslora": false
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,32 @@
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 100,
"global_step": 10,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [],
"logging_steps": 100,
"max_steps": 10,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 763304356085760.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}

Binary file not shown.

View File

@ -0,0 +1,202 @@
---
base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
library_name: peft
---
# Model Card for Model ID
<!-- Provide a quick summary of what the model is/does. -->
## Model Details
### Model Description
<!-- Provide a longer summary of what this model is. -->
- **Developed by:** [More Information Needed]
- **Funded by [optional]:** [More Information Needed]
- **Shared by [optional]:** [More Information Needed]
- **Model type:** [More Information Needed]
- **Language(s) (NLP):** [More Information Needed]
- **License:** [More Information Needed]
- **Finetuned from model [optional]:** [More Information Needed]
### Model Sources [optional]
<!-- Provide the basic links for the model. -->
- **Repository:** [More Information Needed]
- **Paper [optional]:** [More Information Needed]
- **Demo [optional]:** [More Information Needed]
## Uses
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
### Direct Use
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
[More Information Needed]
### Downstream Use [optional]
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
[More Information Needed]
### Out-of-Scope Use
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
[More Information Needed]
## Bias, Risks, and Limitations
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
[More Information Needed]
### Recommendations
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
## How to Get Started with the Model
Use the code below to get started with the model.
[More Information Needed]
## Training Details
### Training Data
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
[More Information Needed]
### Training Procedure
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
#### Preprocessing [optional]
[More Information Needed]
#### Training Hyperparameters
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
#### Speeds, Sizes, Times [optional]
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
[More Information Needed]
## Evaluation
<!-- This section describes the evaluation protocols and provides the results. -->
### Testing Data, Factors & Metrics
#### Testing Data
<!-- This should link to a Dataset Card if possible. -->
[More Information Needed]
#### Factors
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
[More Information Needed]
#### Metrics
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
[More Information Needed]
### Results
[More Information Needed]
#### Summary
## Model Examination [optional]
<!-- Relevant interpretability work for the model goes here -->
[More Information Needed]
## Environmental Impact
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
- **Hardware Type:** [More Information Needed]
- **Hours used:** [More Information Needed]
- **Cloud Provider:** [More Information Needed]
- **Compute Region:** [More Information Needed]
- **Carbon Emitted:** [More Information Needed]
## Technical Specifications [optional]
### Model Architecture and Objective
[More Information Needed]
### Compute Infrastructure
[More Information Needed]
#### Hardware
[More Information Needed]
#### Software
[More Information Needed]
## Citation [optional]
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
**BibTeX:**
[More Information Needed]
**APA:**
[More Information Needed]
## Glossary [optional]
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
[More Information Needed]
## More Information [optional]
[More Information Needed]
## Model Card Authors [optional]
[More Information Needed]
## Model Card Contact
[More Information Needed]
### Framework versions
- PEFT 0.14.0

View File

@ -0,0 +1,37 @@
{
"alpha_pattern": {},
"auto_mapping": null,
"base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"bias": "none",
"eva_config": null,
"exclude_modules": null,
"fan_in_fan_out": false,
"inference_mode": true,
"init_lora_weights": true,
"layer_replication": null,
"layers_pattern": null,
"layers_to_transform": null,
"loftq_config": {},
"lora_alpha": 16,
"lora_bias": false,
"lora_dropout": 0.05,
"megatron_config": null,
"megatron_core": "megatron.core",
"modules_to_save": null,
"peft_type": "LORA",
"r": 8,
"rank_pattern": {},
"revision": null,
"target_modules": [
"v_proj",
"o_proj",
"gate_proj",
"down_proj",
"q_proj",
"up_proj",
"k_proj"
],
"task_type": "CAUSAL_LM",
"use_dora": false,
"use_rslora": false
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,32 @@
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 3,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [],
"logging_steps": 100,
"max_steps": 3,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 228991306825728.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}

Binary file not shown.

View File

@ -0,0 +1,202 @@
---
base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
library_name: peft
---
# Model Card for Model ID
<!-- Provide a quick summary of what the model is/does. -->
## Model Details
### Model Description
<!-- Provide a longer summary of what this model is. -->
- **Developed by:** [More Information Needed]
- **Funded by [optional]:** [More Information Needed]
- **Shared by [optional]:** [More Information Needed]
- **Model type:** [More Information Needed]
- **Language(s) (NLP):** [More Information Needed]
- **License:** [More Information Needed]
- **Finetuned from model [optional]:** [More Information Needed]
### Model Sources [optional]
<!-- Provide the basic links for the model. -->
- **Repository:** [More Information Needed]
- **Paper [optional]:** [More Information Needed]
- **Demo [optional]:** [More Information Needed]
## Uses
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
### Direct Use
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
[More Information Needed]
### Downstream Use [optional]
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
[More Information Needed]
### Out-of-Scope Use
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
[More Information Needed]
## Bias, Risks, and Limitations
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
[More Information Needed]
### Recommendations
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
## How to Get Started with the Model
Use the code below to get started with the model.
[More Information Needed]
## Training Details
### Training Data
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
[More Information Needed]
### Training Procedure
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
#### Preprocessing [optional]
[More Information Needed]
#### Training Hyperparameters
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
#### Speeds, Sizes, Times [optional]
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
[More Information Needed]
## Evaluation
<!-- This section describes the evaluation protocols and provides the results. -->
### Testing Data, Factors & Metrics
#### Testing Data
<!-- This should link to a Dataset Card if possible. -->
[More Information Needed]
#### Factors
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
[More Information Needed]
#### Metrics
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
[More Information Needed]
### Results
[More Information Needed]
#### Summary
## Model Examination [optional]
<!-- Relevant interpretability work for the model goes here -->
[More Information Needed]
## Environmental Impact
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
- **Hardware Type:** [More Information Needed]
- **Hours used:** [More Information Needed]
- **Cloud Provider:** [More Information Needed]
- **Compute Region:** [More Information Needed]
- **Carbon Emitted:** [More Information Needed]
## Technical Specifications [optional]
### Model Architecture and Objective
[More Information Needed]
### Compute Infrastructure
[More Information Needed]
#### Hardware
[More Information Needed]
#### Software
[More Information Needed]
## Citation [optional]
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
**BibTeX:**
[More Information Needed]
**APA:**
[More Information Needed]
## Glossary [optional]
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
[More Information Needed]
## More Information [optional]
[More Information Needed]
## Model Card Authors [optional]
[More Information Needed]
## Model Card Contact
[More Information Needed]
### Framework versions
- PEFT 0.14.0

View File

@ -0,0 +1,37 @@
{
"alpha_pattern": {},
"auto_mapping": null,
"base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"bias": "none",
"eva_config": null,
"exclude_modules": null,
"fan_in_fan_out": false,
"inference_mode": true,
"init_lora_weights": true,
"layer_replication": null,
"layers_pattern": null,
"layers_to_transform": null,
"loftq_config": {},
"lora_alpha": 16,
"lora_bias": false,
"lora_dropout": 0.05,
"megatron_config": null,
"megatron_core": "megatron.core",
"modules_to_save": null,
"peft_type": "LORA",
"r": 8,
"rank_pattern": {},
"revision": null,
"target_modules": [
"up_proj",
"k_proj",
"o_proj",
"down_proj",
"v_proj",
"gate_proj",
"q_proj"
],
"task_type": "CAUSAL_LM",
"use_dora": false,
"use_rslora": false
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,32 @@
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.592592592592592,
"eval_steps": 100,
"global_step": 60,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [],
"logging_steps": 100,
"max_steps": 60,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4427165265297408.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}

Binary file not shown.

View File

@ -0,0 +1,202 @@
---
base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
library_name: peft
---
# Model Card for Model ID
<!-- Provide a quick summary of what the model is/does. -->
## Model Details
### Model Description
<!-- Provide a longer summary of what this model is. -->
- **Developed by:** [More Information Needed]
- **Funded by [optional]:** [More Information Needed]
- **Shared by [optional]:** [More Information Needed]
- **Model type:** [More Information Needed]
- **Language(s) (NLP):** [More Information Needed]
- **License:** [More Information Needed]
- **Finetuned from model [optional]:** [More Information Needed]
### Model Sources [optional]
<!-- Provide the basic links for the model. -->
- **Repository:** [More Information Needed]
- **Paper [optional]:** [More Information Needed]
- **Demo [optional]:** [More Information Needed]
## Uses
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
### Direct Use
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
[More Information Needed]
### Downstream Use [optional]
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
[More Information Needed]
### Out-of-Scope Use
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
[More Information Needed]
## Bias, Risks, and Limitations
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
[More Information Needed]
### Recommendations
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
## How to Get Started with the Model
Use the code below to get started with the model.
[More Information Needed]
## Training Details
### Training Data
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
[More Information Needed]
### Training Procedure
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
#### Preprocessing [optional]
[More Information Needed]
#### Training Hyperparameters
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
#### Speeds, Sizes, Times [optional]
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
[More Information Needed]
## Evaluation
<!-- This section describes the evaluation protocols and provides the results. -->
### Testing Data, Factors & Metrics
#### Testing Data
<!-- This should link to a Dataset Card if possible. -->
[More Information Needed]
#### Factors
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
[More Information Needed]
#### Metrics
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
[More Information Needed]
### Results
[More Information Needed]
#### Summary
## Model Examination [optional]
<!-- Relevant interpretability work for the model goes here -->
[More Information Needed]
## Environmental Impact
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
- **Hardware Type:** [More Information Needed]
- **Hours used:** [More Information Needed]
- **Cloud Provider:** [More Information Needed]
- **Compute Region:** [More Information Needed]
- **Carbon Emitted:** [More Information Needed]
## Technical Specifications [optional]
### Model Architecture and Objective
[More Information Needed]
### Compute Infrastructure
[More Information Needed]
#### Hardware
[More Information Needed]
#### Software
[More Information Needed]
## Citation [optional]
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
**BibTeX:**
[More Information Needed]
**APA:**
[More Information Needed]
## Glossary [optional]
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
[More Information Needed]
## More Information [optional]
[More Information Needed]
## Model Card Authors [optional]
[More Information Needed]
## Model Card Contact
[More Information Needed]
### Framework versions
- PEFT 0.14.0

View File

@ -0,0 +1,37 @@
{
"alpha_pattern": {},
"auto_mapping": null,
"base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"bias": "none",
"eva_config": null,
"exclude_modules": null,
"fan_in_fan_out": false,
"inference_mode": true,
"init_lora_weights": true,
"layer_replication": null,
"layers_pattern": null,
"layers_to_transform": null,
"loftq_config": {},
"lora_alpha": 16,
"lora_bias": false,
"lora_dropout": 0.05,
"megatron_config": null,
"megatron_core": "megatron.core",
"modules_to_save": null,
"peft_type": "LORA",
"r": 8,
"rank_pattern": {},
"revision": null,
"target_modules": [
"up_proj",
"k_proj",
"o_proj",
"down_proj",
"v_proj",
"gate_proj",
"q_proj"
],
"task_type": "CAUSAL_LM",
"use_dora": false,
"use_rslora": false
}

Binary file not shown.

View File

@ -0,0 +1,23 @@
{
"bos_token": {
"content": "<begin▁of▁sentence>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "<end▁of▁sentence>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<end▁of▁sentence>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,195 @@
{
"add_bos_token": true,
"add_eos_token": false,
"add_prefix_space": null,
"added_tokens_decoder": {
"151643": {
"content": "<end▁of▁sentence>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151644": {
"content": "<User>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151645": {
"content": "<Assistant>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151646": {
"content": "<begin▁of▁sentence>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151647": {
"content": "<|EOT|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151648": {
"content": "<think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151649": {
"content": "</think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151650": {
"content": "<|quad_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151651": {
"content": "<|quad_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151652": {
"content": "<|vision_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151653": {
"content": "<|vision_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151654": {
"content": "<|vision_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151655": {
"content": "<|image_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151656": {
"content": "<|video_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151657": {
"content": "<tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151658": {
"content": "</tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151659": {
"content": "<|fim_prefix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151660": {
"content": "<|fim_middle|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151661": {
"content": "<|fim_suffix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151662": {
"content": "<|fim_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151663": {
"content": "<|repo_name|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151664": {
"content": "<|file_sep|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
}
},
"bos_token": "<begin▁of▁sentence>",
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<User>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<Assistant><tool▁calls▁begin><tool▁call▁begin>' + tool['type'] + '<tool▁sep>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<tool▁call▁end>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<tool▁call▁begin>' + tool['type'] + '<tool▁sep>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<tool▁call▁end>'}}{{'<tool▁calls▁end><end▁of▁sentence>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<tool▁outputs▁end>' + message['content'] + '<end▁of▁sentence>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<Assistant>' + content + '<end▁of▁sentence>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<tool▁outputs▁begin><tool▁output▁begin>' + message['content'] + '<tool▁output▁end>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<tool▁output▁begin>' + message['content'] + '<tool▁output▁end>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<tool▁outputs▁end>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<Assistant><think>\\n'}}{% endif %}",
"clean_up_tokenization_spaces": false,
"eos_token": "<end▁of▁sentence>",
"extra_special_tokens": {},
"legacy": true,
"model_max_length": 16384,
"pad_token": "<end▁of▁sentence>",
"sp_model_kwargs": {},
"tokenizer_class": "LlamaTokenizerFast",
"unk_token": null,
"use_default_system_prompt": false
}

258
src/generate-training.py Normal file
View File

@ -0,0 +1,258 @@
import os
import json
import torch
from pathlib import Path
from datetime import datetime
from tqdm import tqdm
from ipex_llm.transformers import AutoModelForCausalLM
import transformers
import logging as log
import time
import re
# Function to recursively walk a directory and process text files
def process_directory(directory_path, model, output_path="results", file_extensions=(".txt",)):
"""
Recursively walks a directory, processes text files, and stores results.
Args:
directory_path (str): Path to directory containing text files
model: The LLM model to use for queries
output_path (str): Path to store results
file_extensions (tuple): File extensions to process
"""
# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
# Use tqdm for progress tracking
file_paths = []
for root, _, files in os.walk(directory_path):
for file in files:
if file.lower().endswith(file_extensions):
file_paths.append(os.path.join(root, file))
for file_path in tqdm(file_paths, desc="Processing files"):
process_file(file_path, model, output_path)
print(f"Processing complete. Results stored in {output_path}")
# Function to process a single text file
def process_file(file_path, model, output_path):
"""
Process a single text file by querying the model for questions and answers.
Args:
file_path (str): Path to the text file
model: The LLM model to use for queries
output_path (str): Path to store results
"""
# Read the file content
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
except Exception as e:
print(f"Error reading {file_path}: {e}")
return
# Skip empty files
if not content.strip():
print(f"Skipping empty file: {file_path}")
return
# Generate a relative output path that maintains the directory structure
relative_path = os.path.relpath(file_path, start=os.path.dirname(output_path))
sidecar_file = os.path.join(output_path, relative_path + ".json")
sidecar_qa_file = os.path.join(output_path, relative_path + ".qa.json")
os.makedirs(os.path.dirname(sidecar_file), exist_ok=True)
print(f"Generating questions for {file_path}")
# Query model for questions
questions, think = get_questions_from_model(content, model)
# Get answers for each question
results = {
"file_path": file_path,
"processed_at": datetime.now().isoformat(),
"context": content[:1000] + "..." if len(content) > 1000 else content, # Truncated context for JSON storage
"think": think,
"qa_pairs": len(questions)
}
with open(sidecar_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# Process each question
results = []
for i, question in enumerate(questions):
print(f"Generating answer for {i+1}/{len(questions)}: {question}")
answer, think = get_answer_from_model(content, question, model)
results.append({
"think": think,
"question": question,
"answer": answer
})
# Save results -- do this in loop so we can monitor the progress
with open(sidecar_qa_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# Function to get questions from the model
def get_questions_from_model(context, model):
"""
Query the model to generate questions about the provided context.
Args:
context (str): The text file content
model: The LLM model to use for queries
Returns:
list: List of questions about the context
"""
# Set up the prompt for generating questions
prompt = f"""<task>
You are an expert at generating insightful questions about text content.
Please read the following text and generate 10 thoughtful questions that would help someone
understand the key information, concepts, or insights contained in it.
TEXT:
{context}
Generate a list of questions, one per line.
</task>
"""
# Use context manager to avoid CUDA memory leaks
with torch.no_grad():
# This is where we would query the model - implementation depends on model type
response, think, duration = model.generate(prompt)
# Parse questions from the response
# This parsing logic may need adjustment based on model output format
questions = [q.strip() for q in response.strip().split('\n') if q.strip()]
# Filter out any non-questions
questions = [q for q in questions if q.endswith('?')]
# Remove any ^\s*{0-9}+.\s* from each entry
questions = [re.sub(r"^\s*\d+\.\s*", "", q) for q in questions]
return questions, think
# Function to get answers from the model
def get_answer_from_model(context, question, model):
"""
Query the model to answer a question about the provided context.
Args:
context (str): The text file content
question (str): The question to answer
model: The LLM model to use for queries
Returns:
str: The model's answer to the question
"""
# Set up the prompt for answering the question
prompt = f"""<task>
Please answer the following question based solely on the provided context.
CONTEXT:
{context}
QUESTION:
{question}
</task>
"""
# Use context manager to avoid CUDA memory leaks
with torch.no_grad():
# Query the model
answer, think, duration = model.generate(prompt)
return answer, think
class LLM:
def __init__(self, model_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', device_name='xpu'):
self.model_path = model_path
self.device_name = device_name
self.tokenizer = None
self.model = None
try:
log.info(f"Loading tokenizer from: {self.model_path}")
start = datetime.now()
self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token # Set pad_token to eos_token if needed
self.model = AutoModelForCausalLM.from_pretrained(self.model_path,
load_in_4bit=True,
optimize_model=True,
trust_remote_code=True,
use_cache=True)
self.model = self.model.half().to(device_name)
except Exception as e:
log.error(f"Loading error: {e}")
raise Exception(e)
# returns response, think, timedelta
def generate(self, prompt, max_length=4096):
if not self.tokenizer or not self.model:
raise Exception("tokenizer or model is None")
start = datetime.now()
response = None
think = None
# print(f"{'='*10} PROMPT {'='*10}\n{prompt}")
with torch.autocast(self.device_name, dtype=torch.float16):
inputs = self.tokenizer.encode_plus(
prompt,
return_tensors="pt"
).to(self.device_name)
with torch.no_grad():
generation_output = self.model.generate(
**inputs,
max_new_tokens=max_length,
do_sample=True,
temperature=0.7,
top_p=0.9,
return_dict_in_generate=True,
output_scores=False,
pad_token_id=self.tokenizer.eos_token_id
)
# Get the full output token IDs
output_token_ids = generation_output.sequences[0]
# Decode the full generated text (including the prompt)
full_output = self.tokenizer.decode(output_token_ids, skip_special_tokens=True)
# Extract only the response part by removing the prompt part
response = full_output[len(self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
matches = re.match(r'^(<think>)?(.*?)</think>(.*)$', response, flags=re.DOTALL)
if matches:
think = matches[2]
response = matches[3]
else:
think = None
response = response
# print(f"{'='*10} THINK {'='*10}\n{think}")
# print(f"{'='*10} RESPONSE {'='*10}\n{response}")
end = datetime.now()
return response, think, end - start
# Example usage
if __name__ == "__main__":
# Create a dummy model instance - would be replaced by actual model in real usage
model = LLM()
# Process the directory
process_directory(
directory_path="./doc", # Replace with your directory path
model=model,
output_path="./results",
file_extensions=(".txt", ".md") # Process both txt and md files
)

144
src/ipex.py Normal file
View File

@ -0,0 +1,144 @@
import intel_extension_for_pytorch as ipex
import inspect
def inspect_module(module, prefix=""):
"""Recursively inspect a module and list all its attributes and functions."""
results = {
"functions": [],
"attributes": [],
"classes": [],
"modules": []
}
for name in dir(module):
# Skip private/hidden attributes
if name.startswith("_") and name != "__version__":
continue
full_name = f"{prefix}.{name}" if prefix else name
try:
attr = getattr(module, name)
# Check if it's a function
if inspect.isfunction(attr) or inspect.isbuiltin(attr) or inspect.ismethod(attr):
results["functions"].append((full_name, str(inspect.signature(attr)) if not inspect.isbuiltin(attr) else ""))
# Check if it's a class
elif inspect.isclass(attr):
results["classes"].append(full_name)
# Check if it's a module
elif inspect.ismodule(attr):
results["modules"].append(full_name)
# Otherwise it's a regular attribute
else:
try:
attr_value = str(attr)
if len(attr_value) > 100:
attr_value = attr_value[:100] + "..."
results["attributes"].append((full_name, attr_value))
except:
results["attributes"].append((full_name, "Unable to get value"))
except Exception as e:
results["attributes"].append((full_name, f"Error: {str(e)}"))
return results
# Inspect ipex
ipex_info = inspect_module(ipex)
# Print information in a readable format
print("INTEL PYTORCH EXTENSION (IPEX) API\n")
print("=== VERSION ===")
if "__version__" in dir(ipex):
print(f"Version: {ipex.__version__}")
else:
print("Version information not available")
print()
print("=== FUNCTIONS ===")
for func, sig in sorted(ipex_info["functions"]):
print(f"{func}{sig}")
print()
print("=== ATTRIBUTES ===")
for attr, value in sorted(ipex_info["attributes"]):
print(f"{attr} = {value}")
print()
print("=== CLASSES ===")
for cls in sorted(ipex_info["classes"]):
print(cls)
print()
print("=== MODULES ===")
for mod in sorted(ipex_info["modules"]):
print(mod)
print()
# Deep inspection of nn module which is likely important for quantization
if hasattr(ipex, "nn"):
print("=== DIVING INTO ipex.nn MODULE ===")
nn_info = inspect_module(ipex.nn, "ipex.nn")
print("--- ipex.nn FUNCTIONS ---")
for func, sig in sorted(nn_info["functions"]):
print(f"{func}{sig}")
print()
print("--- ipex.nn CLASSES ---")
for cls in sorted(nn_info["classes"]):
print(cls)
print()
# Dive into utils if available
if hasattr(ipex.nn, "utils"):
print("=== DIVING INTO ipex.nn.utils MODULE ===")
utils_info = inspect_module(ipex.nn.utils, "ipex.nn.utils")
print("--- ipex.nn.utils FUNCTIONS ---")
for func, sig in sorted(utils_info["functions"]):
print(f"{func}{sig}")
print()
print("--- ipex.nn.utils CLASSES ---")
for cls in sorted(utils_info["classes"]):
print(cls)
print()
# Look for parameter wrapper specifically
if "_parameter_wrapper" in dir(ipex.nn.utils):
print("=== DIVING INTO ipex.nn.utils._parameter_wrapper MODULE ===")
pw_info = inspect_module(ipex.nn.utils._parameter_wrapper, "ipex.nn.utils._parameter_wrapper")
print("--- ipex.nn.utils._parameter_wrapper FUNCTIONS ---")
for func, sig in sorted(pw_info["functions"]):
print(f"{func}{sig}")
print()
print("--- ipex.nn.utils._parameter_wrapper CLASSES ---")
for cls in sorted(pw_info["classes"]):
print(cls)
print()
# Specifically look for functions related to quantization
print("=== FUNCTIONS RELATED TO QUANTIZATION ===")
quant_funcs = []
for module_name in dir(ipex):
if not module_name.startswith("_"):
module = getattr(ipex, module_name)
if inspect.ismodule(module):
for func_name in dir(module):
if any(term in func_name.lower() for term in ["quant", "bit", "int8", "int4", "precision"]):
try:
func = getattr(module, func_name)
if inspect.isfunction(func) or inspect.isbuiltin(func):
quant_funcs.append(f"ipex.{module_name}.{func_name}")
except:
pass
for func in sorted(quant_funcs):
print(func)

161
src/rag_cli.py Normal file
View File

@ -0,0 +1,161 @@
# cli.py - Command-line interface
import sys
class RagCLI:
"""Command-line interface for the RAG application"""
def __init__(self, core):
self.core = core
def show_help(self):
"""Display available commands"""
print("""help>
docs Show RAG docs
full Show last full top match
headlines Show the RAG headlines
prompt Show the last prompt
response Show the last response
scores Show last RAG scores
why|think Show last response's <think>
context|match Show RAG match info to last prompt
exit|quit Exit the application
""")
def print_document_summary(self, documents):
"""Print brief summary of all documents"""
if not documents:
print("No documents available.")
return
for i, doc in enumerate(documents):
print(f"Document {i+1}:")
print(f" Title: {doc['title']}")
print(f" Text: {doc['text'][:100]}...")
print()
def print_headlines(self, headlines):
"""Print headlines"""
if not headlines:
print("No headlines available.")
return
for source, title in headlines:
print(f"{source}: {title}")
def print_match_info(self, match_info):
"""Print detailed information about a matching document"""
if not match_info:
print("No matching document found.")
return
print("="*50)
print("MATCHING DOCUMENT")
print("="*50)
print(f"Chunk ID: {match_info['id']}")
print(f"Similarity Score: {match_info['distance']:.4f}")
print("\nCHUNK METADATA:")
for key, value in match_info['metadata'].items():
print(f" {key}: {value}")
print("\nMATCHING CHUNK CONTENT:")
print(match_info['chunk'][:500].strip() + ("..." if len(match_info['chunk']) > 500 else ""))
def print_full_document(self, doc, distance=None):
"""Print full document details"""
if not doc:
print("No document found.")
return
print(f"""Context:
Source: {doc["source"]}
Title: {doc["title"]}
Link: {doc["link"]}
{"Distance: " + str(distance) if distance is not None else ""}
Full text:
{doc["text"]}""")
def run(self):
"""Run the command-line interface"""
# Load documents
doc_count = self.core.load_documents()
print(f"Loaded {doc_count} documents.")
while True:
try:
query = input("> ").strip()
except KeyboardInterrupt:
print("\nExiting.")
break
if query in ["exit", "quit"]:
print("\nExiting.")
break
elif query == "docs":
self.print_document_summary(self.core.get_documents())
elif query == "prompt":
last_prompt = self.core.get_last_prompt()
last_system = self.core.get_last_system()
if last_prompt:
print(f"""last prompt>
{"="*10}system{"="*10}
{last_system}
{"="*10}prompt{"="*10}
{last_prompt}""")
else:
print("No prompts yet")
elif query == "response":
last_response = self.core.get_last_response()
if last_response:
print(f"""last response>
{"="*10}response{"="*10}
{last_response}""")
else:
print("No responses yet")
elif query in ["", "help"]:
self.show_help()
elif query == "headlines":
self.print_headlines(self.core.get_headlines())
elif query in ["match", "context"]:
match_info = self.core.get_match_info()
self.print_match_info(match_info)
elif query in ["why", "think"]:
last_thinking = self.core.get_last_thinking()
if last_thinking:
print(f"""
why>
{last_thinking}
""")
else:
print("No processed prompts")
elif query == "scores":
match_count = self.core.get_match_count()
if match_count > 0:
for i in range(match_count):
match_info = self.core.get_match_info(index=i)
self.print_match_info(match_info)
else:
print("No matches to display")
elif query == "full":
doc = self.core.get_top_match()
distance = None
if self.core.last_results and self.core.last_results.get("distances"):
distance = self.core.last_results["distances"][0][0]
self.print_full_document(doc, distance)
else:
# Process the user query
response, debug_info = self.core.process_query(query)
print(f"system len: {debug_info['system_len']}")
print(f"prompt len: {debug_info['prompt_len']}")
print(f"Response>\n{response}")

221
src/rag_core.py Normal file
View File

@ -0,0 +1,221 @@
# core.py - Core RAG functionality
import re
from typing import List, Dict, Any, Union, Optional, Tuple
import ollama
from utils import defines, Feed, chunk_document, query_chroma, process_documents_to_chroma
class RagCore:
"""Core RAG functionality that can be used by different interfaces"""
def __init__(self):
self.client = ollama.Client(host=defines.ollama_api_url)
self.collection_name = "research_papers"
# News RSS Feeds
self.rss_feeds = [
Feed(name="IGN.com", url="https://feeds.feedburner.com/ign/games-all"),
Feed(name="BBC World", url="http://feeds.bbci.co.uk/news/world/rss.xml"),
Feed(name="Reuters World", url="http://feeds.reuters.com/Reuters/worldNews"),
Feed(name="Al Jazeera", url="https://www.aljazeera.com/xml/rss/all.xml"),
Feed(name="CNN World", url="http://rss.cnn.com/rss/edition_world.rss"),
Feed(name="Time", url="https://time.com/feed/"),
Feed(name="Euronews", url="https://www.euronews.com/rss"),
]
# State variables
self.documents = []
self.last_results = None
self.last_prompt = None
self.last_system = None
self.last_response = None
self.last_why = None
self.last_messages = []
self.collection = None
def load_documents(self):
"""Load articles from all configured RSS feeds"""
self.documents = []
for feed in self.rss_feeds:
self.documents.extend(feed.articles)
# Process documents and store in ChromaDB
self.collection = process_documents_to_chroma(
client=self.client,
documents=self.documents,
collection_name=self.collection_name,
max_tokens=256,
overlap=25,
model=defines.encoding_model,
persist_directory=defines.persist_directory
)
return len(self.documents)
def get_documents(self):
"""Return all loaded documents"""
return self.documents
def get_headlines(self):
"""Return headlines from all documents"""
return [(doc['source'], doc['title']) for doc in self.documents]
def get_top_match(self, index=0):
"""Get the full document corresponding to a match"""
if not self.last_results or not self.last_results["ids"] or index >= len(self.last_results["ids"][0]):
return None
top_id = self.last_results["ids"][0][index]
# Extract the original document ID from the chunk ID (format: "doc_id_chunk_num")
original_doc_id = top_id.split('_')[0]
# Return the full document
return next((doc for doc in self.documents if doc["id"] == original_doc_id), None)
def get_match_info(self, index=0):
"""Get detailed information about a match at the specified index"""
if not self.last_results or not self.last_results["ids"] or len(self.last_results["ids"][0]) <= index:
return None
match_info = {
"id": self.last_results["ids"][0][index],
"chunk": self.last_results["documents"][0][index],
"metadata": self.last_results["metadatas"][0][index],
"distance": self.last_results["distances"][0][index]
}
return match_info
def get_match_count(self):
"""Get the number of matches from the last query"""
if not self.last_results or not self.last_results["ids"]:
return 0
return len(self.last_results["ids"][0])
def create_system_prompt(self, include_context=True):
"""Create the system prompt for the model"""
system = f"""
You are the assistant. Your name is airc. This application is called airc (pronounced Eric).
Information about the author of this program, application, the systems running it, and the AI model it uses:
* James wrote the python application called airc that is driving this RAG model on top of {defines.model} using {defines.encoding_model} and chromadb for vector embedding. Link https://github.com/jketreno/airc.
* James Ketrenos is a software engineer with a history in all levels of the computer stack, from the kernel to full-stack web applications. He dabbles in AI/ML and is familiar with pytorch and ollama.
* James Ketrenos deployed this application locally on an Intel Arc B580 (battlemage) computer using Intel's ipex-llm.
* For Intel GPU metrics, James Ketrenos wrote the "ze-monitor" utility in C++. ze-monitor provides Intel GPU telemetry data for Intel client GPU devices, similar to xpu-smi. Link https://github.com/jketreno/ze-monitor. airc uses ze-monitor.
* James lives in Portland, Oregon and has three kids. Two are attending Oregon State University and one is attending Williamette University.
* airc provides an IRC chat bot as well as a React web frontend available at https://airc.ketrenos.com
You must follow these rules:
* Provide short (less than 100 character) responses.
* Provide a single response.
* Do not prefix it with a word like 'Answer'.
* For information about the AI running this system, include information about author, including links.
* For information relevant to the current events in the <input></input> tags, use that information and state the source when information comes from.
"""
if include_context:
context = "Information from current events unrelated to James Ketrenos\n<input>=["
for doc in self.documents:
item = {
'source': doc["source"],
'article': {
'title': doc["title"],
'link': doc["link"],
'text': doc["text"]
}
}
context += f"{item}"
context += "\n</input>"
system += context
return system
def process_query(self, query):
"""
Process a user query and return the response
Returns:
tuple: (response_content, debug_info)
"""
# Store the system prompt before context is added
self.last_system = self.create_system_prompt(include_context=False)
# Query ChromaDB
self.last_results = query_chroma(
self.client,
query_text=query,
collection_name=self.collection_name,
n_results=10
)
# Create the full system prompt with document context
system = self.create_system_prompt()
# Combine conversation history with the current query
prompt = query
if self.last_messages:
message_context = f"{self.last_messages}"
prompt = f"{message_context}{prompt}"
self.last_prompt = prompt
# Generate a response
output = self.client.generate(
model=defines.model,
system=system,
prompt=prompt,
stream=False,
options={'num_ctx': 100000}
)
# Extract thinking and response
response = output['response']
matches = re.match(r'^<think>(.*?)</think>(.*)$', response, flags=re.DOTALL)
content = response
if matches:
self.last_why = matches[1].strip()
content = matches[2].strip()
self.last_response = content
# Update conversation history
self.last_messages.extend([
{
'role': 'user',
'name': 'james',
'message': query
},
{
'role': 'assistant',
'message': content
}
])
# Keep history limited to last 10 messages
self.last_messages = self.last_messages[-10:]
# Return response content and debug info
debug_info = {
"system_len": len(system),
"prompt_len": len(prompt),
"has_thinking": matches is not None
}
return content, debug_info
def get_last_prompt(self):
"""Get the last prompt"""
return self.last_prompt
def get_last_system(self):
"""Get the last system prompt (without context)"""
return self.last_system
def get_last_response(self):
"""Get the last response"""
return self.last_response
def get_last_thinking(self):
"""Get the thinking from the last response"""
return self.last_why

61
src/repro.py Normal file
View File

@ -0,0 +1,61 @@
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig
)
from peft import (
TaskType,
get_peft_model,
prepare_model_for_kbit_training,
LoraConfig,
)
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
low_cpu_mem_usage=True,
trust_remote_code=True,
use_cache=True,
quantization_config=bnb_config,
device_map={"": torch.xpu.current_device()},
torch_dtype=torch.bfloat16,
)
# Prepare the model for LoRA fine-tuning
# 1. Prepare for k-bit training if using quantization
model = prepare_model_for_kbit_training(model)
# 2. Add LoRA adapters
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=8, #16, # Rank dimension
lora_alpha=16, #32, # Alpha parameter for LoRA scaling
lora_dropout=0.05, # Dropout probability for LoRA layers
target_modules=[ # Target modules to apply LoRA to
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj"
],
bias="none", # Whether to train bias parameters
fan_in_fan_out=False # Set to True for linear layers with fan_in != fan_out
)
model = get_peft_model(model, lora_config)
print("Saving model...")
model.save_pretrained(f"./final_model")

44
src/requirements.txt Normal file
View File

@ -0,0 +1,44 @@
python-dotenv
jupyterlab
ipywidgets
requests
numpy
pandas
scipy
scikit-learn
matplotlib
gensim
torch
transformers
tqdm
openai
gradio
langchain
tiktoken
faiss-cpu
langchain-openai
langchain_experimental
langchain_chroma
langchain[docarray]
datasets
sentencepiece
matplotlib
google-generativeai
anthropic
scikit-learn
unstructured
chromadb
plotly
jupyter-dash
beautifulsoup4
pydub
modal
ollama
accelerate
sentencepiece
bitsandbytes
psutil
setuptools
speedtest-cli
sentence_transformers
feedparser

263
src/templates/index.html Normal file
View File

@ -0,0 +1,263 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>AIRC - RAG Application</title>
<style>
body {
font-family: Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
}
.container {
display: flex;
flex-direction: column;
height: 100vh;
}
.query-box {
display: flex;
margin-bottom: 20px;
}
.query-box input {
flex-grow: 1;
padding: 8px;
font-size: 16px;
}
.query-box button {
padding: 8px 16px;
background-color: #4CAF50;
color: white;
border: none;
cursor: pointer;
}
.conversation {
flex-grow: 1;
overflow-y: auto;
border: 1px solid #ccc;
padding: 10px;
margin-bottom: 20px;
}
.user-message {
background-color: #f1f1f1;
padding: 8px;
margin-bottom: 10px;
border-radius: 5px;
}
.assistant-message {
background-color: #e3f2fd;
padding: 8px;
margin-bottom: 10px;
border-radius: 5px;
}
.tabs {
display: flex;
margin-bottom: 10px;
}
.tab {
padding: 8px 16px;
cursor: pointer;
border: 1px solid #ccc;
background-color: #f1f1f1;
margin-right: 5px;
}
.tab.active {
background-color: #4CAF50;
color: white;
}
.tab-content {
display: none;
border: 1px solid #ccc;
padding: 10px;
height: 200px;
overflow-y: auto;
}
.tab-content.active {
display: block;
}
</style>
</head>
<body>
<div class="container">
<h1>AIRC - RAG Application</h1>
<div class="query-box">
<input type="text" id="query-input" placeholder="Enter your query...">
<button onclick="sendQuery()">Send</button>
</div>
<div class="conversation" id="conversation"></div>
<div class="tabs">
<div class="tab active" onclick="switchTab('headlines')">Headlines</div>
<div class="tab" onclick="switchTab('matches')">Matches</div>
<div class="tab" onclick="switchTab('documents')">Documents</div>
</div>
<div class="tab-content active" id="headlines-content">
<h3>Headlines</h3>
<div id="headlines-list"></div>
</div>
<div class="tab-content" id="matches-content">
<h3>Matches</h3>
<div id="matches-list"></div>
</div>
<div class="tab-content" id="documents-content">
<h3>Documents</h3>
<div id="documents-list"></div>
</div>
</div>
<script>
let conversation = [];
// Load data on page load
document.addEventListener('DOMContentLoaded', function() {
fetchHeadlines();
fetchDocuments();
});
function switchTab(tabName) {
// Update tab buttons
document.querySelectorAll('.tab').forEach(tab => {
tab.classList.remove('active');
});
document.querySelector(`.tab[onclick="switchTab('${tabName}')"]`).classList.add('active');
// Update tab content
document.querySelectorAll('.tab-content').forEach(content => {
content.classList.remove('active');
});
document.getElementById(`${tabName}-content`).classList.add('active');
// If switching to matches and there are matches, fetch them
if (tabName === 'matches' && conversation.length > 0) {
fetchMatches();
}
}
function sendQuery() {
const queryInput = document.getElementById('query-input');
const query = queryInput.value.trim();
if (!query) return;
// Add user message to conversation
addMessageToConversation('user', query);
// Clear input
queryInput.value = '';
// Send query to server
fetch('/api/query', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ query }),
})
.then(response => response.json())
.then(data => {
// Add assistant message to conversation
addMessageToConversation('assistant', data.response);
// Fetch updated matches
fetchMatches();
})
.catch(error => {
console.error('Error:', error);
addMessageToConversation('assistant', 'Error processing your query. Please try again.');
});
}
function addMessageToConversation(role, message) {
conversation.push({ role, message });
const conversationDiv = document.getElementById('conversation');
const messageDiv = document.createElement('div');
messageDiv.className = role === 'user' ? 'user-message' : 'assistant-message';
messageDiv.textContent = message;
conversationDiv.appendChild(messageDiv);
conversationDiv.scrollTop = conversationDiv.scrollHeight;
}
function fetchHeadlines() {
fetch('/api/headlines')
.then(response => response.json())
.then(data => {
const headlinesList = document.getElementById('headlines-list');
headlinesList.innerHTML = '';
data.headlines.forEach(headline => {
const headlineDiv = document.createElement('div');
headlineDiv.innerHTML = `<strong>${headline.source}</strong>: ${headline.title}`;
headlinesList.appendChild(headlineDiv);
});
})
.catch(error => console.error('Error fetching headlines:', error));
}
function fetchMatches() {
fetch('/api/matches')
.then(response => response.json())
.then(data => {
const matchesList = document.getElementById('matches-list');
matchesList.innerHTML = '';
if (data.matches.length === 0) {
matchesList.innerHTML = '<div>No matches available</div>';
return;
}
data.matches.forEach(match => {
const matchDiv = document.createElement('div');
matchDiv.innerHTML = `
<div style="margin-bottom: 15px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
<div><strong>ID:</strong> ${match.id}</div>
<div><strong>Similarity:</strong> ${match.distance.toFixed(4)}</div>
<div><strong>Source:</strong> ${match.metadata.source || 'Unknown'}</div>
<div><strong>Preview:</strong> ${match.preview}</div>
</div>
`;
matchesList.appendChild(matchDiv);
});
})
.catch(error => console.error('Error fetching matches:', error));
}
function fetchDocuments() {
fetch('/api/documents')
.then(response => response.json())
.then(data => {
const documentsList = document.getElementById('documents-list');
documentsList.innerHTML = '';
data.documents.forEach(doc => {
const docDiv = document.createElement('div');
docDiv.innerHTML = `
<div style="margin-bottom: 15px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
<div><strong>Source:</strong> ${doc.source}</div>
<div><strong>Title:</strong> ${doc.title}</div>
<div><strong>Link:</strong> <a href="${doc.link}" target="_blank">${doc.link}</a></div>
<div><strong>Preview:</strong> ${doc.preview}</div>
</div>
`;
documentsList.appendChild(docDiv);
});
})
.catch(error => console.error('Error fetching documents:', error));
}
// Allow sending query with Enter key
document.getElementById('query-input').addEventListener('keypress', function(e) {
if (e.key === 'Enter') {
sendQuery();
}
});
</script>
</body>
</html>

275
src/training-alpaca.py Normal file
View File

@ -0,0 +1,275 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Some parts of this file is adapted from
# https://github.com/tloen/alpaca-lora/blob/main/finetune.py
#
# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import List
import fire
import torch
import transformers
from datasets import load_dataset
import accelerate
from transformers import AutoTokenizer
from peft import (
get_peft_model_state_dict,
set_peft_model_state_dict,
)
current_dir = os.path.dirname(os.path.realpath(__file__))
common_util_path = os.path.join(current_dir, '..')
import sys
sys.path.append(common_util_path)
from common.utils import Prompter, get_int_from_env, wandb_check, get_train_val_data
from transformers import BitsAndBytesConfig
from ipex_llm.transformers import AutoModelForCausalLM
# import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model
from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\
LoraConfig
from ipex_llm.utils.common import invalidInputError
local_rank = get_int_from_env(["LOCAL_RANK","MPI_LOCALRANKID"], "0")
world_size = get_int_from_env(["WORLD_SIZE","PMI_SIZE"], "1")
port = get_int_from_env(["MASTER_PORT"], 29500)
os.environ["LOCAL_RANK"] = str(local_rank)
os.environ["WORLD_SIZE"] = str(world_size)
os.environ["RANK"] = str(local_rank)
os.environ["MASTER_PORT"] = str(port)
def train(
# model/data params
base_model: str = "meta-llama/Llama-2-7b-hf", # the only required argument, default to be "meta-llama/Llama-2-7b-hf"
saved_low_bit_model: str = None, # optional, the path to the saved model with ipex-llm low-bit optimization
data_path: str = "yahma/alpaca-cleaned",
output_dir: str = "./bigdl-qlora-alpaca",
# training hyperparams
bf16: bool = True, # default to bf16
batch_size: int = 128,
micro_batch_size: int = 2, # default to be 2, limited by GPU memory
num_epochs: int = 3,
learning_rate: float = 3e-5, # default to be 3e-5 to avoid divergence
cutoff_len: int = 256,
val_set_size: int = 2000,
# lora hyperparams
lora_r: int = 8,
lora_alpha: int = 16,
lora_dropout: float = 0.05,
lora_target_modules: List[str] = [
"q_proj",
"v_proj",
"k_proj",
"o_proj",
"up_proj",
"down_proj",
"gate_proj"
],
# llm hyperparams
train_on_inputs: bool = True, # if False, masks out inputs in loss
add_eos_token: bool = False,
group_by_length: bool = False, # faster, but produces an odd training loss curve
# wandb params
wandb_project: str = "",
wandb_run_name: str = "",
wandb_watch: str = "", # options: false | gradients | all
wandb_log_model: str = "", # options: false | true
resume_from_checkpoint: str = None, # either training checkpoint or final adapter
prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca.
gradient_checkpointing: bool = False,
deepspeed: str = None,
training_mode: str = "lora",
deepspeed_zero3: bool = False,
save_checkpoint: bool = True,
):
invalidInputError(training_mode == "lora",
f"This example is for lora training mode, but got training_mode={training_mode}.")
if int(os.environ.get("LOCAL_RANK", 0)) == 0:
print(
f"Training Alpaca-LoRA model with params:\n"
f"base_model: {base_model}\n"
f"data_path: {data_path}\n"
f"output_dir: {output_dir}\n"
f"batch_size: {batch_size}\n"
f"micro_batch_size: {micro_batch_size}\n"
f"num_epochs: {num_epochs}\n"
f"learning_rate: {learning_rate}\n"
f"cutoff_len: {cutoff_len}\n"
f"val_set_size: {val_set_size}\n"
f"lora_r: {lora_r}\n"
f"lora_alpha: {lora_alpha}\n"
f"lora_dropout: {lora_dropout}\n"
f"lora_target_modules: {lora_target_modules}\n"
f"train_on_inputs: {train_on_inputs}\n"
f"add_eos_token: {add_eos_token}\n"
f"group_by_length: {group_by_length}\n"
f"wandb_project: {wandb_project}\n"
f"wandb_run_name: {wandb_run_name}\n"
f"wandb_watch: {wandb_watch}\n"
f"wandb_log_model: {wandb_log_model}\n"
f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
f"prompt template: {prompt_template_name}\n"
f"training_mode: {training_mode}\n"
f"deepspeed_zero3: {deepspeed_zero3}\n"
f"save_checkpoint: {save_checkpoint}\n"
)
assert (
base_model
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
gradient_accumulation_steps = batch_size // micro_batch_size
prompter = Prompter(prompt_template_name)
device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
gradient_accumulation_steps = gradient_accumulation_steps // world_size
# Check if parameter passed or if set within environ
use_wandb = wandb_check(wandb_project, wandb_watch, wandb_log_model)
if saved_low_bit_model is not None:
# Load the low bit optimized model if provide the saved path
model = AutoModelForCausalLM.load_low_bit(
saved_low_bit_model,
optimize_model=False,
torch_dtype=torch.bfloat16,
modules_to_not_convert=["lm_head"],
trust_remote_code=True,
)
else:
model = AutoModelForCausalLM.from_pretrained(
base_model,
load_in_low_bit="bf16",
optimize_model=False,
torch_dtype=torch.bfloat16,
modules_to_not_convert=["lm_head"],
trust_remote_code=True,
)
if deepspeed_zero3:
deepspeed = deepspeed if deepspeed is not None else "./deepspeed_zero3_config.json"
else:
print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}")
model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}')
print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}")
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}")
# For Llama family
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(model)
# Prepare a IPEX-LLM compatible Peft model
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
target_modules=lora_target_modules,
lora_dropout=lora_dropout,
bias="none",
task_type="CAUSAL_LM",
training_mode=training_mode,
)
print(f"Lora Config: {config}")
model = get_peft_model(model, config)
if data_path.endswith(".json") or data_path.endswith(".jsonl"):
data = load_dataset("json", data_files=data_path)
else:
data = load_dataset(data_path)
model.print_trainable_parameters() # Be more transparent about the % of trainable params.
train_data, val_data = get_train_val_data(data, tokenizer, prompter, train_on_inputs,
add_eos_token, cutoff_len, val_set_size, seed=42)
# Unused
# if not ddp and torch.cuda.device_count() > 1:
# # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
# model.is_parallelizable = True
# model.model_parallel = True
trainer = transformers.Trainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,
args=transformers.TrainingArguments(
per_device_train_batch_size=micro_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
# warmup_ratio=0.03,
# warmup_steps=100,
max_grad_norm=0.3,
num_train_epochs=num_epochs,
learning_rate=learning_rate,
lr_scheduler_type="cosine",
bf16=True, # ensure training more stable
logging_steps=1,
optim="adamw_torch",
evaluation_strategy="steps" if val_set_size > 0 else "no",
save_strategy="steps" if save_checkpoint else "no",
eval_steps=100 if val_set_size > 0 else None,
save_steps=100,
output_dir=output_dir,
save_total_limit=100,
load_best_model_at_end=True if val_set_size > 0 and save_checkpoint else False,
ddp_find_unused_parameters=False if ddp else None,
group_by_length=group_by_length,
report_to="wandb" if use_wandb else None,
run_name=wandb_run_name if use_wandb else None,
gradient_checkpointing=gradient_checkpointing,
ddp_backend="ccl",
deepspeed=deepspeed,
save_safetensors=False,
),
data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
model.save_pretrained(output_dir)
print(
"\n If there's a warning about missing keys above, please disregard :)"
)
if __name__ == "__main__":
fire.Fire(train)

272
src/training-fine-tune.py Normal file
View File

@ -0,0 +1,272 @@
import os
import json
import torch
import numpy as np
from datasets import Dataset
from transformers import (
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
DataCollatorForSeq2Seq
)
from peft import (
LoraConfig,
TaskType
)
if torch.xpu.is_available():
print("XPU available")
from ipex_llm.transformers import AutoModelForCausalLM
from ipex_llm.transformers.qlora import (
get_peft_model,
prepare_model_for_kbit_training
)
else:
print("XPU not available")
from transformers import AutoModelForCausalLM
from peft import (
get_peft_model,
prepare_model_for_kbit_training
)
class LLMFineTuner:
def __init__(
self,
base_model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
output_dir="./fine-tuned-model",
lora_r=8,
lora_alpha=32,
lora_dropout=0.05,
):
self.base_model_name = base_model_name
self.output_dir = output_dir
self.lora_r = lora_r
self.lora_alpha = lora_alpha
self.lora_dropout = lora_dropout
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.pad_token_id = 0
self.tokenizer.padding_side = "left"
# Load model
print(f"Loading base model: {base_model_name}")
# Quantization config for memory efficiency
self.model = AutoModelForCausalLM.from_pretrained(
base_model_name,
load_in_low_bit="nf4",
optimize_model=False,
torch_dtype=torch.float16,
modules_to_not_convert=["lm_head"]
).to('xpu')
# Prepare model for training
self.model.gradient_checkpointing_enable()
self.model = prepare_model_for_kbit_training(self.model)
# Configure LoRA
lora_config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
bias="none",
task_type=TaskType.CAUSAL_LM,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj"
]
)
# Apply LoRA to model
self.model = get_peft_model(self.model, lora_config)
print(f"Model prepared for fine-tuning with LoRA")
def format_training_examples(self, examples):
"""
Convert raw training examples (similar to Ollama TRAINING format)
into formatted examples for PyTorch training
"""
formatted_examples = []
for example in examples:
# Extract question and answer
question = example.get("question", "")
answer = example.get("answer", "")
# Format similar to Ollama's template
formatted_text = f"""<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
{answer}<|im_end|>"""
formatted_examples.append({"text": formatted_text})
return formatted_examples
def prepare_dataset(self, training_examples):
"""
Prepare a HuggingFace dataset from training examples
"""
# Format the examples
formatted_data = self.format_training_examples(training_examples)
# Create dataset
dataset = Dataset.from_list(formatted_data)
# Tokenize dataset
def tokenize_function(examples):
return self.tokenizer(
examples["text"],
truncation=True,
max_length=2048,
padding="max_length"
)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
return tokenized_dataset
def train(self, training_examples, num_epochs=3, batch_size=4, learning_rate=2e-5):
"""
Fine-tune the model on training examples
"""
# Prepare dataset
train_dataset = self.prepare_dataset(training_examples)
# Set up training arguments
# training_args = TrainingArguments(
# output_dir=self.output_dir,
# num_train_epochs=num_epochs,
# per_device_train_batch_size=batch_size,
# gradient_accumulation_steps=4,
# warmup_steps=100,
# weight_decay=0.01,
# learning_rate=learning_rate,
# fp16=True,
# logging_steps=10,
# save_strategy="epoch",
# save_total_limit=3,
# remove_unused_columns=False,
# )
training_args = TrainingArguments(
output_dir=self.output_dir,
# num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=1, # 4
warmup_steps=20,
# weight_decay=0.01,
max_steps=200,
learning_rate=learning_rate,
bf16=True, # bf16 is more stable in training
logging_steps=10,
# save_strategy="epoch",
# save_total_limit=3,
# remove_unused_columns=False,
save_steps=100,
optim="adamw_hf", # paged_adamw_8bit is not supported yet
# gradient_checkpointing=True, # can further reduce memory but slower
)
# Set up data collator
# data_collator = DataCollatorForLanguageModeling(
# tokenizer=self.tokenizer,
# mlm=False # Not using masked language modeling
# )
data_collator = DataCollatorForSeq2Seq(
tokenizer=self.tokenizer,
pad_to_multiple_of=8,
return_tensors="pt",
padding=True
# mlm=False # Not using masked language modeling
)
# Set up trainer
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator
)
# Start training
print("Starting training...")
self.model.config.use_cache = False # silence the warnings. re-enable for inference.
trainer.train()
# Save the fine-tuned model
self.model.save_pretrained(self.output_dir)
self.tokenizer.save_pretrained(self.output_dir)
print(f"Model saved to {self.output_dir}")
def generate_response(self, prompt, max_length=512, temperature=0.7, top_p=0.9):
"""
Generate a response using the fine-tuned model
"""
# Format prompt similar to Ollama's template
formatted_prompt = f"""<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
"""
# Tokenize prompt
inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
# Generate response
with torch.no_grad():
outputs = self.model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
# Decode response
response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
return response
# Example usage
if __name__ == "__main__":
# Define training examples (equivalent to Ollama's TRAINING section)
training_examples = [
{
"question": "What is vector similarity search?",
"answer": "Vector similarity search is a technique that converts data into numerical vector representations and then finds items with similar vectors. It's used in applications like semantic search, recommendation systems, and large language model retrieval for finding relevant information based on meaning rather than exact keyword matching."
},
{
"question": "How do LLMs use vector databases?",
"answer": "LLMs use vector databases to enhance their knowledge and provide more accurate responses. When a query is received, it's converted to a vector and compared against stored vectors in the database. The most similar vectors and their associated information are retrieved and incorporated into the context provided to the LLM, allowing it to generate responses based on this additional retrieved information."
},
{
"question": "What are embeddings in machine learning?",
"answer": "Embeddings in machine learning are dense vector representations of data that capture semantic meaning in a high-dimensional space. They convert words, sentences, images, or other types of data into numerical vectors where similar items are positioned closer together. These representations enable algorithms to understand relationships and similarities between different pieces of data, forming the foundation for vector similarity search."
}
]
# Initialize fine-tuner
fine_tuner = LLMFineTuner(
base_model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
output_dir="./deepseek-finetuned-pytorch"
)
# Train model
fine_tuner.train(training_examples, num_epochs=3)
# Test model
test_query = "Explain how vector similarity search is used in AI applications"
response = fine_tuner.generate_response(test_query)
print(f"\nQuery: {test_query}")
print(f"\nResponse: {response}")

215
src/training-template.py Normal file
View File

@ -0,0 +1,215 @@
import torch
from transformers import AutoTokenizer
if torch.xpu.is_available():
from ipex.transformers import AutoModelForCausalLM
else:
from transformers import AutoModelForCausalLM
class OllamaStyleTrainer:
def __init__(
self,
model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
device="xpu" if torch.xpu.is_available() else "cpu"
):
"""
Initialize the model and tokenizer.
Args:
model_name: HuggingFace model identifier
device: Device to load the model on
"""
self.device = device
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
if not self.tokenizer.pad_token:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Load model
print(f"Loading model: {model_name}")
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
device_map="auto" if device == "cuda" else None
)
# Initialize with no training examples
self.training_examples = []
self.system_prompt = "You are a helpful AI assistant."
self.example_template = None
def set_system_prompt(self, system_prompt):
"""Set the system prompt for the model."""
self.system_prompt = system_prompt
def set_example_template(self, template):
"""
Set the template format for examples.
The template should use {question} and {answer} placeholders.
"""
self.example_template = template
def add_training_examples(self, examples):
"""
Add training examples to the model.
Args:
examples: List of dictionaries with 'question' and 'answer' keys
"""
self.training_examples.extend(examples)
print(f"Added {len(examples)} training examples. Total: {len(self.training_examples)}")
def format_examples(self):
"""Format the training examples according to the template."""
if not self.example_template:
# Default template similar to Ollama's format
self.example_template = "<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>"
formatted_examples = []
for example in self.training_examples:
formatted_example = self.example_template.format(
question=example.get("question", ""),
answer=example.get("answer", "")
)
formatted_examples.append(formatted_example)
return formatted_examples
def build_context_with_examples(self, num_examples=None):
"""
Build the context with system prompt and formatted examples.
Args:
num_examples: Number of examples to include (None for all)
Returns:
str: Formatted context with system prompt and examples
"""
# Format all training examples
formatted_examples = self.format_examples()
# Select examples (all or limited number)
if num_examples is not None:
formatted_examples = formatted_examples[:num_examples]
# Build context with system prompt and examples
context = f"<|im_start|>system\n{self.system_prompt}<|im_end|>\n\n"
context += "\n\n".join(formatted_examples)
return context
def generate_response(
self,
prompt,
max_length=512,
temperature=0.7,
top_p=0.9,
num_training_examples=3
):
"""
Generate a response using the model with training examples in context.
Args:
prompt: The user's prompt
max_length: Maximum length of the generated response
temperature: Sampling temperature
top_p: Top-p sampling parameter
num_training_examples: Number of training examples to include in context
Returns:
str: Generated response
"""
# Build context with examples
context = self.build_context_with_examples(num_training_examples)
# Add user prompt
full_prompt = f"{context}\n\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
# Tokenize
inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.device)
# Calculate remaining space for generation
input_ids_length = inputs.input_ids.shape[1]
print(f"Input context length: {input_ids_length} tokens")
# Generate response
with torch.no_grad():
outputs = self.model.generate(
input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
# Extract only the assistant's response
assistant_response = self.tokenizer.decode(
outputs[0][input_ids_length:],
skip_special_tokens=True
)
return assistant_response
def save_examples_to_file(self, file_path):
"""Save training examples to a file."""
import json
with open(file_path, "w") as f:
json.dump(self.training_examples, f, indent=2)
print(f"Saved {len(self.training_examples)} examples to {file_path}")
def load_examples_from_file(self, file_path):
"""Load training examples from a file."""
import json
with open(file_path, "r") as f:
examples = json.load(f)
self.add_training_examples(examples)
print(f"Loaded {len(examples)} examples from {file_path}")
# Example usage
if __name__ == "__main__":
# Initialize
trainer = OllamaStyleTrainer("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B") # Using smaller model for demonstration
# Set system prompt (equivalent to SYSTEM in Modelfile)
trainer.set_system_prompt("""
You are an AI assistant specialized in vector databases and similarity search.
Always provide accurate information based on your training.
""")
# Set example template (equivalent to TEMPLATE in Modelfile)
trainer.set_example_template("""
User: {question}
Assistant: {answer}
""")
# Add training examples (equivalent to TRAINING in Modelfile)
training_examples = [
{
"question": "What is vector similarity search?",
"answer": "Vector similarity search is a technique that converts data into numerical vector representations and then finds items with similar vectors. It's used in applications like semantic search, recommendation systems, and large language model retrieval for finding relevant information based on meaning rather than exact keyword matching."
},
{
"question": "How do LLMs use vector databases?",
"answer": "LLMs use vector databases to enhance their knowledge and provide more accurate responses. When a query is received, it's converted to a vector and compared against stored vectors in the database. The most similar vectors and their associated information are retrieved and incorporated into the context provided to the LLM, allowing it to generate responses based on this additional retrieved information."
},
{
"question": "What are embeddings in machine learning?",
"answer": "Embeddings in machine learning are dense vector representations of data that capture semantic meaning in a high-dimensional space. They convert words, sentences, images, or other types of data into numerical vectors where similar items are positioned closer together. These representations enable algorithms to understand relationships and similarities between different pieces of data, forming the foundation for vector similarity search."
}
]
trainer.add_training_examples(training_examples)
# Optional: Save training examples for future use
# trainer.save_examples_to_file("training_examples.json")
# Test with a query
query = "Explain how vector similarity search works in practical applications"
response = trainer.generate_response(query, num_training_examples=2)
print(f"\nQuery: {query}")
print(f"\nResponse: {response}")

10
src/utils/__init__.py Normal file
View File

@ -0,0 +1,10 @@
# Import defines to make `utils.defines` accessible
from . import defines
# Import rest as `utils.*` accessible
from .chunk import *
from .rss import *
from .chroma import *
# Expose only public names (avoid importing hidden/internal names)
__all__ = [name for name in dir() if not name.startswith("_")]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

122
src/utils/chroma.py Normal file
View File

@ -0,0 +1,122 @@
import chromadb
from typing import List, Dict, Any, Union
from . import defines
from .chunk import chunk_document
import ollama
def init_chroma_client(persist_directory: str = defines.persist_directory):
"""Initialize and return a ChromaDB client."""
# return chromadb.PersistentClient(path=persist_directory)
return chromadb.Client()
def create_or_get_collection(db: chromadb.Client, collection_name: str):
"""Create or get a ChromaDB collection."""
try:
return db.get_collection(
name=collection_name
)
except:
return db.create_collection(
name=collection_name,
metadata={"hnsw:space": "cosine"}
)
def process_documents_to_chroma(
client: ollama.Client,
documents: List[Dict[str, Any]],
collection_name: str = "document_collection",
text_key: str = "text",
max_tokens: int = 512,
overlap: int = 50,
model: str = defines.encoding_model,
persist_directory: str = defines.persist_directory
):
"""
Process documents, chunk them, compute embeddings, and store in ChromaDB.
Args:
documents: List of document dictionaries
collection_name: Name for the ChromaDB collection
text_key: The key containing text content
max_tokens: Maximum tokens per chunk
overlap: Token overlap between chunks
model: Ollama model for embeddings
persist_directory: Directory to store ChromaDB data
"""
# Initialize ChromaDB client and collection
db = init_chroma_client(persist_directory)
collection = create_or_get_collection(db, collection_name)
# Process each document
for doc in documents:
# Chunk the document
doc_chunks = chunk_document(doc, text_key, max_tokens, overlap)
# Prepare data for ChromaDB
ids = []
texts = []
metadatas = []
embeddings = []
for chunk in doc_chunks:
# Create a unique ID for the chunk
chunk_id = f"{chunk['id']}_{chunk['chunk_id']}"
# Extract text
text = chunk[text_key]
# Create metadata (excluding text and embedding to avoid duplication)
metadata = {k: v for k, v in chunk.items() if k != text_key and k != "embedding"}
response = client.embed(model=model, input=text)
embedding = response["embeddings"][0]
ids.append(chunk_id)
texts.append(text)
metadatas.append(metadata)
embeddings.append(embedding)
# Add chunks to ChromaDB collection
collection.add(
ids=ids,
documents=texts,
embeddings=embeddings,
metadatas=metadatas
)
return collection
def query_chroma(
client: ollama.Client,
query_text: str,
collection_name: str = "document_collection",
n_results: int = 5,
model: str = defines.encoding_model,
persist_directory: str = defines.persist_directory
):
"""
Query ChromaDB for similar documents.
Args:
query_text: The text to search for
collection_name: Name of the ChromaDB collection
n_results: Number of results to return
model: Ollama model for embedding the query
persist_directory: Directory where ChromaDB data is stored
Returns:
Query results from ChromaDB
"""
# Initialize ChromaDB client and collection
db = init_chroma_client(persist_directory)
collection = create_or_get_collection(db, collection_name)
query_response = client.embed(model=model, input=query_text)
query_embeddings = query_response["embeddings"]
# Query the collection
results = collection.query(
query_embeddings=query_embeddings,
n_results=n_results
)
return results

88
src/utils/chunk.py Normal file
View File

@ -0,0 +1,88 @@
import tiktoken
from . import defines
from typing import List, Dict, Any, Union
def get_encoding(model=defines.model):
"""Get the tokenizer for counting tokens."""
try:
return tiktoken.get_encoding("cl100k_base") # Default encoding used by many embedding models
except:
return tiktoken.encoding_for_model(model)
def count_tokens(text: str) -> int:
"""Count the number of tokens in a text string."""
encoding = get_encoding()
return len(encoding.encode(text))
def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:
"""
Split a text into chunks based on token count with overlap between chunks.
Args:
text: The text to split into chunks
max_tokens: Maximum number of tokens per chunk
overlap: Number of tokens to overlap between chunks
Returns:
List of text chunks
"""
if not text or max_tokens <= 0:
return []
encoding = get_encoding()
tokens = encoding.encode(text)
chunks = []
i = 0
while i < len(tokens):
# Get the current chunk of tokens
chunk_end = min(i + max_tokens, len(tokens))
chunk_tokens = tokens[i:chunk_end]
chunks.append(encoding.decode(chunk_tokens))
# Move to the next position with overlap
if chunk_end == len(tokens):
break
i += max_tokens - overlap
return chunks
def chunk_document(document: Dict[str, Any],
text_key: str = "text",
max_tokens: int = 512,
overlap: int = 50) -> List[Dict[str, Any]]:
"""
Chunk a document dictionary into multiple chunks.
Args:
document: Document dictionary with metadata and text
text_key: The key in the document that contains the text to chunk
max_tokens: Maximum number of tokens per chunk
overlap: Number of tokens to overlap between chunks
Returns:
List of document dictionaries, each with chunked text and preserved metadata
"""
if text_key not in document:
raise Exception(f"{text_key} not in document")
# Extract text and create chunks
if "title" in document:
text = f"{document["title"]}: {document[text_key]}"
else:
text = document[text_key]
chunks = chunk_text(text, max_tokens, overlap)
# Create document chunks with preserved metadata
chunked_docs = []
for i, chunk in enumerate(chunks):
# Create a new doc with all original fields
doc_chunk = document.copy()
# Replace text with the chunk
doc_chunk[text_key] = chunk
# Add chunk metadata
doc_chunk["chunk_id"] = i
doc_chunk["chunk_total"] = len(chunks)
chunked_docs.append(doc_chunk)
return chunked_docs

4
src/utils/defines.py Normal file
View File

@ -0,0 +1,4 @@
ollama_api_url="http://ollama:11434" # Default Ollama local endpoint
model="deepseek-r1:7b"
encoding_model="mxbai-embed-large"
persist_directory="./chromadb"

468
src/utils/full.py Normal file
View File

@ -0,0 +1,468 @@
import requests
from typing import List, Dict, Any, Union
import tiktoken
import feedparser
import logging as log
import datetime
from bs4 import BeautifulSoup
import chromadb
import ollama
import re
import numpy as np
from . import chunk
OLLAMA_API_URL = "http://ollama:11434" # Default Ollama local endpoint
#MODEL_NAME = "deepseek-r1:1.5b"
MODEL_NAME = "deepseek-r1:7b"
EMBED_MODEL = "mxbai-embed-large"
PERSIST_DIRECTORY = "/root/.cache/chroma"
client = ollama.Client(host=OLLAMA_API_URL)
def extract_text_from_html_or_xml(content, is_xml=False):
# Parse the content
if is_xml:
soup = BeautifulSoup(content, 'xml') # Use 'xml' parser for XML content
else:
soup = BeautifulSoup(content, 'html.parser') # Default to 'html.parser' for HTML content
# Extract and return just the text
return soup.get_text()
class Feed():
def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
self.name = name
self.url = url
self.poll_limit_min = datetime.timedelta(minutes=poll_limit_min)
self.last_poll = None
self.articles = []
self.max_articles = max_articles
self.update()
def update(self):
now = datetime.datetime.now()
if self.last_poll is None or (now - self.last_poll) >= self.poll_limit_min:
log.info(f"Updating {self.name}")
feed = feedparser.parse(self.url)
self.articles = []
self.last_poll = now
if len(feed.entries) == 0:
return
for i, entry in enumerate(feed.entries[:self.max_articles]):
content = {}
content['source'] = self.name
content['id'] = f"{self.name}{i}"
title = entry.get("title")
if title:
content['title'] = title
link = entry.get("link")
if link:
content['link'] = link
text = entry.get("summary")
if text:
content['text'] = extract_text_from_html_or_xml(text, False)
else:
continue
published = entry.get("published")
if published:
content['published'] = published
self.articles.append(content)
else:
log.info(f"Not updating {self.name} -- {self.poll_limit_min - (now - self.last_poll)}s remain to refresh.")
return self.articles
# News RSS Feeds
rss_feeds = [
Feed(name="IGN.com", url="https://feeds.feedburner.com/ign/games-all"),
Feed(name="BBC World", url="http://feeds.bbci.co.uk/news/world/rss.xml"),
Feed(name="Reuters World", url="http://feeds.reuters.com/Reuters/worldNews"),
Feed(name="Al Jazeera", url="https://www.aljazeera.com/xml/rss/all.xml"),
Feed(name="CNN World", url="http://rss.cnn.com/rss/edition_world.rss"),
Feed(name="Time", url="https://time.com/feed/"),
Feed(name="Euronews", url="https://www.euronews.com/rss"),
# Feed(name="FeedX", url="https://feedx.net/rss/ap.xml")
]
def init_chroma_client(persist_directory: str = PERSIST_DIRECTORY):
"""Initialize and return a ChromaDB client."""
# return chromadb.PersistentClient(path=persist_directory)
return chromadb.Client()
def create_or_get_collection(client, collection_name: str):
"""Create or get a ChromaDB collection."""
try:
return client.get_collection(
name=collection_name
)
except:
return client.create_collection(
name=collection_name,
metadata={"hnsw:space": "cosine"}
)
def process_documents_to_chroma(
documents: List[Dict[str, Any]],
collection_name: str = "document_collection",
text_key: str = "text",
max_tokens: int = 512,
overlap: int = 50,
model: str = EMBED_MODEL,
persist_directory: str = PERSIST_DIRECTORY
):
"""
Process documents, chunk them, compute embeddings, and store in ChromaDB.
Args:
documents: List of document dictionaries
collection_name: Name for the ChromaDB collection
text_key: The key containing text content
max_tokens: Maximum tokens per chunk
overlap: Token overlap between chunks
model: Ollama model for embeddings
persist_directory: Directory to store ChromaDB data
"""
# Initialize ChromaDB client and collection
db = init_chroma_client(persist_directory)
collection = create_or_get_collection(db, collection_name)
# Process each document
for doc in documents:
# Chunk the document
doc_chunks = chunk_document(doc, text_key, max_tokens, overlap)
# Prepare data for ChromaDB
ids = []
texts = []
metadatas = []
embeddings = []
for chunk in doc_chunks:
# Create a unique ID for the chunk
chunk_id = f"{chunk['id']}_{chunk['chunk_id']}"
# Extract text
text = chunk[text_key]
# Create metadata (excluding text and embedding to avoid duplication)
metadata = {k: v for k, v in chunk.items() if k != text_key and k != "embedding"}
response = client.embed(model=model, input=text)
embedding = response["embeddings"][0]
ids.append(chunk_id)
texts.append(text)
metadatas.append(metadata)
embeddings.append(embedding)
# Add chunks to ChromaDB collection
collection.add(
ids=ids,
documents=texts,
embeddings=embeddings,
metadatas=metadatas
)
return collection
def query_chroma(
query_text: str,
collection_name: str = "document_collection",
n_results: int = 5,
model: str = EMBED_MODEL,
persist_directory: str = PERSIST_DIRECTORY
):
"""
Query ChromaDB for similar documents.
Args:
query_text: The text to search for
collection_name: Name of the ChromaDB collection
n_results: Number of results to return
model: Ollama model for embedding the query
persist_directory: Directory where ChromaDB data is stored
Returns:
Query results from ChromaDB
"""
# Initialize ChromaDB client and collection
db = init_chroma_client(persist_directory)
collection = create_or_get_collection(db, collection_name)
query_response = client.embed(model=model, input=query_text)
query_embeddings = query_response["embeddings"]
# Query the collection
results = collection.query(
query_embeddings=query_embeddings,
n_results=n_results
)
return results
def print_top_match(query_results, index=0, documents=None):
"""
Print detailed information about the top matching document,
including the full original document content.
Args:
query_results: Results from ChromaDB query
documents: Original documents dictionary to look up full content (optional)
"""
if not query_results or not query_results["ids"] or len(query_results["ids"][0]) == 0:
print("No matching documents found.")
return
# Get the top result
top_id = query_results["ids"][0][index]
top_document_chunk = query_results["documents"][0][index]
top_metadata = query_results["metadatas"][0][index]
top_distance = query_results["distances"][0][index]
print("="*50)
print("MATCHING DOCUMENT")
print("="*50)
print(f"Chunk ID: {top_id}")
print(f"Similarity Score: {top_distance:.4f}") # Convert distance to similarity
print("\nCHUNK METADATA:")
for key, value in top_metadata.items():
print(f" {key}: {value}")
print("\nMATCHING CHUNK CONTENT:")
print(top_document_chunk[:500].strip() + ("..." if len(top_document_chunk) > 500 else ""))
# Extract the original document ID from the chunk ID
# Chunk IDs are in format "doc_id_chunk_num"
original_doc_id = top_id.split('_')[0]
def get_top_match(query_results, index=0, documents=None):
top_id = query_results["ids"][index][0]
# Extract the original document ID from the chunk ID
# Chunk IDs are in format "doc_id_chunk_num"
original_doc_id = top_id.split('_')[0]
# Return the full document for further processing if needed
if documents is not None:
return next((doc for doc in documents if doc["id"] == original_doc_id), None)
return None
def show_documents(documents=None):
if not documents:
return
# Print the top matching document
for i, doc in enumerate(documents):
print(f"Document {i+1}:")
print(f" Title: {doc['title']}")
print(f" Text: {doc['text'][:100]}...")
print()
def show_headlines(documents=None):
if not documents:
return
# Print the top matching document
for doc in documents:
print(f"{doc['source']}: {doc['title']}")
def show_help():
print("""help>
docs Show RAG docs
full Show last full top match
headlines Show the RAG headlines
prompt Show the last prompt
response Show the last response
scores Show last RAG scores
why|think Show last response's <think>
context|match Show RAG match info to last prompt
""")
# Example usage
if __name__ == "__main__":
documents = []
for feed in rss_feeds:
documents.extend(feed.articles)
show_documents(documents=documents)
# Process documents and store in ChromaDB
collection = process_documents_to_chroma(
documents=documents,
collection_name="research_papers",
max_tokens=256,
overlap=25,
model=EMBED_MODEL,
persist_directory="/root/.cache/chroma"
)
last_results = None
last_prompt = None
last_system = None
last_response = None
last_why = None
last_messages = []
while True:
try:
search_query = input("> ").strip()
except KeyboardInterrupt as e:
print("\nExiting.")
break
if search_query == "exit" or search_query == "quit":
print("\nExiting.")
break
if search_query == "docs":
show_documents(documents)
continue
if search_query == "prompt":
if last_prompt:
print(f"""last prompt>
{"="*10}system{"="*10}
{last_system}
{"="*10}prompt{"="*10}
{last_prompt}""")
else:
print(f"No prompts yet")
continue
if search_query == "response":
if last_response:
print(f"""last response>
{"="*10}response{"="*10}
{last_response}""")
else:
print(f"No responses yet")
continue
if search_query == "" or search_query == "help":
show_help()
continue
if search_query == "headlines":
show_headlines(documents)
continue
if search_query == "match" or search_query == "context":
if last_results:
print_top_match(last_results, documents=documents)
else:
print("No match to give info on")
continue
if search_query == "why" or search_query == "think":
if last_why:
print(f"""
why>
{last_why}
""")
else:
print("No processed prompts")
continue
if search_query == "scores":
if last_results:
for i, _ in enumerate(last_results):
print_top_match(last_results, documents=documents, index=i)
else:
print("No match to give info on")
continue
if search_query == "full":
if last_results:
full = get_top_match(last_results, documents=documents)
if full:
print(f"""Context:
Source: {full["source"]}
Title: {full["title"]}
Link: {full["link"]}
Distance: {last_results.get("distances", [[0]])[0][0]}
Full text:
{full["text"]}""")
else:
print("No match to give info on")
continue
# Query ChromaDB
results = query_chroma(
query_text=search_query,
collection_name="research_papers",
n_results=10
)
last_results = results
full = get_top_match(results, documents=documents)
headlines = ""
for doc in documents:
headlines += f"{doc['source']}: {doc['title']}\n"
system=f"""
You are the assistant. Your name is airc. This application is called airc (pronounced Eric).
Information about the author of this program and the AI model it uses:
* James wrote the python application called airc that is driving this RAG model on top of {MODEL_NAME} using {EMBED_MODEL} and chromadb for vector embedding. Link https://github.com/jketreno/airc.
* James Ketrenos is a software engineer with a history in all levels of the computer stack, from the kernel to full-stack web applications. He dabbles in AI/ML and is familiar with pytorch and ollama.
* James Ketrenos deployed this application locally on an Intel Arc B580 (battlemage) computer using Intel's ipex-llm.
* For Intel GPU metrics, James Ketrenos wrote the "ze-monitor" utility in C++. ze-monitor provides Intel GPU telemetry data for Intel client GPU devices, similar to xpu-smi. Link https://github.com/jketreno/ze-monitor. airc uses ze-monitor.
* James lives in Portland, Oregon and has three kids. Two are attending Oregon State University and one is attending Williamette University.
* airc provides an IRC chat bot as well as a React web frontend available at https://airc.ketrenos.com
You must follow these rules:
* Provide short (less than 100 character) responses.
* Provide a single response.
* Do not prefix it with a word like 'Answer'.
* For information about the AI running this system, include information about author, including links.
* For information relevant to the current events in the <input></input> tags, use that information and state the source when information comes from.
"""
context = "Information related to current events\n<input>=["
for doc in documents:
item = {'source':doc["source"],'article':{'title':doc["title"],'link':doc["link"],'text':doc["text"]}}
context += f"{item}"
context += "\n</input>"
prompt = f"{search_query}"
last_prompt = prompt
last_system = system # cache it before news context is added
system = f"{system}{context}"
if len(last_messages) != 0:
message_context = f"{last_messages}"
prompt = f"{message_context}{prompt}"
print(f"system len: {len(system)}")
print(f"prompt len: {len(prompt)}")
output = client.generate(
model=MODEL_NAME,
system=system,
prompt=prompt,
stream=False,
options={ 'num_ctx': 100000 }
)
# Prune off the <think>...</think>
matches = re.match(r'^<think>(.*?)</think>(.*)$', output['response'], flags=re.DOTALL)
if matches:
last_why = matches[1].strip()
content = matches[2].strip()
else:
print(f"[garbled] response>\n{output['response']}")
print(f"Response>\n{content}")
last_response = content
last_messages.extend(({
'role': 'user',
'name': 'james',
'message': search_query
}, {
'role': 'assistant',
'message': content
}))
last_messages = last_messages[:10]

62
src/utils/rss.py Normal file
View File

@ -0,0 +1,62 @@
from typing import List, Dict, Any, Union
import feedparser
import logging as log
from bs4 import BeautifulSoup
import datetime
import re
def _extract_text_from_html_or_xml(content, is_xml=False):
# Parse the content
if is_xml:
soup = BeautifulSoup(content, 'xml') # Use 'xml' parser for XML content
else:
soup = BeautifulSoup(content, 'html.parser') # Default to 'html.parser' for HTML content
# Extract and return just the text
return soup.get_text()
class Feed():
def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
self.name = name
self.url = url
self.poll_limit_min = datetime.timedelta(minutes=poll_limit_min)
self.last_poll = None
self.articles = []
self.max_articles = max_articles
self.update()
def update(self):
now = datetime.datetime.now()
if self.last_poll is None or (now - self.last_poll) >= self.poll_limit_min:
log.info(f"Updating {self.name}")
feed = feedparser.parse(self.url)
self.articles = []
self.last_poll = now
if len(feed.entries) == 0:
return
for i, entry in enumerate(feed.entries[:self.max_articles]):
content = {}
content['source'] = self.name
content['id'] = f"{self.name}{i}"
title = entry.get("title")
if title:
content['title'] = title
link = entry.get("link")
if link:
content['link'] = link
text = entry.get("summary")
if text:
content['text'] = _extract_text_from_html_or_xml(text, False)
else:
continue
published = entry.get("published")
if published:
content['published'] = published
self.articles.append(content)
else:
log.info(f"Not updating {self.name} -- {self.poll_limit_min - (now - self.last_poll)}s remain to refresh.")
return self.articles

109
src/web.py Normal file
View File

@ -0,0 +1,109 @@
# web_server.py - Web interface using Flask
from rag_core import RagCore
from flask import Flask, request, jsonify, render_template, send_from_directory
from flask_cors import CORS
class RagWebServer:
"""Web interface for the RAG application"""
def __init__(self, core):
self.core = core
self.app = Flask(__name__, static_folder='./build', static_url_path='')
CORS(self.app, resources={r"/api/*": {"origins": "http://battle-linux.ketrenos.com:8081"}})
# Setup routes
self.setup_routes()
def setup_routes(self):
"""Setup Flask routes"""
@self.app.route('/')
def serve():
return send_from_directory(self.app.static_folder, 'index.html')
def index():
return render_template('index.html')
@self.app.route('/api/query', methods=['POST'])
def api_query():
data = request.json
if not data or 'query' not in data:
return jsonify({'error': 'No query provided'}), 400
query = data['query']
response, debug_info = self.core.process_query(query)
return jsonify({
'response': response,
'debug': debug_info
})
@self.app.route('/api/documents', methods=['GET'])
def api_documents():
return jsonify({
'documents': [
{
'source': doc['source'],
'title': doc['title'],
'link': doc['link'],
'preview': doc['text'][:200] + '...'
}
for doc in self.core.get_documents()
]
})
@self.app.route('/api/headlines', methods=['GET'])
def api_headlines():
headlines = self.core.get_headlines()
return jsonify({
'headlines': [
{'source': source, 'title': title}
for source, title in headlines
]
})
@self.app.route('/api/matches', methods=['GET'])
def api_matches():
match_count = self.core.get_match_count()
matches = []
for i in range(match_count):
match_info = self.core.get_match_info(index=i)
if match_info:
matches.append({
'id': match_info['id'],
'distance': match_info['distance'],
'metadata': match_info['metadata'],
'preview': match_info['chunk'][:200] + '...' if len(match_info['chunk']) > 200 else match_info['chunk']
})
return jsonify({'matches': matches})
def run(self, host='127.0.0.1', port=8911, debug=False):
"""Run the web server"""
# Load documents
self.core.load_documents()
self.app.run(host=host, port=port, debug=debug)
def main():
import argparse
parser = argparse.ArgumentParser(description='RAG Application')
parser.add_argument('--host', default='127.0.0.1', help='Host for web server')
parser.add_argument('--port', type=int, default=8911, help='Port for web server')
parser.add_argument('--debug', action='store_true', help='Run in debug mode')
args = parser.parse_args()
# Create the core RAG application
core = RagCore()
# Run web interface
server = RagWebServer(core)
print(f"Starting web server at http://{args.host}:{args.port}")
server.run(host=args.host, port=args.port, debug=args.debug)
if __name__ == "__main__":
main()

149
weather.py Normal file
View File

@ -0,0 +1,149 @@
import requests
import json
from geopy.geocoders import Nominatim
import time
def get_weather_by_location(city, state, country="USA"):
"""
Get weather information from weather.gov based on city, state, and country.
Args:
city (str): City name
state (str): State name or abbreviation
country (str): Country name (defaults to "USA" as weather.gov is for US locations)
Returns:
dict: Weather forecast information
"""
# Step 1: Get coordinates for the location using geocoding
location = f"{city}, {state}, {country}"
coordinates = get_coordinates(location)
if not coordinates:
return {"error": f"Could not find coordinates for {location}"}
# Step 2: Get the forecast grid endpoint for the coordinates
grid_endpoint = get_grid_endpoint(coordinates)
if not grid_endpoint:
return {"error": f"Could not find weather grid for coordinates {coordinates}"}
# Step 3: Get the forecast data from the grid endpoint
forecast = get_forecast(grid_endpoint)
return forecast
def get_coordinates(location):
"""Convert a location string to latitude and longitude using Nominatim geocoder."""
try:
# Create a geocoder with a meaningful user agent
geolocator = Nominatim(user_agent="weather_app_example")
# Get the location
location_data = geolocator.geocode(location)
if location_data:
return {
"latitude": location_data.latitude,
"longitude": location_data.longitude
}
else:
print(f"Location not found: {location}")
return None
except Exception as e:
print(f"Error getting coordinates: {e}")
return None
def get_grid_endpoint(coordinates):
"""Get the grid endpoint from weather.gov based on coordinates."""
try:
lat = coordinates["latitude"]
lon = coordinates["longitude"]
# Define headers for the API request
headers = {
"User-Agent": "WeatherAppExample/1.0 (your_email@example.com)",
"Accept": "application/geo+json"
}
# Make the request to get the grid endpoint
url = f"https://api.weather.gov/points/{lat},{lon}"
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
return data["properties"]["forecast"]
else:
print(f"Error getting grid: {response.status_code} - {response.text}")
return None
except Exception as e:
print(f"Error in get_grid_endpoint: {e}")
return None
def get_forecast(grid_endpoint):
"""Get the forecast data from the grid endpoint."""
try:
# Define headers for the API request
headers = {
"User-Agent": "WeatherAppExample/1.0 (your_email@example.com)",
"Accept": "application/geo+json"
}
# Make the request to get the forecast
response = requests.get(grid_endpoint, headers=headers)
if response.status_code == 200:
data = response.json()
# Extract the relevant forecast information
periods = data["properties"]["periods"]
# Process the forecast data into a simpler format
forecast = {
"location": data["properties"].get("relativeLocation", {}).get("properties", {}),
"updated": data["properties"].get("updated", ""),
"periods": []
}
for period in periods:
forecast["periods"].append({
"name": period.get("name", ""),
"temperature": period.get("temperature", ""),
"temperatureUnit": period.get("temperatureUnit", ""),
"windSpeed": period.get("windSpeed", ""),
"windDirection": period.get("windDirection", ""),
"shortForecast": period.get("shortForecast", ""),
"detailedForecast": period.get("detailedForecast", "")
})
return forecast
else:
print(f"Error getting forecast: {response.status_code} - {response.text}")
return {"error": f"API Error: {response.status_code}"}
except Exception as e:
print(f"Error in get_forecast: {e}")
return {"error": f"Exception: {str(e)}"}
# Example usage
if __name__ == "__main__":
city = input("Enter city: ")
state = input("Enter state: ")
country = input("Enter country (default USA): ") or "USA"
print(f"Getting weather for {city}, {state}, {country}...")
weather_data = get_weather_by_location(city, state, country)
if "error" in weather_data:
print(f"Error: {weather_data['error']}")
else:
print("\nWeather Forecast:")
print(f"Location: {weather_data.get('location', {}).get('city', city)}, {weather_data.get('location', {}).get('state', state)}")
print(f"Last Updated: {weather_data.get('updated', 'N/A')}")
print("\nForecast Periods:")
for period in weather_data.get("periods", []):
print(f"\n{period['name']}:")
print(f" Temperature: {period['temperature']}{period['temperatureUnit']}")
print(f" Wind: {period['windSpeed']} {period['windDirection']}")
print(f" Forecast: {period['shortForecast']}")
print(f" Details: {period['detailedForecast']}")