From d0b652aa09d5eeb9568fa038f2aeeead34f7fbf5 Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Thu, 6 Mar 2025 13:48:18 -0800 Subject: [PATCH] Initial commit --- .dockerignore | 2 + .gitignore | 2 + Dockerfile | 396 ++++++++++++++++++++++++++++++++++++++++++++ LICENSE | 24 +++ README.md | 86 ++++++++++ cache/.keep | 0 docker-compose.yml | 31 ++++ src/.keep | 0 src/airc.py | 187 +++++++++++++++++++++ src/model-server.py | 171 +++++++++++++++++++ src/pydle.patch | 56 +++++++ 11 files changed, 955 insertions(+) create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 README.md create mode 100644 cache/.keep create mode 100644 docker-compose.yml create mode 100644 src/.keep create mode 100644 src/airc.py create mode 100644 src/model-server.py create mode 100644 src/pydle.patch diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..7a1eba3 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +* +!src diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..45a744e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +cache/** diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..67cee3a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,396 @@ +FROM ubuntu:oracular AS pytorch-build + +SHELL [ "/bin/bash", "-c" ] + +# Instructions Dockerfied from: +# +# https://github.com/pytorch/pytorch +# +# and +# +# https://pytorch.org/docs/stable/notes/get_start_xpu.html +# https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-6.html +# +# +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + gpg \ + wget \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + + +# ipex only supports python 3.11, so use 3.11 instead of latest oracular (3.12) + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + build-essential \ + ca-certificates \ + ccache \ + cmake \ + curl \ + git \ + gpg-agent \ + less \ + libbz2-dev \ + libffi-dev \ + libjpeg-dev \ + libpng-dev \ + libreadline-dev \ + libssl-dev \ + libsqlite3-dev \ + llvm \ + nano \ + wget \ + zlib1g-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +# python3 \ +# python3-pip \ +# python3-venv \ +# python3-dev \ + +RUN /usr/sbin/update-ccache-symlinks +RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache + +# Build Python in /opt/..., install it locally, then remove the build environment +# collapsed to a single docker layer. +WORKDIR /opt +ENV PYTHON_VERSION=3.11.9 + +RUN wget -q -O - https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz | tar -xz \ + && cd Python-${PYTHON_VERSION} \ + && ./configure --prefix=/opt/python --enable-optimizations \ + && make -j$(nproc) \ + && make install \ + && cd /opt \ + && rm -rf Python-${PYTHON_VERSION} + +WORKDIR /opt/pytorch + +FROM ubuntu:oracular AS ze-monitor +# From https://github.com/jketreno/ze-monitor +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + build-essential \ + debhelper \ + devscripts \ + cmake \ + git \ + libfmt-dev \ + libncurses-dev \ + rpm \ + rpm2cpio \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +RUN apt-get install -y \ + software-properties-common \ + && add-apt-repository -y ppa:kobuk-team/intel-graphics \ + && apt-get update \ + && apt-get install -y \ + libze-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +RUN git clone --depth 1 --branch v0.3.0-1 https://github.com/jketreno/ze-monitor /opt/ze-monitor +WORKDIR /opt/ze-monitor/build +RUN cmake .. \ + && make \ + && cpack + +FROM pytorch-build AS pytorch + +COPY --from=pytorch-build /opt/pytorch /opt/pytorch + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \ + && add-apt-repository -y ppa:kobuk-team/intel-graphics \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + libze-intel-gpu1 \ + libze1 \ + intel-ocloc \ + intel-opencl-icd \ + xpu-smi \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2 + +# When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries. +ENV SYCL_CACHE_PERSISTENT=1 + +WORKDIR /opt/pytorch + +RUN { \ + echo '#!/bin/bash' ; \ + update-alternatives --set python3 /opt/python/bin/python3.11 ; \ + echo 'source /opt/pytorch/venv/bin/activate' ; \ + echo 'bash -c "${@}"' ; \ + } > /opt/pytorch/shell ; \ + chmod +x /opt/pytorch/shell + +RUN python3 -m venv --system-site-packages /opt/pytorch/venv + +SHELL [ "/opt/pytorch/shell" ] + +RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu +RUN pip3 freeze > /opt/pytorch/requirements.txt + +SHELL [ "/bin/bash", "-c" ] + +RUN { \ + echo '#!/bin/bash' ; \ + echo 'echo "Container: pytorch"' ; \ + echo 'set -e' ; \ + echo 'echo "Setting pip environment to /opt/pytorch"' ; \ + echo 'source /opt/pytorch/venv/bin/activate'; \ + echo 'if [[ "${1}" == "" ]] || [[ "${1}" == "shell" ]]; then' ; \ + echo ' echo "Dropping to shell"' ; \ + echo ' /bin/bash -c "source /opt/pytorch/venv/bin/activate ; /bin/bash"' ; \ + echo 'else' ; \ + echo ' exec "${@}"' ; \ + echo 'fi' ; \ + } > /entrypoint.sh \ + && chmod +x /entrypoint.sh + +ENTRYPOINT [ "/entrypoint.sh" ] + +FROM pytorch AS ipex-2.6.10 + +WORKDIR /opt +RUN git clone --branch release/xpu/2.6.10 --depth 1 https://github.com/intel/intel-extension-for-pytorch.git ipex-2.6.10 +WORKDIR /opt/ipex-2.6.10 + +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ + | gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \ + && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \ + | tee /etc/apt/sources.list.d/oneAPI.list \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + intel-deep-learning-essentials-2025.0 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +# Requirements for building ipex / oneAPI... +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + libspdlog-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +RUN python3 -m venv --system-site-packages /opt/ipex-2.6.10/venv + +RUN { \ + echo '#!/bin/bash' ; \ + update-alternatives --set python3 /opt/python/bin/python3.11 ; \ + echo 'source /opt/intel/oneapi/setvars.sh' ; \ + echo 'source /opt/ipex-2.6.10/venv/bin/activate' ; \ + echo 'bash -c "${@}"' ; \ + } > /opt/ipex-2.6.10/shell ; \ + chmod +x /opt/ipex-2.6.10/shell + +SHELL [ "/opt/ipex-2.6.10/shell" ] + +#RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu +RUN pip3 install -r requirements.txt + +RUN git submodule update --init --recursive --depth 1 + +# Building ipex-2.6.10 wheel requires level-zero loader (libze-dev) +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + libze-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +# torch needs to be installed +RUN pip3 install torch --index-url https://download.pytorch.org/whl/test/xpu + +RUN python setup.py bdist_wheel + +FROM pytorch AS ipex-llm-src + +# Build ipex-llm from source + +RUN git clone --depth 1 https://github.com/intel/ipex-llm.git /opt/ipex-llm + +WORKDIR /opt/ipex-llm + +RUN python3 -m venv --system-site-packages /opt/ipex-llm/venv +RUN { \ + echo '#!/bin/bash' ; \ + update-alternatives --set python3 /opt/python/bin/python3.11 ; \ + echo 'source /opt/ipex-llm/venv/bin/activate' ; \ + echo 'bash -c "${@}"' ; \ + } > /opt/ipex-llm/shell ; \ + chmod +x /opt/ipex-llm/shell + +SHELL [ "/opt/ipex-llm/shell" ] + +RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu +COPY --from=ipex-2.6.10 /opt/ipex-2.6.10/dist/intel_extension_for_pytorch-2.6.10*.whl /opt/wheels/ +RUN for pkg in /opt/wheels/intel_extension_for_pytorch-2.6.10*.whl; do pip install $pkg[xpu-2-6]; done + +WORKDIR /opt/ipex-llm/python/llm +RUN pip install requests wheel +RUN python setup.py clean --all bdist_wheel --linux + +FROM airc AS jupyter + +SHELL [ "/opt/airc/shell" ] + +# BEGIN setup Jupyter +RUN pip install jupyter \ + jupyterlab==4.3.0a0 \ + jupyterhub==5.0.0 \ + notebook==7.3.0a0 \ + "jupyter-server-proxy>=4.1.2" +# END setup Jupyter + +SHELL [ "/bin/bash", "-c" ] + +RUN { \ + echo '#!/bin/bash' ; \ + echo 'echo "Container: airc jupyter"' ; \ + echo 'if [[ ! -e "/root/.cache/hub/token" ]]; then' ; \ + echo ' if [[ "${HF_ACCESS_TOKEN}" == "" ]]; then' ; \ + echo ' echo "Set your HF access token in .env as: HF_ACCESS_TOKEN=" >&2' ; \ + echo ' exit 1' ; \ + echo ' else' ; \ + echo ' if [[ ! -d '/root/.cache/hub' ]]; then mkdir -p /root/.cache/hub; fi' ; \ + echo ' echo "${HF_ACCESS_TOKEN}" > /root/.cache/hub/token' ; \ + echo ' fi' ; \ + echo 'fi' ; \ + echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \ + echo 'source /opt/intel/oneapi/setvars.sh' ; \ + echo 'source /opt/airc/venv/bin/activate' ; \ + echo 'if [[ "${1}" == "shell" ]]; then echo "Dropping to shell"; /bin/bash; exit $?; fi' ; \ + echo 'while true; do' ; \ + echo ' echo "Launching jupyter notebook"' ; \ + echo ' jupyter notebook \' ; \ + echo ' --notebook-dir=/opt/jupyter \' ; \ + echo ' --port 8888 \' ; \ + echo ' --ip 0.0.0.0 \' ; \ + echo ' --no-browser \' ; \ + echo ' --allow-root \' ; \ + echo ' --ServerApp.token= \' ; \ + echo ' --ServerApp.password= \' ; \ + echo ' --ServerApp.allow_origin=* \' ; \ + echo ' --ServerApp.base_url="/jupyter" \' ; \ + echo ' "${@}" \' ; \ + echo ' >> "/root/.cache/jupyter.log" 2>&1' ; \ + echo ' echo "jupyter notebook died ($?). Restarting."' ; \ + echo ' sleep 5' ; \ + echo 'done' ; \ + } > /entrypoint-jupyter.sh \ + && chmod +x /entrypoint-jupyter.sh + +ENTRYPOINT [ "/entrypoint-jupyter.sh" ] + +FROM pytorch AS airc + +RUN python3 -m venv --system-site-packages /opt/airc/venv + +# Don't install the full oneapi essentials; just the ones that we seem to need +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ + | gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \ + && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \ + | tee /etc/apt/sources.list.d/oneAPI.list \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + intel-oneapi-mkl-sycl-2025.0 \ + intel-oneapi-dnnl-2025.0 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +RUN { \ + echo '#!/bin/bash' ; \ + echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \ + echo 'source /opt/intel/oneapi/setvars.sh' ; \ + echo 'source /opt/airc/venv/bin/activate' ; \ + echo 'if [[ "$1" == "" ]]; then bash -c; else bash -c "${@}"; fi' ; \ + } > /opt/airc/shell ; \ + chmod +x /opt/airc/shell + +SHELL [ "/opt/airc/shell" ] + +RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu +# Install ipex built in ipex-2.6.10 +COPY --from=ipex-2.6.10 /opt/ipex-2.6.10/dist/*.whl /opt/wheels/ +RUN for pkg in /opt/wheels/intel_extension_for_pytorch-2.6.10*.whl; do pip install $pkg[xpu-2-6]; done +# Install ipex-llm built in ipex-llm-src +COPY --from=ipex-llm-src /opt/ipex-llm/python/llm/dist/*.whl /opt/wheels/ +RUN for pkg in /opt/wheels/ipex_llm*.whl; do pip install $pkg; done + +COPY src/ /opt/airc/src/ + +# pydle does not work with newer asyncio due to coroutine +# being deprecated. Patch to work. +RUN pip3 install pydle transformers sentencepiece accelerate \ + && patch -d /opt/airc/venv/lib/python3*/site-packages/pydle \ + -p1 < /opt/airc/src/pydle.patch + +# mistral fails with cache_position errors with transformers>4.40 (or at least it fails with the latest) +RUN pip install transformers==4.40 + +RUN pip3 install pydle transformers sentencepiece accelerate + +# To get xe_linear and other Xe methods +RUN pip3 install 'bigdl-core-xe-all>=2.6.0b' + +# trl.core doesn't have what is needed with the default 'pip install trl' version +RUN pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c + +# Needed by src/model-server.py +RUN pip install flask + +SHELL [ "/bin/bash", "-c" ] + +RUN { \ + echo '#!/bin/bash' ; \ + echo 'set -e' ; \ + echo 'if [[ ! -e "/root/.cache/hub/token" ]]; then' ; \ + echo ' if [[ "${HF_ACCESS_TOKEN}" == "" ]]; then' ; \ + echo ' echo "Set your HF access token in .env as: HF_ACCESS_TOKEN=" >&2' ; \ + echo ' exit 1' ; \ + echo ' else' ; \ + echo ' if [[ ! -d '/root/.cache/hub' ]]; then mkdir -p /root/.cache/hub; fi' ; \ + echo ' echo "${HF_ACCESS_TOKEN}" > /root/.cache/hub/token' ; \ + echo ' fi' ; \ + echo 'fi' ; \ + echo 'echo "Container: airc"' ; \ + echo 'echo "Setting pip environment to /opt/airc"' ; \ + echo 'source /opt/intel/oneapi/setvars.sh'; \ + echo 'source /opt/airc/venv/bin/activate'; \ + echo 'if [[ "${1}" == "shell" ]] || [[ "${1}" == "/bin/bash" ]]; then' ; \ + echo ' echo "Dropping to shell"' ; \ + echo ' /bin/bash -c "source /opt/airc/venv/bin/activate ; /bin/bash"' ; \ + echo ' exit $?' ; \ + echo 'else' ; \ + echo ' while true; do' ; \ + echo ' echo "Launching model-server"' ; \ + echo ' python src/model-server.py \' ; \ + echo ' 2>&1 | tee -a "/root/.cache/model-server.log"'; \ + echo ' echo "model-server died ($?). Restarting."' ; \ + echo ' sleep 5' ; \ + echo ' done &' ; \ + echo ' while true; do' ; \ + echo ' echo "Launching airc"' ; \ + echo ' python src/airc.py "${@}" \' ; \ + echo ' 2>&1 | tee -a "/root/.cache/airc.log"' ; \ + echo ' echo "airc died ($?). Restarting."' ; \ + echo ' sleep 5' ; \ + echo ' done' ; \ + echo 'fi' ; \ + } > /entrypoint-airc.sh \ + && chmod +x /entrypoint-airc.sh + +COPY --from=ze-monitor /opt/ze-monitor/build/ze-monitor-*deb /opt/ +RUN dpkg -i /opt/ze-monitor-*deb + +WORKDIR /opt/airc + +ENTRYPOINT [ "/entrypoint-airc.sh" ] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d8181f0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +BSD 2-Clause License + +Copyright (c) 2025, James Ketrenos + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..f3c1021 --- /dev/null +++ b/README.md @@ -0,0 +1,86 @@ +# AIRC (pronounced Eric) + +AI is Really Cool + +NOTE: If running on an Intel Arc A series graphics processor, fp64 is not supported and may need to either be emulated or have the model quantized. + +This project provides container definitions that will provide PyTorch 2.6 with +Intel's LLM project. In addition, it provides a small local chat server and an IRC client to provide a chat bot. + +# Installation + +This project uses docker containers to build. As this was originally +written to work on an Intel Arc B580 (Battlemage), it requires a +kernel that supports that hardware, such as the one documented +at [Intel Graphics Preview](https://github.com/canonical/intel-graphics-preview), which runs in Ubuntu Oracular (24.10).. + +NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/) + +## Want to run under WSL2? No can do... + +https://www.intel.com/content/www/us/en/support/articles/000093216/graphics/processor-graphics.html + +The A- and B-series discrete GPUs do not support SR-IOV, required for +the GPU partitioning that Microsoft Windows uses in order to support GPU acceleration in WSL. + +## Building + +NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/) + + +```bash +git clone https://github.com/jketreno/airc +cd airc +docker compose build +``` + +## Running + +In order to download the models, you need to have a Hugging Face +token. See https://huggingface.co/settings/tokens for information +on obtaining a token. + +Edit .env to add the following: + +```.env +HF_ACCESS_TOKEN= +``` + +NOTE: Models downloaded by most examples will be placed in the +./cache directory, which is bind mounted to the container. + +### AIRC + +To launch the airc shell interactively, with the pytorch 2.6 +environment loaded, use the default entrypoint to launch a shell: + +```bash +docker compose run --rm airc shell +``` + +Once in the shell, you can then launch the model-server.py and then +the airc.py client: + +```bash +docker compose run --rm airc shell +src/airc.py --ai-server=http://localhost:5000 & +src/model-server.py +``` + +By default, src/airc.py will connect to irc.libera.chat on the airc-test +channel. See `python src/airc.py --help` for options. + +By separating the model-server into its own process, you can develop +and tweak the chat backend without losing the IRC connection established +by airc. + +### Jupyter + +```bash +docker compose up jupyter -d +``` + +The default port for inbound connections is 8888 (see docker-compose.yml). +$(pwd)/jupyter is bind mounted to /opt/juypter in the container, which is where notebooks will be saved by default. + +To access the jupyter notebook, go to `https://localhost:8888/jupyter`. diff --git a/cache/.keep b/cache/.keep new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..a61746a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,31 @@ +services: + airc: + build: + context: . + dockerfile: Dockerfile + target: airc + image: airc + restart: "no" + env_file: + - .env + devices: + - /dev/dri:/dev/dri + volumes: + - ./cache:/root/.cache + - ./src:/opt/airc/src:rw + + jupyter: + build: + context: . + dockerfile: Dockerfile + target: jupyter + image: jupyter + env_file: + - .env + devices: + - /dev/dri:/dev/dri + ports: + - 8888:8888 # Jupyter Notebook + volumes: + - ./jupyter:/opt/jupyter:rw + - ./cache:/root/.cache diff --git a/src/.keep b/src/.keep new file mode 100644 index 0000000..e69de29 diff --git a/src/airc.py b/src/airc.py new file mode 100644 index 0000000..17bda7f --- /dev/null +++ b/src/airc.py @@ -0,0 +1,187 @@ +import asyncio +import aiohttp +import argparse +import pydle +import logging +import os +import re +import time +import datetime +import asyncio +import json +from typing import Dict, Any + +def parse_args(): + parser = argparse.ArgumentParser(description="AI is Really Cool") + parser.add_argument("--server", type=str, default="irc.libera.chat", help="IRC server address") + parser.add_argument("--port", type=int, default=6667, help="IRC server port") + parser.add_argument("--nickname", type=str, default="airc", help="Bot nickname") + parser.add_argument("--channel", type=str, default="#airc-test", help="Channel to join") + parser.add_argument("--ai-server", type=str, default="http://localhost:5000", help="OpenAI API endpoint") + parser.add_argument('--level', type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default='INFO', help='Set the logging level.') + return parser.parse_args() + +class AsyncOpenAIClient: + def __init__(self, base_url: str = "http://localhost:5000"): + logging.info(f"Using {base_url} as server") + self.base_url = base_url + self.session = None + + async def __aenter__(self): + self.session = aiohttp.ClientSession() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if self.session: + await self.session.close() + + async def chat_completion(self, + messages: list, + model: str = "my-model", + temperature: float = 0.7, + max_tokens: int = 100) -> Dict[str, Any]: + """ + Make an async chat completion request + """ + url = f"{self.base_url}/v1/chat/completions" + + # Prepare the request payload + payload = { + "model": model, + "messages": messages, + "temperature": temperature, + "max_tokens": max_tokens + } + + try: + async with self.session.post(url, json=payload) as response: + if response.status != 200: + error_text = await response.text() + raise Exception(f"Request failed with status {response.status}: {error_text}") + + return await response.json() + + except Exception as e: + print(f"Error during request: {str(e)}") + return {"error": str(e)} + +def setup_logging(level): + numeric_level = getattr(logging, level.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError(f"Invalid log level: {level}") + + logging.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s - %(message)s') + logging.info(f"Logging is set to {level} level.") + +class AIRC(pydle.Client): + def __init__(self, nick, channel, client, burst_limit = 5, rate_limit = 1.0, burst_reset_timeout = 10.0): + super().__init__(nick) + self.nick = nick + self.channel = channel + self.burst_limit = burst_limit + self.sent_burst = 0 + self.rate_limit = rate_limit + self.burst_reset_timeout = burst_reset_timeout + self.sent_burst = 0 # Track messages sent in burst + self.last_message_time = None # Track last message time + self.system_input = "You are a critical assistant. Give concise and accurate answers in less than 120 characters." + self._message_queue = asyncio.Queue() + self._task = asyncio.create_task(self._send_from_queue()) + self.client = client + + async def _send_from_queue(self): + """Background task that sends queued messages with burst + rate limiting.""" + while True: + target, message = await self._message_queue.get() + + # If burst is still available, send immediately + if self.sent_burst < self.burst_limit: + self.sent_burst += 1 + else: + await asyncio.sleep(self.rate_limit) # Apply rate limit + + await super().message(target, message) # Send message + self.last_message_time = asyncio.get_event_loop().time() # Update last message timestamp + + # Start burst reset countdown after each message + asyncio.create_task(self._reset_burst_after_inactivity()) + + async def _reset_burst_after_inactivity(self): + """Resets burst counter only if no new messages are sent within timeout.""" + last_time = self.last_message_time + await asyncio.sleep(self.burst_reset_timeout) # Wait for inactivity period + + # Only reset if no new messages were sent during the wait + if self.last_message_time == last_time: + self.sent_burst = 0 + logging.info("Burst limit reset due to inactivity.") + + async def message(self, target, message): + """Splits a multi-line message and sends each line separately.""" + for line in message.splitlines(): # Splits on both '\n' and '\r\n' + if line.strip(): # Ignore empty lines + await self._message_queue.put((target, line)) + + async def on_connect(self): + logging.debug('on_connect') + await self.join(self.channel) + + def remove_substring(self, string, substring): + return string.replace(substring, "") + + def extract_nick_message(self, input_string): + # Pattern with capturing groups for nick and message + pattern = r"^\s*([^\s:]+?)\s*:\s*(.+?)$" + + match = re.match(pattern, input_string) + if match: + nick = match.group(1) # First capturing group + message = match.group(2) # Second capturing group + return nick, message + return None, None # Return None for both if no match + + async def on_message(self, target, source, message): + if source == self.nick: + return + nick, body = self.extract_nick_message(message) + if nick == self.nick: + content = None + if body == "stats": + content = f"{self.queries} queries handled in {self.processing}s" + else: + # Sample messages + messages = [ + {"role": "system", "content": self.system_input}, + {"role": "user", "content": body} + ] + + # Make the request + response = await self.client.chat_completion(messages) + + # Extract and print just the assistant's message if available + if "choices" in response and len(response["choices"]) > 0: + content = response["choices"][0]["message"]["content"] + print(f"\nAssistant: {content}") + + if content: + logging.info(f'Sending: {content}') + await self.message(target, f"{content}") + +def remove_substring(string, substring): + return string.replace(substring, "") + +async def main(): + # Parse command-line arguments + args = parse_args() + + # Setup logging based on the provided level + setup_logging(args.level) + + async with AsyncOpenAIClient(base_url=args.ai_server) as client: + bot = AIRC(args.nickname, args.channel, client) + await bot.connect(args.server, args.port, tls=False) + await bot.handle_forever() + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/model-server.py b/src/model-server.py new file mode 100644 index 0000000..d7a35a9 --- /dev/null +++ b/src/model-server.py @@ -0,0 +1,171 @@ +from flask import Flask, request, jsonify +import json +import asyncio +import argparse +import pydle +import torch +import logging +from ipex_llm.transformers import AutoModelForCausalLM +import transformers +import os +import re +import time +import datetime +import asyncio +import aiohttp +import json +from typing import Dict, Any + +def parse_args(): + parser = argparse.ArgumentParser(description="AI is Really Cool Server") + parser.add_argument("--device", type=int, default=0, help="Device # to use for inference. See --device-list") + #parser.add_argument("--device-list", help="List available devices") + parser.add_argument('--level', type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default='INFO', help='Set the logging level.') + return parser.parse_args() + +def setup_logging(level): + numeric_level = getattr(logging, level.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError(f"Invalid log level: {level}") + + logging.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s - %(message)s') + logging.info(f"Logging is set to {level} level.") + +class Chat(): + def __init__(self, device_name): + super().__init__() + self.device_name = device_name + self.system_input = "You are a critical assistant. Give concise and accurate answers in less than 120 characters." + self.context = None + self.model_path = 'Intel/neural-chat-7b-v3-3' + try: + logging.info(f"Loading tokenizer from: {self.model_path}") + self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token # Set pad_token to eos_token if needed + + self.model = AutoModelForCausalLM.from_pretrained(self.model_path, + load_in_4bit=True, + optimize_model=True, + trust_remote_code=True, + use_cache=True) + self.model = self.model.half().to(device_name) + except Exception as e: + logging.error(f"Loading error: {e}") + + def remove_substring(self, string, substring): + return string.replace(substring, "") + + def generate_response(self, text): + prompt = f"### System:\n{self.system_input}\n### User:\n{text}\n### Assistant:\n" + start = time.time() + + with torch.autocast(self.device_name, dtype=torch.float16): + inputs = self.tokenizer.encode_plus( + prompt, + add_special_tokens=False, + return_tensors="pt", + max_length=1000, # Prevent 'Asking to truncate to max_length...' + padding=True, # Handles padding automatically + truncation=True + ) + input_ids = inputs["input_ids"].to(self.device_name) + attention_mask = inputs["attention_mask"].to(self.device_name) + + outputs = self.model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_length=1000, + num_return_sequences=1, + pad_token_id=self.tokenizer.eos_token_id + ) + + final_outputs = self.tokenizer.decode(outputs[0], skip_special_tokens=True) + final_outputs = self.remove_substring(final_outputs, prompt).strip() + + end = time.time() + + return final_outputs, datetime.timedelta(seconds=end - start) + +app = Flask(__name__) + +# Basic endpoint for chat completions +@app.route('/v1/chat/completions', methods=['POST']) +def chat_completions(): + logging.info('/v1/chat/completions') + try: + # Get the JSON data from the request + data = request.get_json() + + # Extract relevant fields from the request + model = data.get('model', 'default-model') + messages = data.get('messages', []) + temperature = data.get('temperature', 1.0) + max_tokens = data.get('max_tokens', 2048) + + chat = app.config['chat'] + logging.info(f"Query: {messages}") + response_content, _ = chat.generate_response(messages[-1]['content']) + logging.info(f"Response: {response_content}") + # Format response in OpenAI-compatible structure + response = { + "id": "chatcmpl-" + str(id(data)), # Simple unique ID + "object": "chat.completion", + "created": int(time.time()), + "model": chat.model_path, + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": response_content + }, + "finish_reason": "stop" + }], + # "usage": { + # "prompt_tokens": len(str(messages).split()), + # "completion_tokens": len(response_content.split()), + # "total_tokens": len(str(messages).split()) + len(response_content.split()) + # } + } + + return jsonify(response) + + except Exception as e: + logging.error(e) + return jsonify({ + "error": { + "message": str(e), + "type": "invalid_request_error" + } + }), 400 + +# Health check endpoint +@app.route('/health', methods=['GET']) +def health(): + return jsonify({"status": "healthy"}), 200 + +if __name__ == '__main__': + import time # Imported here for the timestamp + # Parse command-line arguments + args = parse_args() + + # Setup logging based on the provided level + setup_logging(args.level) + + if not torch.xpu.is_available(): + logging.error("No XPU available.") + exit(1) + device_count = torch.xpu.device_count(); + for i in range(device_count): + logging.info(f"Device {i}: {torch.xpu.get_device_name(i)} Total memory: {torch.xpu.get_device_properties(i).total_memory}") + device_name = 'xpu' + device = torch.device(device_name) + print(f"Using device: {device}") + + # Set environment variables that might help with XPU stability + os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"] = "1" + + app.config['chat'] = Chat(device_name) + + app.run(host='0.0.0.0', port=5000, debug=True) \ No newline at end of file diff --git a/src/pydle.patch b/src/pydle.patch new file mode 100644 index 0000000..01ccce7 --- /dev/null +++ b/src/pydle.patch @@ -0,0 +1,56 @@ +diff --git a/__init__.py b/__init__.py +index 2ead20d..892471b 100644 +--- a/__init__.py ++++ b/__init__.py +@@ -1,11 +1,21 @@ + # noinspection PyUnresolvedReferences +-from asyncio import coroutine, Future ++from asyncio import Future + from functools import cmp_to_key + from . import connection, protocol, client, features + from .client import Error, NotInChannel, AlreadyInChannel, BasicClient, ClientPool + from .features.ircv3.cap import NEGOTIATING as CAPABILITY_NEGOTIATING, FAILED as CAPABILITY_FAILED, \ + NEGOTIATED as CAPABILITY_NEGOTIATED + ++import asyncio ++# And use asyncio.coroutine where it was used, although it's better to switch to async def ++# However, since 'coroutine' decorator is removed, you would actually need to: ++from functools import wraps ++ ++def coroutine(func): ++ @wraps(func) ++ async def wrapper(*args, **kwargs): ++ return func(*args, **kwargs) ++ return wrapper + + __name__ = 'pydle' + __version__ = '0.9.4rc1' +diff --git a/connection.py b/connection.py +index c9a9e8e..5445b0e 100644 +--- a/connection.py ++++ b/connection.py +@@ -37,6 +37,7 @@ class Connection: + self.reader = None + self.writer = None + self.eventloop = eventloop or asyncio.new_event_loop() ++ self.lock = asyncio.Lock() + + async def connect(self): + """ Connect to target. """ +@@ -49,8 +50,7 @@ class Connection: + host=self.hostname, + port=self.port, + local_addr=self.source_address, +- ssl=self.tls_context, +- loop=self.eventloop ++ ssl=self.tls_context + ) + + def create_tls_context(self): +@@ -112,4 +112,5 @@ class Connection: + await self.writer.drain() + + async def recv(self, *, timeout=None): +- return await asyncio.wait_for(self.reader.readline(), timeout=timeout) ++ async with self.lock: ++ return await asyncio.wait_for(self.reader.readline(), timeout=timeout)