Initial commit

2025-03-06 13:48:18 -08:00 · 2025-03-06 13:48:18 -08:00 · d0b652aa09
commit d0b652aa09
11 changed files with 955 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,2 @@
 *
 !src
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 .env
 cache/**
--- a/396
+++ b/396
@ -0,0 +1,396 @@
 FROM ubuntu:oracular AS pytorch-build
 SHELL [ "/bin/bash", "-c" ]
 # Instructions Dockerfied from:
 #
 # https://github.com/pytorch/pytorch
 #
 # and
 #
 # https://pytorch.org/docs/stable/notes/get_start_xpu.html
 # https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-6.html
 # 
 #
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    gpg \
    wget \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 # ipex only supports python 3.11, so use 3.11 instead of latest oracular (3.12)
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    build-essential \
    ca-certificates \
    ccache \
    cmake \
    curl \
    git \
    gpg-agent \
    less \
    libbz2-dev \
    libffi-dev \
    libjpeg-dev \
    libpng-dev \
    libreadline-dev \
    libssl-dev \
    libsqlite3-dev \
    llvm \
    nano \
    wget \
    zlib1g-dev \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 #    python3 \
 #    python3-pip \
 #    python3-venv \
 #    python3-dev \
 RUN /usr/sbin/update-ccache-symlinks
 RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
 # Build Python in /opt/..., install it locally, then remove the build environment 
 # collapsed to a single docker layer.
 WORKDIR /opt
 ENV PYTHON_VERSION=3.11.9
 RUN wget -q -O - https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz | tar -xz \
    && cd Python-${PYTHON_VERSION} \
    && ./configure --prefix=/opt/python --enable-optimizations \
    && make -j$(nproc) \
    && make install \
    && cd /opt \
    && rm -rf Python-${PYTHON_VERSION}
 WORKDIR /opt/pytorch
 FROM ubuntu:oracular AS ze-monitor
 # From https://github.com/jketreno/ze-monitor
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    build-essential \
    debhelper \
    devscripts \
    cmake \
    git \
    libfmt-dev \
    libncurses-dev \
    rpm \
    rpm2cpio \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 RUN apt-get install -y \
    software-properties-common \
    && add-apt-repository -y ppa:kobuk-team/intel-graphics \
    && apt-get update \
    && apt-get install -y \
    libze-dev \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 RUN git clone --depth 1 --branch v0.3.0-1 https://github.com/jketreno/ze-monitor /opt/ze-monitor
 WORKDIR /opt/ze-monitor/build
 RUN cmake .. \
    && make \
    && cpack
 FROM pytorch-build AS pytorch
 COPY --from=pytorch-build /opt/pytorch /opt/pytorch
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
    && add-apt-repository -y ppa:kobuk-team/intel-graphics \
    && apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    libze-intel-gpu1 \
    libze1 \
    intel-ocloc \
    intel-opencl-icd \
    xpu-smi \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 RUN update-alternatives --install /usr/bin/python3 python3 /opt/python/bin/python3.11 2
 # When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries.
 ENV SYCL_CACHE_PERSISTENT=1
 WORKDIR /opt/pytorch
 RUN { \
    echo '#!/bin/bash' ; \
    update-alternatives --set python3 /opt/python/bin/python3.11 ; \
    echo 'source /opt/pytorch/venv/bin/activate' ; \
    echo 'bash -c "${@}"' ; \
    } > /opt/pytorch/shell ; \
    chmod +x /opt/pytorch/shell
 RUN python3 -m venv --system-site-packages /opt/pytorch/venv
 SHELL [ "/opt/pytorch/shell" ]
 RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu
 RUN pip3 freeze > /opt/pytorch/requirements.txt
 SHELL [ "/bin/bash", "-c" ]
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'echo "Container: pytorch"' ; \
    echo 'set -e' ; \
    echo 'echo "Setting pip environment to /opt/pytorch"' ; \
    echo 'source /opt/pytorch/venv/bin/activate'; \
    echo 'if [[ "${1}" == "" ]] || [[ "${1}" == "shell" ]]; then' ; \
    echo '  echo "Dropping to shell"' ; \
    echo '  /bin/bash -c "source /opt/pytorch/venv/bin/activate ; /bin/bash"' ; \
    echo 'else' ; \
    echo '  exec "${@}"' ; \
    echo 'fi' ; \
    } > /entrypoint.sh \
    && chmod +x /entrypoint.sh
 ENTRYPOINT [ "/entrypoint.sh" ]
 FROM pytorch AS ipex-2.6.10
 WORKDIR /opt
 RUN git clone --branch release/xpu/2.6.10 --depth 1 https://github.com/intel/intel-extension-for-pytorch.git ipex-2.6.10
 WORKDIR /opt/ipex-2.6.10
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
    | gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \
    && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
    | tee /etc/apt/sources.list.d/oneAPI.list \
    && apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    intel-deep-learning-essentials-2025.0 \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 # Requirements for building ipex / oneAPI...
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    libspdlog-dev \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 RUN python3 -m venv --system-site-packages /opt/ipex-2.6.10/venv
 RUN { \
    echo '#!/bin/bash' ; \
    update-alternatives --set python3 /opt/python/bin/python3.11 ; \
    echo 'source /opt/intel/oneapi/setvars.sh' ; \
    echo 'source /opt/ipex-2.6.10/venv/bin/activate' ; \
    echo 'bash -c "${@}"' ; \
    } > /opt/ipex-2.6.10/shell ; \
    chmod +x /opt/ipex-2.6.10/shell
 SHELL [ "/opt/ipex-2.6.10/shell" ]
 #RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu
 RUN pip3 install -r requirements.txt
 RUN git submodule update --init --recursive --depth 1
 # Building ipex-2.6.10 wheel requires level-zero loader (libze-dev)
 RUN apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    libze-dev \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 # torch needs to be installed
 RUN pip3 install torch --index-url https://download.pytorch.org/whl/test/xpu
 RUN python setup.py bdist_wheel
 FROM pytorch AS ipex-llm-src
 # Build ipex-llm from source
 RUN git clone --depth 1 https://github.com/intel/ipex-llm.git /opt/ipex-llm
 WORKDIR /opt/ipex-llm
 RUN python3 -m venv --system-site-packages /opt/ipex-llm/venv
 RUN { \
    echo '#!/bin/bash' ; \
    update-alternatives --set python3 /opt/python/bin/python3.11 ; \
    echo 'source /opt/ipex-llm/venv/bin/activate' ; \
    echo 'bash -c "${@}"' ; \
    } > /opt/ipex-llm/shell ; \
    chmod +x /opt/ipex-llm/shell
 SHELL [ "/opt/ipex-llm/shell" ]
 RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu
 COPY --from=ipex-2.6.10 /opt/ipex-2.6.10/dist/intel_extension_for_pytorch-2.6.10*.whl /opt/wheels/
 RUN for pkg in /opt/wheels/intel_extension_for_pytorch-2.6.10*.whl; do pip install $pkg[xpu-2-6]; done
 WORKDIR /opt/ipex-llm/python/llm
 RUN pip install requests wheel
 RUN python setup.py clean --all bdist_wheel --linux
 FROM airc AS jupyter
 SHELL [ "/opt/airc/shell" ]
 # BEGIN setup Jupyter
 RUN pip install jupyter \
    jupyterlab==4.3.0a0 \
    jupyterhub==5.0.0 \
    notebook==7.3.0a0 \
    "jupyter-server-proxy>=4.1.2"
 # END setup Jupyter
 SHELL [ "/bin/bash", "-c" ]
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'echo "Container: airc jupyter"' ; \
    echo 'if [[ ! -e "/root/.cache/hub/token" ]]; then' ; \
    echo '  if [[ "${HF_ACCESS_TOKEN}" == "" ]]; then' ; \
    echo '    echo "Set your HF access token in .env as: HF_ACCESS_TOKEN=<token>" >&2' ; \
    echo '    exit 1' ; \
    echo '  else' ; \
    echo '    if [[ ! -d '/root/.cache/hub' ]]; then mkdir -p /root/.cache/hub; fi' ; \
    echo '    echo "${HF_ACCESS_TOKEN}" > /root/.cache/hub/token' ; \
    echo '  fi' ; \
    echo 'fi' ; \
    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
    echo 'source /opt/intel/oneapi/setvars.sh' ; \
    echo 'source /opt/airc/venv/bin/activate' ; \
    echo 'if [[ "${1}" == "shell" ]]; then echo "Dropping to shell"; /bin/bash; exit $?; fi' ; \
    echo 'while true; do' ; \
    echo '  echo "Launching jupyter notebook"' ; \
    echo '  jupyter notebook \' ; \
    echo '    --notebook-dir=/opt/jupyter \' ; \
    echo '    --port 8888 \' ; \
    echo '    --ip 0.0.0.0 \' ; \
    echo '    --no-browser \' ; \
    echo '    --allow-root \' ; \
    echo '    --ServerApp.token= \' ; \
    echo '    --ServerApp.password= \' ; \
    echo '    --ServerApp.allow_origin=* \' ; \
    echo '    --ServerApp.base_url="/jupyter" \' ; \
    echo '    "${@}" \' ; \
    echo '    >> "/root/.cache/jupyter.log" 2>&1' ; \
    echo '  echo "jupyter notebook died ($?). Restarting."' ; \
    echo '  sleep 5' ; \
    echo 'done' ; \
    } > /entrypoint-jupyter.sh \
    && chmod +x /entrypoint-jupyter.sh
 ENTRYPOINT [ "/entrypoint-jupyter.sh" ]
 FROM pytorch AS airc
 RUN python3 -m venv --system-site-packages /opt/airc/venv
 # Don't install the full oneapi essentials; just the ones that we seem to need
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
    | gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \
    && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
    | tee /etc/apt/sources.list.d/oneAPI.list \
    && apt-get update \
    && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    intel-oneapi-mkl-sycl-2025.0 \
    intel-oneapi-dnnl-2025.0 \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'update-alternatives --set python3 /opt/python/bin/python3.11' ; \
    echo 'source /opt/intel/oneapi/setvars.sh' ; \
    echo 'source /opt/airc/venv/bin/activate' ; \
    echo 'if [[ "$1" == "" ]]; then bash -c; else bash -c "${@}"; fi' ; \
    } > /opt/airc/shell ; \
    chmod +x /opt/airc/shell
 SHELL [ "/opt/airc/shell" ]
 RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu
 # Install ipex built in ipex-2.6.10
 COPY --from=ipex-2.6.10 /opt/ipex-2.6.10/dist/*.whl /opt/wheels/
 RUN for pkg in /opt/wheels/intel_extension_for_pytorch-2.6.10*.whl; do pip install $pkg[xpu-2-6]; done
 # Install ipex-llm built in ipex-llm-src
 COPY --from=ipex-llm-src /opt/ipex-llm/python/llm/dist/*.whl /opt/wheels/
 RUN for pkg in /opt/wheels/ipex_llm*.whl; do pip install $pkg; done
 COPY src/ /opt/airc/src/
 # pydle does not work with newer asyncio due to coroutine
 # being deprecated. Patch to work.
 RUN pip3 install pydle transformers sentencepiece accelerate \
    && patch -d /opt/airc/venv/lib/python3*/site-packages/pydle \
    -p1 < /opt/airc/src/pydle.patch
 # mistral fails with cache_position errors with transformers>4.40 (or at least it fails with the latest)
 RUN pip install transformers==4.40
 RUN pip3 install pydle transformers sentencepiece accelerate
 # To get xe_linear and other Xe methods    
 RUN pip3 install 'bigdl-core-xe-all>=2.6.0b'
 # trl.core doesn't have what is needed with the default 'pip install trl' version
 RUN pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
 # Needed by src/model-server.py
 RUN pip install flask
 SHELL [ "/bin/bash", "-c" ]
 RUN { \
    echo '#!/bin/bash' ; \
    echo 'set -e' ; \
    echo 'if [[ ! -e "/root/.cache/hub/token" ]]; then' ; \
    echo '  if [[ "${HF_ACCESS_TOKEN}" == "" ]]; then' ; \
    echo '    echo "Set your HF access token in .env as: HF_ACCESS_TOKEN=<token>" >&2' ; \
    echo '    exit 1' ; \
    echo '  else' ; \
    echo '    if [[ ! -d '/root/.cache/hub' ]]; then mkdir -p /root/.cache/hub; fi' ; \
    echo '    echo "${HF_ACCESS_TOKEN}" > /root/.cache/hub/token' ; \
    echo '  fi' ; \
    echo 'fi' ; \
    echo 'echo "Container: airc"' ; \
    echo 'echo "Setting pip environment to /opt/airc"' ; \
    echo 'source /opt/intel/oneapi/setvars.sh'; \
    echo 'source /opt/airc/venv/bin/activate'; \
    echo 'if [[ "${1}" == "shell" ]] || [[ "${1}" == "/bin/bash" ]]; then' ; \
    echo '  echo "Dropping to shell"' ; \
    echo '  /bin/bash -c "source /opt/airc/venv/bin/activate ; /bin/bash"' ; \
    echo '  exit $?' ; \
    echo 'else' ; \
    echo '  while true; do' ; \
    echo '    echo "Launching model-server"' ; \
    echo '    python src/model-server.py \' ; \
    echo '      2>&1 | tee -a "/root/.cache/model-server.log"'; \
    echo '    echo "model-server died ($?). Restarting."' ; \
    echo '    sleep 5' ; \
    echo '  done &' ; \
    echo '  while true; do' ; \
    echo '    echo "Launching airc"' ; \
    echo '    python src/airc.py "${@}" \' ; \
    echo '      2>&1 | tee -a "/root/.cache/airc.log"' ; \
    echo '    echo "airc died ($?). Restarting."' ; \
    echo '    sleep 5' ; \
    echo '  done' ; \
    echo 'fi' ; \
    } > /entrypoint-airc.sh \
    && chmod +x /entrypoint-airc.sh
 COPY --from=ze-monitor /opt/ze-monitor/build/ze-monitor-*deb /opt/
 RUN dpkg -i /opt/ze-monitor-*deb
 WORKDIR /opt/airc
 ENTRYPOINT [ "/entrypoint-airc.sh" ]
--- a/24
+++ b/24
@ -0,0 +1,24 @@
 BSD 2-Clause License
 Copyright (c) 2025, James Ketrenos
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,86 @@
 # AIRC (pronounced Eric)
 AI is Really Cool
 NOTE: If running on an Intel Arc A series graphics processor, fp64 is not supported and may need to either be emulated or have the model quantized.
 This project provides container definitions that will provide PyTorch 2.6 with
 Intel's LLM project. In addition, it provides a small local chat server and an IRC client to provide a chat bot.
 # Installation
 This project uses docker containers to build. As this was originally
 written to work on an Intel Arc B580 (Battlemage), it requires a
 kernel that supports that hardware, such as the one documented
 at [Intel Graphics Preview](https://github.com/canonical/intel-graphics-preview), which runs in Ubuntu Oracular (24.10)..
 NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
 ## Want to run under WSL2? No can do...
 https://www.intel.com/content/www/us/en/support/articles/000093216/graphics/processor-graphics.html
 The A- and B-series discrete GPUs do not support SR-IOV, required for
 the GPU partitioning that Microsoft Windows uses in order to support GPU acceleration in WSL.
 ## Building
 NOTE: You need 'docker compose' installed. See [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/)
 ```bash
 git clone https://github.com/jketreno/airc
 cd airc
 docker compose build
 ```
 ## Running
 In order to download the models, you need to have a Hugging Face
 token. See https://huggingface.co/settings/tokens for information
 on obtaining a token.
 Edit .env to add the following:
 ```.env
 HF_ACCESS_TOKEN=<access token from huggingface>
 ```
 NOTE: Models downloaded by most examples will be placed in the
 ./cache directory, which is bind mounted to the container.
 ### AIRC
 To launch the airc shell interactively, with the pytorch 2.6
 environment loaded, use the default entrypoint to launch a shell:
 ```bash
 docker compose run --rm airc shell
 ```
 Once in the shell, you can then launch the model-server.py and then
 the airc.py client:
 ```bash
 docker compose run --rm airc shell
 src/airc.py --ai-server=http://localhost:5000 &
 src/model-server.py
 ```
 By default, src/airc.py will connect to irc.libera.chat on the airc-test
 channel. See `python src/airc.py --help` for options.
 By separating the model-server into its own process, you can develop
 and tweak the chat backend without losing the IRC connection established
 by airc.
 ### Jupyter
 ```bash
 docker compose up jupyter -d
 ```
 The default port for inbound connections is 8888 (see docker-compose.yml).
 $(pwd)/jupyter is bind mounted to /opt/juypter in the container, which is where notebooks will be saved by default.
 To access the jupyter notebook, go to `https://localhost:8888/jupyter`.
--- a/cache/.keep
+++ b/cache/.keep
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,31 @@
 services:
  airc:
    build:
      context: .
      dockerfile: Dockerfile
      target: airc
    image: airc
    restart: "no"
    env_file:
      - .env
    devices:
      - /dev/dri:/dev/dri
    volumes:
      - ./cache:/root/.cache
      - ./src:/opt/airc/src:rw
  jupyter:
    build:
      context: .
      dockerfile: Dockerfile
      target: jupyter
    image: jupyter
    env_file:
      - .env
    devices:
      - /dev/dri:/dev/dri
    ports:
    - 8888:8888 # Jupyter Notebook
    volumes:
      - ./jupyter:/opt/jupyter:rw
      - ./cache:/root/.cache
--- a/src/.keep
+++ b/src/.keep
--- a/src/airc.py
+++ b/src/airc.py
@ -0,0 +1,187 @@
 import asyncio
 import aiohttp
 import argparse
 import pydle
 import logging
 import os
 import re
 import time
 import datetime
 import asyncio
 import json
 from typing import Dict, Any
 def parse_args():
    parser = argparse.ArgumentParser(description="AI is Really Cool")
    parser.add_argument("--server", type=str, default="irc.libera.chat", help="IRC server address")
    parser.add_argument("--port", type=int, default=6667, help="IRC server port")
    parser.add_argument("--nickname", type=str, default="airc", help="Bot nickname")
    parser.add_argument("--channel", type=str, default="#airc-test", help="Channel to join")
    parser.add_argument("--ai-server", type=str, default="http://localhost:5000", help="OpenAI API endpoint")
    parser.add_argument('--level', type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 
                        default='INFO', help='Set the logging level.')
    return parser.parse_args()
 class AsyncOpenAIClient:
    def __init__(self, base_url: str = "http://localhost:5000"):
        logging.info(f"Using {base_url} as server")
        self.base_url = base_url
        self.session = None
    async def __aenter__(self):
        self.session = aiohttp.ClientSession()
        return self
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.session:
            await self.session.close()
    async def chat_completion(self, 
                            messages: list,
                            model: str = "my-model",
                            temperature: float = 0.7,
                            max_tokens: int = 100) -> Dict[str, Any]:
        """
        Make an async chat completion request
        """
        url = f"{self.base_url}/v1/chat/completions"
        # Prepare the request payload
        payload = {
            "model": model,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens
        }
        try:
            async with self.session.post(url, json=payload) as response:
                if response.status != 200:
                    error_text = await response.text()
                    raise Exception(f"Request failed with status {response.status}: {error_text}")
                return await response.json()
        except Exception as e:
            print(f"Error during request: {str(e)}")
            return {"error": str(e)}
 def setup_logging(level):
    numeric_level = getattr(logging, level.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError(f"Invalid log level: {level}")
    logging.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s - %(message)s')
    logging.info(f"Logging is set to {level} level.")
 class AIRC(pydle.Client):
    def __init__(self, nick, channel, client, burst_limit = 5, rate_limit = 1.0, burst_reset_timeout = 10.0):
        super().__init__(nick)
        self.nick = nick
        self.channel = channel
        self.burst_limit = burst_limit
        self.sent_burst = 0
        self.rate_limit = rate_limit
        self.burst_reset_timeout = burst_reset_timeout
        self.sent_burst = 0  # Track messages sent in burst
        self.last_message_time = None  # Track last message time
        self.system_input = "You are a critical assistant. Give concise and accurate answers in less than 120 characters."
        self._message_queue = asyncio.Queue()
        self._task = asyncio.create_task(self._send_from_queue())
        self.client = client
    async def _send_from_queue(self):
        """Background task that sends queued messages with burst + rate limiting."""
        while True:
            target, message = await self._message_queue.get()
            # If burst is still available, send immediately
            if self.sent_burst < self.burst_limit:
                self.sent_burst += 1
            else:
                await asyncio.sleep(self.rate_limit)  # Apply rate limit
            await super().message(target, message)  # Send message
            self.last_message_time = asyncio.get_event_loop().time()  # Update last message timestamp
            # Start burst reset countdown after each message
            asyncio.create_task(self._reset_burst_after_inactivity())
    async def _reset_burst_after_inactivity(self):
        """Resets burst counter only if no new messages are sent within timeout."""
        last_time = self.last_message_time
        await asyncio.sleep(self.burst_reset_timeout)  # Wait for inactivity period
        # Only reset if no new messages were sent during the wait
        if self.last_message_time == last_time:
            self.sent_burst = 0
            logging.info("Burst limit reset due to inactivity.")
    async def message(self, target, message):
        """Splits a multi-line message and sends each line separately."""
        for line in message.splitlines():  # Splits on both '\n' and '\r\n'
            if line.strip():  # Ignore empty lines
                await self._message_queue.put((target, line))
    async def on_connect(self):
        logging.debug('on_connect')
        await self.join(self.channel)
    def remove_substring(self, string, substring):
        return string.replace(substring, "")    
    def extract_nick_message(self, input_string):
        # Pattern with capturing groups for nick and message
        pattern = r"^\s*([^\s:]+?)\s*:\s*(.+?)$"
        match = re.match(pattern, input_string)
        if match:
            nick = match.group(1)    # First capturing group
            message = match.group(2)  # Second capturing group
            return nick, message
        return None, None  # Return None for both if no match
    async def on_message(self, target, source, message):
        if source == self.nick:
            return
        nick, body = self.extract_nick_message(message)
        if nick == self.nick:
            content = None
            if body == "stats":
                content = f"{self.queries} queries handled in {self.processing}s"
            else:
                # Sample messages
                messages = [
                    {"role": "system", "content": self.system_input},
                    {"role": "user", "content": body}
                ]
                # Make the request
                response = await self.client.chat_completion(messages)
                # Extract and print just the assistant's message if available
                if "choices" in response and len(response["choices"]) > 0:
                    content = response["choices"][0]["message"]["content"]
                    print(f"\nAssistant: {content}")
            if content:
                logging.info(f'Sending: {content}')
                await self.message(target, f"{content}")
 def remove_substring(string, substring):
    return string.replace(substring, "")
 async def main():
    # Parse command-line arguments
    args = parse_args()
    # Setup logging based on the provided level
    setup_logging(args.level)
    async with AsyncOpenAIClient(base_url=args.ai_server) as client:
        bot = AIRC(args.nickname, args.channel, client)
        await bot.connect(args.server, args.port, tls=False)
        await bot.handle_forever()
 if __name__ == "__main__":
    asyncio.run(main())
--- a/src/model-server.py
+++ b/src/model-server.py
@ -0,0 +1,171 @@
 from flask import Flask, request, jsonify
 import json
 import asyncio
 import argparse
 import pydle
 import torch
 import logging
 from ipex_llm.transformers import AutoModelForCausalLM
 import transformers
 import os
 import re
 import time
 import datetime
 import asyncio
 import aiohttp
 import json
 from typing import Dict, Any
 def parse_args():
    parser = argparse.ArgumentParser(description="AI is Really Cool Server")
    parser.add_argument("--device", type=int, default=0, help="Device # to use for inference. See --device-list")
    #parser.add_argument("--device-list", help="List available devices")
    parser.add_argument('--level', type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 
                        default='INFO', help='Set the logging level.')
    return parser.parse_args()
 def setup_logging(level):
    numeric_level = getattr(logging, level.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError(f"Invalid log level: {level}")
    logging.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s - %(message)s')
    logging.info(f"Logging is set to {level} level.")
 class Chat():
    def __init__(self, device_name):
        super().__init__()
        self.device_name = device_name
        self.system_input = "You are a critical assistant. Give concise and accurate answers in less than 120 characters."
        self.context = None
        self.model_path = 'Intel/neural-chat-7b-v3-3'
        try:
            logging.info(f"Loading tokenizer from: {self.model_path}")
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token  # Set pad_token to eos_token if needed
            self.model = AutoModelForCausalLM.from_pretrained(self.model_path,
                                                load_in_4bit=True,
                                                optimize_model=True,
                                                trust_remote_code=True,
                                                use_cache=True)
            self.model = self.model.half().to(device_name)
        except Exception as e:
            logging.error(f"Loading error: {e}")
    def remove_substring(self, string, substring):
        return string.replace(substring, "")
    def generate_response(self, text):
        prompt = f"### System:\n{self.system_input}\n### User:\n{text}\n### Assistant:\n"
        start = time.time()
        with torch.autocast(self.device_name, dtype=torch.float16):
            inputs = self.tokenizer.encode_plus(
                prompt, 
                add_special_tokens=False,
                return_tensors="pt", 
                max_length=1000,            # Prevent 'Asking to truncate to max_length...'
                padding=True,               # Handles padding automatically
                truncation=True
            )
            input_ids = inputs["input_ids"].to(self.device_name)
            attention_mask = inputs["attention_mask"].to(self.device_name)
            outputs = self.model.generate(
                input_ids=input_ids, 
                attention_mask=attention_mask,
                max_length=1000,
                num_return_sequences=1,
                pad_token_id=self.tokenizer.eos_token_id
            )
            final_outputs = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            final_outputs = self.remove_substring(final_outputs, prompt).strip()
        end = time.time()
        return final_outputs, datetime.timedelta(seconds=end - start)
 app = Flask(__name__)
 # Basic endpoint for chat completions
@app.route('/v1/chat/completions', methods=['POST'])
 def chat_completions():
    logging.info('/v1/chat/completions')
    try:
        # Get the JSON data from the request
        data = request.get_json()
        # Extract relevant fields from the request
        model = data.get('model', 'default-model')
        messages = data.get('messages', [])
        temperature = data.get('temperature', 1.0)
        max_tokens = data.get('max_tokens', 2048)
        chat = app.config['chat']
        logging.info(f"Query: {messages}")
        response_content, _ = chat.generate_response(messages[-1]['content'])
        logging.info(f"Response: {response_content}")
        # Format response in OpenAI-compatible structure
        response = {
            "id": "chatcmpl-" + str(id(data)),  # Simple unique ID
            "object": "chat.completion",
            "created": int(time.time()),
            "model": chat.model_path,
            "choices": [{
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": response_content
                },
                "finish_reason": "stop"
            }],
            # "usage": {
            #     "prompt_tokens": len(str(messages).split()),
            #     "completion_tokens": len(response_content.split()),
            #     "total_tokens": len(str(messages).split()) + len(response_content.split())
            # }
        }
        return jsonify(response)
    except Exception as e:
        logging.error(e)
        return jsonify({
            "error": {
                "message": str(e),
                "type": "invalid_request_error"
            }
        }), 400
 # Health check endpoint
@app.route('/health', methods=['GET'])
 def health():
    return jsonify({"status": "healthy"}), 200
 if __name__ == '__main__':
    import time  # Imported here for the timestamp
    # Parse command-line arguments
    args = parse_args()
    # Setup logging based on the provided level
    setup_logging(args.level)
    if not torch.xpu.is_available():
        logging.error("No XPU available.")
        exit(1)
    device_count = torch.xpu.device_count();
    for i in range(device_count):
        logging.info(f"Device {i}: {torch.xpu.get_device_name(i)} Total memory: {torch.xpu.get_device_properties(i).total_memory}")
    device_name = 'xpu'
    device = torch.device(device_name)
    print(f"Using device: {device}")
    # Set environment variables that might help with XPU stability
    os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"] = "1"
    app.config['chat'] = Chat(device_name)
    app.run(host='0.0.0.0', port=5000, debug=True)
--- a/src/pydle.patch
+++ b/src/pydle.patch
@ -0,0 +1,56 @@
 diff --git a/__init__.py b/__init__.py
 index 2ead20d..892471b 100644
 --- a/__init__.py
 +++ b/__init__.py
@@ -1,11 +1,21 @@
 # noinspection PyUnresolvedReferences
 -from asyncio import coroutine, Future
 +from asyncio import Future
 from functools import cmp_to_key
 from . import connection, protocol, client, features
 from .client import Error, NotInChannel, AlreadyInChannel, BasicClient, ClientPool
 from .features.ircv3.cap import NEGOTIATING as CAPABILITY_NEGOTIATING, FAILED as CAPABILITY_FAILED, \
     NEGOTIATED as CAPABILITY_NEGOTIATED
 +import asyncio
 +# And use asyncio.coroutine where it was used, although it's better to switch to async def
 +# However, since 'coroutine' decorator is removed, you would actually need to:
 +from functools import wraps
 +
 +def coroutine(func):
 +    @wraps(func)
 +    async def wrapper(*args, **kwargs):
 +        return func(*args, **kwargs)
 +    return wrapper
 __name__ = 'pydle'
 __version__ = '0.9.4rc1'
 diff --git a/connection.py b/connection.py
 index c9a9e8e..5445b0e 100644
 --- a/connection.py
 +++ b/connection.py
@@ -37,6 +37,7 @@ class Connection:
         self.reader = None
         self.writer = None
         self.eventloop = eventloop or asyncio.new_event_loop()
 +        self.lock = asyncio.Lock()
     async def connect(self):
         """ Connect to target. """
@@ -49,8 +50,7 @@ class Connection:
             host=self.hostname,
             port=self.port,
             local_addr=self.source_address,
 -            ssl=self.tls_context,
 -            loop=self.eventloop
 +            ssl=self.tls_context
         )
     def create_tls_context(self):
@@ -112,4 +112,5 @@ class Connection:
         await self.writer.drain()
     async def recv(self, *, timeout=None):
 -        return await asyncio.wait_for(self.reader.readline(), timeout=timeout)
 +        async with self.lock:
 +            return await asyncio.wait_for(self.reader.readline(), timeout=timeout)