From 59cf29ef24b6efa2cee98d937ed97bc61b6be807 Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Tue, 29 Jul 2025 16:55:47 -0700 Subject: [PATCH] Initial commit --- .gitignore | 4 ++ Dockerfile | 127 +++++++++++++++++++++++++++++++++++++++++++++ Dockerfile.xpu | 94 +++++++++++++++++++++++++++++++++ cache/.keep | 0 db/.keep | 0 docker-compose.yml | 127 +++++++++++++++++++++++++++++++++++++++++++++ ollama/.keep | 0 7 files changed, 352 insertions(+) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 Dockerfile.xpu create mode 100755 cache/.keep create mode 100644 db/.keep create mode 100644 docker-compose.yml create mode 100644 ollama/.keep diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fca52c8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +db/** +cache/** +ollama/** +.env diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e2c0e43 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,127 @@ +FROM ubuntu:oracular AS ollama + +# Get a couple prerequisites +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + gpg \ + wget \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +# Install Intel graphics runtimes +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \ + && add-apt-repository -y ppa:kobuk-team/intel-graphics \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + libze-intel-gpu1 \ + libze1 \ + intel-ocloc \ + intel-opencl-icd \ + xpu-smi \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +WORKDIR /opt/ollama + +# Download the nightly ollama release from ipex-llm + +# NOTE: NO longer at github.com/intel -- now at ipex-llm + +# This version does not work: +ENV OLLAMA_VERSION=https://github.com/ipex-llm/ipex-llm/releases/download/v2.2.0/ollama-ipex-llm-2.2.0-ubuntu.tgz + + +# Does not work -- crashes +# ENV OLLAMA_VERSION=https://github.com/ipex-llm/ipex-llm/releases/download/v2.3.0-nightly/ollama-ipex-llm-2.3.0b20250612-ubuntu.tgz + +RUN wget -qO - ${OLLAMA_VERSION} | \ + tar --strip-components=1 -C . -xzv + +# Install Python from Oracular (ollama works with 3.12) +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + gpg \ + python3 \ + python3-pip \ + python3-venv \ + wget \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +# Setup the ollama python virtual environment +RUN python3 -m venv --system-site-packages /opt/ollama/venv + +# Setup the docker pip shell +RUN { \ + echo '#!/bin/bash' ; \ + echo 'source /opt/ollama/venv/bin/activate' ; \ + echo 'if [[ "${1}" != "" ]]; then bash -c "${@}"; else bash -i; fi' ; \ + } > /opt/ollama/shell ; \ + chmod +x /opt/ollama/shell + +# Activate the pip environment on all shell calls +SHELL [ "/opt/ollama/shell" ] + +# Install ollama python module +RUN pip install ollama langchain-ollama + +SHELL [ "/bin/bash", "-c" ] + +RUN { \ + echo '#!/bin/bash'; \ + echo 'echo "Container: ollama"'; \ + echo 'set -e'; \ + echo 'echo "Setting pip environment to /opt/ollama"'; \ + echo 'source /opt/ollama/venv/bin/activate'; \ + echo 'export OLLAMA_NUM_GPU=999'; \ + echo 'export ZES_ENABLE_SYSMAN=1'; \ + echo 'export SYCL_CACHE_PERSISTENT=1'; \ + echo 'export OLLAMA_KEEP_ALIVE=-1'; \ + echo 'export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1'; \ + echo ''; \ + echo 'if [[ "${1}" == "/bin/bash" ]] || [[ "${1}" =~ ^(/opt/ollama/)?shell$ ]]; then'; \ + echo ' echo "Dropping to shell"'; \ + echo ' shift'; \ + echo ' if [[ "${1}" != "" ]]; then cmd="/opt/ollama/shell ${@}"; echo "Running: ${cmd}"; exec ${cmd}; else /opt/ollama/shell; fi'; \ + echo 'else'; \ + echo ' echo "Launching Ollama server..."'; \ + echo ' exec ollama serve'; \ + echo 'fi'; \ + } > /entrypoint.sh \ + && chmod +x /entrypoint.sh + +RUN { \ + echo '#!/bin/bash'; \ + echo 'echo "Container: ollama"'; \ + echo 'set -e'; \ + echo 'echo "Setting pip environment to /opt/ollama"'; \ + echo 'source /opt/ollama/venv/bin/activate'; \ + echo 'ollama pull qwen2.5:7b' ; \ + echo 'ollama pull llama3.2' ; \ + echo 'ollama pull mxbai-embed-large' ; \ + echo 'ollama pull deepseek-r1:7b' ; \ + echo 'ollama pull mistral:7b' ; \ + } > /fetch-models.sh \ + && chmod +x /fetch-models.sh + +ENV PYTHONUNBUFFERED=1 + +# Enable ext_intel_free_memory +ENV ZES_ENABLE_SYSMAN=1 + +# Use all GPUs +ENV OLLAMA_NUM_GPU=999 + +# Use immediate command lists +ENV SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 + +# Use persistent cache +ENV SYCL_CACHE_PERSISTENT=1 + +VOLUME [" /root/.ollama" ] + +ENV PATH=/opt/ollama:${PATH} + +ENTRYPOINT [ "/entrypoint.sh" ] + diff --git a/Dockerfile.xpu b/Dockerfile.xpu new file mode 100644 index 0000000..7e1ecc6 --- /dev/null +++ b/Dockerfile.xpu @@ -0,0 +1,94 @@ +FROM intel/deep-learning-essentials:2025.2.0-0-devel-ubuntu24.04 AS vllm-base + +RUN rm /etc/apt/sources.list.d/intel-graphics.list + +RUN apt-get update -y && \ + apt-get install -y --no-install-recommends --fix-missing \ + curl \ + ffmpeg \ + git \ + gpg \ + libsndfile1 \ + libsm6 \ + libxext6 \ + libgl1 \ + lsb-release \ + numactl \ + python3 \ + python3-dev \ + python3-pip \ + python3-venv \ + wget \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +WORKDIR /workspace +RUN git clone https://github.com/vllm-project/vllm.git +WORKDIR /workspace/vllm + +# Setup the docker pip shell +RUN { \ + echo '#!/bin/bash' ; \ + echo 'source /workspace/venv/bin/activate' ; \ + echo 'if [[ "${1}" != "" ]]; then bash -c "${@}"; else bash -i; fi' ; \ + } > /workspace/shell ; \ + chmod +x /workspace/shell + +SHELL [ "/workspace/shell" ] + +RUN python3 -m venv --system-site-packages /workspace/venv + +#COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt +#COPY requirements/common.txt /workspace/vllm/requirements/common.txt + +#RUN --mount=type=cache,target=/root/.cache/pip \ +RUN pip install --no-cache-dir \ + -r requirements/xpu.txt + +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" + +#ARG GIT_REPO_CHECK=0 +#RUN --mount=type=bind,source=.git,target=.git \ +RUN bash tools/check_repo.sh + +ENV VLLM_TARGET_DEVICE=xpu +ENV VLLM_WORKER_MULTIPROC_METHOD=spawn + +# --mount=type=bind,source=.git,target=.git \ +#RUN --mount=type=cache,target=/root/.cache/pip \ +RUN python3 setup.py install + +CMD ["/bin/bash"] + +FROM vllm-base AS vllm-openai + +SHELL [ "/workspace/shell" ] + +# install additional dependencies for openai api server +#RUN --mount=type=cache,target=/root/.cache/pip \ +RUN pip install accelerate hf_transfer pytest 'modelscope!=1.15.0' + + +ENV VLLM_USAGE_SOURCE=production-docker-image +ENV TRITON_XPU_PROFILE=1 +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + +SHELL [ "/bin/bash", "-c" ] + +# RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ +# gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg +# RUN echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified" | \ +# tee /etc/apt/sources.list.d/intel-gpu-jammy.list + +# RUN apt-get update \ +# && apt-get install -y \ +# libze-intel-gpu1 \ +# libze1 \ +# intel-opencl-icd \ +# clinfo \ +# && apt-get clean \ +# && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +#ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] +ENTRYPOINT ["/bin/bash"] \ No newline at end of file diff --git a/cache/.keep b/cache/.keep new file mode 100755 index 0000000..e69de29 diff --git a/db/.keep b/db/.keep new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..8222a5e --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,127 @@ +services: + # This doesn't work... + # ollama-intel: + # image: intelanalytics/ipex-llm-inference-cpp-xpu:latest + # container_name: ollama-intel + # restart: unless-stopped + # env_file: + # - .env + # devices: + # - /dev/dri:/dev/dri + # volumes: + # - ./cache:/root/.cache # Cache hub models and neo_compiler_cache + # - ./ollama:/root/.ollama # Cache the ollama models + # ports: + # - 11434:11434 + # environment: + # - OLLAMA_HOST=0.0.0.0 + # - DEVICE=Arc + # - OLLAMA_INTEL_GPU=true + # - OLLAMA_NUM_GPU=999 + # - ZES_ENABLE_SYSMAN=1 + # - ONEAPI_DEVICE_SELECTOR=level_zero:0 + # - TZ=America/Los_Angeles + # command: sh -c 'mkdir -p /llm/ollama && cd /llm/ollama && init-ollama && exec ./ollama serve' + + ollama: + build: + context: . + dockerfile: Dockerfile + target: ollama + container_name: ollama + restart: "always" + env_file: + - .env + environment: + - OLLAMA_HOST=0.0.0.0 + - ONEAPI_DEVICE_SELECTOR=level_zero:0 + devices: + - /dev/dri:/dev/dri + ports: + - 11434:11434 # ollama serve port + networks: + - internal + volumes: + - ./cache:/root/.cache # Cache hub models and neo_compiler_cache + - ./ollama:/root/.ollama # Cache the ollama models + cap_add: # used for running ze-monitor within container + - CAP_DAC_READ_SEARCH # Bypass all filesystem read access checks + - CAP_PERFMON # Access to perf_events (vs. overloaded CAP_SYS_ADMIN) + - CAP_SYS_PTRACE # PTRACE_MODE_READ_REALCREDS ptrace access mode check + + # ollama-ov-server: + # build: + # context: . + # dockerfile: Dockerfile + # target: ollama-ov-server + # container_name: ollama-ov-server + # restart: "no" + # env_file: + # - .env + # environment: + # - OLLAMA_HOST=0.0.0.0 + # - ONEAPI_DEVICE_SELECTOR=level_zero:0 + # devices: + # - /dev/dri:/dev/dri + # ports: + # - 11435:11434 # ollama serve port + # networks: + # - internal + # volumes: + # - ./cache:/root/.cache # Cache hub models and neo_compiler_cache + # - ./ollama:/root/.ollama # Cache the ollama models + + vllm: + build: + context: . + dockerfile: Dockerfile.xpu + target: vllm-openai + container_name: vllm-openai + restart: "no" + shm_size: 10.24gb + env_file: + - .env + environment: + - OLLAMA_HOST=0.0.0.0 + # - ONEAPI_DEVICE_SELECTOR=level_zero:0 + - ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 + - ZE_AFFINITY_MASK=0.0 + - CCL_LOG_LEVEL=INFO + devices: + - /dev:/dev + # group_add: + # - render + # - video + ports: + - 11438:8000 # ollama serve port + networks: + - internal + volumes: + - ./cache:/root/.cache # Cache hub models and neo_compiler_cache + - ./ollama:/root/.ollama # Cache the ollama models + - /sys:/sys # Required so oneAPI can read PCI paths for Battlemage + privileged: true + cap_add: # used for running ze-monitor within container + - CAP_DAC_READ_SEARCH # Bypass all filesystem read access checks + - CAP_PERFMON # Access to perf_events (vs. overloaded CAP_SYS_ADMIN) + - CAP_SYS_PTRACE # PTRACE_MODE_READ_REALCREDS ptrace access mode check + + + phoenix: + image: arizephoenix/phoenix:latest + container_name: phoenix + restart: "always" + env_file: + - .env + volumes: + - ./db:/opt/phoenix/data + ports: + - 6006:6006 # Phoenix UI port + +networks: + internal: + driver: bridge + +volumes: + redis_data: + driver: local diff --git a/ollama/.keep b/ollama/.keep new file mode 100644 index 0000000..e69de29