llm/docker-compose.yml

services:
  # This doesn't work...
  # ollama-intel:
  #   image: intelanalytics/ipex-llm-inference-cpp-xpu:latest
  #   container_name: ollama-intel
  #   restart: unless-stopped
  #   env_file:
  #     - .env
  #   devices:
  #     - /dev/dri:/dev/dri
  #   volumes:
  #     - ./cache:/root/.cache   # Cache hub models and neo_compiler_cache
  #     - ./ollama:/root/.ollama # Cache the ollama models
  #   ports:
  #     - 11434:11434
  #   environment:
  #     - OLLAMA_HOST=0.0.0.0
  #     - DEVICE=Arc
  #     - OLLAMA_INTEL_GPU=true
  #     - OLLAMA_NUM_GPU=999
  #     - ZES_ENABLE_SYSMAN=1
  #     - ONEAPI_DEVICE_SELECTOR=level_zero:0
  #     - TZ=America/Los_Angeles
  #   command: sh -c 'mkdir -p /llm/ollama && cd /llm/ollama && init-ollama && exec ./ollama serve'

  ollama:
    build:
      context: .
      dockerfile: Dockerfile
      target: ollama
    container_name: ollama
    restart: "always"
    env_file:
      - .env
    environment:
      - OLLAMA_HOST=0.0.0.0
      - ONEAPI_DEVICE_SELECTOR=level_zero:0
    devices:
      - /dev/dri:/dev/dri
    ports:
      - 11434:11434 # ollama serve port
    networks:
      - internal
    volumes:
      - ./cache:/root/.cache # Cache hub models and neo_compiler_cache
      - ./ollama:/root/.ollama # Cache the ollama models
    cap_add: # used for running ze-monitor within container
      - CAP_DAC_READ_SEARCH # Bypass all filesystem read access checks
      - CAP_PERFMON # Access to perf_events (vs. overloaded CAP_SYS_ADMIN)
      - CAP_SYS_PTRACE # PTRACE_MODE_READ_REALCREDS ptrace access mode check

  # ollama-ov-server:
  #   build:
  #     context: .
  #     dockerfile: Dockerfile
  #     target: ollama-ov-server
  #   container_name: ollama-ov-server
  #   restart: "no"
  #   env_file:
  #     - .env
  #   environment:
  #     - OLLAMA_HOST=0.0.0.0
  #     - ONEAPI_DEVICE_SELECTOR=level_zero:0
  #   devices:
  #     - /dev/dri:/dev/dri
  #   ports:
  #     - 11435:11434 # ollama serve port
  #   networks:
  #     - internal
  #   volumes:
  #     - ./cache:/root/.cache   # Cache hub models and neo_compiler_cache
  #     - ./ollama:/root/.ollama # Cache the ollama models

  ollama-context-proxy:
    build:
      context: ./ollama-context-proxy
      dockerfile: Dockerfile
    container_name: ollama-context-proxy
    restart: "always"
    env_file:
      - .env
    environment:
      - OLLAMA_HOST=http://ollama:11434
    ports:
      - 11436:11434 # ollama-context-proxy port
    networks:
      - internal

  vllm:
    build:
      context: .
      dockerfile: Dockerfile.xpu
      target: vllm-openai
    container_name: vllm-openai
    restart: "no"
    shm_size: 10.24gb
    env_file:
      - .env
    environment:
      - OLLAMA_HOST=0.0.0.0
    #      - ONEAPI_DEVICE_SELECTOR=level_zero:0
      - ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
      - ZE_AFFINITY_MASK=0.0
      - CCL_LOG_LEVEL=INFO
    devices:
      - /dev:/dev
    # group_add:
    #   - render
    #   - video
    ports:
      - 11438:8000 # ollama serve port
    networks:
      - internal
    volumes:
      - ./cache:/root/.cache # Cache hub models and neo_compiler_cache
      - ./ollama:/root/.ollama # Cache the ollama models
      - /sys:/sys  # Required so oneAPI can read PCI paths for Battlemage
    privileged: true
    cap_add: # used for running ze-monitor within container
      - CAP_DAC_READ_SEARCH # Bypass all filesystem read access checks
      - CAP_PERFMON         # Access to perf_events (vs. overloaded CAP_SYS_ADMIN)
      - CAP_SYS_PTRACE      # PTRACE_MODE_READ_REALCREDS ptrace access mode check


  phoenix:
    image: arizephoenix/phoenix:latest
    container_name: phoenix
    restart: "always"
    env_file:
      - .env
    volumes:
      - ./db:/opt/phoenix/data
    ports:
      - 6006:6006 # Phoenix UI port

networks:
  internal:
    driver: bridge

volumes:
  redis_data:
    driver: local