Added auto-context proxy
This commit is contained in:
parent
59cf29ef24
commit
8119cd8492
@ -71,6 +71,21 @@ services:
|
|||||||
# - ./cache:/root/.cache # Cache hub models and neo_compiler_cache
|
# - ./cache:/root/.cache # Cache hub models and neo_compiler_cache
|
||||||
# - ./ollama:/root/.ollama # Cache the ollama models
|
# - ./ollama:/root/.ollama # Cache the ollama models
|
||||||
|
|
||||||
|
ollama-context-proxy:
|
||||||
|
build:
|
||||||
|
context: ./ollama-context-proxy
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: ollama-context-proxy
|
||||||
|
restart: "always"
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
environment:
|
||||||
|
- OLLAMA_HOST=http://ollama:11434
|
||||||
|
ports:
|
||||||
|
- 11436:11434 # ollama-context-proxy port
|
||||||
|
networks:
|
||||||
|
- internal
|
||||||
|
|
||||||
vllm:
|
vllm:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
|
61
ollama-context-proxy/Dockerfile
Normal file
61
ollama-context-proxy/Dockerfile
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
FROM ubuntu:noble AS ollama-context-proxy
|
||||||
|
|
||||||
|
RUN apt-get update -y && \
|
||||||
|
apt-get install -y --no-install-recommends --fix-missing \
|
||||||
|
python3 \
|
||||||
|
python3-dev \
|
||||||
|
python3-pip \
|
||||||
|
python3-venv \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log}
|
||||||
|
|
||||||
|
WORKDIR /opt/ollama-context-proxy
|
||||||
|
|
||||||
|
# Set default Ollama base URL
|
||||||
|
ENV OLLAMA_BASE_URL=http://ollama:11434
|
||||||
|
|
||||||
|
# Setup the docker pip shell
|
||||||
|
RUN { \
|
||||||
|
echo '#!/bin/bash' ; \
|
||||||
|
echo 'source /opt/ollama-context-proxy/venv/bin/activate' ; \
|
||||||
|
echo 'if [[ "${1}" != "" ]]; then bash -c "${@}"; else bash -i; fi' ; \
|
||||||
|
} > /opt/ollama-context-proxy/shell ; \
|
||||||
|
chmod +x /opt/ollama-context-proxy/shell
|
||||||
|
|
||||||
|
SHELL [ "/opt/ollama-context-proxy/shell" ]
|
||||||
|
|
||||||
|
RUN python3 -m venv --system-site-packages /opt/ollama-context-proxy/venv
|
||||||
|
|
||||||
|
COPY /requirements.txt /opt/ollama-context-proxy/
|
||||||
|
COPY /ollama-context-proxy.py /opt/ollama-context-proxy/ollama-context-proxy.py
|
||||||
|
|
||||||
|
RUN pip install -r requirements.txt
|
||||||
|
|
||||||
|
SHELL [ "/bin/bash", "-c" ]
|
||||||
|
|
||||||
|
RUN { \
|
||||||
|
echo '#!/bin/bash'; \
|
||||||
|
echo 'echo "Container: ollama-context-proxy"'; \
|
||||||
|
echo 'set -e'; \
|
||||||
|
echo 'echo "Setting pip environment to /opt/ollama-context-proxy"'; \
|
||||||
|
echo 'source /opt/ollama-context-proxy/venv/bin/activate'; \
|
||||||
|
echo 'if [[ "${1}" == "/bin/bash" ]] || [[ "${1}" =~ ^(/opt/ollama-context-proxy/)?shell$ ]]; then'; \
|
||||||
|
echo ' echo "Dropping to shell"'; \
|
||||||
|
echo ' shift'; \
|
||||||
|
echo ' if [[ "${1}" != "" ]]; then cmd="/opt/ollama-context-proxy/shell ${@}"; echo "Running: ${cmd}"; exec ${cmd}; else /opt/ollama-context-proxy/shell; fi'; \
|
||||||
|
echo 'else'; \
|
||||||
|
echo ' while true; do'; \
|
||||||
|
echo ' echo "Launching Ollama context proxy server..."'; \
|
||||||
|
echo ' exec python3 /opt/ollama-context-proxy/ollama-context-proxy.py'; \
|
||||||
|
echo ' if [[ $? -ne 0 ]]; then'; \
|
||||||
|
echo ' echo "Ollama context proxy server crashed, restarting in 3 seconds..."'; \
|
||||||
|
echo ' sleep 3'; \
|
||||||
|
echo ' fi'; \
|
||||||
|
echo ' done' ; \
|
||||||
|
echo 'fi'; \
|
||||||
|
} > /entrypoint.sh \
|
||||||
|
&& chmod +x /entrypoint.sh
|
||||||
|
|
||||||
|
ENV PATH=/opt/ollama-context-proxy:$PATH
|
||||||
|
|
||||||
|
ENTRYPOINT ["/entrypoint.sh"]
|
326
ollama-context-proxy/README.md
Normal file
326
ollama-context-proxy/README.md
Normal file
@ -0,0 +1,326 @@
|
|||||||
|
# Ollama Context Proxy
|
||||||
|
|
||||||
|
A smart proxy server for Ollama that provides **automatic context size detection** and **URL-based context routing**. This proxy intelligently analyzes incoming requests to determine the optimal context window size, eliminating the need to manually configure context sizes for different types of prompts.
|
||||||
|
|
||||||
|
## Why Ollama Context Proxy?
|
||||||
|
|
||||||
|
### The Problem
|
||||||
|
- **Memory Efficiency**: Large context windows consume significantly more GPU memory and processing time
|
||||||
|
- **Manual Configuration**: Traditional setups require you to manually set context sizes for each request
|
||||||
|
- **One-Size-Fits-All**: Most deployments use a fixed context size, wasting resources on small prompts or limiting large ones
|
||||||
|
- **Performance Impact**: Using a 32K context for a simple 100-token prompt is inefficient
|
||||||
|
|
||||||
|
### The Solution
|
||||||
|
Ollama Context Proxy solves these issues by:
|
||||||
|
|
||||||
|
1. **🧠 Intelligent Auto-Sizing**: Automatically analyzes prompt content and selects the optimal context size
|
||||||
|
2. **🎯 Resource Optimization**: Uses smaller contexts for small prompts, larger contexts only when needed
|
||||||
|
3. **⚡ Performance Boost**: Reduces memory usage and inference time for most requests
|
||||||
|
4. **🔧 Flexible Routing**: URL-based routing allows explicit context control when needed
|
||||||
|
5. **🔄 Drop-in Replacement**: Works as a transparent proxy - no client code changes required
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Automatic Context Detection**: Analyzes prompts and automatically selects appropriate context sizes
|
||||||
|
- **URL-Based Routing**: Explicit context control via URL paths (`/proxy-context/4096/api/generate`)
|
||||||
|
- **Multiple API Support**: Works with Ollama native API and OpenAI-compatible endpoints
|
||||||
|
- **Streaming Support**: Full support for streaming responses
|
||||||
|
- **Resource Optimization**: Reduces memory usage by using appropriate context sizes
|
||||||
|
- **Docker Ready**: Includes Docker configuration for easy deployment
|
||||||
|
- **Environment Variable Support**: Configurable via `OLLAMA_BASE_URL`
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Using Docker (Recommended)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build the Docker image
|
||||||
|
docker build -t ollama-context-proxy .
|
||||||
|
|
||||||
|
# Run with default settings (connects to ollama:11434)
|
||||||
|
docker run -p 11435:11435 ollama-context-proxy
|
||||||
|
|
||||||
|
# Run with custom Ollama URL
|
||||||
|
docker run -p 11435:11435 -e OLLAMA_BASE_URL=http://your-ollama-host:11434 ollama-context-proxy
|
||||||
|
```
|
||||||
|
|
||||||
|
### Direct Python Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install dependencies
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Run with auto-detection of Ollama
|
||||||
|
python3 ollama-context-proxy.py
|
||||||
|
|
||||||
|
# Run with custom Ollama host
|
||||||
|
python3 ollama-context-proxy.py --ollama-host your-ollama-host --ollama-port 11434
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
|----------|---------|-------------|
|
||||||
|
| `OLLAMA_BASE_URL` | `http://ollama:11434` | Full URL to Ollama server (Docker default) |
|
||||||
|
|
||||||
|
### Command Line Arguments
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 ollama-context-proxy.py [OPTIONS]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--ollama-host HOST Ollama server host (default: localhost or from OLLAMA_BASE_URL)
|
||||||
|
--ollama-port PORT Ollama server port (default: 11434)
|
||||||
|
--proxy-port PORT Proxy server port (default: 11435)
|
||||||
|
--log-level LEVEL Log level: DEBUG, INFO, WARNING, ERROR (default: INFO)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage Examples
|
||||||
|
|
||||||
|
### Automatic Context Sizing (Recommended)
|
||||||
|
|
||||||
|
The proxy automatically determines the best context size based on your prompt:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Auto-sizing - proxy analyzes prompt and chooses optimal context
|
||||||
|
curl -X POST http://localhost:11435/proxy-context/auto/api/generate \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "llama2",
|
||||||
|
"prompt": "Write a short story about a robot.",
|
||||||
|
"stream": false
|
||||||
|
}'
|
||||||
|
|
||||||
|
# Chat endpoint with auto-sizing
|
||||||
|
curl -X POST http://localhost:11435/proxy-context/auto/api/chat \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "llama2",
|
||||||
|
"messages": [{"role": "user", "content": "Hello!"}]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Fixed Context Sizes
|
||||||
|
|
||||||
|
When you need explicit control over context size:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Force 2K context for small prompts
|
||||||
|
curl -X POST http://localhost:11435/proxy-context/2048/api/generate \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"model": "llama2", "prompt": "Hello world"}'
|
||||||
|
|
||||||
|
# Force 16K context for large prompts
|
||||||
|
curl -X POST http://localhost:11435/proxy-context/16384/api/generate \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"model": "llama2", "prompt": "Your very long prompt here..."}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### OpenAI-Compatible Endpoints
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Auto-sizing with OpenAI-compatible API
|
||||||
|
curl -X POST http://localhost:11435/proxy-context/auto/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "llama2",
|
||||||
|
"messages": [{"role": "user", "content": "Explain quantum computing"}],
|
||||||
|
"max_tokens": 150
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Health Check
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check proxy status and available context sizes
|
||||||
|
curl http://localhost:11435/health
|
||||||
|
```
|
||||||
|
|
||||||
|
## How Auto-Sizing Works
|
||||||
|
|
||||||
|
The proxy uses intelligent analysis to determine optimal context sizes:
|
||||||
|
|
||||||
|
1. **Content Analysis**: Extracts and analyzes prompt text from various endpoint formats
|
||||||
|
2. **Token Estimation**: Estimates input tokens using character-based approximation
|
||||||
|
3. **Buffer Calculation**: Adds buffers for system prompts, response space, and safety margins
|
||||||
|
4. **Context Selection**: Chooses the smallest available context that can handle the request
|
||||||
|
|
||||||
|
### Available Context Sizes
|
||||||
|
|
||||||
|
- **2K** (2048 tokens): Short prompts, simple Q&A
|
||||||
|
- **4K** (4096 tokens): Medium prompts, code snippets
|
||||||
|
- **8K** (8192 tokens): Long prompts, detailed instructions
|
||||||
|
- **16K** (16384 tokens): Very long prompts, document analysis
|
||||||
|
- **32K** (32768 tokens): Maximum context, large documents
|
||||||
|
|
||||||
|
### Auto-Sizing Logic
|
||||||
|
|
||||||
|
```
|
||||||
|
Total Required = Input Tokens + Max Response Tokens + System Overhead + Safety Margin
|
||||||
|
↓ ↓ ↓ ↓
|
||||||
|
Estimated from From request 100 tokens 200 tokens
|
||||||
|
prompt content max_tokens buffer buffer
|
||||||
|
```
|
||||||
|
|
||||||
|
## Docker Compose Integration
|
||||||
|
|
||||||
|
Example `docker-compose.yml` integration:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
version: '3.8'
|
||||||
|
services:
|
||||||
|
ollama:
|
||||||
|
image: ollama/ollama
|
||||||
|
ports:
|
||||||
|
- "11434:11434"
|
||||||
|
volumes:
|
||||||
|
- ollama_data:/root/.ollama
|
||||||
|
|
||||||
|
ollama-context-proxy:
|
||||||
|
build: ./ollama-context-proxy
|
||||||
|
ports:
|
||||||
|
- "11435:11435"
|
||||||
|
environment:
|
||||||
|
- OLLAMA_BASE_URL=http://ollama:11434
|
||||||
|
depends_on:
|
||||||
|
- ollama
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
ollama_data:
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Proxy Endpoints
|
||||||
|
|
||||||
|
| Endpoint Pattern | Description |
|
||||||
|
|-----------------|-------------|
|
||||||
|
| `/proxy-context/auto/{path}` | Auto-detect context size |
|
||||||
|
| `/proxy-context/{size}/{path}` | Fixed context size (2048, 4096, 8192, 16384, 32768) |
|
||||||
|
| `/health` | Health check and proxy status |
|
||||||
|
|
||||||
|
### Supported Ollama Endpoints
|
||||||
|
|
||||||
|
All standard Ollama endpoints are supported through the proxy:
|
||||||
|
|
||||||
|
- `/api/generate` - Text generation
|
||||||
|
- `/api/chat` - Chat completions
|
||||||
|
- `/api/tags` - List models
|
||||||
|
- `/api/show` - Model information
|
||||||
|
- `/v1/chat/completions` - OpenAI-compatible chat
|
||||||
|
- `/v1/completions` - OpenAI-compatible completions
|
||||||
|
|
||||||
|
## Performance Benefits
|
||||||
|
|
||||||
|
### Memory Usage Reduction
|
||||||
|
|
||||||
|
Using appropriate context sizes can significantly reduce GPU memory usage:
|
||||||
|
|
||||||
|
- **2K context**: ~1-2GB GPU memory
|
||||||
|
- **4K context**: ~2-4GB GPU memory
|
||||||
|
- **8K context**: ~4-8GB GPU memory
|
||||||
|
- **16K context**: ~8-16GB GPU memory
|
||||||
|
- **32K context**: ~16-32GB GPU memory
|
||||||
|
|
||||||
|
### Response Time Improvement
|
||||||
|
|
||||||
|
Smaller contexts process faster:
|
||||||
|
|
||||||
|
- **Simple prompts**: 2-3x faster with auto-sizing vs. fixed 32K
|
||||||
|
- **Medium prompts**: 1.5-2x faster with optimal sizing
|
||||||
|
- **Large prompts**: Minimal difference (uses large context anyway)
|
||||||
|
|
||||||
|
## Monitoring and Logging
|
||||||
|
|
||||||
|
The proxy provides detailed logging for monitoring:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Enable debug logging for detailed analysis
|
||||||
|
python3 ollama-context-proxy.py --log-level DEBUG
|
||||||
|
```
|
||||||
|
|
||||||
|
Log information includes:
|
||||||
|
- Context size selection reasoning
|
||||||
|
- Token estimation details
|
||||||
|
- Request routing information
|
||||||
|
- Performance metrics
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Common Issues
|
||||||
|
|
||||||
|
**Connection Refused**
|
||||||
|
```bash
|
||||||
|
# Check if Ollama is running
|
||||||
|
curl http://localhost:11434/api/tags
|
||||||
|
|
||||||
|
# Verify proxy configuration
|
||||||
|
curl http://localhost:11435/health
|
||||||
|
```
|
||||||
|
|
||||||
|
**Context Size Warnings**
|
||||||
|
```
|
||||||
|
Request may exceed largest available context!
|
||||||
|
```
|
||||||
|
- The request requires more than 32K tokens
|
||||||
|
- Consider breaking large prompts into smaller chunks
|
||||||
|
- Use streaming for very long responses
|
||||||
|
|
||||||
|
**Auto-sizing Not Working**
|
||||||
|
- Ensure you're using `/proxy-context/auto/` in your URLs
|
||||||
|
- Check request format matches supported endpoints
|
||||||
|
- Enable DEBUG logging to see analysis details
|
||||||
|
|
||||||
|
### Debug Mode
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run with debug logging
|
||||||
|
python3 ollama-context-proxy.py --log-level DEBUG
|
||||||
|
|
||||||
|
# This will show:
|
||||||
|
# - Token estimation details
|
||||||
|
# - Context selection reasoning
|
||||||
|
# - Request/response routing info
|
||||||
|
```
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
### Requirements
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install aiohttp asyncio
|
||||||
|
```
|
||||||
|
|
||||||
|
### Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
ollama-context-proxy/
|
||||||
|
├── ollama-context-proxy.py # Main proxy server
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
├── Dockerfile # Docker configuration
|
||||||
|
└── README.md # This file
|
||||||
|
```
|
||||||
|
|
||||||
|
### Contributing
|
||||||
|
|
||||||
|
1. Fork the repository
|
||||||
|
2. Create a feature branch
|
||||||
|
3. Make your changes
|
||||||
|
4. Add tests if applicable
|
||||||
|
5. Submit a pull request
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
[Add your license information here]
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
- **Issues**: Report bugs and feature requests via GitHub issues
|
||||||
|
- **Documentation**: This README and inline code comments
|
||||||
|
- **Community**: [Add community links if applicable]
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Note**: This proxy is designed to work transparently with existing Ollama clients. Simply change your Ollama URL from `http://localhost:11434` to `http://localhost:11435/proxy-context/auto` to enable intelligent context sizing.
|
419
ollama-context-proxy/ollama-context-proxy.py
Normal file
419
ollama-context-proxy/ollama-context-proxy.py
Normal file
@ -0,0 +1,419 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Ollama Context Proxy - Single port with URL-based context routing + auto-sizing
|
||||||
|
Use URLs like: http://localhost:11434/proxy-context/4096/api/generate
|
||||||
|
Or auto-sizing: http://localhost:11434/proxy-context/auto/api/generate
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
from typing import Optional, Union
|
||||||
|
import aiohttp
|
||||||
|
from aiohttp import web, ClientSession
|
||||||
|
from aiohttp.web_response import StreamResponse
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaContextProxy:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
ollama_host: Optional[str] = None,
|
||||||
|
ollama_port: int = 11434,
|
||||||
|
proxy_port: int = 11434,
|
||||||
|
):
|
||||||
|
# Use OLLAMA_BASE_URL environment variable or construct from host/port
|
||||||
|
base_url = os.getenv("OLLAMA_BASE_URL")
|
||||||
|
if base_url:
|
||||||
|
self.ollama_base_url = base_url.rstrip("/")
|
||||||
|
else:
|
||||||
|
# Fall back to host/port construction
|
||||||
|
if ollama_host is None:
|
||||||
|
ollama_host = "localhost"
|
||||||
|
self.ollama_base_url = f"http://{ollama_host}:{ollama_port}"
|
||||||
|
|
||||||
|
self.proxy_port = proxy_port
|
||||||
|
self.session: Optional[ClientSession] = None
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Available context sizes (must be sorted ascending)
|
||||||
|
self.available_contexts = [2048, 4096, 8192, 16384, 32768]
|
||||||
|
|
||||||
|
# URL pattern to extract context size or 'auto'
|
||||||
|
self.context_pattern = re.compile(r"^/proxy-context/(auto|\d+)(/.*)?$")
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
"""Initialize the HTTP session"""
|
||||||
|
self.session = ClientSession()
|
||||||
|
|
||||||
|
async def stop(self):
|
||||||
|
"""Cleanup HTTP session"""
|
||||||
|
if self.session:
|
||||||
|
await self.session.close()
|
||||||
|
|
||||||
|
def create_app(self) -> web.Application:
|
||||||
|
"""Create the main web application"""
|
||||||
|
app = web.Application()
|
||||||
|
app["proxy"] = self
|
||||||
|
|
||||||
|
# Add routes - capture everything under /proxy-context/
|
||||||
|
app.router.add_route(
|
||||||
|
"*",
|
||||||
|
r"/proxy-context/{context_spec:(auto|\d+)}{path:.*}",
|
||||||
|
self.proxy_handler,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Optional: Add a health check endpoint
|
||||||
|
app.router.add_get("/", self.health_check)
|
||||||
|
app.router.add_get("/health", self.health_check)
|
||||||
|
|
||||||
|
return app
|
||||||
|
|
||||||
|
async def health_check(self, request: web.Request) -> web.Response:
|
||||||
|
"""Health check endpoint"""
|
||||||
|
return web.Response(
|
||||||
|
text="Ollama Context Proxy is running\n"
|
||||||
|
"Usage: /proxy-context/{context_size}/api/{endpoint}\n"
|
||||||
|
" /proxy-context/auto/api/{endpoint}\n"
|
||||||
|
"Examples:\n"
|
||||||
|
" Fixed: /proxy-context/4096/api/generate\n"
|
||||||
|
" Auto: /proxy-context/auto/api/generate\n"
|
||||||
|
f"Available contexts: {', '.join(map(str, self.available_contexts))}",
|
||||||
|
content_type="text/plain",
|
||||||
|
)
|
||||||
|
|
||||||
|
async def proxy_handler(self, request: web.Request) -> web.Response:
|
||||||
|
"""Handle all proxy requests with context size extraction or auto-detection"""
|
||||||
|
|
||||||
|
# Extract context spec and remaining path
|
||||||
|
context_spec = request.match_info["context_spec"]
|
||||||
|
remaining_path = request.match_info.get("path", "")
|
||||||
|
|
||||||
|
# Remove leading slash if present
|
||||||
|
if remaining_path.startswith("/"):
|
||||||
|
remaining_path = remaining_path[1:]
|
||||||
|
|
||||||
|
# Get request data first (needed for auto-sizing)
|
||||||
|
if request.content_type == "application/json":
|
||||||
|
try:
|
||||||
|
data = await request.json()
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
data = await request.text()
|
||||||
|
else:
|
||||||
|
data = await request.read()
|
||||||
|
|
||||||
|
# Determine context size
|
||||||
|
if context_spec == "auto":
|
||||||
|
context_size = self._auto_determine_context_size(data, remaining_path)
|
||||||
|
else:
|
||||||
|
context_size = int(context_spec)
|
||||||
|
|
||||||
|
# Validate context size
|
||||||
|
if context_size not in self.available_contexts:
|
||||||
|
# Find the next larger available context
|
||||||
|
suitable_context = next(
|
||||||
|
(ctx for ctx in self.available_contexts if ctx >= context_size),
|
||||||
|
self.available_contexts[-1],
|
||||||
|
)
|
||||||
|
self.logger.warning(
|
||||||
|
f"Requested context {context_size} not available, using {suitable_context}"
|
||||||
|
)
|
||||||
|
context_size = suitable_context
|
||||||
|
|
||||||
|
# Build target URL
|
||||||
|
if not remaining_path:
|
||||||
|
target_url = self.ollama_base_url
|
||||||
|
else:
|
||||||
|
target_url = f"{self.ollama_base_url}/{remaining_path}"
|
||||||
|
|
||||||
|
self.logger.info(f"Routing to context {context_size} -> {target_url}")
|
||||||
|
|
||||||
|
# Inject context if needed
|
||||||
|
if self._should_inject_context(remaining_path) and isinstance(data, dict):
|
||||||
|
if "options" not in data:
|
||||||
|
data["options"] = {}
|
||||||
|
data["options"]["num_ctx"] = context_size
|
||||||
|
self.logger.info(f"Injected num_ctx={context_size} for {remaining_path}")
|
||||||
|
|
||||||
|
# Prepare headers (exclude hop-by-hop headers)
|
||||||
|
headers = {
|
||||||
|
key: value
|
||||||
|
for key, value in request.headers.items()
|
||||||
|
if key.lower() not in ["host", "connection", "upgrade"]
|
||||||
|
}
|
||||||
|
|
||||||
|
if not self.session:
|
||||||
|
raise RuntimeError("HTTP session not initialized")
|
||||||
|
try:
|
||||||
|
# Make request to Ollama
|
||||||
|
async with self.session.request(
|
||||||
|
method=request.method,
|
||||||
|
url=target_url,
|
||||||
|
data=json.dumps(data) if isinstance(data, dict) else data,
|
||||||
|
headers=headers,
|
||||||
|
params=request.query,
|
||||||
|
) as response:
|
||||||
|
# Handle streaming responses (for generate/chat endpoints)
|
||||||
|
if response.headers.get("content-type", "").startswith(
|
||||||
|
"application/x-ndjson"
|
||||||
|
):
|
||||||
|
return await self._handle_streaming_response(request, response)
|
||||||
|
else:
|
||||||
|
return await self._handle_regular_response(response)
|
||||||
|
|
||||||
|
except aiohttp.ClientError as e:
|
||||||
|
self.logger.error(f"Error proxying request to {target_url}: {e}")
|
||||||
|
return web.Response(
|
||||||
|
text=f"Proxy error: {str(e)}", status=502, content_type="text/plain"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _auto_determine_context_size(
|
||||||
|
self, data: Union[dict, str, bytes], endpoint: str
|
||||||
|
) -> int:
|
||||||
|
"""Automatically determine the required context size based on request content"""
|
||||||
|
|
||||||
|
input_tokens = 0
|
||||||
|
max_tokens = 0
|
||||||
|
|
||||||
|
if isinstance(data, dict):
|
||||||
|
# Extract text content and max_tokens based on endpoint
|
||||||
|
if endpoint.startswith("api/generate"):
|
||||||
|
# Ollama generate endpoint
|
||||||
|
prompt = data.get("prompt", "")
|
||||||
|
input_tokens = self._estimate_tokens(prompt)
|
||||||
|
max_tokens = data.get("options", {}).get("num_predict", 0)
|
||||||
|
|
||||||
|
elif endpoint.startswith("api/chat"):
|
||||||
|
# Ollama chat endpoint
|
||||||
|
messages = data.get("messages", [])
|
||||||
|
total_text = ""
|
||||||
|
for msg in messages:
|
||||||
|
if isinstance(msg, dict) and "content" in msg:
|
||||||
|
total_text += str(msg["content"]) + " "
|
||||||
|
input_tokens = self._estimate_tokens(total_text)
|
||||||
|
max_tokens = data.get("options", {}).get("num_predict", 0)
|
||||||
|
|
||||||
|
elif endpoint.startswith("v1/chat/completions"):
|
||||||
|
# OpenAI-compatible chat endpoint
|
||||||
|
messages = data.get("messages", [])
|
||||||
|
total_text = ""
|
||||||
|
for msg in messages:
|
||||||
|
if isinstance(msg, dict) and "content" in msg:
|
||||||
|
total_text += str(msg["content"]) + " "
|
||||||
|
input_tokens = self._estimate_tokens(total_text)
|
||||||
|
max_tokens = data.get("max_tokens", 0)
|
||||||
|
|
||||||
|
elif endpoint.startswith("v1/completions"):
|
||||||
|
# OpenAI-compatible completions endpoint
|
||||||
|
prompt = data.get("prompt", "")
|
||||||
|
input_tokens = self._estimate_tokens(prompt)
|
||||||
|
max_tokens = data.get("max_tokens", 0)
|
||||||
|
|
||||||
|
elif isinstance(data, (str, bytes)):
|
||||||
|
# Fallback for non-JSON data
|
||||||
|
text = (
|
||||||
|
data if isinstance(data, str) else data.decode("utf-8", errors="ignore")
|
||||||
|
)
|
||||||
|
input_tokens = self._estimate_tokens(text)
|
||||||
|
|
||||||
|
# Calculate total tokens needed
|
||||||
|
system_overhead = 100 # Buffer for system prompts, formatting, etc.
|
||||||
|
response_buffer = max(max_tokens, 512) # Ensure space for response
|
||||||
|
safety_margin = 200 # Additional safety buffer
|
||||||
|
|
||||||
|
total_needed = input_tokens + response_buffer + system_overhead + safety_margin
|
||||||
|
|
||||||
|
# Find the smallest context that can accommodate the request
|
||||||
|
suitable_context = next(
|
||||||
|
(ctx for ctx in self.available_contexts if ctx >= total_needed),
|
||||||
|
self.available_contexts[-1], # Fall back to largest if none are big enough
|
||||||
|
)
|
||||||
|
|
||||||
|
self.logger.info(
|
||||||
|
f"Auto-sizing analysis: "
|
||||||
|
f"input_tokens={input_tokens}, "
|
||||||
|
f"max_tokens={max_tokens}, "
|
||||||
|
f"total_needed={total_needed}, "
|
||||||
|
f"selected_context={suitable_context}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Log warning if we're using the largest context and it might not be enough
|
||||||
|
if (
|
||||||
|
suitable_context == self.available_contexts[-1]
|
||||||
|
and total_needed > suitable_context
|
||||||
|
):
|
||||||
|
self.logger.warning(
|
||||||
|
f"Request may exceed largest available context! "
|
||||||
|
f"Needed: {total_needed}, Available: {suitable_context}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return suitable_context
|
||||||
|
|
||||||
|
def _estimate_tokens(self, text: str) -> int:
|
||||||
|
"""Estimate token count from text (rough approximation)"""
|
||||||
|
if not text:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Rough estimation: ~4 characters per token for English
|
||||||
|
# This is a conservative estimate - actual tokenization varies by model
|
||||||
|
char_count = len(str(text))
|
||||||
|
estimated_tokens = max(1, char_count // 4)
|
||||||
|
|
||||||
|
self.logger.debug(
|
||||||
|
f"Token estimation: {char_count} chars -> ~{estimated_tokens} tokens"
|
||||||
|
)
|
||||||
|
return estimated_tokens
|
||||||
|
|
||||||
|
def _should_inject_context(self, path: str) -> bool:
|
||||||
|
"""Determine if we should inject context for this endpoint"""
|
||||||
|
# Inject context for endpoints that support the num_ctx parameter
|
||||||
|
context_endpoints = [
|
||||||
|
"api/generate",
|
||||||
|
"api/chat",
|
||||||
|
"v1/chat/completions",
|
||||||
|
"v1/completions",
|
||||||
|
]
|
||||||
|
return any(path.startswith(endpoint) for endpoint in context_endpoints)
|
||||||
|
|
||||||
|
async def _handle_streaming_response(
|
||||||
|
self, request: web.Request, response: aiohttp.ClientResponse
|
||||||
|
) -> StreamResponse:
|
||||||
|
"""Handle streaming responses (NDJSON)"""
|
||||||
|
stream_response = StreamResponse(
|
||||||
|
status=response.status,
|
||||||
|
headers={
|
||||||
|
key: value
|
||||||
|
for key, value in response.headers.items()
|
||||||
|
if key.lower() not in ["content-length", "transfer-encoding"]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
await stream_response.prepare(request)
|
||||||
|
|
||||||
|
async for chunk in response.content.iter_any():
|
||||||
|
await stream_response.write(chunk)
|
||||||
|
|
||||||
|
await stream_response.write_eof()
|
||||||
|
return stream_response
|
||||||
|
|
||||||
|
async def _handle_regular_response(
|
||||||
|
self, response: aiohttp.ClientResponse
|
||||||
|
) -> web.Response:
|
||||||
|
"""Handle regular (non-streaming) responses"""
|
||||||
|
content = await response.read()
|
||||||
|
|
||||||
|
return web.Response(
|
||||||
|
body=content,
|
||||||
|
status=response.status,
|
||||||
|
headers={
|
||||||
|
key: value
|
||||||
|
for key, value in response.headers.items()
|
||||||
|
if key.lower() not in ["content-length", "transfer-encoding"]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Ollama Context Proxy - URL-based routing with auto-sizing"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get default host from OLLAMA_BASE_URL if available
|
||||||
|
default_host = "localhost"
|
||||||
|
base_url = os.getenv("OLLAMA_BASE_URL")
|
||||||
|
if base_url:
|
||||||
|
# Extract host from base URL for backward compatibility with CLI args
|
||||||
|
parsed = urllib.parse.urlparse(base_url)
|
||||||
|
if parsed.hostname:
|
||||||
|
default_host = parsed.hostname
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--ollama-host",
|
||||||
|
default=default_host,
|
||||||
|
help=f"Ollama server host (default: {default_host})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ollama-port",
|
||||||
|
type=int,
|
||||||
|
default=11434,
|
||||||
|
help="Ollama server port (default: 11434)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--proxy-port",
|
||||||
|
type=int,
|
||||||
|
default=11435,
|
||||||
|
help="Proxy server port (default: 11435)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--log-level",
|
||||||
|
default="INFO",
|
||||||
|
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
|
help="Log level (default: INFO)",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=getattr(logging, args.log_level),
|
||||||
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create proxy instance
|
||||||
|
proxy = OllamaContextProxy(args.ollama_host, args.ollama_port, args.proxy_port)
|
||||||
|
await proxy.start()
|
||||||
|
|
||||||
|
# Create and start the web application
|
||||||
|
app = proxy.create_app()
|
||||||
|
runner = web.AppRunner(app)
|
||||||
|
await runner.setup()
|
||||||
|
|
||||||
|
site = web.TCPSite(runner, "0.0.0.0", args.proxy_port)
|
||||||
|
await site.start()
|
||||||
|
|
||||||
|
logging.info(f"Ollama Context Proxy started on port {args.proxy_port}")
|
||||||
|
logging.info(f"Forwarding to Ollama at {proxy.ollama_base_url}")
|
||||||
|
logging.info(f"Available context sizes: {proxy.available_contexts}")
|
||||||
|
logging.info("Usage examples:")
|
||||||
|
logging.info(
|
||||||
|
f" Auto-size: http://localhost:{args.proxy_port}/proxy-context/auto"
|
||||||
|
)
|
||||||
|
logging.info(
|
||||||
|
f" 2K context: http://localhost:{args.proxy_port}/proxy-context/2048"
|
||||||
|
)
|
||||||
|
logging.info(
|
||||||
|
f" 4K context: http://localhost:{args.proxy_port}/proxy-context/4096"
|
||||||
|
)
|
||||||
|
logging.info(
|
||||||
|
f" 8K context: http://localhost:{args.proxy_port}/proxy-context/8192"
|
||||||
|
)
|
||||||
|
logging.info(
|
||||||
|
f" 16K context: http://localhost:{args.proxy_port}/proxy-context/16384"
|
||||||
|
)
|
||||||
|
logging.info(
|
||||||
|
f" 32K context: http://localhost:{args.proxy_port}/proxy-context/32768"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Keep running
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logging.info("Shutting down...")
|
||||||
|
finally:
|
||||||
|
# Cleanup
|
||||||
|
await runner.cleanup()
|
||||||
|
await proxy.stop()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
asyncio.run(main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nShutdown complete.")
|
||||||
|
sys.exit(0)
|
12
ollama-context-proxy/requirements.txt
Normal file
12
ollama-context-proxy/requirements.txt
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
aiohappyeyeballs==2.6.1
|
||||||
|
aiohttp==3.12.15
|
||||||
|
aiosignal==1.4.0
|
||||||
|
attrs==25.3.0
|
||||||
|
frozenlist==1.7.0
|
||||||
|
idna==3.10
|
||||||
|
multidict==6.6.3
|
||||||
|
propcache==0.3.2
|
||||||
|
setuptools==68.1.2
|
||||||
|
typing_extensions==4.14.1
|
||||||
|
wheel==0.42.0
|
||||||
|
yarl==1.20.1
|
Loading…
x
Reference in New Issue
Block a user