diff --git a/docker-compose.yml b/docker-compose.yml index 8222a5e..0d9e269 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -71,6 +71,21 @@ services: # - ./cache:/root/.cache # Cache hub models and neo_compiler_cache # - ./ollama:/root/.ollama # Cache the ollama models + ollama-context-proxy: + build: + context: ./ollama-context-proxy + dockerfile: Dockerfile + container_name: ollama-context-proxy + restart: "always" + env_file: + - .env + environment: + - OLLAMA_HOST=http://ollama:11434 + ports: + - 11436:11434 # ollama-context-proxy port + networks: + - internal + vllm: build: context: . diff --git a/ollama-context-proxy/Dockerfile b/ollama-context-proxy/Dockerfile new file mode 100644 index 0000000..ce65378 --- /dev/null +++ b/ollama-context-proxy/Dockerfile @@ -0,0 +1,61 @@ +FROM ubuntu:noble AS ollama-context-proxy + +RUN apt-get update -y && \ + apt-get install -y --no-install-recommends --fix-missing \ + python3 \ + python3-dev \ + python3-pip \ + python3-venv \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} + +WORKDIR /opt/ollama-context-proxy + +# Set default Ollama base URL +ENV OLLAMA_BASE_URL=http://ollama:11434 + +# Setup the docker pip shell +RUN { \ + echo '#!/bin/bash' ; \ + echo 'source /opt/ollama-context-proxy/venv/bin/activate' ; \ + echo 'if [[ "${1}" != "" ]]; then bash -c "${@}"; else bash -i; fi' ; \ + } > /opt/ollama-context-proxy/shell ; \ + chmod +x /opt/ollama-context-proxy/shell + +SHELL [ "/opt/ollama-context-proxy/shell" ] + +RUN python3 -m venv --system-site-packages /opt/ollama-context-proxy/venv + +COPY /requirements.txt /opt/ollama-context-proxy/ +COPY /ollama-context-proxy.py /opt/ollama-context-proxy/ollama-context-proxy.py + +RUN pip install -r requirements.txt + +SHELL [ "/bin/bash", "-c" ] + +RUN { \ + echo '#!/bin/bash'; \ + echo 'echo "Container: ollama-context-proxy"'; \ + echo 'set -e'; \ + echo 'echo "Setting pip environment to /opt/ollama-context-proxy"'; \ + echo 'source /opt/ollama-context-proxy/venv/bin/activate'; \ + echo 'if [[ "${1}" == "/bin/bash" ]] || [[ "${1}" =~ ^(/opt/ollama-context-proxy/)?shell$ ]]; then'; \ + echo ' echo "Dropping to shell"'; \ + echo ' shift'; \ + echo ' if [[ "${1}" != "" ]]; then cmd="/opt/ollama-context-proxy/shell ${@}"; echo "Running: ${cmd}"; exec ${cmd}; else /opt/ollama-context-proxy/shell; fi'; \ + echo 'else'; \ + echo ' while true; do'; \ + echo ' echo "Launching Ollama context proxy server..."'; \ + echo ' exec python3 /opt/ollama-context-proxy/ollama-context-proxy.py'; \ + echo ' if [[ $? -ne 0 ]]; then'; \ + echo ' echo "Ollama context proxy server crashed, restarting in 3 seconds..."'; \ + echo ' sleep 3'; \ + echo ' fi'; \ + echo ' done' ; \ + echo 'fi'; \ + } > /entrypoint.sh \ + && chmod +x /entrypoint.sh + +ENV PATH=/opt/ollama-context-proxy:$PATH + +ENTRYPOINT ["/entrypoint.sh"] \ No newline at end of file diff --git a/ollama-context-proxy/README.md b/ollama-context-proxy/README.md new file mode 100644 index 0000000..574028d --- /dev/null +++ b/ollama-context-proxy/README.md @@ -0,0 +1,326 @@ +# Ollama Context Proxy + +A smart proxy server for Ollama that provides **automatic context size detection** and **URL-based context routing**. This proxy intelligently analyzes incoming requests to determine the optimal context window size, eliminating the need to manually configure context sizes for different types of prompts. + +## Why Ollama Context Proxy? + +### The Problem +- **Memory Efficiency**: Large context windows consume significantly more GPU memory and processing time +- **Manual Configuration**: Traditional setups require you to manually set context sizes for each request +- **One-Size-Fits-All**: Most deployments use a fixed context size, wasting resources on small prompts or limiting large ones +- **Performance Impact**: Using a 32K context for a simple 100-token prompt is inefficient + +### The Solution +Ollama Context Proxy solves these issues by: + +1. **🧠 Intelligent Auto-Sizing**: Automatically analyzes prompt content and selects the optimal context size +2. **🎯 Resource Optimization**: Uses smaller contexts for small prompts, larger contexts only when needed +3. **⚡ Performance Boost**: Reduces memory usage and inference time for most requests +4. **🔧 Flexible Routing**: URL-based routing allows explicit context control when needed +5. **🔄 Drop-in Replacement**: Works as a transparent proxy - no client code changes required + +## Features + +- **Automatic Context Detection**: Analyzes prompts and automatically selects appropriate context sizes +- **URL-Based Routing**: Explicit context control via URL paths (`/proxy-context/4096/api/generate`) +- **Multiple API Support**: Works with Ollama native API and OpenAI-compatible endpoints +- **Streaming Support**: Full support for streaming responses +- **Resource Optimization**: Reduces memory usage by using appropriate context sizes +- **Docker Ready**: Includes Docker configuration for easy deployment +- **Environment Variable Support**: Configurable via `OLLAMA_BASE_URL` + +## Quick Start + +### Using Docker (Recommended) + +```bash +# Build the Docker image +docker build -t ollama-context-proxy . + +# Run with default settings (connects to ollama:11434) +docker run -p 11435:11435 ollama-context-proxy + +# Run with custom Ollama URL +docker run -p 11435:11435 -e OLLAMA_BASE_URL=http://your-ollama-host:11434 ollama-context-proxy +``` + +### Direct Python Usage + +```bash +# Install dependencies +pip install -r requirements.txt + +# Run with auto-detection of Ollama +python3 ollama-context-proxy.py + +# Run with custom Ollama host +python3 ollama-context-proxy.py --ollama-host your-ollama-host --ollama-port 11434 +``` + +## Configuration + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `OLLAMA_BASE_URL` | `http://ollama:11434` | Full URL to Ollama server (Docker default) | + +### Command Line Arguments + +```bash +python3 ollama-context-proxy.py [OPTIONS] + +Options: + --ollama-host HOST Ollama server host (default: localhost or from OLLAMA_BASE_URL) + --ollama-port PORT Ollama server port (default: 11434) + --proxy-port PORT Proxy server port (default: 11435) + --log-level LEVEL Log level: DEBUG, INFO, WARNING, ERROR (default: INFO) +``` + +## Usage Examples + +### Automatic Context Sizing (Recommended) + +The proxy automatically determines the best context size based on your prompt: + +```bash +# Auto-sizing - proxy analyzes prompt and chooses optimal context +curl -X POST http://localhost:11435/proxy-context/auto/api/generate \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama2", + "prompt": "Write a short story about a robot.", + "stream": false + }' + +# Chat endpoint with auto-sizing +curl -X POST http://localhost:11435/proxy-context/auto/api/chat \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama2", + "messages": [{"role": "user", "content": "Hello!"}] + }' +``` + +### Fixed Context Sizes + +When you need explicit control over context size: + +```bash +# Force 2K context for small prompts +curl -X POST http://localhost:11435/proxy-context/2048/api/generate \ + -H "Content-Type: application/json" \ + -d '{"model": "llama2", "prompt": "Hello world"}' + +# Force 16K context for large prompts +curl -X POST http://localhost:11435/proxy-context/16384/api/generate \ + -H "Content-Type: application/json" \ + -d '{"model": "llama2", "prompt": "Your very long prompt here..."}' +``` + +### OpenAI-Compatible Endpoints + +```bash +# Auto-sizing with OpenAI-compatible API +curl -X POST http://localhost:11435/proxy-context/auto/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama2", + "messages": [{"role": "user", "content": "Explain quantum computing"}], + "max_tokens": 150 + }' +``` + +### Health Check + +```bash +# Check proxy status and available context sizes +curl http://localhost:11435/health +``` + +## How Auto-Sizing Works + +The proxy uses intelligent analysis to determine optimal context sizes: + +1. **Content Analysis**: Extracts and analyzes prompt text from various endpoint formats +2. **Token Estimation**: Estimates input tokens using character-based approximation +3. **Buffer Calculation**: Adds buffers for system prompts, response space, and safety margins +4. **Context Selection**: Chooses the smallest available context that can handle the request + +### Available Context Sizes + +- **2K** (2048 tokens): Short prompts, simple Q&A +- **4K** (4096 tokens): Medium prompts, code snippets +- **8K** (8192 tokens): Long prompts, detailed instructions +- **16K** (16384 tokens): Very long prompts, document analysis +- **32K** (32768 tokens): Maximum context, large documents + +### Auto-Sizing Logic + +``` +Total Required = Input Tokens + Max Response Tokens + System Overhead + Safety Margin + ↓ ↓ ↓ ↓ + Estimated from From request 100 tokens 200 tokens + prompt content max_tokens buffer buffer +``` + +## Docker Compose Integration + +Example `docker-compose.yml` integration: + +```yaml +version: '3.8' +services: + ollama: + image: ollama/ollama + ports: + - "11434:11434" + volumes: + - ollama_data:/root/.ollama + + ollama-context-proxy: + build: ./ollama-context-proxy + ports: + - "11435:11435" + environment: + - OLLAMA_BASE_URL=http://ollama:11434 + depends_on: + - ollama + +volumes: + ollama_data: +``` + +## API Endpoints + +### Proxy Endpoints + +| Endpoint Pattern | Description | +|-----------------|-------------| +| `/proxy-context/auto/{path}` | Auto-detect context size | +| `/proxy-context/{size}/{path}` | Fixed context size (2048, 4096, 8192, 16384, 32768) | +| `/health` | Health check and proxy status | + +### Supported Ollama Endpoints + +All standard Ollama endpoints are supported through the proxy: + +- `/api/generate` - Text generation +- `/api/chat` - Chat completions +- `/api/tags` - List models +- `/api/show` - Model information +- `/v1/chat/completions` - OpenAI-compatible chat +- `/v1/completions` - OpenAI-compatible completions + +## Performance Benefits + +### Memory Usage Reduction + +Using appropriate context sizes can significantly reduce GPU memory usage: + +- **2K context**: ~1-2GB GPU memory +- **4K context**: ~2-4GB GPU memory +- **8K context**: ~4-8GB GPU memory +- **16K context**: ~8-16GB GPU memory +- **32K context**: ~16-32GB GPU memory + +### Response Time Improvement + +Smaller contexts process faster: + +- **Simple prompts**: 2-3x faster with auto-sizing vs. fixed 32K +- **Medium prompts**: 1.5-2x faster with optimal sizing +- **Large prompts**: Minimal difference (uses large context anyway) + +## Monitoring and Logging + +The proxy provides detailed logging for monitoring: + +```bash +# Enable debug logging for detailed analysis +python3 ollama-context-proxy.py --log-level DEBUG +``` + +Log information includes: +- Context size selection reasoning +- Token estimation details +- Request routing information +- Performance metrics + +## Troubleshooting + +### Common Issues + +**Connection Refused** +```bash +# Check if Ollama is running +curl http://localhost:11434/api/tags + +# Verify proxy configuration +curl http://localhost:11435/health +``` + +**Context Size Warnings** +``` +Request may exceed largest available context! +``` +- The request requires more than 32K tokens +- Consider breaking large prompts into smaller chunks +- Use streaming for very long responses + +**Auto-sizing Not Working** +- Ensure you're using `/proxy-context/auto/` in your URLs +- Check request format matches supported endpoints +- Enable DEBUG logging to see analysis details + +### Debug Mode + +```bash +# Run with debug logging +python3 ollama-context-proxy.py --log-level DEBUG + +# This will show: +# - Token estimation details +# - Context selection reasoning +# - Request/response routing info +``` + +## Development + +### Requirements + +```bash +pip install aiohttp asyncio +``` + +### Project Structure + +``` +ollama-context-proxy/ +├── ollama-context-proxy.py # Main proxy server +├── requirements.txt # Python dependencies +├── Dockerfile # Docker configuration +└── README.md # This file +``` + +### Contributing + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Add tests if applicable +5. Submit a pull request + +## License + +[Add your license information here] + +## Support + +- **Issues**: Report bugs and feature requests via GitHub issues +- **Documentation**: This README and inline code comments +- **Community**: [Add community links if applicable] + +--- + +**Note**: This proxy is designed to work transparently with existing Ollama clients. Simply change your Ollama URL from `http://localhost:11434` to `http://localhost:11435/proxy-context/auto` to enable intelligent context sizing. diff --git a/ollama-context-proxy/ollama-context-proxy.py b/ollama-context-proxy/ollama-context-proxy.py new file mode 100644 index 0000000..f0d9a3c --- /dev/null +++ b/ollama-context-proxy/ollama-context-proxy.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +""" +Ollama Context Proxy - Single port with URL-based context routing + auto-sizing +Use URLs like: http://localhost:11434/proxy-context/4096/api/generate +Or auto-sizing: http://localhost:11434/proxy-context/auto/api/generate +""" + +import asyncio +import json +import logging +import os +import re +import urllib.parse +from typing import Optional, Union +import aiohttp +from aiohttp import web, ClientSession +from aiohttp.web_response import StreamResponse +import argparse +import sys + + +class OllamaContextProxy: + def __init__( + self, + ollama_host: Optional[str] = None, + ollama_port: int = 11434, + proxy_port: int = 11434, + ): + # Use OLLAMA_BASE_URL environment variable or construct from host/port + base_url = os.getenv("OLLAMA_BASE_URL") + if base_url: + self.ollama_base_url = base_url.rstrip("/") + else: + # Fall back to host/port construction + if ollama_host is None: + ollama_host = "localhost" + self.ollama_base_url = f"http://{ollama_host}:{ollama_port}" + + self.proxy_port = proxy_port + self.session: Optional[ClientSession] = None + self.logger = logging.getLogger(__name__) + + # Available context sizes (must be sorted ascending) + self.available_contexts = [2048, 4096, 8192, 16384, 32768] + + # URL pattern to extract context size or 'auto' + self.context_pattern = re.compile(r"^/proxy-context/(auto|\d+)(/.*)?$") + + async def start(self): + """Initialize the HTTP session""" + self.session = ClientSession() + + async def stop(self): + """Cleanup HTTP session""" + if self.session: + await self.session.close() + + def create_app(self) -> web.Application: + """Create the main web application""" + app = web.Application() + app["proxy"] = self + + # Add routes - capture everything under /proxy-context/ + app.router.add_route( + "*", + r"/proxy-context/{context_spec:(auto|\d+)}{path:.*}", + self.proxy_handler, + ) + + # Optional: Add a health check endpoint + app.router.add_get("/", self.health_check) + app.router.add_get("/health", self.health_check) + + return app + + async def health_check(self, request: web.Request) -> web.Response: + """Health check endpoint""" + return web.Response( + text="Ollama Context Proxy is running\n" + "Usage: /proxy-context/{context_size}/api/{endpoint}\n" + " /proxy-context/auto/api/{endpoint}\n" + "Examples:\n" + " Fixed: /proxy-context/4096/api/generate\n" + " Auto: /proxy-context/auto/api/generate\n" + f"Available contexts: {', '.join(map(str, self.available_contexts))}", + content_type="text/plain", + ) + + async def proxy_handler(self, request: web.Request) -> web.Response: + """Handle all proxy requests with context size extraction or auto-detection""" + + # Extract context spec and remaining path + context_spec = request.match_info["context_spec"] + remaining_path = request.match_info.get("path", "") + + # Remove leading slash if present + if remaining_path.startswith("/"): + remaining_path = remaining_path[1:] + + # Get request data first (needed for auto-sizing) + if request.content_type == "application/json": + try: + data = await request.json() + except json.JSONDecodeError: + data = await request.text() + else: + data = await request.read() + + # Determine context size + if context_spec == "auto": + context_size = self._auto_determine_context_size(data, remaining_path) + else: + context_size = int(context_spec) + + # Validate context size + if context_size not in self.available_contexts: + # Find the next larger available context + suitable_context = next( + (ctx for ctx in self.available_contexts if ctx >= context_size), + self.available_contexts[-1], + ) + self.logger.warning( + f"Requested context {context_size} not available, using {suitable_context}" + ) + context_size = suitable_context + + # Build target URL + if not remaining_path: + target_url = self.ollama_base_url + else: + target_url = f"{self.ollama_base_url}/{remaining_path}" + + self.logger.info(f"Routing to context {context_size} -> {target_url}") + + # Inject context if needed + if self._should_inject_context(remaining_path) and isinstance(data, dict): + if "options" not in data: + data["options"] = {} + data["options"]["num_ctx"] = context_size + self.logger.info(f"Injected num_ctx={context_size} for {remaining_path}") + + # Prepare headers (exclude hop-by-hop headers) + headers = { + key: value + for key, value in request.headers.items() + if key.lower() not in ["host", "connection", "upgrade"] + } + + if not self.session: + raise RuntimeError("HTTP session not initialized") + try: + # Make request to Ollama + async with self.session.request( + method=request.method, + url=target_url, + data=json.dumps(data) if isinstance(data, dict) else data, + headers=headers, + params=request.query, + ) as response: + # Handle streaming responses (for generate/chat endpoints) + if response.headers.get("content-type", "").startswith( + "application/x-ndjson" + ): + return await self._handle_streaming_response(request, response) + else: + return await self._handle_regular_response(response) + + except aiohttp.ClientError as e: + self.logger.error(f"Error proxying request to {target_url}: {e}") + return web.Response( + text=f"Proxy error: {str(e)}", status=502, content_type="text/plain" + ) + + def _auto_determine_context_size( + self, data: Union[dict, str, bytes], endpoint: str + ) -> int: + """Automatically determine the required context size based on request content""" + + input_tokens = 0 + max_tokens = 0 + + if isinstance(data, dict): + # Extract text content and max_tokens based on endpoint + if endpoint.startswith("api/generate"): + # Ollama generate endpoint + prompt = data.get("prompt", "") + input_tokens = self._estimate_tokens(prompt) + max_tokens = data.get("options", {}).get("num_predict", 0) + + elif endpoint.startswith("api/chat"): + # Ollama chat endpoint + messages = data.get("messages", []) + total_text = "" + for msg in messages: + if isinstance(msg, dict) and "content" in msg: + total_text += str(msg["content"]) + " " + input_tokens = self._estimate_tokens(total_text) + max_tokens = data.get("options", {}).get("num_predict", 0) + + elif endpoint.startswith("v1/chat/completions"): + # OpenAI-compatible chat endpoint + messages = data.get("messages", []) + total_text = "" + for msg in messages: + if isinstance(msg, dict) and "content" in msg: + total_text += str(msg["content"]) + " " + input_tokens = self._estimate_tokens(total_text) + max_tokens = data.get("max_tokens", 0) + + elif endpoint.startswith("v1/completions"): + # OpenAI-compatible completions endpoint + prompt = data.get("prompt", "") + input_tokens = self._estimate_tokens(prompt) + max_tokens = data.get("max_tokens", 0) + + elif isinstance(data, (str, bytes)): + # Fallback for non-JSON data + text = ( + data if isinstance(data, str) else data.decode("utf-8", errors="ignore") + ) + input_tokens = self._estimate_tokens(text) + + # Calculate total tokens needed + system_overhead = 100 # Buffer for system prompts, formatting, etc. + response_buffer = max(max_tokens, 512) # Ensure space for response + safety_margin = 200 # Additional safety buffer + + total_needed = input_tokens + response_buffer + system_overhead + safety_margin + + # Find the smallest context that can accommodate the request + suitable_context = next( + (ctx for ctx in self.available_contexts if ctx >= total_needed), + self.available_contexts[-1], # Fall back to largest if none are big enough + ) + + self.logger.info( + f"Auto-sizing analysis: " + f"input_tokens={input_tokens}, " + f"max_tokens={max_tokens}, " + f"total_needed={total_needed}, " + f"selected_context={suitable_context}" + ) + + # Log warning if we're using the largest context and it might not be enough + if ( + suitable_context == self.available_contexts[-1] + and total_needed > suitable_context + ): + self.logger.warning( + f"Request may exceed largest available context! " + f"Needed: {total_needed}, Available: {suitable_context}" + ) + + return suitable_context + + def _estimate_tokens(self, text: str) -> int: + """Estimate token count from text (rough approximation)""" + if not text: + return 0 + + # Rough estimation: ~4 characters per token for English + # This is a conservative estimate - actual tokenization varies by model + char_count = len(str(text)) + estimated_tokens = max(1, char_count // 4) + + self.logger.debug( + f"Token estimation: {char_count} chars -> ~{estimated_tokens} tokens" + ) + return estimated_tokens + + def _should_inject_context(self, path: str) -> bool: + """Determine if we should inject context for this endpoint""" + # Inject context for endpoints that support the num_ctx parameter + context_endpoints = [ + "api/generate", + "api/chat", + "v1/chat/completions", + "v1/completions", + ] + return any(path.startswith(endpoint) for endpoint in context_endpoints) + + async def _handle_streaming_response( + self, request: web.Request, response: aiohttp.ClientResponse + ) -> StreamResponse: + """Handle streaming responses (NDJSON)""" + stream_response = StreamResponse( + status=response.status, + headers={ + key: value + for key, value in response.headers.items() + if key.lower() not in ["content-length", "transfer-encoding"] + }, + ) + + await stream_response.prepare(request) + + async for chunk in response.content.iter_any(): + await stream_response.write(chunk) + + await stream_response.write_eof() + return stream_response + + async def _handle_regular_response( + self, response: aiohttp.ClientResponse + ) -> web.Response: + """Handle regular (non-streaming) responses""" + content = await response.read() + + return web.Response( + body=content, + status=response.status, + headers={ + key: value + for key, value in response.headers.items() + if key.lower() not in ["content-length", "transfer-encoding"] + }, + ) + + +async def main(): + parser = argparse.ArgumentParser( + description="Ollama Context Proxy - URL-based routing with auto-sizing" + ) + + # Get default host from OLLAMA_BASE_URL if available + default_host = "localhost" + base_url = os.getenv("OLLAMA_BASE_URL") + if base_url: + # Extract host from base URL for backward compatibility with CLI args + parsed = urllib.parse.urlparse(base_url) + if parsed.hostname: + default_host = parsed.hostname + + parser.add_argument( + "--ollama-host", + default=default_host, + help=f"Ollama server host (default: {default_host})", + ) + parser.add_argument( + "--ollama-port", + type=int, + default=11434, + help="Ollama server port (default: 11434)", + ) + parser.add_argument( + "--proxy-port", + type=int, + default=11435, + help="Proxy server port (default: 11435)", + ) + parser.add_argument( + "--log-level", + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], + help="Log level (default: INFO)", + ) + + args = parser.parse_args() + + # Setup logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + # Create proxy instance + proxy = OllamaContextProxy(args.ollama_host, args.ollama_port, args.proxy_port) + await proxy.start() + + # Create and start the web application + app = proxy.create_app() + runner = web.AppRunner(app) + await runner.setup() + + site = web.TCPSite(runner, "0.0.0.0", args.proxy_port) + await site.start() + + logging.info(f"Ollama Context Proxy started on port {args.proxy_port}") + logging.info(f"Forwarding to Ollama at {proxy.ollama_base_url}") + logging.info(f"Available context sizes: {proxy.available_contexts}") + logging.info("Usage examples:") + logging.info( + f" Auto-size: http://localhost:{args.proxy_port}/proxy-context/auto" + ) + logging.info( + f" 2K context: http://localhost:{args.proxy_port}/proxy-context/2048" + ) + logging.info( + f" 4K context: http://localhost:{args.proxy_port}/proxy-context/4096" + ) + logging.info( + f" 8K context: http://localhost:{args.proxy_port}/proxy-context/8192" + ) + logging.info( + f" 16K context: http://localhost:{args.proxy_port}/proxy-context/16384" + ) + logging.info( + f" 32K context: http://localhost:{args.proxy_port}/proxy-context/32768" + ) + + try: + # Keep running + while True: + await asyncio.sleep(1) + + except KeyboardInterrupt: + logging.info("Shutting down...") + finally: + # Cleanup + await runner.cleanup() + await proxy.stop() + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\nShutdown complete.") + sys.exit(0) diff --git a/ollama-context-proxy/requirements.txt b/ollama-context-proxy/requirements.txt new file mode 100644 index 0000000..49e072d --- /dev/null +++ b/ollama-context-proxy/requirements.txt @@ -0,0 +1,12 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +attrs==25.3.0 +frozenlist==1.7.0 +idna==3.10 +multidict==6.6.3 +propcache==0.3.2 +setuptools==68.1.2 +typing_extensions==4.14.1 +wheel==0.42.0 +yarl==1.20.1 \ No newline at end of file