Working to reduce RSS context length by parsing out HTML

This commit is contained in:
James Ketr 2025-03-06 22:06:54 -08:00
parent 872e74efbb
commit 87c2469c65
3 changed files with 23 additions and 4 deletions

View File

@ -340,7 +340,7 @@ WORKDIR /opt/airc
SHELL [ "/opt/airc/shell" ]
# Needed by src/model-server.py
RUN pip install faiss-cpu sentence_transformers feedparser
RUN pip install faiss-cpu sentence_transformers feedparser bs4
SHELL [ "/bin/bash", "-c" ]

View File

@ -118,11 +118,18 @@ class AIRC(pydle.Client):
logging.info("Burst limit reset due to inactivity.")
async def message(self, target, message):
"""Splits a multi-line message and sends each line separately."""
for line in message.splitlines(): # Splits on both '\n' and '\r\n'
"""Splits a multi-line message and sends each line separately. If more than 10 lines, truncate and add a message."""
lines = message.splitlines() # Splits on both '\n' and '\r\n'
# Process the first 10 lines
for line in lines[:10]:
if line.strip(): # Ignore empty lines
await self._message_queue.put((target, line))
# If there are more than 10 lines, add the truncation message
if len(lines) > 10:
await self._message_queue.put((target, "[additional content truncated]"))
async def on_connect(self):
logging.debug('on_connect')
await self.join(self.channel)

View File

@ -20,6 +20,7 @@ import faiss
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
def parse_args():
parser = argparse.ArgumentParser(description="AI is Really Cool Server")
@ -37,6 +38,16 @@ def setup_logging(level):
logging.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info(f"Logging is set to {level} level.")
def extract_text_from_html_or_xml(content, is_xml=False):
# Parse the content
if is_xml:
soup = BeautifulSoup(content, 'xml') # Use 'xml' parser for XML content
else:
soup = BeautifulSoup(content, 'html.parser') # Default to 'html.parser' for HTML content
# Extract and return just the text
return soup.get_text()
class Feed():
def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
self.name = name
@ -67,6 +78,7 @@ class Feed():
content += f"Link: {link}\n"
summary = entry.get("summary")
if summary:
summary = extract_text_from_html_or_xml(summary, False)
content += f"Summary: {summary}\n"
published = entry.get("published")
if published:
@ -166,7 +178,7 @@ class Chat():
prompt,
add_special_tokens=False,
return_tensors="pt",
max_length=8000, # Prevent 'Asking to truncate to max_length...'
max_length=7999, # Prevent 'Asking to truncate to max_length...'
padding=True, # Handles padding automatically
truncation=True
)