Working to reduce RSS context length by parsing out HTML

2025-03-06 22:06:54 -08:00 · 2025-03-06 22:06:54 -08:00 · 87c2469c65
commit 87c2469c65
parent 872e74efbb
3 changed files with 23 additions and 4 deletions
--- a/2
+++ b/2
@ -340,7 +340,7 @@ WORKDIR /opt/airc
 SHELL [ "/opt/airc/shell" ]

 # Needed by src/model-server.py
-RUN pip install faiss-cpu sentence_transformers feedparser
+RUN pip install faiss-cpu sentence_transformers feedparser bs4

 SHELL [ "/bin/bash", "-c" ]

--- a/src/airc.py
+++ b/src/airc.py
@ -118,11 +118,18 @@ class AIRC(pydle.Client):
            logging.info("Burst limit reset due to inactivity.")

    async def message(self, target, message):
-        """Splits a multi-line message and sends each line separately."""
-        for line in message.splitlines():  # Splits on both '\n' and '\r\n'
+        """Splits a multi-line message and sends each line separately. If more than 10 lines, truncate and add a message."""
+        lines = message.splitlines()  # Splits on both '\n' and '\r\n'
+        
+        # Process the first 10 lines
+        for line in lines[:10]:
            if line.strip():  # Ignore empty lines
                await self._message_queue.put((target, line))

+        # If there are more than 10 lines, add the truncation message
+        if len(lines) > 10:
+            await self._message_queue.put((target, "[additional content truncated]"))
+
    async def on_connect(self):
        logging.debug('on_connect')
        await self.join(self.channel)
--- a/src/model-server.py
+++ b/src/model-server.py
@ -20,6 +20,7 @@ import faiss
 import numpy as np
 import torch
 from sentence_transformers import SentenceTransformer
+from bs4 import BeautifulSoup

 def parse_args():
    parser = argparse.ArgumentParser(description="AI is Really Cool Server")
@ -37,6 +38,16 @@ def setup_logging(level):
    logging.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s - %(message)s')
    logging.info(f"Logging is set to {level} level.")

+def extract_text_from_html_or_xml(content, is_xml=False):
+    # Parse the content
+    if is_xml:
+        soup = BeautifulSoup(content, 'xml')  # Use 'xml' parser for XML content
+    else:
+        soup = BeautifulSoup(content, 'html.parser')  # Default to 'html.parser' for HTML content
+
+    # Extract and return just the text
+    return soup.get_text()
+
 class Feed():
    def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
        self.name = name
@ -67,6 +78,7 @@ class Feed():
                    content += f"Link: {link}\n"
                summary = entry.get("summary")
                if summary:
+                    summary = extract_text_from_html_or_xml(summary, False)
                    content += f"Summary: {summary}\n"
                published = entry.get("published")
                if published:
@ -166,7 +178,7 @@ class Chat():
                prompt, 
                add_special_tokens=False,
                return_tensors="pt", 
-                max_length=8000,            # Prevent 'Asking to truncate to max_length...'
+                max_length=7999,            # Prevent 'Asking to truncate to max_length...'
                padding=True,               # Handles padding automatically
                truncation=True
            )