Working to reduce RSS context length by parsing out HTML
This commit is contained in:
parent
872e74efbb
commit
87c2469c65
@ -340,7 +340,7 @@ WORKDIR /opt/airc
|
||||
SHELL [ "/opt/airc/shell" ]
|
||||
|
||||
# Needed by src/model-server.py
|
||||
RUN pip install faiss-cpu sentence_transformers feedparser
|
||||
RUN pip install faiss-cpu sentence_transformers feedparser bs4
|
||||
|
||||
SHELL [ "/bin/bash", "-c" ]
|
||||
|
||||
|
11
src/airc.py
11
src/airc.py
@ -118,11 +118,18 @@ class AIRC(pydle.Client):
|
||||
logging.info("Burst limit reset due to inactivity.")
|
||||
|
||||
async def message(self, target, message):
|
||||
"""Splits a multi-line message and sends each line separately."""
|
||||
for line in message.splitlines(): # Splits on both '\n' and '\r\n'
|
||||
"""Splits a multi-line message and sends each line separately. If more than 10 lines, truncate and add a message."""
|
||||
lines = message.splitlines() # Splits on both '\n' and '\r\n'
|
||||
|
||||
# Process the first 10 lines
|
||||
for line in lines[:10]:
|
||||
if line.strip(): # Ignore empty lines
|
||||
await self._message_queue.put((target, line))
|
||||
|
||||
# If there are more than 10 lines, add the truncation message
|
||||
if len(lines) > 10:
|
||||
await self._message_queue.put((target, "[additional content truncated]"))
|
||||
|
||||
async def on_connect(self):
|
||||
logging.debug('on_connect')
|
||||
await self.join(self.channel)
|
||||
|
@ -20,6 +20,7 @@ import faiss
|
||||
import numpy as np
|
||||
import torch
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="AI is Really Cool Server")
|
||||
@ -37,6 +38,16 @@ def setup_logging(level):
|
||||
logging.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logging.info(f"Logging is set to {level} level.")
|
||||
|
||||
def extract_text_from_html_or_xml(content, is_xml=False):
|
||||
# Parse the content
|
||||
if is_xml:
|
||||
soup = BeautifulSoup(content, 'xml') # Use 'xml' parser for XML content
|
||||
else:
|
||||
soup = BeautifulSoup(content, 'html.parser') # Default to 'html.parser' for HTML content
|
||||
|
||||
# Extract and return just the text
|
||||
return soup.get_text()
|
||||
|
||||
class Feed():
|
||||
def __init__(self, name, url, poll_limit_min = 30, max_articles=5):
|
||||
self.name = name
|
||||
@ -67,6 +78,7 @@ class Feed():
|
||||
content += f"Link: {link}\n"
|
||||
summary = entry.get("summary")
|
||||
if summary:
|
||||
summary = extract_text_from_html_or_xml(summary, False)
|
||||
content += f"Summary: {summary}\n"
|
||||
published = entry.get("published")
|
||||
if published:
|
||||
@ -166,7 +178,7 @@ class Chat():
|
||||
prompt,
|
||||
add_special_tokens=False,
|
||||
return_tensors="pt",
|
||||
max_length=8000, # Prevent 'Asking to truncate to max_length...'
|
||||
max_length=7999, # Prevent 'Asking to truncate to max_length...'
|
||||
padding=True, # Handles padding automatically
|
||||
truncation=True
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user