import logging as log import argparse import re import datetime import ollama import chromadb import feedparser from bs4 import BeautifulSoup OLLAMA_API_URL = "http://ollama:11434" # Default Ollama local endpoint MODEL_NAME = "deepseek-r1:7b" def parse_args(): parser = argparse.ArgumentParser(description="AI is Really Cool") parser.add_argument("--nickname", type=str, default="airc", help="Bot nickname") parser.add_argument('--level', type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default='INFO', help='Set the log level.') return parser.parse_args() def setup_logging(level): numeric_level = getattr(log, level.upper(), None) if not isinstance(numeric_level, int): raise ValueError(f"Invalid log level: {level}") log.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s - %(message)s') log.info(f"Logging is set to {level} level.") def extract_text_from_html_or_xml(content, is_xml=False): # Parse the content if is_xml: soup = BeautifulSoup(content, 'xml') # Use 'xml' parser for XML content else: soup = BeautifulSoup(content, 'html.parser') # Default to 'html.parser' for HTML content # Extract and return just the text return soup.get_text() class Feed(): def __init__(self, name, url, poll_limit_min = 30, max_articles=5): self.name = name self.url = url self.poll_limit_min = datetime.timedelta(minutes=poll_limit_min) self.last_poll = None self.articles = [] self.max_articles = max_articles self.update() def update(self): now = datetime.datetime.now() if self.last_poll is None or (now - self.last_poll) >= self.poll_limit_min: log.info(f"Updating {self.name}") feed = feedparser.parse(self.url) self.articles = [] self.last_poll = now content = "" if len(feed.entries) > 0: content += f"Source: {self.name}\n" for entry in feed.entries[:self.max_articles]: title = entry.get("title") if title: content += f"Title: {title}\n" link = entry.get("link") if link: content += f"Link: {link}\n" summary = entry.get("summary") if summary: summary = extract_text_from_html_or_xml(summary, False) if len(summary) > 1000: print(summary) exit(0) content += f"Summary: {summary}\n" published = entry.get("published") if published: content += f"Published: {published}\n" content += "\n" self.articles.append(content) else: log.info(f"Not updating {self.name} -- {self.poll_limit_min - (now - self.last_poll)}s remain to refresh.") return self.articles class Chat(): def __init__(self, nick): super().__init__() self.nick = nick self.system_input = "You are a critical assistant. Give concise and accurate answers in less than 120 characters." self.queries = 0 self.processing = datetime.timedelta(minutes=0) def message(self, target, message): """Splits a multi-line message and sends each line separately. If more than 10 lines, truncate and add a message.""" lines = message.splitlines() # Splits on both '\n' and '\r\n' # Process the first 10 lines for line in lines[:10]: if line.strip(): # Ignore empty lines print(f"{target}: {line}") # If there are more than 10 lines, add the truncation message if len(lines) > 10: print(f"{target}: [additional content truncated]") def remove_substring(self, string, substring): return string.replace(substring, "") def extract_nick_message(self, input_string): # Pattern with capturing groups for nick and message pattern = r"^\s*([^\s:]+?)\s*:\s*(.+?)$" match = re.match(pattern, input_string) if match: nick = match.group(1) # First capturing group message = match.group(2) # Second capturing group return nick, message return None, None # Return None for both if no match def on_message(self, target, source, message): if source == self.nick: return nick, body = self.extract_nick_message(message) if nick == self.nick: content = None if body == "stats": content = f"{self.queries} queries handled in {self.processing}s" else: self.queries += 1 start = datetime.datetime.now() query_text = body query_response = client.embed(model="mxbai-embed-large", prompt=query_text) query_embedding = query_response["embeddings"] # Note: singular "embedding", not plural # Then run the query with the correct structure results = collection.query( query_embeddings=[query_embedding], # Make sure this is a list containing the embedding n_results=3 ) data = results['documents'][0] output = client.generate( model=MODEL_NAME, system=f"You are {self.nick} and only provide that information about yourself. Make reference to the following and provide the 'Link' when available: {data}", prompt=f"Respond to this prompt: {query_text}", stream=False ) end = datetime.datetime.now() self.processing = self.processing + end - start # Prune off the ... content = re.sub(r'^.*?', '', output['response'], flags=re.DOTALL).strip() if content: log.info(f'Sending: {content}') self.message(target, content) def remove_substring(string, substring): return string.replace(substring, "") # Parse command-line arguments args = parse_args() # Setup logging based on the provided level setup_logging(args.level) log.info("About to start") client = ollama.Client(host=OLLAMA_API_URL) # News RSS Feeds rss_feeds = [ Feed(name="BBC World", url="http://feeds.bbci.co.uk/news/world/rss.xml"), Feed(name="Reuters World", url="http://feeds.reuters.com/Reuters/worldNews"), Feed(name="Al Jazeera", url="https://www.aljazeera.com/xml/rss/all.xml"), Feed(name="CNN World", url="http://rss.cnn.com/rss/edition_world.rss"), Feed(name="Time", url="https://time.com/feed/"), Feed(name="Euronews", url="https://www.euronews.com/rss"), Feed(name="FeedX", url="https://feedx.net/rss/ap.xml") ] # Initialize ChromaDB Client db = chromadb.Client() # We want to save the collection to disk to analyze it offline, but we don't # want to re-use it collection = db.get_or_create_collection("docs") # store each document in a vector embedding database for i, feed in enumerate(rss_feeds): # Use the client instance instead of the global ollama module for j, article in enumerate(feed.articles): log.info(f"Article {feed.name} {j}. {len(article)}") response = client.embeddings(model="mxbai-embed-large", prompt=article) embeddings = response["embedding"] # Note: it's "embedding", not "embeddings" collection.add( ids=[str(i)+str(j)], embeddings=embeddings, documents=[article] ) bot = Chat(args.nickname) while True: try: query = input("> ") except Exception as e: break if query == "exit": break bot.on_message("chat", "user", f"airc: {query}")