From 581bc4a575af6fab5938953f3ddb47114ad46915 Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Sat, 26 Apr 2025 23:24:28 -0700 Subject: [PATCH] Updated system prompts to use better syntax for qwen Fixed markdownit integration --- Dockerfile | 1 + docker-compose.yml | 2 +- frontend/src/VectorVisualizer.tsx | 2 + src/server.py | 140 ++++++++++++++---------------- src/utils/rag.py | 41 ++++++++- 5 files changed, 109 insertions(+), 77 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5abee11..b7516ac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -257,6 +257,7 @@ FROM llm-base AS backstory COPY /src/requirements.txt /opt/backstory/src/requirements.txt RUN pip install -r /opt/backstory/src/requirements.txt +RUN pip install 'markitdown[all]' SHELL [ "/bin/bash", "-c" ] diff --git a/docker-compose.yml b/docker-compose.yml index 81c52eb..9f96b25 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,7 +26,7 @@ services: - ./sessions:/opt/backstory/sessions:rw # Persist sessions - ./chromadb:/opt/backstory/chromadb:rw # Persist ChromaDB - ./dev-keys:/opt/backstory/keys:ro # Developer keys - - ./docs:/opt/backstory/docs:ro # Live mount of RAG content + - ./docs:/opt/backstory/docs:rw # Live mount of RAG content - ./src:/opt/backstory/src:rw # Live mount server src - ./frontend:/opt/backstory/frontend:rw # Live mount frontend src cap_add: # used for running ze-monitor within container diff --git a/frontend/src/VectorVisualizer.tsx b/frontend/src/VectorVisualizer.tsx index 24fdf35..ab3e7a8 100644 --- a/frontend/src/VectorVisualizer.tsx +++ b/frontend/src/VectorVisualizer.tsx @@ -82,6 +82,7 @@ const emojiMap: Record = { query: '🔍', resume: '📄', projects: '📁', + 'performance-reviews': '📄', news: '📰', }; @@ -90,6 +91,7 @@ const colorMap: Record = { resume: '#4A7A7D', // Dusty Teal — secondary theme color projects: '#1A2536', // Midnight Blue — rich and deep news: '#D3CDBF', // Warm Gray — soft and neutral + 'performance-reviews': '#FF0000', // Bright red }; const sizeMap: Record = { diff --git a/src/server.py b/src/server.py index 42309de..67784f3 100644 --- a/src/server.py +++ b/src/server.py @@ -139,78 +139,68 @@ DEFAULT_HISTORY_LENGTH=5 # %% # Globals -NAME = "James Ketrenos" - -resume_intro = f""" -As an AI/ML professional specializing in creating custom solutions to new problem domains, {NAME} developed a custom -language model applications that streamline information processing and content generation. This tailored resume -was created using a Retrieval-Augmented Generation system I built to efficiently match my relevant experience -with your specific needs—demonstrating both my technical capabilities and commitment to intelligent resource -optimization. -""" system_message = f""" Launched on {DateTime()}. When answering queries, follow these steps: -1. First analyze the query to determine if real-time information might be helpful -2. Even when [INFO] is provided, consider whether the tools would provide more current or comprehensive information -3. Use the provided tools whenever they would enhance your response, regardless of whether context is also available -4. When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️ -4. When both [INFO] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer -5. Always prioritize the most up-to-date and relevant information, whether it comes from [INFO] or tools -6. If [INFO] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data -7. If there is information in the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '[INFO]' (etc.) or quoting it directly. -8. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags. +- First analyze the query to determine if real-time information might be helpful +- Even when <|context|> is provided, consider whether the tools would provide more current or comprehensive information +- Use the provided tools whenever they would enhance your response, regardless of whether context is also available +- When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️ +- When both <|context|> and tool outputs are relevant, synthesize information from both sources to provide the most complete answer +- Always prioritize the most up-to-date and relevant information, whether it comes from <|context|> or tools +- If <|context|> and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data +- If there is information in the <|context|>, <|job_description|>, or <|work_history|> sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '<|context|>' (etc.) or quoting it directly. +- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags. -Always use tools and [INFO] when possible. Be concise, and never make up information. If you do not know the answer, say so. -""".strip() +Always use tools and <|context|> when possible. Be concise, and never make up information. If you do not know the answer, say so. +""" system_generate_resume = f""" Launched on {DateTime()}. -You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's [WORK HISTORY]. +You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's <|work_history|>. When answering queries, follow these steps: -1. You must not invent or assume any inforation not explicitly present in the [WORK HISTORY]. -2. Analyze the [JOB DESCRIPTION] to identify skills required for the job. -3. Use the [JOB DESCRIPTION] provided to guide the focus, tone, and relevant skills or experience to highlight from the [WORK HISTORY]. -4. Identify and emphasize the experiences, achievements, and responsibilities from the [WORK HISTORY] that best align with the [JOB DESCRIPTION]. -5. Only provide information from [WORK HISTORY] items if it is relevant to the [JOB DESCRIPTION]. -6. Do not use the [JOB DESCRIPTION] skills unless listed in [WORK HISTORY]. -7. Do not include any information unless it is provided in [WORK HISTORY] or [INTRO]. -8. Use the [INTRO] to highlight the use of AI in generating this resume. -9. Use the [WORK HISTORY] to create a polished, professional resume. -10. Do not list any locations or mailing addresses in the resume. -11. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly. -12. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags. +- You must not invent or assume any inforation not explicitly present in the <|work_history|>. +- Analyze the <|job_description|> to identify skills required for the job. +- Use the <|job_description|> provided to guide the focus, tone, and relevant skills or experience to highlight from the <|work_history|>. +- Identify and emphasize the experiences, achievements, and responsibilities from the <|work_history|> that best align with the <|job_description|>. +- Only provide information from <|work_history|> items if it is relevant to the <|job_description|>. +- Do not use the <|job_description|> skills unless listed in <|work_history|>. +- Do not include any information unless it is provided in <|work_history|>. +- Use the <|work_history|> to create a polished, professional resume. +- Do not list any locations or mailing addresses in the resume. +- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly. +- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags. +- Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes. Structure the resume professionally with the following sections where applicable: -* "Name: Use full name." -* "Professional Summary: A 2-4 sentence overview tailored to the job, using [INTRO] to highlight the use of AI in generating this resume." -* "Skills: A bullet list of key skills derived from the work history and relevant to the job." -* Professional Experience: A detailed list of roles, achievements, and responsibilities from [WORK HISTORY] that relate to the [JOB DESCRIPTION]." +* Name: Use full name +* Professional Summary: A 2-4 sentence overview tailored to the job. +* Skills: A bullet list of key skills derived from the work history and relevant to the job. +* Professional Experience: A detailed list of roles, achievements, and responsibilities from <|work_history|> that relate to the <|job_description|>. * Education: Include only if available in the work history. +* Notes: Indicate the initial draft of the resume was generated using the Backstory application. -Do not include any information unless it is provided in [WORK HISTORY] or [INTRO]. -Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes. """.strip() system_fact_check = f""" Launched on {DateTime()}. -You are a professional resume fact checker. Your task is to identify any inaccuracies in the [RESUME] based on the individual's [WORK HISTORY]. +You are a professional resume fact checker. Your task is to identify any inaccuracies in the <|resume|> based on the individual's <|work_history|>. If there are inaccuracies, list them in a bullet point format. When answering queries, follow these steps: -1. You must not invent or assume any information not explicitly present in the [WORK HISTORY]. -2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY]. -3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly. -4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags. +- You must not invent or assume any information not explicitly present in the <|work_history|>. +- Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>. +- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly. +- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags. """.strip() system_job_description = f""" @@ -219,10 +209,10 @@ Launched on {DateTime()}. You are a hiring and job placing specialist. Your task is to answers about a job description. When answering queries, follow these steps: -1. Analyze the [JOB DESCRIPTION] to provide insights for the asked question. -2. If any financial information is requested, be sure to account for inflation. -3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly. -4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags. +- Analyze the <|job_description|> to provide insights for the asked question. +- If any financial information is requested, be sure to account for inflation. +- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly. +- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags. """.strip() def create_system_message(prompt): @@ -1088,12 +1078,15 @@ class WebServer: if rag_context: preamble = f""" -1. Respond to this query: {content} -2. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly. -3. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags. -[INFO] + +<|rules|> +- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly. +- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags. + +<|context|> {rag_context} -[/INFO] + +<|question|> Use that information to respond to:""" # Use the mode specific system_prompt instead of 'chat' @@ -1122,24 +1115,23 @@ Use that information to respond to:""" rag_context += f"{doc}\n" preamble = f""" -[INTRO] -{resume_intro} -[/INTRO] - -[WORK HISTORY] +<|work_history|> {rag_context} -[/WORK HISTORY] -[JOB DESCRIPTION] +<|job_description|> {content} -[/JOB DESCRIPTION] + """ - context["sessions"]["job_description"]["content_seed"] = preamble + "Use the above information to answer this query: " + context["sessions"]["job_description"]["content_seed"] = preamble + "<|question|>\nUse the above information to answer this query: " preamble += f""" -1. Use the above [INTRO] and [WORK HISTORY] to create the resume for the [JOB DESCRIPTION]. -2. Do not use content from the [JOB DESCRIPTION] in the response unless the [WORK HISTORY] mentions them. + +<|rules|> +1. Use the above <|<|work_history|> to create the resume for the <|job_description|>. +2. Do not use content from the <|job_description|> in the response unless the <|work_history|> mentions them. + +<|question|> """ # Seed the history for job_description @@ -1185,25 +1177,25 @@ Use that information to respond to:""" rag_context += f"{doc}\n" preamble = f""" -[WORK HISTORY] +<|work_history|> {rag_context} -[/WORK HISTORY] -[RESUME] +<|resume|> {resume['content']} -[/RESUME] -Perform the following: -1. Do not invent or assume any information not explicitly present in the [WORK HISTORY]. -2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY]. +<|rules|> +1. Do not invent or assume any information not explicitly present in the <|work_history|>. +2. Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>. + +<|question|> """ context["sessions"]["resume"]["content_seed"] = f""" -[RESUME] +<|resume|> {resume["content"]} -[/RESUME] -Use the above [RESUME] to answer this query: +<|question|> +Use the above <|resume|> to answer this query: """ content = "Fact check the resume and report discrepancies." diff --git a/src/utils/rag.py b/src/utils/rag.py index b76eb7a..c7ef8ca 100644 --- a/src/utils/rag.py +++ b/src/utils/rag.py @@ -1,5 +1,6 @@ import os import glob +from pathlib import Path import time import hashlib import asyncio @@ -12,6 +13,7 @@ import asyncio import json import pickle import numpy as np +import re import chromadb import ollama @@ -21,6 +23,7 @@ from langchain.schema import Document from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler import umap +from markitdown import MarkItDown # Import your existing modules if __name__ == "__main__": @@ -49,6 +52,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler): self.chunk_overlap = chunk_overlap self.loop = loop + + self.md = MarkItDown(enable_plugins=False) # Set to True to enable plugins + #self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Path for storing file hash state @@ -98,6 +104,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler): def umap_model_3d(self): return self._umap_model_3d + def _markitdown(self, document : str, markdown : Path): + logging.info(f'Converting {document} to {markdown}') + try: + result = self.md.convert(document) + markdown.write_text(result.text_content) + except Exception as e: + logging.error(f"Error convering via markdownit: {e}") + def _save_hash_state(self): """Save the current file hash state to disk.""" try: @@ -188,9 +202,11 @@ class ChromaDBFileWatcher(FileSystemEventHandler): """Process a file update event.""" # Skip if already being processed if file_path in self.processing_files: + logging.info(f"{file_path} already in queue. Not adding.") return try: + logging.info(f"{file_path} not in queue. Adding.") self.processing_files.add(file_path) # Wait a moment to ensure the file write is complete @@ -203,11 +219,12 @@ class ChromaDBFileWatcher(FileSystemEventHandler): if file_path in self.file_hashes and self.file_hashes[file_path] == current_hash: # File hasn't actually changed in content + logging.info(f"Hash has not changed for {file_path}") return # Update file hash self.file_hashes[file_path] = current_hash - + # Process and update the file in ChromaDB async with self.update_lock: await self._update_document_in_collection(file_path) @@ -420,6 +437,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler): asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop) logging.info(f"File deleted: {file_path}") + def on_moved(self, event): + """Handle move deletion events.""" + if event.is_directory: + return + + file_path = event.src_path + logging.info(f"TODO: on_moved: ${file_path}") + def _normalize_embeddings(self, embeddings): """Normalize the embeddings to unit length.""" # Handle both single vector and array of vectors @@ -435,12 +460,24 @@ class ChromaDBFileWatcher(FileSystemEventHandler): async def _update_document_in_collection(self, file_path): """Update a document in the ChromaDB collection.""" try: - logging.info(f"Updating document in collection: {file_path}") # Remove existing entries for this file existing_results = self.collection.get(where={"path": file_path}) if existing_results and 'ids' in existing_results and existing_results['ids']: self.collection.delete(ids=existing_results['ids']) + extensions = (".docx", ".xlsx", ".xls", ".pdf") + if file_path.endswith(extensions): + p = Path(file_path) + p_as_md = p.with_suffix(".md") + if p_as_md.exists(): + logging.info(f"newer: {p.stat().st_mtime > p_as_md.stat().st_mtime}") + + # If file_path.md doesn't exist or file_path is newer than file_path.md, + # fire off markitdown + if (not p_as_md.exists()) or (p.stat().st_mtime > p_as_md.stat().st_mtime): + self._markitdown(file_path, p_as_md) + return + # Create document object in LangChain format with open(file_path, "r", encoding="utf-8") as f: content = f.read()