From 581bc4a575af6fab5938953f3ddb47114ad46915 Mon Sep 17 00:00:00 2001
From: James Ketrenos <james_git@ketrenos.com>
Date: Sat, 26 Apr 2025 23:24:28 -0700
Subject: [PATCH] Updated system prompts to use better syntax for qwen

Fixed markdownit integration
---
 Dockerfile                        |   1 +
 docker-compose.yml                |   2 +-
 frontend/src/VectorVisualizer.tsx |   2 +
 src/server.py                     | 140 ++++++++++++++----------------
 src/utils/rag.py                  |  41 ++++++++-
 5 files changed, 109 insertions(+), 77 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 5abee11..b7516ac 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -257,6 +257,7 @@ FROM llm-base AS backstory
 
 COPY /src/requirements.txt /opt/backstory/src/requirements.txt
 RUN pip install -r /opt/backstory/src/requirements.txt
+RUN pip install 'markitdown[all]'
 
 SHELL [ "/bin/bash", "-c" ]
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 81c52eb..9f96b25 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -26,7 +26,7 @@ services:
       - ./sessions:/opt/backstory/sessions:rw    # Persist sessions
       - ./chromadb:/opt/backstory/chromadb:rw    # Persist ChromaDB
       - ./dev-keys:/opt/backstory/keys:ro        # Developer keys
-      - ./docs:/opt/backstory/docs:ro            # Live mount of RAG content
+      - ./docs:/opt/backstory/docs:rw            # Live mount of RAG content
       - ./src:/opt/backstory/src:rw              # Live mount server src 
       - ./frontend:/opt/backstory/frontend:rw    # Live mount frontend src 
     cap_add: # used for running ze-monitor within container
diff --git a/frontend/src/VectorVisualizer.tsx b/frontend/src/VectorVisualizer.tsx
index 24fdf35..ab3e7a8 100644
--- a/frontend/src/VectorVisualizer.tsx
+++ b/frontend/src/VectorVisualizer.tsx
@@ -82,6 +82,7 @@ const emojiMap: Record<string, string> = {
   query: '🔍',
   resume: '📄',
   projects: '📁',
+  'performance-reviews': '📄',
   news: '📰',
 };
 
@@ -90,6 +91,7 @@ const colorMap: Record<string, string> = {
   resume: '#4A7A7D',    // Dusty Teal — secondary theme color
   projects: '#1A2536',  // Midnight Blue — rich and deep
   news: '#D3CDBF',      // Warm Gray — soft and neutral
+  'performance-reviews': '#FF0000', // Bright red
 };
 
 const sizeMap: Record<string, number> = {
diff --git a/src/server.py b/src/server.py
index 42309de..67784f3 100644
--- a/src/server.py
+++ b/src/server.py
@@ -139,78 +139,68 @@ DEFAULT_HISTORY_LENGTH=5
 
 # %%
 # Globals
-NAME = "James Ketrenos"
-
-resume_intro = f"""
-As an AI/ML professional specializing in creating custom solutions to new problem domains, {NAME} developed a custom 
-language model applications that streamline information processing and content generation. This tailored resume 
-was created using a Retrieval-Augmented Generation system I built to efficiently match my relevant experience 
-with your specific needs—demonstrating both my technical capabilities and commitment to intelligent resource 
-optimization.
-"""
 
 system_message = f"""
 Launched on {DateTime()}.
 
 When answering queries, follow these steps:
 
-1. First analyze the query to determine if real-time information might be helpful
-2. Even when [INFO] is provided, consider whether the tools would provide more current or comprehensive information
-3. Use the provided tools whenever they would enhance your response, regardless of whether context is also available
-4. When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️
-4. When both [INFO] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
-5. Always prioritize the most up-to-date and relevant information, whether it comes from [INFO] or tools
-6. If [INFO] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
-7. If there is information in the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '[INFO]' (etc.) or quoting it directly.
-8. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
+- First analyze the query to determine if real-time information might be helpful
+- Even when <|context|> is provided, consider whether the tools would provide more current or comprehensive information
+- Use the provided tools whenever they would enhance your response, regardless of whether context is also available
+- When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️
+- When both <|context|> and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
+- Always prioritize the most up-to-date and relevant information, whether it comes from <|context|> or tools
+- If <|context|> and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
+- If there is information in the <|context|>, <|job_description|>, or <|work_history|> sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '<|context|>' (etc.) or quoting it directly.
+- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
 
-Always use tools and [INFO] when possible. Be concise, and never make up information. If you do not know the answer, say so.
-""".strip()
+Always use tools and <|context|> when possible. Be concise, and never make up information. If you do not know the answer, say so.
+"""
 
 system_generate_resume = f"""
 Launched on {DateTime()}.
 
-You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's [WORK HISTORY].
+You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's <|work_history|>.
 
 When answering queries, follow these steps:
 
-1. You must not invent or assume any inforation not explicitly present in the [WORK HISTORY].
-2. Analyze the [JOB DESCRIPTION] to identify skills required for the job.
-3. Use the [JOB DESCRIPTION] provided to guide the focus, tone, and relevant skills or experience to highlight from the [WORK HISTORY]. 
-4. Identify and emphasize the experiences, achievements, and responsibilities from the [WORK HISTORY] that best align with the [JOB DESCRIPTION].
-5. Only provide information from [WORK HISTORY] items if it is relevant to the [JOB DESCRIPTION].
-6. Do not use the [JOB DESCRIPTION] skills unless listed in [WORK HISTORY].
-7. Do not include any information unless it is provided in [WORK HISTORY] or [INTRO].
-8. Use the [INTRO] to highlight the use of AI in generating this resume.
-9. Use the [WORK HISTORY] to create a polished, professional resume.
-10. Do not list any locations or mailing addresses in the resume.
-11. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
-12. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
+- You must not invent or assume any inforation not explicitly present in the <|work_history|>.
+- Analyze the <|job_description|> to identify skills required for the job.
+- Use the <|job_description|> provided to guide the focus, tone, and relevant skills or experience to highlight from the <|work_history|>. 
+- Identify and emphasize the experiences, achievements, and responsibilities from the <|work_history|> that best align with the <|job_description|>.
+- Only provide information from <|work_history|> items if it is relevant to the <|job_description|>.
+- Do not use the <|job_description|> skills unless listed in <|work_history|>.
+- Do not include any information unless it is provided in <|work_history|>.
+- Use the <|work_history|> to create a polished, professional resume.
+- Do not list any locations or mailing addresses in the resume.
+- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
+- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
+- Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes.
 
 Structure the resume professionally with the following sections where applicable:
 
-* "Name: Use full name."
-* "Professional Summary: A 2-4 sentence overview tailored to the job, using [INTRO] to highlight the use of AI in generating this resume."
-* "Skills: A bullet list of key skills derived from the work history and relevant to the job."
-* Professional Experience: A detailed list of roles, achievements, and responsibilities from [WORK HISTORY] that relate to the [JOB DESCRIPTION]."
+* Name: Use full name
+* Professional Summary: A 2-4 sentence overview tailored to the job.
+* Skills: A bullet list of key skills derived from the work history and relevant to the job.
+* Professional Experience: A detailed list of roles, achievements, and responsibilities from <|work_history|> that relate to the <|job_description|>.
 * Education: Include only if available in the work history.
+* Notes: Indicate the initial draft of the resume was generated using the Backstory application.
 
-Do not include any information unless it is provided in [WORK HISTORY] or [INTRO]. 
-Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes.
 """.strip()
 
 system_fact_check = f"""
 Launched on {DateTime()}.
 
-You are a professional resume fact checker. Your task is to identify any inaccuracies in the [RESUME] based on the individual's [WORK HISTORY].
+You are a professional resume fact checker. Your task is to identify any inaccuracies in the <|resume|> based on the individual's <|work_history|>.
 
 If there are inaccuracies, list them in a bullet point format.
 
 When answering queries, follow these steps:
-1. You must not invent or assume any information not explicitly present in the [WORK HISTORY].
-2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY].
-3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
-4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags.
+- You must not invent or assume any information not explicitly present in the <|work_history|>.
+- Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>.
+- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
+- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags.
 """.strip()
 
 system_job_description = f"""
@@ -219,10 +209,10 @@ Launched on {DateTime()}.
 You are a hiring and job placing specialist. Your task is to answers about a job description.
 
 When answering queries, follow these steps:
-1. Analyze the [JOB DESCRIPTION] to provide insights for the asked question.
-2. If any financial information is requested, be sure to account for inflation.
-3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
-4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags.
+- Analyze the <|job_description|> to provide insights for the asked question.
+- If any financial information is requested, be sure to account for inflation.
+- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
+- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags.
 """.strip()
 
 def create_system_message(prompt):
@@ -1088,12 +1078,15 @@ class WebServer:
 
                     if rag_context:
                         preamble = f"""
-1. Respond to this query: {content}
-2. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
-3. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
-[INFO]
+
+<|rules|>
+- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
+- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
+
+<|context|>
 {rag_context}
-[/INFO]
+
+<|question|>
 Use that information to respond to:"""
 
                     # Use the mode specific system_prompt instead of 'chat'
@@ -1122,24 +1115,23 @@ Use that information to respond to:"""
                                 rag_context += f"{doc}\n"
 
                     preamble = f"""
-[INTRO]
-{resume_intro}
-[/INTRO]
-
-[WORK HISTORY]
+<|work_history|>
 {rag_context}
-[/WORK HISTORY]
 
-[JOB DESCRIPTION]
+<|job_description|>
 {content}
-[/JOB DESCRIPTION]
+
 """
                     
-                    context["sessions"]["job_description"]["content_seed"] = preamble + "Use the above information to answer this query: "
+                    context["sessions"]["job_description"]["content_seed"] = preamble + "<|question|>\nUse the above information to answer this query: "
 
                     preamble += f"""
-1. Use the above [INTRO] and [WORK HISTORY] to create the resume for the [JOB DESCRIPTION]. 
-2. Do not use content from the [JOB DESCRIPTION] in the response unless the [WORK HISTORY] mentions them.
+
+<|rules|>          
+1. Use the above <|<|work_history|> to create the resume for the <|job_description|>. 
+2. Do not use content from the <|job_description|> in the response unless the <|work_history|> mentions them.
+
+<|question|>
 """
 
                     # Seed the history for job_description
@@ -1185,25 +1177,25 @@ Use that information to respond to:"""
                                 rag_context += f"{doc}\n"
 
                     preamble = f"""
-[WORK HISTORY]
+<|work_history|>
 {rag_context}
-[/WORK HISTORY]
 
-[RESUME]
+<|resume|>
 {resume['content']}
-[/RESUME]
 
-Perform the following:
-1. Do not invent or assume any information not explicitly present in the [WORK HISTORY].
-2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY].
+<|rules|>
+1. Do not invent or assume any information not explicitly present in the <|work_history|>.
+2. Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>.
+
+<|question|>
 """
                     
                     context["sessions"]["resume"]["content_seed"] = f"""
-[RESUME]
+<|resume|>
 {resume["content"]}
-[/RESUME]
 
-Use the above [RESUME] to answer this query:
+<|question|>
+Use the above <|resume|> to answer this query:
 """
 
                     content = "Fact check the resume and report discrepancies."
diff --git a/src/utils/rag.py b/src/utils/rag.py
index b76eb7a..c7ef8ca 100644
--- a/src/utils/rag.py
+++ b/src/utils/rag.py
@@ -1,5 +1,6 @@
 import os
 import glob
+from pathlib import Path
 import time
 import hashlib
 import asyncio
@@ -12,6 +13,7 @@ import asyncio
 import json
 import pickle
 import numpy as np
+import re
 
 import chromadb
 import ollama
@@ -21,6 +23,7 @@ from langchain.schema import Document
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
 import umap
+from markitdown import MarkItDown
 
 # Import your existing modules
 if __name__ == "__main__":
@@ -49,6 +52,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
         self.chunk_overlap = chunk_overlap
         self.loop = loop
         
+
+        self.md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
+
         #self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 
         # Path for storing file hash state
@@ -98,6 +104,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
     def umap_model_3d(self):
         return self._umap_model_3d
 
+    def _markitdown(self, document : str, markdown : Path):
+        logging.info(f'Converting {document} to {markdown}')
+        try:
+            result = self.md.convert(document)
+            markdown.write_text(result.text_content)
+        except Exception as e:
+            logging.error(f"Error convering via markdownit: {e}")
+
     def _save_hash_state(self):
         """Save the current file hash state to disk."""
         try:
@@ -188,9 +202,11 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
         """Process a file update event."""
         # Skip if already being processed
         if file_path in self.processing_files:
+            logging.info(f"{file_path} already in queue. Not adding.")
             return
             
         try:
+            logging.info(f"{file_path} not in queue. Adding.")
             self.processing_files.add(file_path)
             
             # Wait a moment to ensure the file write is complete
@@ -203,11 +219,12 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
                 
             if file_path in self.file_hashes and self.file_hashes[file_path] == current_hash:
                 # File hasn't actually changed in content
+                logging.info(f"Hash has not changed for {file_path}")
                 return
             
             # Update file hash
             self.file_hashes[file_path] = current_hash
-            
+
             # Process and update the file in ChromaDB
             async with self.update_lock:
                 await self._update_document_in_collection(file_path)
@@ -420,6 +437,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
         asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
         logging.info(f"File deleted: {file_path}")
 
+    def on_moved(self, event):
+        """Handle move deletion events."""
+        if event.is_directory:
+            return
+        
+        file_path = event.src_path
+        logging.info(f"TODO: on_moved: ${file_path}")
+
     def _normalize_embeddings(self, embeddings):
         """Normalize the embeddings to unit length."""
         # Handle both single vector and array of vectors
@@ -435,12 +460,24 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
     async def _update_document_in_collection(self, file_path):
         """Update a document in the ChromaDB collection."""
         try:
-            logging.info(f"Updating document in collection: {file_path}")
             # Remove existing entries for this file
             existing_results = self.collection.get(where={"path": file_path})
             if existing_results and 'ids' in existing_results and existing_results['ids']:
                 self.collection.delete(ids=existing_results['ids'])
             
+            extensions = (".docx", ".xlsx", ".xls", ".pdf")
+            if file_path.endswith(extensions):
+                p = Path(file_path)
+                p_as_md = p.with_suffix(".md")
+                if p_as_md.exists():
+                    logging.info(f"newer: {p.stat().st_mtime > p_as_md.stat().st_mtime}")
+
+                # If file_path.md doesn't exist or file_path is newer than file_path.md,
+                # fire off markitdown
+                if (not p_as_md.exists()) or (p.stat().st_mtime > p_as_md.stat().st_mtime):
+                    self._markitdown(file_path, p_as_md)
+                return
+
             # Create document object in LangChain format
             with open(file_path, "r", encoding="utf-8") as f:
                 content = f.read()