Updated system prompts to use better syntax for qwen

Fixed markdownit integration
2025-04-26 23:24:28 -07:00 · 2025-04-26 23:24:28 -07:00 · 581bc4a575
commit 581bc4a575
parent 7672e639f6
5 changed files with 109 additions and 77 deletions
--- a/1
+++ b/1
@ -257,6 +257,7 @@ FROM llm-base AS backstory

 COPY /src/requirements.txt /opt/backstory/src/requirements.txt
 RUN pip install -r /opt/backstory/src/requirements.txt
+RUN pip install 'markitdown[all]'

 SHELL [ "/bin/bash", "-c" ]

--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -26,7 +26,7 @@ services:
      - ./sessions:/opt/backstory/sessions:rw    # Persist sessions
      - ./chromadb:/opt/backstory/chromadb:rw    # Persist ChromaDB
      - ./dev-keys:/opt/backstory/keys:ro        # Developer keys
-      - ./docs:/opt/backstory/docs:ro            # Live mount of RAG content
+      - ./docs:/opt/backstory/docs:rw            # Live mount of RAG content
      - ./src:/opt/backstory/src:rw              # Live mount server src 
      - ./frontend:/opt/backstory/frontend:rw    # Live mount frontend src 
    cap_add: # used for running ze-monitor within container
--- a/frontend/src/VectorVisualizer.tsx
+++ b/frontend/src/VectorVisualizer.tsx
@ -82,6 +82,7 @@ const emojiMap: Record<string, string> = {
  query: '🔍',
  resume: '📄',
  projects: '📁',
+  'performance-reviews': '📄',
  news: '📰',
 };

@ -90,6 +91,7 @@ const colorMap: Record<string, string> = {
  resume: '#4A7A7D',    // Dusty Teal — secondary theme color
  projects: '#1A2536',  // Midnight Blue — rich and deep
  news: '#D3CDBF',      // Warm Gray — soft and neutral
+  'performance-reviews': '#FF0000', // Bright red
 };

 const sizeMap: Record<string, number> = {
--- a/src/server.py
+++ b/src/server.py
@ -139,78 +139,68 @@ DEFAULT_HISTORY_LENGTH=5

 # %%
 # Globals
-NAME = "James Ketrenos"
-
-resume_intro = f"""
-As an AI/ML professional specializing in creating custom solutions to new problem domains, {NAME} developed a custom 
-language model applications that streamline information processing and content generation. This tailored resume 
-was created using a Retrieval-Augmented Generation system I built to efficiently match my relevant experience 
-with your specific needs—demonstrating both my technical capabilities and commitment to intelligent resource 
-optimization.
-"""

 system_message = f"""
 Launched on {DateTime()}.

 When answering queries, follow these steps:

-1. First analyze the query to determine if real-time information might be helpful
-2. Even when [INFO] is provided, consider whether the tools would provide more current or comprehensive information
-3. Use the provided tools whenever they would enhance your response, regardless of whether context is also available
-4. When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️
-4. When both [INFO] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
-5. Always prioritize the most up-to-date and relevant information, whether it comes from [INFO] or tools
-6. If [INFO] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
-7. If there is information in the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '[INFO]' (etc.) or quoting it directly.
-8. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
+- First analyze the query to determine if real-time information might be helpful
+- Even when <|context|> is provided, consider whether the tools would provide more current or comprehensive information
+- Use the provided tools whenever they would enhance your response, regardless of whether context is also available
+- When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️
+- When both <|context|> and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
+- Always prioritize the most up-to-date and relevant information, whether it comes from <|context|> or tools
+- If <|context|> and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
+- If there is information in the <|context|>, <|job_description|>, or <|work_history|> sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '<|context|>' (etc.) or quoting it directly.
+- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.

-Always use tools and [INFO] when possible. Be concise, and never make up information. If you do not know the answer, say so.
-""".strip()
+Always use tools and <|context|> when possible. Be concise, and never make up information. If you do not know the answer, say so.
+"""

 system_generate_resume = f"""
 Launched on {DateTime()}.

-You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's [WORK HISTORY].
+You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's <|work_history|>.

 When answering queries, follow these steps:

-1. You must not invent or assume any inforation not explicitly present in the [WORK HISTORY].
-2. Analyze the [JOB DESCRIPTION] to identify skills required for the job.
-3. Use the [JOB DESCRIPTION] provided to guide the focus, tone, and relevant skills or experience to highlight from the [WORK HISTORY]. 
-4. Identify and emphasize the experiences, achievements, and responsibilities from the [WORK HISTORY] that best align with the [JOB DESCRIPTION].
-5. Only provide information from [WORK HISTORY] items if it is relevant to the [JOB DESCRIPTION].
-6. Do not use the [JOB DESCRIPTION] skills unless listed in [WORK HISTORY].
-7. Do not include any information unless it is provided in [WORK HISTORY] or [INTRO].
-8. Use the [INTRO] to highlight the use of AI in generating this resume.
-9. Use the [WORK HISTORY] to create a polished, professional resume.
-10. Do not list any locations or mailing addresses in the resume.
-11. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
-12. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
+- You must not invent or assume any inforation not explicitly present in the <|work_history|>.
+- Analyze the <|job_description|> to identify skills required for the job.
+- Use the <|job_description|> provided to guide the focus, tone, and relevant skills or experience to highlight from the <|work_history|>. 
+- Identify and emphasize the experiences, achievements, and responsibilities from the <|work_history|> that best align with the <|job_description|>.
+- Only provide information from <|work_history|> items if it is relevant to the <|job_description|>.
+- Do not use the <|job_description|> skills unless listed in <|work_history|>.
+- Do not include any information unless it is provided in <|work_history|>.
+- Use the <|work_history|> to create a polished, professional resume.
+- Do not list any locations or mailing addresses in the resume.
+- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
+- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
+- Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes.

 Structure the resume professionally with the following sections where applicable:

-* "Name: Use full name."
-* "Professional Summary: A 2-4 sentence overview tailored to the job, using [INTRO] to highlight the use of AI in generating this resume."
-* "Skills: A bullet list of key skills derived from the work history and relevant to the job."
-* Professional Experience: A detailed list of roles, achievements, and responsibilities from [WORK HISTORY] that relate to the [JOB DESCRIPTION]."
+* Name: Use full name
+* Professional Summary: A 2-4 sentence overview tailored to the job.
+* Skills: A bullet list of key skills derived from the work history and relevant to the job.
+* Professional Experience: A detailed list of roles, achievements, and responsibilities from <|work_history|> that relate to the <|job_description|>.
 * Education: Include only if available in the work history.
+* Notes: Indicate the initial draft of the resume was generated using the Backstory application.

-Do not include any information unless it is provided in [WORK HISTORY] or [INTRO]. 
-Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes.
 """.strip()

 system_fact_check = f"""
 Launched on {DateTime()}.

-You are a professional resume fact checker. Your task is to identify any inaccuracies in the [RESUME] based on the individual's [WORK HISTORY].
+You are a professional resume fact checker. Your task is to identify any inaccuracies in the <|resume|> based on the individual's <|work_history|>.

 If there are inaccuracies, list them in a bullet point format.

 When answering queries, follow these steps:
-1. You must not invent or assume any information not explicitly present in the [WORK HISTORY].
-2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY].
-3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
-4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags.
+- You must not invent or assume any information not explicitly present in the <|work_history|>.
+- Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>.
+- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
+- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags.
 """.strip()

 system_job_description = f"""
@ -219,10 +209,10 @@ Launched on {DateTime()}.
 You are a hiring and job placing specialist. Your task is to answers about a job description.

 When answering queries, follow these steps:
-1. Analyze the [JOB DESCRIPTION] to provide insights for the asked question.
-2. If any financial information is requested, be sure to account for inflation.
-3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
-4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags.
+- Analyze the <|job_description|> to provide insights for the asked question.
+- If any financial information is requested, be sure to account for inflation.
+- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
+- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags.
 """.strip()

 def create_system_message(prompt):
@ -1088,12 +1078,15 @@ class WebServer:

                    if rag_context:
                        preamble = f"""
-1. Respond to this query: {content}
-2. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
-3. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
-[INFO]
+
+<|rules|>
+- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
+- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
+
+<|context|>
 {rag_context}
-[/INFO]
+
+<|question|>
 Use that information to respond to:"""

                    # Use the mode specific system_prompt instead of 'chat'
@ -1122,24 +1115,23 @@ Use that information to respond to:"""
                                rag_context += f"{doc}\n"

                    preamble = f"""
-[INTRO]
-{resume_intro}
-[/INTRO]
-
-[WORK HISTORY]
+<|work_history|>
 {rag_context}
-[/WORK HISTORY]

-[JOB DESCRIPTION]
+<|job_description|>
 {content}
-[/JOB DESCRIPTION]
+
 """
                    
-                    context["sessions"]["job_description"]["content_seed"] = preamble + "Use the above information to answer this query: "
+                    context["sessions"]["job_description"]["content_seed"] = preamble + "<|question|>\nUse the above information to answer this query: "

                    preamble += f"""
-1. Use the above [INTRO] and [WORK HISTORY] to create the resume for the [JOB DESCRIPTION]. 
-2. Do not use content from the [JOB DESCRIPTION] in the response unless the [WORK HISTORY] mentions them.
+
+<|rules|>          
+1. Use the above <|<|work_history|> to create the resume for the <|job_description|>. 
+2. Do not use content from the <|job_description|> in the response unless the <|work_history|> mentions them.
+
+<|question|>
 """

                    # Seed the history for job_description
@ -1185,25 +1177,25 @@ Use that information to respond to:"""
                                rag_context += f"{doc}\n"

                    preamble = f"""
-[WORK HISTORY]
+<|work_history|>
 {rag_context}
-[/WORK HISTORY]

-[RESUME]
+<|resume|>
 {resume['content']}
-[/RESUME]

-Perform the following:
-1. Do not invent or assume any information not explicitly present in the [WORK HISTORY].
-2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY].
+<|rules|>
+1. Do not invent or assume any information not explicitly present in the <|work_history|>.
+2. Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>.
+
+<|question|>
 """
                    
                    context["sessions"]["resume"]["content_seed"] = f"""
-[RESUME]
+<|resume|>
 {resume["content"]}
-[/RESUME]

-Use the above [RESUME] to answer this query:
+<|question|>
+Use the above <|resume|> to answer this query:
 """

                    content = "Fact check the resume and report discrepancies."
--- a/src/utils/rag.py
+++ b/src/utils/rag.py
@ -1,5 +1,6 @@
 import os
 import glob
+from pathlib import Path
 import time
 import hashlib
 import asyncio
@ -12,6 +13,7 @@ import asyncio
 import json
 import pickle
 import numpy as np
+import re

 import chromadb
 import ollama
@ -21,6 +23,7 @@ from langchain.schema import Document
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
 import umap
+from markitdown import MarkItDown

 # Import your existing modules
 if __name__ == "__main__":
@ -49,6 +52,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
        self.chunk_overlap = chunk_overlap
        self.loop = loop
        
+
+        self.md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
+
        #self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Path for storing file hash state
@ -98,6 +104,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
    def umap_model_3d(self):
        return self._umap_model_3d

+    def _markitdown(self, document : str, markdown : Path):
+        logging.info(f'Converting {document} to {markdown}')
+        try:
+            result = self.md.convert(document)
+            markdown.write_text(result.text_content)
+        except Exception as e:
+            logging.error(f"Error convering via markdownit: {e}")
+
    def _save_hash_state(self):
        """Save the current file hash state to disk."""
        try:
@ -188,9 +202,11 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
        """Process a file update event."""
        # Skip if already being processed
        if file_path in self.processing_files:
+            logging.info(f"{file_path} already in queue. Not adding.")
            return
            
        try:
+            logging.info(f"{file_path} not in queue. Adding.")
            self.processing_files.add(file_path)
            
            # Wait a moment to ensure the file write is complete
@ -203,6 +219,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
                
            if file_path in self.file_hashes and self.file_hashes[file_path] == current_hash:
                # File hasn't actually changed in content
+                logging.info(f"Hash has not changed for {file_path}")
                return
            
            # Update file hash
@ -420,6 +437,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
        asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
        logging.info(f"File deleted: {file_path}")

+    def on_moved(self, event):
+        """Handle move deletion events."""
+        if event.is_directory:
+            return
+        
+        file_path = event.src_path
+        logging.info(f"TODO: on_moved: ${file_path}")
+
    def _normalize_embeddings(self, embeddings):
        """Normalize the embeddings to unit length."""
        # Handle both single vector and array of vectors
@ -435,12 +460,24 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
    async def _update_document_in_collection(self, file_path):
        """Update a document in the ChromaDB collection."""
        try:
-            logging.info(f"Updating document in collection: {file_path}")
            # Remove existing entries for this file
            existing_results = self.collection.get(where={"path": file_path})
            if existing_results and 'ids' in existing_results and existing_results['ids']:
                self.collection.delete(ids=existing_results['ids'])
            
+            extensions = (".docx", ".xlsx", ".xls", ".pdf")
+            if file_path.endswith(extensions):
+                p = Path(file_path)
+                p_as_md = p.with_suffix(".md")
+                if p_as_md.exists():
+                    logging.info(f"newer: {p.stat().st_mtime > p_as_md.stat().st_mtime}")
+
+                # If file_path.md doesn't exist or file_path is newer than file_path.md,
+                # fire off markitdown
+                if (not p_as_md.exists()) or (p.stat().st_mtime > p_as_md.stat().st_mtime):
+                    self._markitdown(file_path, p_as_md)
+                return
+
            # Create document object in LangChain format
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()