Updated system prompts to use better syntax for qwen

Fixed markdownit integration
This commit is contained in:
James Ketr 2025-04-26 23:24:28 -07:00
parent 7672e639f6
commit 581bc4a575
5 changed files with 109 additions and 77 deletions

View File

@ -257,6 +257,7 @@ FROM llm-base AS backstory
COPY /src/requirements.txt /opt/backstory/src/requirements.txt COPY /src/requirements.txt /opt/backstory/src/requirements.txt
RUN pip install -r /opt/backstory/src/requirements.txt RUN pip install -r /opt/backstory/src/requirements.txt
RUN pip install 'markitdown[all]'
SHELL [ "/bin/bash", "-c" ] SHELL [ "/bin/bash", "-c" ]

View File

@ -26,7 +26,7 @@ services:
- ./sessions:/opt/backstory/sessions:rw # Persist sessions - ./sessions:/opt/backstory/sessions:rw # Persist sessions
- ./chromadb:/opt/backstory/chromadb:rw # Persist ChromaDB - ./chromadb:/opt/backstory/chromadb:rw # Persist ChromaDB
- ./dev-keys:/opt/backstory/keys:ro # Developer keys - ./dev-keys:/opt/backstory/keys:ro # Developer keys
- ./docs:/opt/backstory/docs:ro # Live mount of RAG content - ./docs:/opt/backstory/docs:rw # Live mount of RAG content
- ./src:/opt/backstory/src:rw # Live mount server src - ./src:/opt/backstory/src:rw # Live mount server src
- ./frontend:/opt/backstory/frontend:rw # Live mount frontend src - ./frontend:/opt/backstory/frontend:rw # Live mount frontend src
cap_add: # used for running ze-monitor within container cap_add: # used for running ze-monitor within container

View File

@ -82,6 +82,7 @@ const emojiMap: Record<string, string> = {
query: '🔍', query: '🔍',
resume: '📄', resume: '📄',
projects: '📁', projects: '📁',
'performance-reviews': '📄',
news: '📰', news: '📰',
}; };
@ -90,6 +91,7 @@ const colorMap: Record<string, string> = {
resume: '#4A7A7D', // Dusty Teal — secondary theme color resume: '#4A7A7D', // Dusty Teal — secondary theme color
projects: '#1A2536', // Midnight Blue — rich and deep projects: '#1A2536', // Midnight Blue — rich and deep
news: '#D3CDBF', // Warm Gray — soft and neutral news: '#D3CDBF', // Warm Gray — soft and neutral
'performance-reviews': '#FF0000', // Bright red
}; };
const sizeMap: Record<string, number> = { const sizeMap: Record<string, number> = {

View File

@ -139,78 +139,68 @@ DEFAULT_HISTORY_LENGTH=5
# %% # %%
# Globals # Globals
NAME = "James Ketrenos"
resume_intro = f"""
As an AI/ML professional specializing in creating custom solutions to new problem domains, {NAME} developed a custom
language model applications that streamline information processing and content generation. This tailored resume
was created using a Retrieval-Augmented Generation system I built to efficiently match my relevant experience
with your specific needsdemonstrating both my technical capabilities and commitment to intelligent resource
optimization.
"""
system_message = f""" system_message = f"""
Launched on {DateTime()}. Launched on {DateTime()}.
When answering queries, follow these steps: When answering queries, follow these steps:
1. First analyze the query to determine if real-time information might be helpful - First analyze the query to determine if real-time information might be helpful
2. Even when [INFO] is provided, consider whether the tools would provide more current or comprehensive information - Even when <|context|> is provided, consider whether the tools would provide more current or comprehensive information
3. Use the provided tools whenever they would enhance your response, regardless of whether context is also available - Use the provided tools whenever they would enhance your response, regardless of whether context is also available
4. When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️ - When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️
4. When both [INFO] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer - When both <|context|> and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
5. Always prioritize the most up-to-date and relevant information, whether it comes from [INFO] or tools - Always prioritize the most up-to-date and relevant information, whether it comes from <|context|> or tools
6. If [INFO] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data - If <|context|> and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
7. If there is information in the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '[INFO]' (etc.) or quoting it directly. - If there is information in the <|context|>, <|job_description|>, or <|work_history|> sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '<|context|>' (etc.) or quoting it directly.
8. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags. - Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
Always use tools and [INFO] when possible. Be concise, and never make up information. If you do not know the answer, say so. Always use tools and <|context|> when possible. Be concise, and never make up information. If you do not know the answer, say so.
""".strip() """
system_generate_resume = f""" system_generate_resume = f"""
Launched on {DateTime()}. Launched on {DateTime()}.
You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's [WORK HISTORY]. You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's <|work_history|>.
When answering queries, follow these steps: When answering queries, follow these steps:
1. You must not invent or assume any inforation not explicitly present in the [WORK HISTORY]. - You must not invent or assume any inforation not explicitly present in the <|work_history|>.
2. Analyze the [JOB DESCRIPTION] to identify skills required for the job. - Analyze the <|job_description|> to identify skills required for the job.
3. Use the [JOB DESCRIPTION] provided to guide the focus, tone, and relevant skills or experience to highlight from the [WORK HISTORY]. - Use the <|job_description|> provided to guide the focus, tone, and relevant skills or experience to highlight from the <|work_history|>.
4. Identify and emphasize the experiences, achievements, and responsibilities from the [WORK HISTORY] that best align with the [JOB DESCRIPTION]. - Identify and emphasize the experiences, achievements, and responsibilities from the <|work_history|> that best align with the <|job_description|>.
5. Only provide information from [WORK HISTORY] items if it is relevant to the [JOB DESCRIPTION]. - Only provide information from <|work_history|> items if it is relevant to the <|job_description|>.
6. Do not use the [JOB DESCRIPTION] skills unless listed in [WORK HISTORY]. - Do not use the <|job_description|> skills unless listed in <|work_history|>.
7. Do not include any information unless it is provided in [WORK HISTORY] or [INTRO]. - Do not include any information unless it is provided in <|work_history|>.
8. Use the [INTRO] to highlight the use of AI in generating this resume. - Use the <|work_history|> to create a polished, professional resume.
9. Use the [WORK HISTORY] to create a polished, professional resume. - Do not list any locations or mailing addresses in the resume.
10. Do not list any locations or mailing addresses in the resume. - If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
11. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly. - Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
12. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags. - Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes.
Structure the resume professionally with the following sections where applicable: Structure the resume professionally with the following sections where applicable:
* "Name: Use full name." * Name: Use full name
* "Professional Summary: A 2-4 sentence overview tailored to the job, using [INTRO] to highlight the use of AI in generating this resume." * Professional Summary: A 2-4 sentence overview tailored to the job.
* "Skills: A bullet list of key skills derived from the work history and relevant to the job." * Skills: A bullet list of key skills derived from the work history and relevant to the job.
* Professional Experience: A detailed list of roles, achievements, and responsibilities from [WORK HISTORY] that relate to the [JOB DESCRIPTION]." * Professional Experience: A detailed list of roles, achievements, and responsibilities from <|work_history|> that relate to the <|job_description|>.
* Education: Include only if available in the work history. * Education: Include only if available in the work history.
* Notes: Indicate the initial draft of the resume was generated using the Backstory application.
Do not include any information unless it is provided in [WORK HISTORY] or [INTRO].
Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes.
""".strip() """.strip()
system_fact_check = f""" system_fact_check = f"""
Launched on {DateTime()}. Launched on {DateTime()}.
You are a professional resume fact checker. Your task is to identify any inaccuracies in the [RESUME] based on the individual's [WORK HISTORY]. You are a professional resume fact checker. Your task is to identify any inaccuracies in the <|resume|> based on the individual's <|work_history|>.
If there are inaccuracies, list them in a bullet point format. If there are inaccuracies, list them in a bullet point format.
When answering queries, follow these steps: When answering queries, follow these steps:
1. You must not invent or assume any information not explicitly present in the [WORK HISTORY]. - You must not invent or assume any information not explicitly present in the <|work_history|>.
2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY]. - Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>.
3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly. - If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags. - Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags.
""".strip() """.strip()
system_job_description = f""" system_job_description = f"""
@ -219,10 +209,10 @@ Launched on {DateTime()}.
You are a hiring and job placing specialist. Your task is to answers about a job description. You are a hiring and job placing specialist. Your task is to answers about a job description.
When answering queries, follow these steps: When answering queries, follow these steps:
1. Analyze the [JOB DESCRIPTION] to provide insights for the asked question. - Analyze the <|job_description|> to provide insights for the asked question.
2. If any financial information is requested, be sure to account for inflation. - If any financial information is requested, be sure to account for inflation.
3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly. - If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags. - Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags.
""".strip() """.strip()
def create_system_message(prompt): def create_system_message(prompt):
@ -1088,12 +1078,15 @@ class WebServer:
if rag_context: if rag_context:
preamble = f""" preamble = f"""
1. Respond to this query: {content}
2. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly. <|rules|>
3. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags. - If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
[INFO] - Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
<|context|>
{rag_context} {rag_context}
[/INFO]
<|question|>
Use that information to respond to:""" Use that information to respond to:"""
# Use the mode specific system_prompt instead of 'chat' # Use the mode specific system_prompt instead of 'chat'
@ -1122,24 +1115,23 @@ Use that information to respond to:"""
rag_context += f"{doc}\n" rag_context += f"{doc}\n"
preamble = f""" preamble = f"""
[INTRO] <|work_history|>
{resume_intro}
[/INTRO]
[WORK HISTORY]
{rag_context} {rag_context}
[/WORK HISTORY]
[JOB DESCRIPTION] <|job_description|>
{content} {content}
[/JOB DESCRIPTION]
""" """
context["sessions"]["job_description"]["content_seed"] = preamble + "Use the above information to answer this query: " context["sessions"]["job_description"]["content_seed"] = preamble + "<|question|>\nUse the above information to answer this query: "
preamble += f""" preamble += f"""
1. Use the above [INTRO] and [WORK HISTORY] to create the resume for the [JOB DESCRIPTION].
2. Do not use content from the [JOB DESCRIPTION] in the response unless the [WORK HISTORY] mentions them. <|rules|>
1. Use the above <|<|work_history|> to create the resume for the <|job_description|>.
2. Do not use content from the <|job_description|> in the response unless the <|work_history|> mentions them.
<|question|>
""" """
# Seed the history for job_description # Seed the history for job_description
@ -1185,25 +1177,25 @@ Use that information to respond to:"""
rag_context += f"{doc}\n" rag_context += f"{doc}\n"
preamble = f""" preamble = f"""
[WORK HISTORY] <|work_history|>
{rag_context} {rag_context}
[/WORK HISTORY]
[RESUME] <|resume|>
{resume['content']} {resume['content']}
[/RESUME]
Perform the following: <|rules|>
1. Do not invent or assume any information not explicitly present in the [WORK HISTORY]. 1. Do not invent or assume any information not explicitly present in the <|work_history|>.
2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY]. 2. Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>.
<|question|>
""" """
context["sessions"]["resume"]["content_seed"] = f""" context["sessions"]["resume"]["content_seed"] = f"""
[RESUME] <|resume|>
{resume["content"]} {resume["content"]}
[/RESUME]
Use the above [RESUME] to answer this query: <|question|>
Use the above <|resume|> to answer this query:
""" """
content = "Fact check the resume and report discrepancies." content = "Fact check the resume and report discrepancies."

View File

@ -1,5 +1,6 @@
import os import os
import glob import glob
from pathlib import Path
import time import time
import hashlib import hashlib
import asyncio import asyncio
@ -12,6 +13,7 @@ import asyncio
import json import json
import pickle import pickle
import numpy as np import numpy as np
import re
import chromadb import chromadb
import ollama import ollama
@ -21,6 +23,7 @@ from langchain.schema import Document
from watchdog.observers import Observer from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler from watchdog.events import FileSystemEventHandler
import umap import umap
from markitdown import MarkItDown
# Import your existing modules # Import your existing modules
if __name__ == "__main__": if __name__ == "__main__":
@ -49,6 +52,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
self.chunk_overlap = chunk_overlap self.chunk_overlap = chunk_overlap
self.loop = loop self.loop = loop
self.md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
#self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') #self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Path for storing file hash state # Path for storing file hash state
@ -98,6 +104,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
def umap_model_3d(self): def umap_model_3d(self):
return self._umap_model_3d return self._umap_model_3d
def _markitdown(self, document : str, markdown : Path):
logging.info(f'Converting {document} to {markdown}')
try:
result = self.md.convert(document)
markdown.write_text(result.text_content)
except Exception as e:
logging.error(f"Error convering via markdownit: {e}")
def _save_hash_state(self): def _save_hash_state(self):
"""Save the current file hash state to disk.""" """Save the current file hash state to disk."""
try: try:
@ -188,9 +202,11 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
"""Process a file update event.""" """Process a file update event."""
# Skip if already being processed # Skip if already being processed
if file_path in self.processing_files: if file_path in self.processing_files:
logging.info(f"{file_path} already in queue. Not adding.")
return return
try: try:
logging.info(f"{file_path} not in queue. Adding.")
self.processing_files.add(file_path) self.processing_files.add(file_path)
# Wait a moment to ensure the file write is complete # Wait a moment to ensure the file write is complete
@ -203,11 +219,12 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
if file_path in self.file_hashes and self.file_hashes[file_path] == current_hash: if file_path in self.file_hashes and self.file_hashes[file_path] == current_hash:
# File hasn't actually changed in content # File hasn't actually changed in content
logging.info(f"Hash has not changed for {file_path}")
return return
# Update file hash # Update file hash
self.file_hashes[file_path] = current_hash self.file_hashes[file_path] = current_hash
# Process and update the file in ChromaDB # Process and update the file in ChromaDB
async with self.update_lock: async with self.update_lock:
await self._update_document_in_collection(file_path) await self._update_document_in_collection(file_path)
@ -420,6 +437,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop) asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
logging.info(f"File deleted: {file_path}") logging.info(f"File deleted: {file_path}")
def on_moved(self, event):
"""Handle move deletion events."""
if event.is_directory:
return
file_path = event.src_path
logging.info(f"TODO: on_moved: ${file_path}")
def _normalize_embeddings(self, embeddings): def _normalize_embeddings(self, embeddings):
"""Normalize the embeddings to unit length.""" """Normalize the embeddings to unit length."""
# Handle both single vector and array of vectors # Handle both single vector and array of vectors
@ -435,12 +460,24 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
async def _update_document_in_collection(self, file_path): async def _update_document_in_collection(self, file_path):
"""Update a document in the ChromaDB collection.""" """Update a document in the ChromaDB collection."""
try: try:
logging.info(f"Updating document in collection: {file_path}")
# Remove existing entries for this file # Remove existing entries for this file
existing_results = self.collection.get(where={"path": file_path}) existing_results = self.collection.get(where={"path": file_path})
if existing_results and 'ids' in existing_results and existing_results['ids']: if existing_results and 'ids' in existing_results and existing_results['ids']:
self.collection.delete(ids=existing_results['ids']) self.collection.delete(ids=existing_results['ids'])
extensions = (".docx", ".xlsx", ".xls", ".pdf")
if file_path.endswith(extensions):
p = Path(file_path)
p_as_md = p.with_suffix(".md")
if p_as_md.exists():
logging.info(f"newer: {p.stat().st_mtime > p_as_md.stat().st_mtime}")
# If file_path.md doesn't exist or file_path is newer than file_path.md,
# fire off markitdown
if (not p_as_md.exists()) or (p.stat().st_mtime > p_as_md.stat().st_mtime):
self._markitdown(file_path, p_as_md)
return
# Create document object in LangChain format # Create document object in LangChain format
with open(file_path, "r", encoding="utf-8") as f: with open(file_path, "r", encoding="utf-8") as f:
content = f.read() content = f.read()