Updated system prompts to use better syntax for qwen

Fixed markdownit integration
This commit is contained in:
James Ketr 2025-04-26 23:24:28 -07:00
parent 7672e639f6
commit 581bc4a575
5 changed files with 109 additions and 77 deletions

View File

@ -257,6 +257,7 @@ FROM llm-base AS backstory
COPY /src/requirements.txt /opt/backstory/src/requirements.txt
RUN pip install -r /opt/backstory/src/requirements.txt
RUN pip install 'markitdown[all]'
SHELL [ "/bin/bash", "-c" ]

View File

@ -26,7 +26,7 @@ services:
- ./sessions:/opt/backstory/sessions:rw # Persist sessions
- ./chromadb:/opt/backstory/chromadb:rw # Persist ChromaDB
- ./dev-keys:/opt/backstory/keys:ro # Developer keys
- ./docs:/opt/backstory/docs:ro # Live mount of RAG content
- ./docs:/opt/backstory/docs:rw # Live mount of RAG content
- ./src:/opt/backstory/src:rw # Live mount server src
- ./frontend:/opt/backstory/frontend:rw # Live mount frontend src
cap_add: # used for running ze-monitor within container

View File

@ -82,6 +82,7 @@ const emojiMap: Record<string, string> = {
query: '🔍',
resume: '📄',
projects: '📁',
'performance-reviews': '📄',
news: '📰',
};
@ -90,6 +91,7 @@ const colorMap: Record<string, string> = {
resume: '#4A7A7D', // Dusty Teal — secondary theme color
projects: '#1A2536', // Midnight Blue — rich and deep
news: '#D3CDBF', // Warm Gray — soft and neutral
'performance-reviews': '#FF0000', // Bright red
};
const sizeMap: Record<string, number> = {

View File

@ -139,78 +139,68 @@ DEFAULT_HISTORY_LENGTH=5
# %%
# Globals
NAME = "James Ketrenos"
resume_intro = f"""
As an AI/ML professional specializing in creating custom solutions to new problem domains, {NAME} developed a custom
language model applications that streamline information processing and content generation. This tailored resume
was created using a Retrieval-Augmented Generation system I built to efficiently match my relevant experience
with your specific needsdemonstrating both my technical capabilities and commitment to intelligent resource
optimization.
"""
system_message = f"""
Launched on {DateTime()}.
When answering queries, follow these steps:
1. First analyze the query to determine if real-time information might be helpful
2. Even when [INFO] is provided, consider whether the tools would provide more current or comprehensive information
3. Use the provided tools whenever they would enhance your response, regardless of whether context is also available
4. When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️
4. When both [INFO] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
5. Always prioritize the most up-to-date and relevant information, whether it comes from [INFO] or tools
6. If [INFO] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
7. If there is information in the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '[INFO]' (etc.) or quoting it directly.
8. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
- First analyze the query to determine if real-time information might be helpful
- Even when <|context|> is provided, consider whether the tools would provide more current or comprehensive information
- Use the provided tools whenever they would enhance your response, regardless of whether context is also available
- When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️
- When both <|context|> and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
- Always prioritize the most up-to-date and relevant information, whether it comes from <|context|> or tools
- If <|context|> and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
- If there is information in the <|context|>, <|job_description|>, or <|work_history|> sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '<|context|>' (etc.) or quoting it directly.
- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
Always use tools and [INFO] when possible. Be concise, and never make up information. If you do not know the answer, say so.
""".strip()
Always use tools and <|context|> when possible. Be concise, and never make up information. If you do not know the answer, say so.
"""
system_generate_resume = f"""
Launched on {DateTime()}.
You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's [WORK HISTORY].
You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's <|work_history|>.
When answering queries, follow these steps:
1. You must not invent or assume any inforation not explicitly present in the [WORK HISTORY].
2. Analyze the [JOB DESCRIPTION] to identify skills required for the job.
3. Use the [JOB DESCRIPTION] provided to guide the focus, tone, and relevant skills or experience to highlight from the [WORK HISTORY].
4. Identify and emphasize the experiences, achievements, and responsibilities from the [WORK HISTORY] that best align with the [JOB DESCRIPTION].
5. Only provide information from [WORK HISTORY] items if it is relevant to the [JOB DESCRIPTION].
6. Do not use the [JOB DESCRIPTION] skills unless listed in [WORK HISTORY].
7. Do not include any information unless it is provided in [WORK HISTORY] or [INTRO].
8. Use the [INTRO] to highlight the use of AI in generating this resume.
9. Use the [WORK HISTORY] to create a polished, professional resume.
10. Do not list any locations or mailing addresses in the resume.
11. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
12. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
- You must not invent or assume any inforation not explicitly present in the <|work_history|>.
- Analyze the <|job_description|> to identify skills required for the job.
- Use the <|job_description|> provided to guide the focus, tone, and relevant skills or experience to highlight from the <|work_history|>.
- Identify and emphasize the experiences, achievements, and responsibilities from the <|work_history|> that best align with the <|job_description|>.
- Only provide information from <|work_history|> items if it is relevant to the <|job_description|>.
- Do not use the <|job_description|> skills unless listed in <|work_history|>.
- Do not include any information unless it is provided in <|work_history|>.
- Use the <|work_history|> to create a polished, professional resume.
- Do not list any locations or mailing addresses in the resume.
- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
- Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes.
Structure the resume professionally with the following sections where applicable:
* "Name: Use full name."
* "Professional Summary: A 2-4 sentence overview tailored to the job, using [INTRO] to highlight the use of AI in generating this resume."
* "Skills: A bullet list of key skills derived from the work history and relevant to the job."
* Professional Experience: A detailed list of roles, achievements, and responsibilities from [WORK HISTORY] that relate to the [JOB DESCRIPTION]."
* Name: Use full name
* Professional Summary: A 2-4 sentence overview tailored to the job.
* Skills: A bullet list of key skills derived from the work history and relevant to the job.
* Professional Experience: A detailed list of roles, achievements, and responsibilities from <|work_history|> that relate to the <|job_description|>.
* Education: Include only if available in the work history.
* Notes: Indicate the initial draft of the resume was generated using the Backstory application.
Do not include any information unless it is provided in [WORK HISTORY] or [INTRO].
Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes.
""".strip()
system_fact_check = f"""
Launched on {DateTime()}.
You are a professional resume fact checker. Your task is to identify any inaccuracies in the [RESUME] based on the individual's [WORK HISTORY].
You are a professional resume fact checker. Your task is to identify any inaccuracies in the <|resume|> based on the individual's <|work_history|>.
If there are inaccuracies, list them in a bullet point format.
When answering queries, follow these steps:
1. You must not invent or assume any information not explicitly present in the [WORK HISTORY].
2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY].
3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags.
- You must not invent or assume any information not explicitly present in the <|work_history|>.
- Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>.
- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags.
""".strip()
system_job_description = f"""
@ -219,10 +209,10 @@ Launched on {DateTime()}.
You are a hiring and job placing specialist. Your task is to answers about a job description.
When answering queries, follow these steps:
1. Analyze the [JOB DESCRIPTION] to provide insights for the asked question.
2. If any financial information is requested, be sure to account for inflation.
3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags.
- Analyze the <|job_description|> to provide insights for the asked question.
- If any financial information is requested, be sure to account for inflation.
- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags.
""".strip()
def create_system_message(prompt):
@ -1088,12 +1078,15 @@ class WebServer:
if rag_context:
preamble = f"""
1. Respond to this query: {content}
2. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
3. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
[INFO]
<|rules|>
- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
<|context|>
{rag_context}
[/INFO]
<|question|>
Use that information to respond to:"""
# Use the mode specific system_prompt instead of 'chat'
@ -1122,24 +1115,23 @@ Use that information to respond to:"""
rag_context += f"{doc}\n"
preamble = f"""
[INTRO]
{resume_intro}
[/INTRO]
[WORK HISTORY]
<|work_history|>
{rag_context}
[/WORK HISTORY]
[JOB DESCRIPTION]
<|job_description|>
{content}
[/JOB DESCRIPTION]
"""
context["sessions"]["job_description"]["content_seed"] = preamble + "Use the above information to answer this query: "
context["sessions"]["job_description"]["content_seed"] = preamble + "<|question|>\nUse the above information to answer this query: "
preamble += f"""
1. Use the above [INTRO] and [WORK HISTORY] to create the resume for the [JOB DESCRIPTION].
2. Do not use content from the [JOB DESCRIPTION] in the response unless the [WORK HISTORY] mentions them.
<|rules|>
1. Use the above <|<|work_history|> to create the resume for the <|job_description|>.
2. Do not use content from the <|job_description|> in the response unless the <|work_history|> mentions them.
<|question|>
"""
# Seed the history for job_description
@ -1185,25 +1177,25 @@ Use that information to respond to:"""
rag_context += f"{doc}\n"
preamble = f"""
[WORK HISTORY]
<|work_history|>
{rag_context}
[/WORK HISTORY]
[RESUME]
<|resume|>
{resume['content']}
[/RESUME]
Perform the following:
1. Do not invent or assume any information not explicitly present in the [WORK HISTORY].
2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY].
<|rules|>
1. Do not invent or assume any information not explicitly present in the <|work_history|>.
2. Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>.
<|question|>
"""
context["sessions"]["resume"]["content_seed"] = f"""
[RESUME]
<|resume|>
{resume["content"]}
[/RESUME]
Use the above [RESUME] to answer this query:
<|question|>
Use the above <|resume|> to answer this query:
"""
content = "Fact check the resume and report discrepancies."

View File

@ -1,5 +1,6 @@
import os
import glob
from pathlib import Path
import time
import hashlib
import asyncio
@ -12,6 +13,7 @@ import asyncio
import json
import pickle
import numpy as np
import re
import chromadb
import ollama
@ -21,6 +23,7 @@ from langchain.schema import Document
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import umap
from markitdown import MarkItDown
# Import your existing modules
if __name__ == "__main__":
@ -49,6 +52,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
self.chunk_overlap = chunk_overlap
self.loop = loop
self.md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
#self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Path for storing file hash state
@ -98,6 +104,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
def umap_model_3d(self):
return self._umap_model_3d
def _markitdown(self, document : str, markdown : Path):
logging.info(f'Converting {document} to {markdown}')
try:
result = self.md.convert(document)
markdown.write_text(result.text_content)
except Exception as e:
logging.error(f"Error convering via markdownit: {e}")
def _save_hash_state(self):
"""Save the current file hash state to disk."""
try:
@ -188,9 +202,11 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
"""Process a file update event."""
# Skip if already being processed
if file_path in self.processing_files:
logging.info(f"{file_path} already in queue. Not adding.")
return
try:
logging.info(f"{file_path} not in queue. Adding.")
self.processing_files.add(file_path)
# Wait a moment to ensure the file write is complete
@ -203,6 +219,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
if file_path in self.file_hashes and self.file_hashes[file_path] == current_hash:
# File hasn't actually changed in content
logging.info(f"Hash has not changed for {file_path}")
return
# Update file hash
@ -420,6 +437,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
logging.info(f"File deleted: {file_path}")
def on_moved(self, event):
"""Handle move deletion events."""
if event.is_directory:
return
file_path = event.src_path
logging.info(f"TODO: on_moved: ${file_path}")
def _normalize_embeddings(self, embeddings):
"""Normalize the embeddings to unit length."""
# Handle both single vector and array of vectors
@ -435,12 +460,24 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
async def _update_document_in_collection(self, file_path):
"""Update a document in the ChromaDB collection."""
try:
logging.info(f"Updating document in collection: {file_path}")
# Remove existing entries for this file
existing_results = self.collection.get(where={"path": file_path})
if existing_results and 'ids' in existing_results and existing_results['ids']:
self.collection.delete(ids=existing_results['ids'])
extensions = (".docx", ".xlsx", ".xls", ".pdf")
if file_path.endswith(extensions):
p = Path(file_path)
p_as_md = p.with_suffix(".md")
if p_as_md.exists():
logging.info(f"newer: {p.stat().st_mtime > p_as_md.stat().st_mtime}")
# If file_path.md doesn't exist or file_path is newer than file_path.md,
# fire off markitdown
if (not p_as_md.exists()) or (p.stat().st_mtime > p_as_md.stat().st_mtime):
self._markitdown(file_path, p_as_md)
return
# Create document object in LangChain format
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()