Updated system prompts to use better syntax for qwen
Fixed markdownit integration
This commit is contained in:
parent
7672e639f6
commit
581bc4a575
@ -257,6 +257,7 @@ FROM llm-base AS backstory
|
|||||||
|
|
||||||
COPY /src/requirements.txt /opt/backstory/src/requirements.txt
|
COPY /src/requirements.txt /opt/backstory/src/requirements.txt
|
||||||
RUN pip install -r /opt/backstory/src/requirements.txt
|
RUN pip install -r /opt/backstory/src/requirements.txt
|
||||||
|
RUN pip install 'markitdown[all]'
|
||||||
|
|
||||||
SHELL [ "/bin/bash", "-c" ]
|
SHELL [ "/bin/bash", "-c" ]
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ services:
|
|||||||
- ./sessions:/opt/backstory/sessions:rw # Persist sessions
|
- ./sessions:/opt/backstory/sessions:rw # Persist sessions
|
||||||
- ./chromadb:/opt/backstory/chromadb:rw # Persist ChromaDB
|
- ./chromadb:/opt/backstory/chromadb:rw # Persist ChromaDB
|
||||||
- ./dev-keys:/opt/backstory/keys:ro # Developer keys
|
- ./dev-keys:/opt/backstory/keys:ro # Developer keys
|
||||||
- ./docs:/opt/backstory/docs:ro # Live mount of RAG content
|
- ./docs:/opt/backstory/docs:rw # Live mount of RAG content
|
||||||
- ./src:/opt/backstory/src:rw # Live mount server src
|
- ./src:/opt/backstory/src:rw # Live mount server src
|
||||||
- ./frontend:/opt/backstory/frontend:rw # Live mount frontend src
|
- ./frontend:/opt/backstory/frontend:rw # Live mount frontend src
|
||||||
cap_add: # used for running ze-monitor within container
|
cap_add: # used for running ze-monitor within container
|
||||||
|
@ -82,6 +82,7 @@ const emojiMap: Record<string, string> = {
|
|||||||
query: '🔍',
|
query: '🔍',
|
||||||
resume: '📄',
|
resume: '📄',
|
||||||
projects: '📁',
|
projects: '📁',
|
||||||
|
'performance-reviews': '📄',
|
||||||
news: '📰',
|
news: '📰',
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -90,6 +91,7 @@ const colorMap: Record<string, string> = {
|
|||||||
resume: '#4A7A7D', // Dusty Teal — secondary theme color
|
resume: '#4A7A7D', // Dusty Teal — secondary theme color
|
||||||
projects: '#1A2536', // Midnight Blue — rich and deep
|
projects: '#1A2536', // Midnight Blue — rich and deep
|
||||||
news: '#D3CDBF', // Warm Gray — soft and neutral
|
news: '#D3CDBF', // Warm Gray — soft and neutral
|
||||||
|
'performance-reviews': '#FF0000', // Bright red
|
||||||
};
|
};
|
||||||
|
|
||||||
const sizeMap: Record<string, number> = {
|
const sizeMap: Record<string, number> = {
|
||||||
|
140
src/server.py
140
src/server.py
@ -139,78 +139,68 @@ DEFAULT_HISTORY_LENGTH=5
|
|||||||
|
|
||||||
# %%
|
# %%
|
||||||
# Globals
|
# Globals
|
||||||
NAME = "James Ketrenos"
|
|
||||||
|
|
||||||
resume_intro = f"""
|
|
||||||
As an AI/ML professional specializing in creating custom solutions to new problem domains, {NAME} developed a custom
|
|
||||||
language model applications that streamline information processing and content generation. This tailored resume
|
|
||||||
was created using a Retrieval-Augmented Generation system I built to efficiently match my relevant experience
|
|
||||||
with your specific needs—demonstrating both my technical capabilities and commitment to intelligent resource
|
|
||||||
optimization.
|
|
||||||
"""
|
|
||||||
|
|
||||||
system_message = f"""
|
system_message = f"""
|
||||||
Launched on {DateTime()}.
|
Launched on {DateTime()}.
|
||||||
|
|
||||||
When answering queries, follow these steps:
|
When answering queries, follow these steps:
|
||||||
|
|
||||||
1. First analyze the query to determine if real-time information might be helpful
|
- First analyze the query to determine if real-time information might be helpful
|
||||||
2. Even when [INFO] is provided, consider whether the tools would provide more current or comprehensive information
|
- Even when <|context|> is provided, consider whether the tools would provide more current or comprehensive information
|
||||||
3. Use the provided tools whenever they would enhance your response, regardless of whether context is also available
|
- Use the provided tools whenever they would enhance your response, regardless of whether context is also available
|
||||||
4. When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️
|
- When presenting weather forecasts, include relevant emojis immediately before the corresponding text. For example, for a sunny day, say \"☀️ Sunny\" or if the forecast says there will be \"rain showers, say \"🌧️ Rain showers\". Use this mapping for weather emojis: Sunny: ☀️, Cloudy: ☁️, Rainy: 🌧️, Snowy: ❄️
|
||||||
4. When both [INFO] and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
|
- When both <|context|> and tool outputs are relevant, synthesize information from both sources to provide the most complete answer
|
||||||
5. Always prioritize the most up-to-date and relevant information, whether it comes from [INFO] or tools
|
- Always prioritize the most up-to-date and relevant information, whether it comes from <|context|> or tools
|
||||||
6. If [INFO] and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
|
- If <|context|> and tool outputs contain conflicting information, prefer the tool outputs as they likely represent more current data
|
||||||
7. If there is information in the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '[INFO]' (etc.) or quoting it directly.
|
- If there is information in the <|context|>, <|job_description|>, or <|work_history|> sections to enhance the answer, incorporate it seamlessly and refer to it as 'the latest information' or 'recent data' instead of mentioning '<|context|>' (etc.) or quoting it directly.
|
||||||
8. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
|
- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
|
||||||
|
|
||||||
Always use tools and [INFO] when possible. Be concise, and never make up information. If you do not know the answer, say so.
|
Always use tools and <|context|> when possible. Be concise, and never make up information. If you do not know the answer, say so.
|
||||||
""".strip()
|
"""
|
||||||
|
|
||||||
system_generate_resume = f"""
|
system_generate_resume = f"""
|
||||||
Launched on {DateTime()}.
|
Launched on {DateTime()}.
|
||||||
|
|
||||||
You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's [WORK HISTORY].
|
You are a professional resume writer. Your task is to write a concise, polished, and tailored resume for a specific job based only on the individual's <|work_history|>.
|
||||||
|
|
||||||
When answering queries, follow these steps:
|
When answering queries, follow these steps:
|
||||||
|
|
||||||
1. You must not invent or assume any inforation not explicitly present in the [WORK HISTORY].
|
- You must not invent or assume any inforation not explicitly present in the <|work_history|>.
|
||||||
2. Analyze the [JOB DESCRIPTION] to identify skills required for the job.
|
- Analyze the <|job_description|> to identify skills required for the job.
|
||||||
3. Use the [JOB DESCRIPTION] provided to guide the focus, tone, and relevant skills or experience to highlight from the [WORK HISTORY].
|
- Use the <|job_description|> provided to guide the focus, tone, and relevant skills or experience to highlight from the <|work_history|>.
|
||||||
4. Identify and emphasize the experiences, achievements, and responsibilities from the [WORK HISTORY] that best align with the [JOB DESCRIPTION].
|
- Identify and emphasize the experiences, achievements, and responsibilities from the <|work_history|> that best align with the <|job_description|>.
|
||||||
5. Only provide information from [WORK HISTORY] items if it is relevant to the [JOB DESCRIPTION].
|
- Only provide information from <|work_history|> items if it is relevant to the <|job_description|>.
|
||||||
6. Do not use the [JOB DESCRIPTION] skills unless listed in [WORK HISTORY].
|
- Do not use the <|job_description|> skills unless listed in <|work_history|>.
|
||||||
7. Do not include any information unless it is provided in [WORK HISTORY] or [INTRO].
|
- Do not include any information unless it is provided in <|work_history|>.
|
||||||
8. Use the [INTRO] to highlight the use of AI in generating this resume.
|
- Use the <|work_history|> to create a polished, professional resume.
|
||||||
9. Use the [WORK HISTORY] to create a polished, professional resume.
|
- Do not list any locations or mailing addresses in the resume.
|
||||||
10. Do not list any locations or mailing addresses in the resume.
|
- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
|
||||||
11. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
|
- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
|
||||||
12. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
|
- Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes.
|
||||||
|
|
||||||
Structure the resume professionally with the following sections where applicable:
|
Structure the resume professionally with the following sections where applicable:
|
||||||
|
|
||||||
* "Name: Use full name."
|
* Name: Use full name
|
||||||
* "Professional Summary: A 2-4 sentence overview tailored to the job, using [INTRO] to highlight the use of AI in generating this resume."
|
* Professional Summary: A 2-4 sentence overview tailored to the job.
|
||||||
* "Skills: A bullet list of key skills derived from the work history and relevant to the job."
|
* Skills: A bullet list of key skills derived from the work history and relevant to the job.
|
||||||
* Professional Experience: A detailed list of roles, achievements, and responsibilities from [WORK HISTORY] that relate to the [JOB DESCRIPTION]."
|
* Professional Experience: A detailed list of roles, achievements, and responsibilities from <|work_history|> that relate to the <|job_description|>.
|
||||||
* Education: Include only if available in the work history.
|
* Education: Include only if available in the work history.
|
||||||
|
* Notes: Indicate the initial draft of the resume was generated using the Backstory application.
|
||||||
|
|
||||||
Do not include any information unless it is provided in [WORK HISTORY] or [INTRO].
|
|
||||||
Ensure the langauge is clear, concise, and aligned with industry standards for professional resumes.
|
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
system_fact_check = f"""
|
system_fact_check = f"""
|
||||||
Launched on {DateTime()}.
|
Launched on {DateTime()}.
|
||||||
|
|
||||||
You are a professional resume fact checker. Your task is to identify any inaccuracies in the [RESUME] based on the individual's [WORK HISTORY].
|
You are a professional resume fact checker. Your task is to identify any inaccuracies in the <|resume|> based on the individual's <|work_history|>.
|
||||||
|
|
||||||
If there are inaccuracies, list them in a bullet point format.
|
If there are inaccuracies, list them in a bullet point format.
|
||||||
|
|
||||||
When answering queries, follow these steps:
|
When answering queries, follow these steps:
|
||||||
1. You must not invent or assume any information not explicitly present in the [WORK HISTORY].
|
- You must not invent or assume any information not explicitly present in the <|work_history|>.
|
||||||
2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY].
|
- Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>.
|
||||||
3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
|
- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
|
||||||
4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags.
|
- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags.
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
system_job_description = f"""
|
system_job_description = f"""
|
||||||
@ -219,10 +209,10 @@ Launched on {DateTime()}.
|
|||||||
You are a hiring and job placing specialist. Your task is to answers about a job description.
|
You are a hiring and job placing specialist. Your task is to answers about a job description.
|
||||||
|
|
||||||
When answering queries, follow these steps:
|
When answering queries, follow these steps:
|
||||||
1. Analyze the [JOB DESCRIPTION] to provide insights for the asked question.
|
- Analyze the <|job_description|> to provide insights for the asked question.
|
||||||
2. If any financial information is requested, be sure to account for inflation.
|
- If any financial information is requested, be sure to account for inflation.
|
||||||
3. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
|
- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
|
||||||
4. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], [RESUME], or [WORK HISTORY] tags.
|
- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, <|resume|>, or <|work_history|> tags.
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
def create_system_message(prompt):
|
def create_system_message(prompt):
|
||||||
@ -1088,12 +1078,15 @@ class WebServer:
|
|||||||
|
|
||||||
if rag_context:
|
if rag_context:
|
||||||
preamble = f"""
|
preamble = f"""
|
||||||
1. Respond to this query: {content}
|
|
||||||
2. If there is information in the [INFO], [JOB DESCRIPTION], [WORK HISTORY], or [RESUME] sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '[JOB DESCRIPTION]' (etc.) or quoting it directly.
|
<|rules|>
|
||||||
3. Avoid phrases like 'According to the [INFO]' or similar references to the [INFO], [JOB DESCRIPTION], or [WORK HISTORY] tags.
|
- If there is information in the <|context|>, <|job_description|>, <|work_history|>, or <|resume|> sections to enhance the answer, incorporate it seamlessly and refer to it using natural language instead of mentioning '<|job_description|>' (etc.) or quoting it directly.
|
||||||
[INFO]
|
- Avoid phrases like 'According to the <|context|>' or similar references to the <|context|>, <|job_description|>, or <|work_history|> tags.
|
||||||
|
|
||||||
|
<|context|>
|
||||||
{rag_context}
|
{rag_context}
|
||||||
[/INFO]
|
|
||||||
|
<|question|>
|
||||||
Use that information to respond to:"""
|
Use that information to respond to:"""
|
||||||
|
|
||||||
# Use the mode specific system_prompt instead of 'chat'
|
# Use the mode specific system_prompt instead of 'chat'
|
||||||
@ -1122,24 +1115,23 @@ Use that information to respond to:"""
|
|||||||
rag_context += f"{doc}\n"
|
rag_context += f"{doc}\n"
|
||||||
|
|
||||||
preamble = f"""
|
preamble = f"""
|
||||||
[INTRO]
|
<|work_history|>
|
||||||
{resume_intro}
|
|
||||||
[/INTRO]
|
|
||||||
|
|
||||||
[WORK HISTORY]
|
|
||||||
{rag_context}
|
{rag_context}
|
||||||
[/WORK HISTORY]
|
|
||||||
|
|
||||||
[JOB DESCRIPTION]
|
<|job_description|>
|
||||||
{content}
|
{content}
|
||||||
[/JOB DESCRIPTION]
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
context["sessions"]["job_description"]["content_seed"] = preamble + "Use the above information to answer this query: "
|
context["sessions"]["job_description"]["content_seed"] = preamble + "<|question|>\nUse the above information to answer this query: "
|
||||||
|
|
||||||
preamble += f"""
|
preamble += f"""
|
||||||
1. Use the above [INTRO] and [WORK HISTORY] to create the resume for the [JOB DESCRIPTION].
|
|
||||||
2. Do not use content from the [JOB DESCRIPTION] in the response unless the [WORK HISTORY] mentions them.
|
<|rules|>
|
||||||
|
1. Use the above <|<|work_history|> to create the resume for the <|job_description|>.
|
||||||
|
2. Do not use content from the <|job_description|> in the response unless the <|work_history|> mentions them.
|
||||||
|
|
||||||
|
<|question|>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Seed the history for job_description
|
# Seed the history for job_description
|
||||||
@ -1185,25 +1177,25 @@ Use that information to respond to:"""
|
|||||||
rag_context += f"{doc}\n"
|
rag_context += f"{doc}\n"
|
||||||
|
|
||||||
preamble = f"""
|
preamble = f"""
|
||||||
[WORK HISTORY]
|
<|work_history|>
|
||||||
{rag_context}
|
{rag_context}
|
||||||
[/WORK HISTORY]
|
|
||||||
|
|
||||||
[RESUME]
|
<|resume|>
|
||||||
{resume['content']}
|
{resume['content']}
|
||||||
[/RESUME]
|
|
||||||
|
|
||||||
Perform the following:
|
<|rules|>
|
||||||
1. Do not invent or assume any information not explicitly present in the [WORK HISTORY].
|
1. Do not invent or assume any information not explicitly present in the <|work_history|>.
|
||||||
2. Analyze the [RESUME] to identify any discrepancies or inaccuracies based on the [WORK HISTORY].
|
2. Analyze the <|resume|> to identify any discrepancies or inaccuracies based on the <|work_history|>.
|
||||||
|
|
||||||
|
<|question|>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
context["sessions"]["resume"]["content_seed"] = f"""
|
context["sessions"]["resume"]["content_seed"] = f"""
|
||||||
[RESUME]
|
<|resume|>
|
||||||
{resume["content"]}
|
{resume["content"]}
|
||||||
[/RESUME]
|
|
||||||
|
|
||||||
Use the above [RESUME] to answer this query:
|
<|question|>
|
||||||
|
Use the above <|resume|> to answer this query:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
content = "Fact check the resume and report discrepancies."
|
content = "Fact check the resume and report discrepancies."
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import glob
|
import glob
|
||||||
|
from pathlib import Path
|
||||||
import time
|
import time
|
||||||
import hashlib
|
import hashlib
|
||||||
import asyncio
|
import asyncio
|
||||||
@ -12,6 +13,7 @@ import asyncio
|
|||||||
import json
|
import json
|
||||||
import pickle
|
import pickle
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import re
|
||||||
|
|
||||||
import chromadb
|
import chromadb
|
||||||
import ollama
|
import ollama
|
||||||
@ -21,6 +23,7 @@ from langchain.schema import Document
|
|||||||
from watchdog.observers import Observer
|
from watchdog.observers import Observer
|
||||||
from watchdog.events import FileSystemEventHandler
|
from watchdog.events import FileSystemEventHandler
|
||||||
import umap
|
import umap
|
||||||
|
from markitdown import MarkItDown
|
||||||
|
|
||||||
# Import your existing modules
|
# Import your existing modules
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -49,6 +52,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
self.chunk_overlap = chunk_overlap
|
self.chunk_overlap = chunk_overlap
|
||||||
self.loop = loop
|
self.loop = loop
|
||||||
|
|
||||||
|
|
||||||
|
self.md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
|
||||||
|
|
||||||
#self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
#self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||||
|
|
||||||
# Path for storing file hash state
|
# Path for storing file hash state
|
||||||
@ -98,6 +104,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
def umap_model_3d(self):
|
def umap_model_3d(self):
|
||||||
return self._umap_model_3d
|
return self._umap_model_3d
|
||||||
|
|
||||||
|
def _markitdown(self, document : str, markdown : Path):
|
||||||
|
logging.info(f'Converting {document} to {markdown}')
|
||||||
|
try:
|
||||||
|
result = self.md.convert(document)
|
||||||
|
markdown.write_text(result.text_content)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error convering via markdownit: {e}")
|
||||||
|
|
||||||
def _save_hash_state(self):
|
def _save_hash_state(self):
|
||||||
"""Save the current file hash state to disk."""
|
"""Save the current file hash state to disk."""
|
||||||
try:
|
try:
|
||||||
@ -188,9 +202,11 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
"""Process a file update event."""
|
"""Process a file update event."""
|
||||||
# Skip if already being processed
|
# Skip if already being processed
|
||||||
if file_path in self.processing_files:
|
if file_path in self.processing_files:
|
||||||
|
logging.info(f"{file_path} already in queue. Not adding.")
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
logging.info(f"{file_path} not in queue. Adding.")
|
||||||
self.processing_files.add(file_path)
|
self.processing_files.add(file_path)
|
||||||
|
|
||||||
# Wait a moment to ensure the file write is complete
|
# Wait a moment to ensure the file write is complete
|
||||||
@ -203,11 +219,12 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
|
|
||||||
if file_path in self.file_hashes and self.file_hashes[file_path] == current_hash:
|
if file_path in self.file_hashes and self.file_hashes[file_path] == current_hash:
|
||||||
# File hasn't actually changed in content
|
# File hasn't actually changed in content
|
||||||
|
logging.info(f"Hash has not changed for {file_path}")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Update file hash
|
# Update file hash
|
||||||
self.file_hashes[file_path] = current_hash
|
self.file_hashes[file_path] = current_hash
|
||||||
|
|
||||||
# Process and update the file in ChromaDB
|
# Process and update the file in ChromaDB
|
||||||
async with self.update_lock:
|
async with self.update_lock:
|
||||||
await self._update_document_in_collection(file_path)
|
await self._update_document_in_collection(file_path)
|
||||||
@ -420,6 +437,14 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
|
asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
|
||||||
logging.info(f"File deleted: {file_path}")
|
logging.info(f"File deleted: {file_path}")
|
||||||
|
|
||||||
|
def on_moved(self, event):
|
||||||
|
"""Handle move deletion events."""
|
||||||
|
if event.is_directory:
|
||||||
|
return
|
||||||
|
|
||||||
|
file_path = event.src_path
|
||||||
|
logging.info(f"TODO: on_moved: ${file_path}")
|
||||||
|
|
||||||
def _normalize_embeddings(self, embeddings):
|
def _normalize_embeddings(self, embeddings):
|
||||||
"""Normalize the embeddings to unit length."""
|
"""Normalize the embeddings to unit length."""
|
||||||
# Handle both single vector and array of vectors
|
# Handle both single vector and array of vectors
|
||||||
@ -435,12 +460,24 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
async def _update_document_in_collection(self, file_path):
|
async def _update_document_in_collection(self, file_path):
|
||||||
"""Update a document in the ChromaDB collection."""
|
"""Update a document in the ChromaDB collection."""
|
||||||
try:
|
try:
|
||||||
logging.info(f"Updating document in collection: {file_path}")
|
|
||||||
# Remove existing entries for this file
|
# Remove existing entries for this file
|
||||||
existing_results = self.collection.get(where={"path": file_path})
|
existing_results = self.collection.get(where={"path": file_path})
|
||||||
if existing_results and 'ids' in existing_results and existing_results['ids']:
|
if existing_results and 'ids' in existing_results and existing_results['ids']:
|
||||||
self.collection.delete(ids=existing_results['ids'])
|
self.collection.delete(ids=existing_results['ids'])
|
||||||
|
|
||||||
|
extensions = (".docx", ".xlsx", ".xls", ".pdf")
|
||||||
|
if file_path.endswith(extensions):
|
||||||
|
p = Path(file_path)
|
||||||
|
p_as_md = p.with_suffix(".md")
|
||||||
|
if p_as_md.exists():
|
||||||
|
logging.info(f"newer: {p.stat().st_mtime > p_as_md.stat().st_mtime}")
|
||||||
|
|
||||||
|
# If file_path.md doesn't exist or file_path is newer than file_path.md,
|
||||||
|
# fire off markitdown
|
||||||
|
if (not p_as_md.exists()) or (p.stat().st_mtime > p_as_md.stat().st_mtime):
|
||||||
|
self._markitdown(file_path, p_as_md)
|
||||||
|
return
|
||||||
|
|
||||||
# Create document object in LangChain format
|
# Create document object in LangChain format
|
||||||
with open(file_path, "r", encoding="utf-8") as f:
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user