mid claude rewrite

This commit is contained in:
James Ketr 2025-04-17 15:14:27 -07:00
parent eb2629bcce
commit 1ad2638277
2 changed files with 25 additions and 40 deletions

View File

@ -387,7 +387,6 @@ class WebServer:
self.observer, self.file_watcher = Rag.start_file_watcher(
llm=client,
watch_directory=defines.doc_dir,
initialize=True, # Only loads documents if no hash state exists
recreate=False # Don't recreate if exists
)

View File

@ -44,28 +44,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
self.chunk_overlap = chunk_overlap
self.loop = loop
# Initialize ChromaDB collection
self.collection = self._get_vector_collection(recreate=recreate)
# Setup text splitter
self.text_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
# Track file hashes and processing state
self.file_hashes: dict[str, str] = {}
self.update_lock = asyncio.Lock()
self.processing_files = set()
# Initialize file hashes
self.llm = llm
self.watch_directory = watch_directory
self.persist_directory = persist_directory or defines.persist_directory
self.collection_name = collection_name
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
# Path for storing file hash state
self.hash_state_path = os.path.join(self.persist_directory, f"{collection_name}_hash_state.json")
@ -83,14 +61,16 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
self.update_lock = asyncio.Lock()
self.processing_files = set()
# Only scan for new/changed files if we have previous hash state
if not self.file_hashes:
self._initialize_file_hashes()
else:
# Always scan for new/changed files at startup
self._update_file_hashes()
@property
def collection(self):
return self.collection
return self._collection
@collection.setter
def collection(self, value):
self._collection = value
def _save_hash_state(self):
"""Save the current file hash state to disk."""
@ -158,8 +138,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
# Save the updated state
self._save_hash_state()
# ... rest of existing methods ...
async def process_file_update(self, file_path):
"""Process a file update event."""
# Skip if already being processed
@ -324,6 +302,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
if hash:
self.file_hashes[file_path] = hash
# Save the initialized hash state
self._save_hash_state()
def _get_file_hash(self, file_path):
"""Calculate MD5 hash of a file."""
try:
@ -362,8 +343,15 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
logging.info(f"File deleted: {file_path}")
def _normalize_embeddings(self, embeddings):
"""Normalize the embeddings to unit length."""
# Handle both single vector and array of vectors
if isinstance(embeddings[0], (int, float)):
# Single vector
norm = np.linalg.norm(embeddings)
return [e / norm for e in embeddings] if norm > 0 else embeddings
else:
# Array of vectors
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
return embeddings / norms
@ -417,7 +405,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
# Function to start the file watcher
def start_file_watcher(llm, watch_directory, persist_directory=None,
collection_name="documents", initialize=False, recreate=False):
collection_name="documents", recreate=False):
"""
Start watching a directory for file changes.
@ -426,7 +414,6 @@ def start_file_watcher(llm, watch_directory, persist_directory=None,
watch_directory: Directory to watch for changes
persist_directory: Directory to persist ChromaDB and hash state
collection_name: Name of the ChromaDB collection
initialize: Whether to initialize the collection with all documents (only needed first time)
recreate: Whether to recreate the collection (will delete existing)
"""
loop = asyncio.get_event_loop()
@ -440,8 +427,8 @@ def start_file_watcher(llm, watch_directory, persist_directory=None,
recreate=recreate
)
# Initialize collection if requested and no existing hash state
if initialize and not file_watcher.file_hashes:
# Initialize collection if it does not exist
if not os.path.exists(file_watcher.hash_state_path):
file_watcher.initialize_collection()
# Start observer
@ -464,7 +451,6 @@ if __name__ == "__main__":
llm,
defines.doc_dir,
recreate=True, # Start fresh
initialize=True # Load all documents initially
)
# Example query