From 1ad2638277a6f9c76cfadabbfb80f7f6abc4feeb Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Thu, 17 Apr 2025 15:14:27 -0700 Subject: [PATCH] mid claude rewrite --- src/server.py | 1 - src/utils/rag.py | 64 +++++++++++++++++++----------------------------- 2 files changed, 25 insertions(+), 40 deletions(-) diff --git a/src/server.py b/src/server.py index 08b522f..c6b2b15 100644 --- a/src/server.py +++ b/src/server.py @@ -387,7 +387,6 @@ class WebServer: self.observer, self.file_watcher = Rag.start_file_watcher( llm=client, watch_directory=defines.doc_dir, - initialize=True, # Only loads documents if no hash state exists recreate=False # Don't recreate if exists ) diff --git a/src/utils/rag.py b/src/utils/rag.py index baeb58a..d569b4c 100644 --- a/src/utils/rag.py +++ b/src/utils/rag.py @@ -43,28 +43,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.loop = loop - - # Initialize ChromaDB collection - self.collection = self._get_vector_collection(recreate=recreate) - - # Setup text splitter - self.text_splitter = CharacterTextSplitter( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap - ) - - # Track file hashes and processing state - self.file_hashes: dict[str, str] = {} - self.update_lock = asyncio.Lock() - self.processing_files = set() - - # Initialize file hashes - self.llm = llm - self.watch_directory = watch_directory - self.persist_directory = persist_directory or defines.persist_directory - self.collection_name = collection_name - self.chunk_size = chunk_size - self.chunk_overlap = chunk_overlap # Path for storing file hash state self.hash_state_path = os.path.join(self.persist_directory, f"{collection_name}_hash_state.json") @@ -82,15 +60,17 @@ class ChromaDBFileWatcher(FileSystemEventHandler): self.file_hashes = self._load_hash_state() self.update_lock = asyncio.Lock() self.processing_files = set() - - # Only scan for new/changed files if we have previous hash state - if not self.file_hashes: - self._initialize_file_hashes() - else: - self._update_file_hashes() + + # Always scan for new/changed files at startup + self._update_file_hashes() + @property def collection(self): - return self.collection + return self._collection + + @collection.setter + def collection(self, value): + self._collection = value def _save_hash_state(self): """Save the current file hash state to disk.""" @@ -158,8 +138,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler): # Save the updated state self._save_hash_state() - # ... rest of existing methods ... - async def process_file_update(self, file_path): """Process a file update event.""" # Skip if already being processed @@ -323,6 +301,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler): hash = self._get_file_hash(file_path) if hash: self.file_hashes[file_path] = hash + + # Save the initialized hash state + self._save_hash_state() def _get_file_hash(self, file_path): """Calculate MD5 hash of a file.""" @@ -362,10 +343,17 @@ class ChromaDBFileWatcher(FileSystemEventHandler): asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop) logging.info(f"File deleted: {file_path}") - def _normalize_embeddings(self, embeddings): - norms = np.linalg.norm(embeddings, axis=1, keepdims=True) - return embeddings / norms + """Normalize the embeddings to unit length.""" + # Handle both single vector and array of vectors + if isinstance(embeddings[0], (int, float)): + # Single vector + norm = np.linalg.norm(embeddings) + return [e / norm for e in embeddings] if norm > 0 else embeddings + else: + # Array of vectors + norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + return embeddings / norms async def _update_document_in_collection(self, file_path): """Update a document in the ChromaDB collection.""" @@ -417,7 +405,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler): # Function to start the file watcher def start_file_watcher(llm, watch_directory, persist_directory=None, - collection_name="documents", initialize=False, recreate=False): + collection_name="documents", recreate=False): """ Start watching a directory for file changes. @@ -426,7 +414,6 @@ def start_file_watcher(llm, watch_directory, persist_directory=None, watch_directory: Directory to watch for changes persist_directory: Directory to persist ChromaDB and hash state collection_name: Name of the ChromaDB collection - initialize: Whether to initialize the collection with all documents (only needed first time) recreate: Whether to recreate the collection (will delete existing) """ loop = asyncio.get_event_loop() @@ -440,8 +427,8 @@ def start_file_watcher(llm, watch_directory, persist_directory=None, recreate=recreate ) - # Initialize collection if requested and no existing hash state - if initialize and not file_watcher.file_hashes: + # Initialize collection if it does not exist + if not os.path.exists(file_watcher.hash_state_path): file_watcher.initialize_collection() # Start observer @@ -464,7 +451,6 @@ if __name__ == "__main__": llm, defines.doc_dir, recreate=True, # Start fresh - initialize=True # Load all documents initially ) # Example query