mid claude rewrite

2025-04-17 15:14:27 -07:00 · 2025-04-17 15:14:27 -07:00 · 1ad2638277
commit 1ad2638277
parent eb2629bcce
2 changed files with 25 additions and 40 deletions
--- a/src/server.py
+++ b/src/server.py
@ -387,7 +387,6 @@ class WebServer:
            self.observer, self.file_watcher = Rag.start_file_watcher(
                llm=client, 
                watch_directory=defines.doc_dir,
-                initialize=True,  # Only loads documents if no hash state exists
                recreate=False    # Don't recreate if exists
            )
            
--- a/src/utils/rag.py
+++ b/src/utils/rag.py
@ -44,28 +44,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
        self.chunk_overlap = chunk_overlap
        self.loop = loop
        
-        # Initialize ChromaDB collection
-        self.collection = self._get_vector_collection(recreate=recreate)
-        
-        # Setup text splitter
-        self.text_splitter = CharacterTextSplitter(
-            chunk_size=chunk_size, 
-            chunk_overlap=chunk_overlap
-        )
-        
-        # Track file hashes and processing state
-        self.file_hashes: dict[str, str] = {}
-        self.update_lock = asyncio.Lock()
-        self.processing_files = set()
-        
-        # Initialize file hashes
-        self.llm = llm
-        self.watch_directory = watch_directory
-        self.persist_directory = persist_directory or defines.persist_directory
-        self.collection_name = collection_name
-        self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap
-        
        # Path for storing file hash state
        self.hash_state_path = os.path.join(self.persist_directory, f"{collection_name}_hash_state.json")
        
@ -83,14 +61,16 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
        self.update_lock = asyncio.Lock()
        self.processing_files = set()
                
-        # Only scan for new/changed files if we have previous hash state
-        if not self.file_hashes:
-            self._initialize_file_hashes()
-        else:
+        # Always scan for new/changed files at startup
        self._update_file_hashes()
    
+    @property
    def collection(self):
-        return self.collection
+        return self._collection
+    
+    @collection.setter
+    def collection(self, value):
+        self._collection = value
    
    def _save_hash_state(self):
        """Save the current file hash state to disk."""
@ -158,8 +138,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
        # Save the updated state
        self._save_hash_state()
    
-    # ... rest of existing methods ...
-    
    async def process_file_update(self, file_path):
        """Process a file update event."""
        # Skip if already being processed
@ -324,6 +302,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
                if hash:
                    self.file_hashes[file_path] = hash
        
+        # Save the initialized hash state
+        self._save_hash_state()
+    
    def _get_file_hash(self, file_path):
        """Calculate MD5 hash of a file."""
        try:
@ -362,8 +343,15 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
        asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
        logging.info(f"File deleted: {file_path}")

-
    def _normalize_embeddings(self, embeddings):
+        """Normalize the embeddings to unit length."""
+        # Handle both single vector and array of vectors
+        if isinstance(embeddings[0], (int, float)):
+            # Single vector
+            norm = np.linalg.norm(embeddings)
+            return [e / norm for e in embeddings] if norm > 0 else embeddings
+        else:
+            # Array of vectors
            norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
            return embeddings / norms

@ -417,7 +405,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler):

 # Function to start the file watcher
 def start_file_watcher(llm, watch_directory, persist_directory=None, 
-                      collection_name="documents", initialize=False, recreate=False):
+                      collection_name="documents", recreate=False):
    """
    Start watching a directory for file changes.
    
@ -426,7 +414,6 @@ def start_file_watcher(llm, watch_directory, persist_directory=None,
        watch_directory: Directory to watch for changes
        persist_directory: Directory to persist ChromaDB and hash state
        collection_name: Name of the ChromaDB collection
-        initialize: Whether to initialize the collection with all documents (only needed first time)
        recreate: Whether to recreate the collection (will delete existing)
    """
    loop = asyncio.get_event_loop()
@ -440,8 +427,8 @@ def start_file_watcher(llm, watch_directory, persist_directory=None,
        recreate=recreate
    )
    
-    # Initialize collection if requested and no existing hash state
-    if initialize and not file_watcher.file_hashes:
+    # Initialize collection if it does not exist
+    if not os.path.exists(file_watcher.hash_state_path):
        file_watcher.initialize_collection()
    
    # Start observer
@ -464,7 +451,6 @@ if __name__ == "__main__":
        llm, 
        defines.doc_dir, 
        recreate=True,  # Start fresh
-        initialize=True  # Load all documents initially
    )
    
    # Example query