mid claude rewrite
This commit is contained in:
parent
eb2629bcce
commit
1ad2638277
@ -387,7 +387,6 @@ class WebServer:
|
||||
self.observer, self.file_watcher = Rag.start_file_watcher(
|
||||
llm=client,
|
||||
watch_directory=defines.doc_dir,
|
||||
initialize=True, # Only loads documents if no hash state exists
|
||||
recreate=False # Don't recreate if exists
|
||||
)
|
||||
|
||||
|
@ -44,28 +44,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
||||
self.chunk_overlap = chunk_overlap
|
||||
self.loop = loop
|
||||
|
||||
# Initialize ChromaDB collection
|
||||
self.collection = self._get_vector_collection(recreate=recreate)
|
||||
|
||||
# Setup text splitter
|
||||
self.text_splitter = CharacterTextSplitter(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap
|
||||
)
|
||||
|
||||
# Track file hashes and processing state
|
||||
self.file_hashes: dict[str, str] = {}
|
||||
self.update_lock = asyncio.Lock()
|
||||
self.processing_files = set()
|
||||
|
||||
# Initialize file hashes
|
||||
self.llm = llm
|
||||
self.watch_directory = watch_directory
|
||||
self.persist_directory = persist_directory or defines.persist_directory
|
||||
self.collection_name = collection_name
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
|
||||
# Path for storing file hash state
|
||||
self.hash_state_path = os.path.join(self.persist_directory, f"{collection_name}_hash_state.json")
|
||||
|
||||
@ -83,14 +61,16 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
||||
self.update_lock = asyncio.Lock()
|
||||
self.processing_files = set()
|
||||
|
||||
# Only scan for new/changed files if we have previous hash state
|
||||
if not self.file_hashes:
|
||||
self._initialize_file_hashes()
|
||||
else:
|
||||
# Always scan for new/changed files at startup
|
||||
self._update_file_hashes()
|
||||
|
||||
@property
|
||||
def collection(self):
|
||||
return self.collection
|
||||
return self._collection
|
||||
|
||||
@collection.setter
|
||||
def collection(self, value):
|
||||
self._collection = value
|
||||
|
||||
def _save_hash_state(self):
|
||||
"""Save the current file hash state to disk."""
|
||||
@ -158,8 +138,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
||||
# Save the updated state
|
||||
self._save_hash_state()
|
||||
|
||||
# ... rest of existing methods ...
|
||||
|
||||
async def process_file_update(self, file_path):
|
||||
"""Process a file update event."""
|
||||
# Skip if already being processed
|
||||
@ -324,6 +302,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
||||
if hash:
|
||||
self.file_hashes[file_path] = hash
|
||||
|
||||
# Save the initialized hash state
|
||||
self._save_hash_state()
|
||||
|
||||
def _get_file_hash(self, file_path):
|
||||
"""Calculate MD5 hash of a file."""
|
||||
try:
|
||||
@ -362,8 +343,15 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
||||
asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
|
||||
logging.info(f"File deleted: {file_path}")
|
||||
|
||||
|
||||
def _normalize_embeddings(self, embeddings):
|
||||
"""Normalize the embeddings to unit length."""
|
||||
# Handle both single vector and array of vectors
|
||||
if isinstance(embeddings[0], (int, float)):
|
||||
# Single vector
|
||||
norm = np.linalg.norm(embeddings)
|
||||
return [e / norm for e in embeddings] if norm > 0 else embeddings
|
||||
else:
|
||||
# Array of vectors
|
||||
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
||||
return embeddings / norms
|
||||
|
||||
@ -417,7 +405,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
||||
|
||||
# Function to start the file watcher
|
||||
def start_file_watcher(llm, watch_directory, persist_directory=None,
|
||||
collection_name="documents", initialize=False, recreate=False):
|
||||
collection_name="documents", recreate=False):
|
||||
"""
|
||||
Start watching a directory for file changes.
|
||||
|
||||
@ -426,7 +414,6 @@ def start_file_watcher(llm, watch_directory, persist_directory=None,
|
||||
watch_directory: Directory to watch for changes
|
||||
persist_directory: Directory to persist ChromaDB and hash state
|
||||
collection_name: Name of the ChromaDB collection
|
||||
initialize: Whether to initialize the collection with all documents (only needed first time)
|
||||
recreate: Whether to recreate the collection (will delete existing)
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
@ -440,8 +427,8 @@ def start_file_watcher(llm, watch_directory, persist_directory=None,
|
||||
recreate=recreate
|
||||
)
|
||||
|
||||
# Initialize collection if requested and no existing hash state
|
||||
if initialize and not file_watcher.file_hashes:
|
||||
# Initialize collection if it does not exist
|
||||
if not os.path.exists(file_watcher.hash_state_path):
|
||||
file_watcher.initialize_collection()
|
||||
|
||||
# Start observer
|
||||
@ -464,7 +451,6 @@ if __name__ == "__main__":
|
||||
llm,
|
||||
defines.doc_dir,
|
||||
recreate=True, # Start fresh
|
||||
initialize=True # Load all documents initially
|
||||
)
|
||||
|
||||
# Example query
|
||||
|
Loading…
x
Reference in New Issue
Block a user