mid claude rewrite
This commit is contained in:
parent
eb2629bcce
commit
1ad2638277
@ -387,7 +387,6 @@ class WebServer:
|
|||||||
self.observer, self.file_watcher = Rag.start_file_watcher(
|
self.observer, self.file_watcher = Rag.start_file_watcher(
|
||||||
llm=client,
|
llm=client,
|
||||||
watch_directory=defines.doc_dir,
|
watch_directory=defines.doc_dir,
|
||||||
initialize=True, # Only loads documents if no hash state exists
|
|
||||||
recreate=False # Don't recreate if exists
|
recreate=False # Don't recreate if exists
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -43,28 +43,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
self.chunk_size = chunk_size
|
self.chunk_size = chunk_size
|
||||||
self.chunk_overlap = chunk_overlap
|
self.chunk_overlap = chunk_overlap
|
||||||
self.loop = loop
|
self.loop = loop
|
||||||
|
|
||||||
# Initialize ChromaDB collection
|
|
||||||
self.collection = self._get_vector_collection(recreate=recreate)
|
|
||||||
|
|
||||||
# Setup text splitter
|
|
||||||
self.text_splitter = CharacterTextSplitter(
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
chunk_overlap=chunk_overlap
|
|
||||||
)
|
|
||||||
|
|
||||||
# Track file hashes and processing state
|
|
||||||
self.file_hashes: dict[str, str] = {}
|
|
||||||
self.update_lock = asyncio.Lock()
|
|
||||||
self.processing_files = set()
|
|
||||||
|
|
||||||
# Initialize file hashes
|
|
||||||
self.llm = llm
|
|
||||||
self.watch_directory = watch_directory
|
|
||||||
self.persist_directory = persist_directory or defines.persist_directory
|
|
||||||
self.collection_name = collection_name
|
|
||||||
self.chunk_size = chunk_size
|
|
||||||
self.chunk_overlap = chunk_overlap
|
|
||||||
|
|
||||||
# Path for storing file hash state
|
# Path for storing file hash state
|
||||||
self.hash_state_path = os.path.join(self.persist_directory, f"{collection_name}_hash_state.json")
|
self.hash_state_path = os.path.join(self.persist_directory, f"{collection_name}_hash_state.json")
|
||||||
@ -82,15 +60,17 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
self.file_hashes = self._load_hash_state()
|
self.file_hashes = self._load_hash_state()
|
||||||
self.update_lock = asyncio.Lock()
|
self.update_lock = asyncio.Lock()
|
||||||
self.processing_files = set()
|
self.processing_files = set()
|
||||||
|
|
||||||
# Only scan for new/changed files if we have previous hash state
|
# Always scan for new/changed files at startup
|
||||||
if not self.file_hashes:
|
self._update_file_hashes()
|
||||||
self._initialize_file_hashes()
|
|
||||||
else:
|
|
||||||
self._update_file_hashes()
|
|
||||||
|
|
||||||
|
@property
|
||||||
def collection(self):
|
def collection(self):
|
||||||
return self.collection
|
return self._collection
|
||||||
|
|
||||||
|
@collection.setter
|
||||||
|
def collection(self, value):
|
||||||
|
self._collection = value
|
||||||
|
|
||||||
def _save_hash_state(self):
|
def _save_hash_state(self):
|
||||||
"""Save the current file hash state to disk."""
|
"""Save the current file hash state to disk."""
|
||||||
@ -158,8 +138,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
# Save the updated state
|
# Save the updated state
|
||||||
self._save_hash_state()
|
self._save_hash_state()
|
||||||
|
|
||||||
# ... rest of existing methods ...
|
|
||||||
|
|
||||||
async def process_file_update(self, file_path):
|
async def process_file_update(self, file_path):
|
||||||
"""Process a file update event."""
|
"""Process a file update event."""
|
||||||
# Skip if already being processed
|
# Skip if already being processed
|
||||||
@ -323,6 +301,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
hash = self._get_file_hash(file_path)
|
hash = self._get_file_hash(file_path)
|
||||||
if hash:
|
if hash:
|
||||||
self.file_hashes[file_path] = hash
|
self.file_hashes[file_path] = hash
|
||||||
|
|
||||||
|
# Save the initialized hash state
|
||||||
|
self._save_hash_state()
|
||||||
|
|
||||||
def _get_file_hash(self, file_path):
|
def _get_file_hash(self, file_path):
|
||||||
"""Calculate MD5 hash of a file."""
|
"""Calculate MD5 hash of a file."""
|
||||||
@ -362,10 +343,17 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
|
asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
|
||||||
logging.info(f"File deleted: {file_path}")
|
logging.info(f"File deleted: {file_path}")
|
||||||
|
|
||||||
|
|
||||||
def _normalize_embeddings(self, embeddings):
|
def _normalize_embeddings(self, embeddings):
|
||||||
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
"""Normalize the embeddings to unit length."""
|
||||||
return embeddings / norms
|
# Handle both single vector and array of vectors
|
||||||
|
if isinstance(embeddings[0], (int, float)):
|
||||||
|
# Single vector
|
||||||
|
norm = np.linalg.norm(embeddings)
|
||||||
|
return [e / norm for e in embeddings] if norm > 0 else embeddings
|
||||||
|
else:
|
||||||
|
# Array of vectors
|
||||||
|
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
||||||
|
return embeddings / norms
|
||||||
|
|
||||||
async def _update_document_in_collection(self, file_path):
|
async def _update_document_in_collection(self, file_path):
|
||||||
"""Update a document in the ChromaDB collection."""
|
"""Update a document in the ChromaDB collection."""
|
||||||
@ -417,7 +405,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
|
|
||||||
# Function to start the file watcher
|
# Function to start the file watcher
|
||||||
def start_file_watcher(llm, watch_directory, persist_directory=None,
|
def start_file_watcher(llm, watch_directory, persist_directory=None,
|
||||||
collection_name="documents", initialize=False, recreate=False):
|
collection_name="documents", recreate=False):
|
||||||
"""
|
"""
|
||||||
Start watching a directory for file changes.
|
Start watching a directory for file changes.
|
||||||
|
|
||||||
@ -426,7 +414,6 @@ def start_file_watcher(llm, watch_directory, persist_directory=None,
|
|||||||
watch_directory: Directory to watch for changes
|
watch_directory: Directory to watch for changes
|
||||||
persist_directory: Directory to persist ChromaDB and hash state
|
persist_directory: Directory to persist ChromaDB and hash state
|
||||||
collection_name: Name of the ChromaDB collection
|
collection_name: Name of the ChromaDB collection
|
||||||
initialize: Whether to initialize the collection with all documents (only needed first time)
|
|
||||||
recreate: Whether to recreate the collection (will delete existing)
|
recreate: Whether to recreate the collection (will delete existing)
|
||||||
"""
|
"""
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
@ -440,8 +427,8 @@ def start_file_watcher(llm, watch_directory, persist_directory=None,
|
|||||||
recreate=recreate
|
recreate=recreate
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize collection if requested and no existing hash state
|
# Initialize collection if it does not exist
|
||||||
if initialize and not file_watcher.file_hashes:
|
if not os.path.exists(file_watcher.hash_state_path):
|
||||||
file_watcher.initialize_collection()
|
file_watcher.initialize_collection()
|
||||||
|
|
||||||
# Start observer
|
# Start observer
|
||||||
@ -464,7 +451,6 @@ if __name__ == "__main__":
|
|||||||
llm,
|
llm,
|
||||||
defines.doc_dir,
|
defines.doc_dir,
|
||||||
recreate=True, # Start fresh
|
recreate=True, # Start fresh
|
||||||
initialize=True # Load all documents initially
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Example query
|
# Example query
|
||||||
|
Loading…
x
Reference in New Issue
Block a user