diff --git a/src/utils/agents/resume.py b/src/utils/agents/resume.py index aa9b929..49279b5 100644 --- a/src/utils/agents/resume.py +++ b/src/utils/agents/resume.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, Field, model_validator, PrivateAttr +from pydantic import BaseModel, Field, model_validator, PrivateAttr # type: ignore from typing import Literal, TypeAlias, get_args, List, Generator, Iterator, AsyncGenerator, TYPE_CHECKING, Optional, ClassVar from typing_extensions import Annotated from abc import ABC, abstractmethod diff --git a/src/utils/rag.py b/src/utils/rag.py index 2a1cb9c..8bcfd9b 100644 --- a/src/utils/rag.py +++ b/src/utils/rag.py @@ -1,4 +1,5 @@ -from typing import List +from pydantic import BaseModel, # type: ignore +from typing import List, Optional, Dict, Any import os import glob from pathlib import Path @@ -12,9 +13,7 @@ import time import hashlib import asyncio import json -import pickle import numpy as np # type: ignore -import re import chromadb import ollama @@ -25,6 +24,7 @@ from watchdog.observers import Observer # type: ignore from watchdog.events import FileSystemEventHandler # type: ignore import umap # type: ignore from markitdown import MarkItDown # type: ignore +from chromadb.api.models.Collection import Collection # type: ignore # Import your existing modules if __name__ == "__main__": @@ -42,6 +42,12 @@ __all__ = [ DEFAULT_CHUNK_SIZE=750 DEFAULT_CHUNK_OVERLAP=100 +class ChromaDBGetResponse(BaseModel): + ids: List[str] + embeddings: Optional[List[List[float]]] = None + documents: Optional[List[str]] = None + metadatas: Optional[List[Dict[str, Any]]] = None + class ChromaDBFileWatcher(FileSystemEventHandler): def __init__(self, llm, watch_directory, loop, persist_directory=None, collection_name="documents", chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP, recreate=False): @@ -52,9 +58,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.loop = loop - self._umap_collection = None - self._umap_embedding_2d : List[int]= [] - self._umap_embedding_3d = [] + self._umap_collection : ChromaDBGetResponse | None = None + self._umap_embedding_2d : np.ndarray = [] + self._umap_embedding_3d : np.ndarray = [] self._umap_model_2d : umap.UMAP = None self._umap_model_3d : umap.UMAP = None self.md = MarkItDown(enable_plugins=False) # Set to True to enable plugins @@ -68,7 +74,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler): self.is_new_collection = False # Initialize ChromaDB collection - self._collection = self._get_vector_collection(recreate=recreate) + self._collection : Collection = self._get_vector_collection(recreate=recreate) self._update_umaps() # Setup text splitter @@ -89,15 +95,15 @@ class ChromaDBFileWatcher(FileSystemEventHandler): return self._collection @property - def umap_collection(self): + def umap_collection(self) -> ChromaDBGetResponse | None: return self._umap_collection @property - def umap_embedding_2d(self) -> List[int]: + def umap_embedding_2d(self) -> np.ndarray: return self._umap_embedding_2d @property - def umap_embedding_3d(self) -> List[int]: + def umap_embedding_3d(self) -> np.ndarray: return self._umap_embedding_3d @property @@ -285,7 +291,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler): self._umap_embedding_3d = self._umap_model_3d.fit_transform(vectors) logging.info(f"3D UMAP model n_components: {self._umap_model_3d.n_components}") # Should be 3 - def _get_vector_collection(self, recreate=False): + def _get_vector_collection(self, recreate=False) -> Collection: """Get or create a ChromaDB collection.""" # Initialize ChromaDB client chroma_client = chromadb.PersistentClient( # type: ignore