No more pydantic errors

This commit is contained in:
James Ketr 2025-05-02 16:53:20 -07:00
parent 3fe2cfd9ef
commit 2bc7451dd9
2 changed files with 18 additions and 12 deletions

View File

@ -1,4 +1,4 @@
from pydantic import BaseModel, Field, model_validator, PrivateAttr from pydantic import BaseModel, Field, model_validator, PrivateAttr # type: ignore
from typing import Literal, TypeAlias, get_args, List, Generator, Iterator, AsyncGenerator, TYPE_CHECKING, Optional, ClassVar from typing import Literal, TypeAlias, get_args, List, Generator, Iterator, AsyncGenerator, TYPE_CHECKING, Optional, ClassVar
from typing_extensions import Annotated from typing_extensions import Annotated
from abc import ABC, abstractmethod from abc import ABC, abstractmethod

View File

@ -1,4 +1,5 @@
from typing import List from pydantic import BaseModel, # type: ignore
from typing import List, Optional, Dict, Any
import os import os
import glob import glob
from pathlib import Path from pathlib import Path
@ -12,9 +13,7 @@ import time
import hashlib import hashlib
import asyncio import asyncio
import json import json
import pickle
import numpy as np # type: ignore import numpy as np # type: ignore
import re
import chromadb import chromadb
import ollama import ollama
@ -25,6 +24,7 @@ from watchdog.observers import Observer # type: ignore
from watchdog.events import FileSystemEventHandler # type: ignore from watchdog.events import FileSystemEventHandler # type: ignore
import umap # type: ignore import umap # type: ignore
from markitdown import MarkItDown # type: ignore from markitdown import MarkItDown # type: ignore
from chromadb.api.models.Collection import Collection # type: ignore
# Import your existing modules # Import your existing modules
if __name__ == "__main__": if __name__ == "__main__":
@ -42,6 +42,12 @@ __all__ = [
DEFAULT_CHUNK_SIZE=750 DEFAULT_CHUNK_SIZE=750
DEFAULT_CHUNK_OVERLAP=100 DEFAULT_CHUNK_OVERLAP=100
class ChromaDBGetResponse(BaseModel):
ids: List[str]
embeddings: Optional[List[List[float]]] = None
documents: Optional[List[str]] = None
metadatas: Optional[List[Dict[str, Any]]] = None
class ChromaDBFileWatcher(FileSystemEventHandler): class ChromaDBFileWatcher(FileSystemEventHandler):
def __init__(self, llm, watch_directory, loop, persist_directory=None, collection_name="documents", def __init__(self, llm, watch_directory, loop, persist_directory=None, collection_name="documents",
chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP, recreate=False): chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP, recreate=False):
@ -52,9 +58,9 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
self.chunk_size = chunk_size self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap self.chunk_overlap = chunk_overlap
self.loop = loop self.loop = loop
self._umap_collection = None self._umap_collection : ChromaDBGetResponse | None = None
self._umap_embedding_2d : List[int]= [] self._umap_embedding_2d : np.ndarray = []
self._umap_embedding_3d = [] self._umap_embedding_3d : np.ndarray = []
self._umap_model_2d : umap.UMAP = None self._umap_model_2d : umap.UMAP = None
self._umap_model_3d : umap.UMAP = None self._umap_model_3d : umap.UMAP = None
self.md = MarkItDown(enable_plugins=False) # Set to True to enable plugins self.md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
@ -68,7 +74,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
self.is_new_collection = False self.is_new_collection = False
# Initialize ChromaDB collection # Initialize ChromaDB collection
self._collection = self._get_vector_collection(recreate=recreate) self._collection : Collection = self._get_vector_collection(recreate=recreate)
self._update_umaps() self._update_umaps()
# Setup text splitter # Setup text splitter
@ -89,15 +95,15 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
return self._collection return self._collection
@property @property
def umap_collection(self): def umap_collection(self) -> ChromaDBGetResponse | None:
return self._umap_collection return self._umap_collection
@property @property
def umap_embedding_2d(self) -> List[int]: def umap_embedding_2d(self) -> np.ndarray:
return self._umap_embedding_2d return self._umap_embedding_2d
@property @property
def umap_embedding_3d(self) -> List[int]: def umap_embedding_3d(self) -> np.ndarray:
return self._umap_embedding_3d return self._umap_embedding_3d
@property @property
@ -285,7 +291,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
self._umap_embedding_3d = self._umap_model_3d.fit_transform(vectors) self._umap_embedding_3d = self._umap_model_3d.fit_transform(vectors)
logging.info(f"3D UMAP model n_components: {self._umap_model_3d.n_components}") # Should be 3 logging.info(f"3D UMAP model n_components: {self._umap_model_3d.n_components}") # Should be 3
def _get_vector_collection(self, recreate=False): def _get_vector_collection(self, recreate=False) -> Collection:
"""Get or create a ChromaDB collection.""" """Get or create a ChromaDB collection."""
# Initialize ChromaDB client # Initialize ChromaDB client
chroma_client = chromadb.PersistentClient( # type: ignore chroma_client = chromadb.PersistentClient( # type: ignore