144 lines
6.0 KiB
Python
144 lines
6.0 KiB
Python
from datetime import UTC, datetime
|
|
import logging
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from .protocols import DatabaseProtocol
|
|
from ..constants import KEY_PREFIXES
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DocumentMixin(DatabaseProtocol):
|
|
"""Mixin for document-related database operations"""
|
|
|
|
async def get_document(self, document_id: str) -> Optional[Dict]:
|
|
"""Get document metadata by ID"""
|
|
key = f"document:{document_id}"
|
|
data = await self.redis.get(key)
|
|
return self._deserialize(data) if data else None
|
|
|
|
async def set_document(self, document_id: str, document_data: Dict):
|
|
"""Set document metadata"""
|
|
key = f"document:{document_id}"
|
|
await self.redis.set(key, self._serialize(document_data))
|
|
|
|
async def delete_document(self, document_id: str):
|
|
"""Delete document metadata"""
|
|
key = f"document:{document_id}"
|
|
await self.redis.delete(key)
|
|
|
|
async def delete_all_candidate_documents(self, candidate_id: str) -> int:
|
|
"""Delete all documents for a specific candidate and return count of deleted documents"""
|
|
try:
|
|
# Get all document IDs for this candidate
|
|
key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
|
|
document_ids = await self.redis.lrange(key, 0, -1)# type: ignore
|
|
|
|
if not document_ids:
|
|
logger.info(f"No documents found for candidate {candidate_id}")
|
|
return 0
|
|
|
|
deleted_count = 0
|
|
|
|
# Use pipeline for efficient batch operations
|
|
pipe = self.redis.pipeline()
|
|
|
|
# Delete each document's metadata
|
|
for doc_id in document_ids:
|
|
pipe.delete(f"document:{doc_id}")
|
|
pipe.delete(f"{KEY_PREFIXES['job_requirements']}{doc_id}")
|
|
deleted_count += 1
|
|
|
|
# Delete the candidate's document list
|
|
pipe.delete(key)
|
|
|
|
# Execute all operations
|
|
await pipe.execute()
|
|
|
|
logger.info(f"Successfully deleted {deleted_count} documents for candidate {candidate_id}")
|
|
return deleted_count
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error deleting all documents for candidate {candidate_id}: {e}")
|
|
raise
|
|
|
|
async def get_candidate_documents(self, candidate_id: str) -> List[Dict]:
|
|
"""Get all documents for a specific candidate"""
|
|
key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
|
|
document_ids = await self.redis.lrange(key, 0, -1) # type: ignore
|
|
|
|
if not document_ids:
|
|
return []
|
|
|
|
# Get all document metadata
|
|
pipe = self.redis.pipeline()
|
|
for doc_id in document_ids:
|
|
pipe.get(f"document:{doc_id}")
|
|
values = await pipe.execute()
|
|
|
|
documents = []
|
|
for doc_id, value in zip(document_ids, values):
|
|
if value:
|
|
doc_data = self._deserialize(value)
|
|
if doc_data:
|
|
documents.append(doc_data)
|
|
else:
|
|
# Clean up orphaned document ID
|
|
await self.redis.lrem(key, 0, doc_id)# type: ignore
|
|
logger.warning(f"Removed orphaned document ID {doc_id} for candidate {candidate_id}")
|
|
|
|
return documents
|
|
|
|
async def add_document_to_candidate(self, candidate_id: str, document_id: str):
|
|
"""Add a document ID to a candidate's document list"""
|
|
key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
|
|
await self.redis.rpush(key, document_id)# type: ignore
|
|
|
|
async def remove_document_from_candidate(self, candidate_id: str, document_id: str):
|
|
"""Remove a document ID from a candidate's document list"""
|
|
key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
|
|
await self.redis.lrem(key, 0, document_id)# type: ignore
|
|
|
|
async def update_document(self, document_id: str, updates: Dict) -> Dict[Any, Any] | None:
|
|
"""Update document metadata"""
|
|
document_data = await self.get_document(document_id)
|
|
if document_data:
|
|
document_data.update(updates)
|
|
await self.set_document(document_id, document_data)
|
|
return document_data
|
|
return None
|
|
|
|
async def get_documents_by_rag_status(self, candidate_id: str, include_in_rag: bool = True) -> List[Dict]:
|
|
"""Get candidate documents filtered by RAG inclusion status"""
|
|
all_documents = await self.get_candidate_documents(candidate_id)
|
|
return [doc for doc in all_documents if doc.get("include_in_rag", False) == include_in_rag]
|
|
|
|
async def bulk_update_document_rag_status(self, candidate_id: str, document_ids: List[str], include_in_rag: bool):
|
|
"""Bulk update RAG status for multiple documents"""
|
|
pipe = self.redis.pipeline()
|
|
|
|
for doc_id in document_ids:
|
|
doc_data = await self.get_document(doc_id)
|
|
if doc_data and doc_data.get("candidate_id") == candidate_id:
|
|
doc_data["include_in_rag"] = include_in_rag
|
|
doc_data["updatedAt"] = datetime.now(UTC).isoformat()
|
|
pipe.set(f"document:{doc_id}", self._serialize(doc_data))
|
|
|
|
await pipe.execute()
|
|
|
|
async def get_document_count_for_candidate(self, candidate_id: str) -> int:
|
|
"""Get total number of documents for a candidate"""
|
|
key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
|
|
return await self.redis.llen(key)# type: ignore
|
|
|
|
async def search_candidate_documents(self, candidate_id: str, query: str) -> List[Dict]:
|
|
"""Search documents by filename for a candidate"""
|
|
all_documents = await self.get_candidate_documents(candidate_id)
|
|
query_lower = query.lower()
|
|
|
|
return [
|
|
doc for doc in all_documents
|
|
if (query_lower in doc.get("filename", "").lower() or
|
|
query_lower in doc.get("originalName", "").lower())
|
|
]
|
|
|