144 lines
6.0 KiB
Python

from datetime import UTC, datetime
import logging
from typing import Any, Dict, List, Optional
from .protocols import DatabaseProtocol
from ..constants import KEY_PREFIXES
logger = logging.getLogger(__name__)
class DocumentMixin(DatabaseProtocol):
"""Mixin for document-related database operations"""
async def get_document(self, document_id: str) -> Optional[Dict]:
"""Get document metadata by ID"""
key = f"document:{document_id}"
data = await self.redis.get(key)
return self._deserialize(data) if data else None
async def set_document(self, document_id: str, document_data: Dict):
"""Set document metadata"""
key = f"document:{document_id}"
await self.redis.set(key, self._serialize(document_data))
async def delete_document(self, document_id: str):
"""Delete document metadata"""
key = f"document:{document_id}"
await self.redis.delete(key)
async def delete_all_candidate_documents(self, candidate_id: str) -> int:
"""Delete all documents for a specific candidate and return count of deleted documents"""
try:
# Get all document IDs for this candidate
key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
document_ids = await self.redis.lrange(key, 0, -1)# type: ignore
if not document_ids:
logger.info(f"No documents found for candidate {candidate_id}")
return 0
deleted_count = 0
# Use pipeline for efficient batch operations
pipe = self.redis.pipeline()
# Delete each document's metadata
for doc_id in document_ids:
pipe.delete(f"document:{doc_id}")
pipe.delete(f"{KEY_PREFIXES['job_requirements']}{doc_id}")
deleted_count += 1
# Delete the candidate's document list
pipe.delete(key)
# Execute all operations
await pipe.execute()
logger.info(f"Successfully deleted {deleted_count} documents for candidate {candidate_id}")
return deleted_count
except Exception as e:
logger.error(f"Error deleting all documents for candidate {candidate_id}: {e}")
raise
async def get_candidate_documents(self, candidate_id: str) -> List[Dict]:
"""Get all documents for a specific candidate"""
key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
document_ids = await self.redis.lrange(key, 0, -1) # type: ignore
if not document_ids:
return []
# Get all document metadata
pipe = self.redis.pipeline()
for doc_id in document_ids:
pipe.get(f"document:{doc_id}")
values = await pipe.execute()
documents = []
for doc_id, value in zip(document_ids, values):
if value:
doc_data = self._deserialize(value)
if doc_data:
documents.append(doc_data)
else:
# Clean up orphaned document ID
await self.redis.lrem(key, 0, doc_id)# type: ignore
logger.warning(f"Removed orphaned document ID {doc_id} for candidate {candidate_id}")
return documents
async def add_document_to_candidate(self, candidate_id: str, document_id: str):
"""Add a document ID to a candidate's document list"""
key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
await self.redis.rpush(key, document_id)# type: ignore
async def remove_document_from_candidate(self, candidate_id: str, document_id: str):
"""Remove a document ID from a candidate's document list"""
key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
await self.redis.lrem(key, 0, document_id)# type: ignore
async def update_document(self, document_id: str, updates: Dict) -> Dict[Any, Any] | None:
"""Update document metadata"""
document_data = await self.get_document(document_id)
if document_data:
document_data.update(updates)
await self.set_document(document_id, document_data)
return document_data
return None
async def get_documents_by_rag_status(self, candidate_id: str, include_in_rag: bool = True) -> List[Dict]:
"""Get candidate documents filtered by RAG inclusion status"""
all_documents = await self.get_candidate_documents(candidate_id)
return [doc for doc in all_documents if doc.get("include_in_rag", False) == include_in_rag]
async def bulk_update_document_rag_status(self, candidate_id: str, document_ids: List[str], include_in_rag: bool):
"""Bulk update RAG status for multiple documents"""
pipe = self.redis.pipeline()
for doc_id in document_ids:
doc_data = await self.get_document(doc_id)
if doc_data and doc_data.get("candidate_id") == candidate_id:
doc_data["include_in_rag"] = include_in_rag
doc_data["updatedAt"] = datetime.now(UTC).isoformat()
pipe.set(f"document:{doc_id}", self._serialize(doc_data))
await pipe.execute()
async def get_document_count_for_candidate(self, candidate_id: str) -> int:
"""Get total number of documents for a candidate"""
key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
return await self.redis.llen(key)# type: ignore
async def search_candidate_documents(self, candidate_id: str, query: str) -> List[Dict]:
"""Search documents by filename for a candidate"""
all_documents = await self.get_candidate_documents(candidate_id)
query_lower = query.lower()
return [
doc for doc in all_documents
if (query_lower in doc.get("filename", "").lower() or
query_lower in doc.get("originalName", "").lower())
]