backstory/src/backend/database/mixins/document.py

from datetime import UTC, datetime
import logging
from typing import Any, Dict, List, Optional

from .protocols import DatabaseProtocol
from ..constants import KEY_PREFIXES

logger = logging.getLogger(__name__)

class DocumentMixin(DatabaseProtocol):
    """Mixin for document-related database operations"""

    async def get_document(self, document_id: str) -> Optional[Dict]:
        """Get document metadata by ID"""
        key = f"document:{document_id}"
        data = await self.redis.get(key)
        return self._deserialize(data) if data else None

    async def set_document(self, document_id: str, document_data: Dict):
        """Set document metadata"""
        key = f"document:{document_id}"
        await self.redis.set(key, self._serialize(document_data))

    async def delete_document(self, document_id: str):
        """Delete document metadata"""
        key = f"document:{document_id}"
        await self.redis.delete(key)

    async def delete_all_candidate_documents(self, candidate_id: str) -> int:
        """Delete all documents for a specific candidate and return count of deleted documents"""
        try:
            # Get all document IDs for this candidate
            key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
            document_ids = await self.redis.lrange(key, 0, -1)# type: ignore

            if not document_ids:
                logger.info(f"No documents found for candidate {candidate_id}")
                return 0

            deleted_count = 0

            # Use pipeline for efficient batch operations
            pipe = self.redis.pipeline()

            # Delete each document's metadata
            for doc_id in document_ids:
                pipe.delete(f"document:{doc_id}")
                pipe.delete(f"{KEY_PREFIXES['job_requirements']}{doc_id}")
                deleted_count += 1

            # Delete the candidate's document list
            pipe.delete(key)

            # Execute all operations
            await pipe.execute()

            logger.info(f"Successfully deleted {deleted_count} documents for candidate {candidate_id}")
            return deleted_count

        except Exception as e:
            logger.error(f"Error deleting all documents for candidate {candidate_id}: {e}")
            raise

    async def get_candidate_documents(self, candidate_id: str) -> List[Dict]:
        """Get all documents for a specific candidate"""
        key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
        document_ids = await self.redis.lrange(key, 0, -1) # type: ignore

        if not document_ids:
            return []

        # Get all document metadata
        pipe = self.redis.pipeline()
        for doc_id in document_ids:
            pipe.get(f"document:{doc_id}")
        values = await pipe.execute()

        documents = []
        for doc_id, value in zip(document_ids, values):
            if value:
                doc_data = self._deserialize(value)
                if doc_data:
                    documents.append(doc_data)
            else:
                # Clean up orphaned document ID
                await self.redis.lrem(key, 0, doc_id)# type: ignore
                logger.warning(f"Removed orphaned document ID {doc_id} for candidate {candidate_id}")

        return documents

    async def add_document_to_candidate(self, candidate_id: str, document_id: str):
        """Add a document ID to a candidate's document list"""
        key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
        await self.redis.rpush(key, document_id)# type: ignore

    async def remove_document_from_candidate(self, candidate_id: str, document_id: str):
        """Remove a document ID from a candidate's document list"""
        key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
        await self.redis.lrem(key, 0, document_id)# type: ignore

    async def update_document(self, document_id: str, updates: Dict) -> Dict[Any, Any] | None:
        """Update document metadata"""
        document_data = await self.get_document(document_id)
        if document_data:
            document_data.update(updates)
            await self.set_document(document_id, document_data)
            return document_data
        return None

    async def get_documents_by_rag_status(self, candidate_id: str, include_in_rag: bool = True) -> List[Dict]:
        """Get candidate documents filtered by RAG inclusion status"""
        all_documents = await self.get_candidate_documents(candidate_id)
        return [doc for doc in all_documents if doc.get("include_in_rag", False) == include_in_rag]

    async def bulk_update_document_rag_status(self, candidate_id: str, document_ids: List[str], include_in_rag: bool):
        """Bulk update RAG status for multiple documents"""
        pipe = self.redis.pipeline()

        for doc_id in document_ids:
            doc_data = await self.get_document(doc_id)
            if doc_data and doc_data.get("candidate_id") == candidate_id:
                doc_data["include_in_rag"] = include_in_rag
                doc_data["updatedAt"] = datetime.now(UTC).isoformat()
                pipe.set(f"document:{doc_id}", self._serialize(doc_data))

        await pipe.execute()

    async def get_document_count_for_candidate(self, candidate_id: str) -> int:
        """Get total number of documents for a candidate"""
        key = f"{KEY_PREFIXES['candidate_documents']}{candidate_id}"
        return await self.redis.llen(key)# type: ignore

    async def search_candidate_documents(self, candidate_id: str, query: str) -> List[Dict]:
        """Search documents by filename for a candidate"""
        all_documents = await self.get_candidate_documents(candidate_id)
        query_lower = query.lower()

        return [
            doc for doc in all_documents
            if (query_lower in doc.get("filename", "").lower() or
                query_lower in doc.get("originalName", "").lower())
        ]