From a1798b58acaae1238d2e488e95f808fdec760a58 Mon Sep 17 00:00:00 2001
From: James Ketrenos <james_git@ketrenos.com>
Date: Mon, 12 May 2025 16:57:20 -0700
Subject: [PATCH] Switching to one-call per skill

---
 frontend/public/docs/resume-generation.md |  88 +++++++++--------
 src/utils/agents/job_description.py       | 110 ++++++----------------
 src/utils/rag.py                          |   5 +
 3 files changed, 78 insertions(+), 125 deletions(-)

diff --git a/frontend/public/docs/resume-generation.md b/frontend/public/docs/resume-generation.md
index 426b8b2..c93217f 100644
--- a/frontend/public/docs/resume-generation.md
+++ b/frontend/public/docs/resume-generation.md
@@ -4,13 +4,12 @@ The system follows a carefully designed pipeline with isolated stages to prevent
 
 The system uses a pipeline of isolated analysis and generation steps:
 
-1. **Stage 1: Isolated Analysis** (three sub-stages)
+1. **Stage 1: Isolated Analysis**
    - **1A: Job Analysis** - Extracts requirements from job description only
-   - **1B: Candidate Analysis** - Catalogs qualifications from resume/context only
-   - **1C: Mapping Analysis** - Identifies legitimate matches between requirements and qualifications
+   - **1B: Skill-Based Assessment** - For each required skill, determine a Individisual Skill Assessment, adding it to a Skill Assessments Collection.
 
 2. **Stage 2: Resume Generation**
-   - Uses mapping output to create a tailored resume with evidence-based content
+   - Uses Skills Asessments Collection to generate a tailored resume.
 
 3. **Stage 3: Verification**
    - Performs fact-checking to catch any remaining fabrications
@@ -23,63 +22,62 @@ flowchart TD
             A2 --> A3[Job Requirements JSON]
         end
 
-        subgraph "Stage 1B: Candidate Analysis"
-            B1[Resume Input] --> B5[Candidate Analysis LLM]
-            B5 --> B4[Candidate Qualifications JSON]
-            B2[Candidate Info] --> B3[RAG]
-            B3[RAG] --> B2[Candidate Info]
-            A3[Job Requirements JSON] --> B3[RAG]
-            B3[RAG] --> B5
-        end
-        
-        subgraph "Stage 1C: Mapping Analysis"
-            C1[Job Requirements JSON] --> C3[Mapping Analysis LLM]
-            C2[Candidate Qualifications JSON] --> C3
-            C3 --> C4[Skills Mapping JSON]
+        subgraph "Stage 1B: Skill-Based Assessment"
+            B1[Resume Input] --> B2[Candidate Info]
+            B2 --> B3[RAG System]
+            A3 --> B4[Skill Assessment Generator]
+            B3 --> B4
+            B4 --> B5{For Each Required Skill}
+            B5 --> B6[Skill-Focused LLM Query]
+            B6 --> B7[Individual Skill Assessment]
+            B7 --> B8[Skill Assessments Collection]
         end
     end
     
     subgraph "Stage 2: Resume Generation"
-        D1[Skills Mapping JSON] --> D3[Resume Generation LLM]
-        D2[Original Resume Reference] --> D3
-        D3 --> D4[Tailored Resume Draft]
+        C1[Skill Assessments Collection] --> C2[Resume Generator]
+        C3[Original Resume Reference] --> C2
+        C4[Candidate Information] --> C2
+        C2 --> C5[Resume Generation Prompt]
+        C5 --> C6[Resume Generation LLM]
+        C6 --> C7[Tailored Resume Draft]
     end
     
-    subgraph "Stage 3: Verification"
-        E1[Skills Mapping JSON] --> E2[Original Materials]
-        E2 --> E3[Tailored Resume Draft]
-        E3 --> E4[Verification LLM]
-        E4 --> E5{Verification Check}
-        E5 -->|PASS| E6[Approved Resume]
-        E5 -->|FAIL| E7[Correction Instructions]
-        E7 --> D3
+    subgraph "Stage 3: Statistics & Verification"
+        D1[Job Requirements JSON] --> D2[Match Statistics Calculator]
+        D3[Skill Assessments Collection] --> D2
+        D2 --> D4[Match Statistics]
+        D4 --> D5[Verification LLM]
+        C7 --> D5
+        D5 --> D6{Verification Check}
+        D6 -->|PASS| D7[Approved Resume]
+        D6 -->|FAIL| D8[Correction Instructions]
+        D8 --> C2
     end
     
-    A3 --> C1
-    B4 --> C2
-    C4 --> D1
-    C4 --> E1
-    D4 --> E3
+    A3 --> B4
+    B8 --> C1
+    B8 --> D3
+    B1 --> C3
     
     style A2 fill:#f9d77e,stroke:#333,stroke-width:2px
-    style B5 fill:#f9d77e,stroke:#333,stroke-width:2px
-    style C3 fill:#f9d77e,stroke:#333,stroke-width:2px
-    style D3 fill:#f9d77e,stroke:#333,stroke-width:2px
-    style E4 fill:#f9d77e,stroke:#333,stroke-width:2px
-    style E5 fill:#a3e4d7,stroke:#333,stroke-width:2px
-    style E6 fill:#aed6f1,stroke:#333,stroke-width:2px
-    style E7 fill:#f5b7b1,stroke:#333,stroke-width:2px
+    style B6 fill:#f9d77e,stroke:#333,stroke-width:2px
+    style C6 fill:#f9d77e,stroke:#333,stroke-width:2px
+    style D5 fill:#f9d77e,stroke:#333,stroke-width:2px
+    style B5 fill:#a3e4d7,stroke:#333,stroke-width:2px
+    style D6 fill:#a3e4d7,stroke:#333,stroke-width:2px
+    style D7 fill:#aed6f1,stroke:#333,stroke-width:2px
+    style D8 fill:#f5b7b1,stroke:#333,stroke-width:2px
 ```
 
-## Stage 1: Isolated Analysis (three separate sub-stages)
+## Stage 1: Isolated Analysis
 
 1. **Job Analysis**: Extracts requirements from just the job description
-2. **Candidate Analysis**: Catalogs qualifications from just the resume/context
-3. **Mapping Analysis**: Identifies legitimate matches between requirements and qualifications
+2. **Candidate Analysis**: Catalogs qualifications for each job requirement from just the resume/context
 
 ## Stage 2: Resume Generation
 
-Creates a tailored resume using only verified information from the mapping
+Creates a tailored resume using the skills collection and candidate information.
 
 ## Stage 3: Verification
 
@@ -90,7 +88,7 @@ Creates a tailored resume using only verified information from the mapping
 
 The system uses several techniques to prevent fabrication:
 
-* **Isolation of Analysis Stages**: By analyzing the job and candidate separately, the system prevents the LLM from prematurely creating connections that might lead to fabrication.
+* **Isolation of Analysis Stages**: By analyzing the job and candidate separately, and having the LLM only provide evidence of a single skill per pass, the system prevents the LLM from prematurely creating connections that might lead to fabrication.
 * **Evidence Requirements**: Each qualification included must have explicit evidence from the original materials.
 * **Conservative Transferability**: The system is instructed to be conservative when claiming skills are transferable.
 * **Verification Layer**: A dedicated verification step acts as a safety check to catch any remaining fabrications.
diff --git a/src/utils/agents/job_description.py b/src/utils/agents/job_description.py
index 5e6eb87..cd9f428 100644
--- a/src/utils/agents/job_description.py
+++ b/src/utils/agents/job_description.py
@@ -436,11 +436,11 @@ class JobDescription(Agent):
       # Group results by category and subcategory
       grouped_context = defaultdict(list)
       for result in rag_results:
-          key = f"{result['category']}/{result['subcategory']}".strip("/")
-          grouped_context[key].append({
-              "query": result["context"],
-              "content": result["content"][:100] + "..." if len(result["content"]) > 100 else result["content"]
-          })
+        key = f"{result['category']}/{result['subcategory']}".strip("/")
+        grouped_context[key].append({
+          "query": result["context"],
+          "content": result["content"][:100] + "..." if len(result["content"]) > 100 else result["content"]
+        })
       
       # Format as a structured string
       context_lines = ["Additional Context from Document Retrieval:"]
@@ -454,120 +454,70 @@ class JobDescription(Agent):
   
   # Stage 1B: Candidate Analysis Implementation
   def create_candidate_analysis_prompt(self, resume: str, rag_results: List[Dict[str, Any]]) -> tuple[str, str]:
-      """Create the prompt for candidate qualifications analysis."""
-  
-      # system_prompt = """
-      # You are an objective resume analyzer. Create a comprehensive inventory of all skills, experiences, and qualifications present in the candidate's materials.
-
-      # CORE PRINCIPLES:
-      # - Analyze ONLY the candidate's resume and provided context
-      # - Focus ONLY on the candidate's actual qualifications
-      # - Do not reference any job requirements
-      # - Include only explicitly mentioned information
-
-      # OUTPUT FORMAT:
-      # ```json
-      # {
-      #   "candidate_qualifications": {
-      #     "technical_skills": [
-      #       {
-      #         "skill": "skill name",
-      #         "evidence_location": "where in resume this appears",
-      #         "expertise_level": "stated level or 'unspecified'"
-      #       }
-      #     ],
-      #     "work_experience": [
-      #       {
-      #         "role": "job title",
-      #         "company": "company name",
-      #         "duration": "time period",
-      #         "responsibilities": ["resp1", "resp2"],
-      #         "technologies_used": ["tech1", "tech2"],
-      #         "achievements": ["achievement1", "achievement2"]
-      #       }
-      #     ],
-      #     "education": [
-      #       {
-      #         "degree": "degree name",
-      #         "institution": "institution name",
-      #         "completed": true/false,
-      #         "graduation_date": "date or 'ongoing'"
-      #       }
-      #     ],
-      #     "projects": [
-      #       {
-      #         "name": "project name",
-      #         "description": "brief description",
-      #         "technologies_used": ["tech1", "tech2"]
-      #       }
-      #     ],
-      #     "soft_skills": [
-      #       {
-      #         "skill": "skill name",
-      #         "context": "brief mention of where this appears"
-      #       }
-      #     ]
-      #   }
-      # }
-      # """
-      system_prompt = """\
-You are an objective resume analyzer. Create a comprehensive inventory of all skills, experiences, and qualifications present in the candidate's materials.
+    """Create the prompt for candidate qualifications analysis."""
+    system_prompt = """\
+You are an objective resume analyzer. Create a concise inventory of the candidate's key skills, experiences, and qualifications based on their resume.
 
 CORE PRINCIPLES:
 - Analyze ONLY the candidate's resume and provided context.
-- Focus ONLY on the candidate's actual qualifications explicitly mentioned in the resume.
-- Use the additional context to clarify or provide background for terms, skills, or experiences mentioned in the resume (e.g., to understand the scope of a skill like 'Python' or a role's responsibilities).
-- Do NOT treat the context as job requirements or infer qualifications not explicitly stated in the resume.
-- Include only explicitly mentioned information from the resume, supplemented by context where relevant.
+- Focus on the most significant and relevant qualifications explicitly mentioned.
+- Limit your analysis to the most important items in each category.
+- Prioritize brevity and completeness over exhaustiveness.
+- Complete the entire analysis in one response without getting stuck on any section.
 
 OUTPUT FORMAT:
-```json
 {
   "candidate_qualifications": {
     "technical_skills": [
+      // Include MAX 10 most important technical skills
       {
         "skill": "skill name",
-        "evidence_location": "where in resume this appears",
+        "evidence_location": "brief reference",
         "expertise_level": "stated level or 'unspecified'"
       }
     ],
     "work_experience": [
+      // Include MAX 5 most recent or relevant positions
       {
         "role": "job title",
         "company": "company name",
         "duration": "time period",
-        "responsibilities": ["resp1", "resp2"],
-        "technologies_used": ["tech1", "tech2"],
-        "achievements": ["achievement1", "achievement2"]
+        "responsibilities": ["resp1", "resp2"], // MAX 3 key responsibilities
+        "technologies_used": ["tech1", "tech2"], // MAX 5 technologies
+        "achievements": ["achievement1"] // MAX 2 achievements
       }
     ],
     "education": [
+      // Include ALL education entries (typically 1-3)
       {
         "degree": "degree name",
         "institution": "institution name",
-        "completed": true/false,
-        "graduation_date": "date or 'ongoing'"
+        "completed": true/false
       }
     ],
     "projects": [
+      // Include MAX 3 most significant projects
       {
         "name": "project name",
-        "description": "brief description",
-        "technologies_used": ["tech1", "tech2"]
+        "description": "one sentence description",
+        "technologies_used": ["tech1", "tech2"] // MAX 3 technologies
       }
     ],
     "soft_skills": [
+      // Include MAX 5 most prominent soft skills
       {
         "skill": "skill name",
-        "context": "brief mention of where this appears"
+        "context": "brief mention"
       }
     ]
   }
 }
+
+IMPORTANT: If at any point you find yourself repeating items or getting stuck, STOP that section and move to the next. It's better to provide a partial analysis than to get stuck in a loop.
 """
-      context = self.format_rag_context(rag_results)
-      prompt = f"Resume:\n{resume}\n\nAdditional Context:\n{context}"
-      return system_prompt, prompt
+    context = self.format_rag_context(rag_results)
+    prompt = f"Resume:\n{resume}\n\nAdditional Context:\n{context}"
+    return system_prompt, prompt
 
   async def call_llm(self, message: Message, system_prompt, prompt, temperature=0.7):
     logger.info(f"{self.agent_type} - {inspect.stack()[0].function}")
diff --git a/src/utils/rag.py b/src/utils/rag.py
index 5c48615..d9b335b 100644
--- a/src/utils/rag.py
+++ b/src/utils/rag.py
@@ -167,6 +167,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
             if os.path.isfile(file_path):
                 # Do not put the Resume in RAG as it is provideded with all queries.
                 if file_path == defines.resume_doc:
+                    logging.info(f"Not adding {file_path} to RAG -- primary resume")
                     continue
                 files_checked += 1
                 current_hash = self._get_file_hash(file_path)
@@ -217,6 +218,10 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
         if file_path in self.processing_files:
             logging.info(f"{file_path} already in queue. Not adding.")
             return
+
+        if file_path == defines.resume_doc:
+            logging.info(f"Not adding {file_path} to RAG -- primary resume")
+            return
             
         try:
             logging.info(f"{file_path} not in queue. Adding.")