# Self-supervised synthetic data via sequential generation import os import re import json import torch from pathlib import Path from datetime import datetime from tqdm import tqdm from transformers import ( AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, ) def load_deepseek_r1(): """ Loads the DeepSeek-R1 model and tokenizer. Returns: tuple: (model, tokenizer) for DeepSeek-R1 """ # Load model and tokenizer model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) model = AutoModelForCausalLM.from_pretrained( model_name, low_cpu_mem_usage=True, trust_remote_code=True, use_cache=False, quantization_config=bnb_config, device_map={"": torch.xpu.current_device()}, torch_dtype=torch.bfloat16, ) if not model.config.pad_token_id: model.config.pad_token_id = model.config.eos_token_id return model, tokenizer # Function to get answers from the model def validate_question_answer_from_model(context, question, answer, model, tokenizer): """ Query the model to evaluate if an answer is a good fit for a question. Args: context (str): The text file content question (str): The question answer (str): The answer model: The DeepSeek model tokenizer: The DeepSeek tokenizer Returns: str: The model's evaluation to the question """ # Set up the prompt for answering the question prompt = f""" You are a quality assurance expert reviewing question-answer pairs for an AI training dataset. Your task is to evaluate whether each pair meets our quality standards and is suitable for training. For each question-answer pair, evaluate: 1. ACCURACY: Does the answer contain ONLY information from the context, without fabrications? 2. COMPLETENESS: Does the answer fully address the question using all relevant information? 3. RELEVANCE: Is the question meaningful and relevant to the context? 4. NATURALNESS: Do both question and answer sound natural and conversational? 5. DIVERSITY: Does this pair add variety to our dataset (not redundant with others)? Context: {context} Question: {question} Answer: {answer} Provide your assessment as follows: - VERDICT: [ACCEPT/REJECT] - REASONING: [Brief explanation of your decision] - IMPROVEMENTS: [Suggestions if needed] """ # Generate answer raw_answer = generate_with_deepseek(prompt, model, tokenizer) # Parse the response to get the actual answer answer = parse_deepseek_response(raw_answer) return answer def generate_with_deepseek(prompt, model, tokenizer, max_length=4096): """ Generate text using DeepSeek-R1 model with proper handling of full output. Args: prompt (str): The input prompt model: The DeepSeek model tokenizer: The DeepSeek tokenizer max_length (int): Maximum length of generated text Returns: str: Generated text response """ inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): # Get the full generated sequence generation_output = model.generate( **inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=max_length, do_sample=True, temperature=0.7, top_p=0.9, return_dict_in_generate=True, output_scores=False ) # Get the full output token IDs output_token_ids = generation_output.sequences[0] # Decode the full generated text (including the prompt) full_output = tokenizer.decode(output_token_ids, skip_special_tokens=True) # Extract only the response part by removing the prompt part response = full_output[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):] return response def parse_deepseek_response(response): """ Parse the DeepSeek-R1 response. This handles DeepSeek's thinking steps and separates them from the response. Args: response (str): The raw response from the DeepSeek model Returns: str: The cleaned answer part of the response """ # If the response has thinking tags, extract the actual answer response = re.sub(r"^()?.*", "", response, flags=re.DOTALL) # If no special formatting detected, return the whole response return response.strip() # Function to recursively walk a directory and process text files def process_directory(directory_path, output_path="results", file_extensions=(".txt",".md"), batch_size=5): """ Recursively walks a directory, processes text files, and stores results. Args: directory_path (str): Path to directory containing text files output_path (str): Path to store results file_extensions (tuple): File extensions to process batch_size (int): Number of files to process before clearing cache """ # Load the DeepSeek-R1 model model, tokenizer = load_deepseek_r1() # Create output directory if it doesn't exist os.makedirs(output_path, exist_ok=True) # Use tqdm for progress tracking file_paths = [] for root, _, files in os.walk(directory_path): for file in files: if file.lower().endswith(file_extensions): file_paths.append(os.path.join(root, file)) # Process files with batch-based memory management for i, file_path in enumerate(tqdm(file_paths, desc="Processing files")): process_file(file_path, model, tokenizer, output_path) # Clear cache periodically to prevent memory issues if (i + 1) % batch_size == 0: if torch.xpu.is_available(): torch.xpu.empty_cache() else: torch.cuda.empty_cache() print(f"Processing complete. Results stored in {output_path}") # Function to process a single text file def process_file(file_path, model, tokenizer, output_path): """ Process a single text file by querying the model for questions and answers. Args: file_path (str): Path to the text file model: The DeepSeek model tokenizer: The DeepSeek tokenizer output_path (str): Path to store results """ # Read the file content try: with open(file_path, 'r', encoding='utf-8') as file: content = file.read() except Exception as e: print(f"Error reading {file_path}: {e}") return # Skip empty files if not content.strip(): print(f"Skipping empty file: {file_path}") return # Generate a relative output path that maintains the directory structure relative_path = os.path.relpath(file_path, start=os.path.dirname(output_path)) output_file = os.path.join(output_path, relative_path + ".json") os.makedirs(os.path.dirname(output_file), exist_ok=True) # Handle potential token length issues by truncating if necessary # DeepSeek has a context window limit, truncate if needed max_content_length = 100000 # Adjust based on model's context length limit if len(content) > max_content_length: content = content[:max_content_length] + "... [Content truncated due to length]" # Query model for questions questions = get_questions_from_model(content, model, tokenizer) print(f"{len(questions)} questions generated for {file_path}") # Get answers for each question results = { "file_path": file_path, "processed_at": datetime.now().isoformat(), "context": content[:1000] + "..." if len(content) > 1000 else content, # Truncated context for JSON storage "qa_pairs": [] } # Process each question for i, question in enumerate(questions): print(f"Generating answer for question {i+1}/{len(questions)}: {question}") answer = get_answer_from_model(content, question, model, tokenizer) print(f"Answer: {answer[:50] + '...' if len(answer) > 50 else answer}") print(f"Evaluating response...") validation = validate_question_answer_from_model(content, question, answer, model, tokenizer) print(f"Evaluation: {validation[:50] + '...' if len(validation) > 50 else validation}") results["qa_pairs"].append({ "question": question, "answer": answer, "validation": validation, }) # Save results after each generate pass so they can be evaluated with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) # Function to get questions from the model def get_questions_from_model(context, model, tokenizer): """ Query the model to generate questions about the provided context. Args: context (str): The text file content model: The DeepSeek model tokenizer: The DeepSeek tokenizer Returns: list: List of questions about the context """ # Set up the prompt for generating questions prompt = f""" You are an expert data scientist creating a training dataset. I'll provide context information about a person from their resume. Your task is to generate 10 diverse, realistic questions that someone might ask about this person. IMPORTANT: DO NOT return JSON or any structured format. Respond with a simple numbered list of questions only, with no formatting, no JSON, and no additional text. Generate questions that: - Vary in complexity (simple factual questions, complex reasoning questions) - Cover different aspects of the context (experience, skills, education, achievements) - Include both specific and general inquiries - Sound natural, as if asked by a real person - Avoid asking for information not present in the context Context: {context} Return ONLY a plain numbered list like: 1. First question? 2. Second question? ...and so on. Do not include any explanations, JSON, or other formatting. """ # Generate questions raw_response = generate_with_deepseek(prompt, model, tokenizer) # Parse the response to get the actual questions response = parse_deepseek_response(raw_response) lines = response.strip().split('\n') # Transform lines like "1. What is your name?" into just "What is your name?" (only include lines that end with a question mark) questions = [re.sub(r'^\d+\.\s*', '', line.strip()) for line in lines if line.strip() and line.strip().endswith('?')] if len(questions) == 0: print(response) exit(0) return questions # Function to get answers from the model def get_answer_from_model(context, question, model, tokenizer): """ Query the model to answer a question about the provided context. Args: context (str): The text file content question (str): The question to answer model: The DeepSeek model tokenizer: The DeepSeek tokenizer Returns: str: The model's answer to the question """ # Set up the prompt for answering the question prompt = f""" You are an AI assistant being fine-tuned to accurately represent a specific person based on their resume. Below is a question about this person and the context from their resume. Your task is to provide a comprehensive, accurate answer that: - Only uses information explicitly stated in the context - Doesn't fabricate or assume additional details - Maintains a professional, helpful tone - Clearly states if the question cannot be answered based on the given context - Structures the response in a natural, conversational way Context: {context} Question: {question} Answer: """ # Generate answer raw_answer = generate_with_deepseek(prompt, model, tokenizer) # Parse the response to get the actual answer answer = parse_deepseek_response(raw_answer) return answer # Example usage if __name__ == "__main__": process_directory( directory_path="../doc/", # Replace with your directory path output_path="../results", file_extensions=(".txt", ".md"), # Process both txt and md files batch_size=5 # Clear cache after every 5 files )