# # Copyright 2016 The BigDL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Some parts of this file is adapted from # https://github.com/tloen/alpaca-lora/blob/main/finetune.py # # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from typing import List import torch import transformers from datasets import load_dataset import accelerate from typing import Union from datetime import datetime from transformers import AutoTokenizer from peft import ( get_peft_model_state_dict, set_peft_model_state_dict, ) from ipex_llm.transformers import AutoModelForCausalLM # import them from ipex_llm.transformers.qlora to get a IPEX-LLM compatible Peft model from ipex_llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training,\ LoraConfig import json local_rank = 0 world_size = 1 class Prompter(object): __slots__ = ("data") def __init__(self, data_path: str = ""): self.data = None if not os.path.exists(data_path): raise Exception(f"Can't read {data_path}") with open(data_path) as fp: self.data = json.load(fp) def generate_prompt( self, instruction: str, input: Union[None, str]=None, label: Union[None, str]=None, ) -> str: # returns the full prompt from instruction and optional input # if a label (=response, =output) is provided, it's also appended. if input: ret = f"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" else: ret = f"### Instruction:\n{instruction}\n\n### Response:\n" if label: ret = f"{ret}{label}" return ret def get_response(self, output: str) -> str: return output.split("### Response:")[1].strip() """Data processing to get train data and val data""" def tokenize_qa(prompt, tokenizer, max_length=1, add_eos_token=True): # there's probably a way to do this with the tokenizer settings # but again, gotta move fast result = tokenizer( prompt, truncation=True, max_length=max_length, padding=False, return_tensors=None, ) if ( result["input_ids"][-1] != tokenizer.eos_token_id and len(result["input_ids"]) < max_length and add_eos_token ): result["input_ids"].append(tokenizer.eos_token_id) result["attention_mask"].append(1) result["labels"] = result["input_ids"].copy() return result def generate_and_tokenize_prompt(entry, prompter, tokenizer, max_length, add_eos_token, train_on_inputs): full_prompt = prompter.generate_prompt( entry["question"], None, entry["answer"], ) print({ 'full_prompt:' full_prompt }) tokenized_full_prompt = tokenize_qa(full_prompt, tokenizer, max_length=max_length) if not train_on_inputs: user_prompt = prompter.generate_prompt( entry["question"], None ) tokenized_user_prompt = tokenize_qa( user_prompt, tokenizer, add_eos_token=add_eos_token, max_length=max_length ) user_prompt_len = len(tokenized_user_prompt["input_ids"]) if add_eos_token: user_prompt_len -= 1 # Mask out the input from the labels tokenized_full_prompt["labels"] = [ -100 ] * user_prompt_len + tokenized_full_prompt["labels"][ user_prompt_len: ] # could be sped up, probably return tokenized_full_prompt def train( # model/data params base_model: str = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", data_path: str = "./results/doc/resume/generic.txt.qa.json", output_dir: str = "./bigdl-qlora-alpaca", # training hyperparams bf16: bool = True, # default to bf16 batch_size: int = 128, micro_batch_size: int = 2, # default to be 2, limited by GPU memory num_epochs: int = 3, learning_rate: float = 3e-5, # default to be 3e-5 to avoid divergence cutoff_len: int = 256, val_set_size: int = 2000, # lora hyperparams lora_r: int = 8, lora_alpha: int = 16, lora_dropout: float = 0.05, lora_target_modules: List[str] = [ "q_proj", "v_proj", "k_proj", "o_proj", "up_proj", "down_proj", "gate_proj" ], # llm hyperparams train_on_inputs: bool = True, # if False, masks out inputs in loss add_eos_token: bool = False, group_by_length: bool = False, # faster, but produces an odd training loss curve resume_from_checkpoint: str = "", # either training checkpoint or final adapter prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca. gradient_checkpointing: bool = False, training_mode: str = "lora", save_checkpoint: bool = True, ): if training_mode != "lora": raise Exception(f"This example is for lora training mode, but got training_mode={training_mode}.") print( f"Training Alpaca-LoRA model with params:\n" f"base_model: {base_model}\n" f"data_path: {data_path}\n" f"output_dir: {output_dir}\n" f"batch_size: {batch_size}\n" f"micro_batch_size: {micro_batch_size}\n" f"num_epochs: {num_epochs}\n" f"learning_rate: {learning_rate}\n" f"cutoff_len: {cutoff_len}\n" f"val_set_size: {val_set_size}\n" f"lora_r: {lora_r}\n" f"lora_alpha: {lora_alpha}\n" f"lora_dropout: {lora_dropout}\n" f"lora_target_modules: {lora_target_modules}\n" f"train_on_inputs: {train_on_inputs}\n" f"add_eos_token: {add_eos_token}\n" f"group_by_length: {group_by_length}\n" f"resume_from_checkpoint: {resume_from_checkpoint or False}\n" f"prompt template: {prompt_template_name}\n" f"training_mode: {training_mode}\n" f"save_checkpoint: {save_checkpoint}\n" ) gradient_accumulation_steps = batch_size // micro_batch_size prompter = Prompter("./results/doc/resume/generic.txt.qa.json") model = AutoModelForCausalLM.from_pretrained( base_model, # load_in_4bit=True, load_in_low_bit="bf16", optimize_model=False, # optimize_model=True, torch_dtype=torch.bfloat16, modules_to_not_convert=["lm_head"], trust_remote_code=True, ) # model = model.half() tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Prepare a IPEX-LLM compatible Peft model model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing) config = LoraConfig( r=lora_r, lora_alpha=lora_alpha, target_modules=lora_target_modules, lora_dropout=lora_dropout, bias="none", task_type="CAUSAL_LM", training_mode=training_mode, ) print(f"Lora Config: {config}") model = get_peft_model(model, config) dataset = load_dataset("json", data_files=data_path) tokenized_dataset = dataset.map(generate_and_tokenize_prompt, fn_kwargs={ "prompter": prompter, "tokenizer": tokenizer, "max_length": cutoff_len, "add_eos_token": add_eos_token, "train_on_inputs": train_on_inputs }, batched=True) model.print_trainable_parameters() # Be more transparent about the % of trainable params. split = tokenized_dataset.train_test_split(test_size=0.2) train_dataset = split["train"] eval_dataset = split["test"] trainer = transformers.Trainer( model=model, train_dataset=train_dataset, eval_dataset=eval_dataset, args=transformers.TrainingArguments( per_device_train_batch_size=micro_batch_size, gradient_accumulation_steps=gradient_accumulation_steps, # warmup_ratio=0.03, # warmup_steps=100, max_grad_norm=0.3, num_train_epochs=num_epochs, learning_rate=learning_rate, lr_scheduler_type="cosine", bf16=True, # ensure training more stable logging_steps=1, optim="adamw_torch", evaluation_strategy="steps" if val_set_size > 0 else "no", save_strategy="steps" if save_checkpoint else "no", eval_steps=100 if val_set_size > 0 else None, save_steps=100, output_dir=output_dir, save_total_limit=100, load_best_model_at_end=True if val_set_size > 0 and save_checkpoint else False, group_by_length=group_by_length, gradient_checkpointing=gradient_checkpointing, ddp_backend="ccl", save_safetensors=False, ), data_collator=transformers.DataCollatorForSeq2Seq( tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True ), ) model.config.use_cache = False trainer.train(data_path="./results/doc/resume/generic.txt.qa.json", resume_from_checkpoint=resume_from_checkpoint) model.save_pretrained(output_dir) print( "\n If there's a warning about missing keys above, please disregard :)" ) if __name__ == "__main__": train()