import os import argparse import torch from torch.utils.data import Dataset, DataLoader from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from tqdm import tqdm import pandas as pd import torch.nn.functional as F class CSVDataset(Dataset): def __init__(self, filepath, tokenizer, seq_length, rows_per_sample): self.data = pd.read_csv(filepath) self.text_data = self.data['Text'].tolist() self.tokenizer = tokenizer self.seq_length = seq_length self.rows_per_sample = rows_per_sample # Number of rows to pack per sample # Define CAP_SAMPLE_LEN self.CAP_SAMPLE_LEN = 17500 # 15000 for Phi3 Model # Maximum number of characters per sample if self.tokenizer.eos_token is None: self.tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'}) if self.tokenizer.pad_token is None: self.tokenizer.add_special_tokens({'pad_token': '<|pad|>'}) self.eos_token_id = self.tokenizer.eos_token_id self.pad_token_id = self.tokenizer.pad_token_id def __len__(self): return (len(self.text_data) + self.rows_per_sample - 1) // self.rows_per_sample def __getitem__(self, idx): start_idx = idx * self.rows_per_sample end_idx = min(start_idx + self.rows_per_sample, len(self.text_data)) lines = self.text_data[start_idx:end_idx] # Truncate each line at CAP_SAMPLE_LEN (preferably at a space boundary) truncated_lines = [] for text in lines: if len(text) > self.CAP_SAMPLE_LEN: l = text.rfind(' ', 0, self.CAP_SAMPLE_LEN) if l < 0: l = self.CAP_SAMPLE_LEN text = text[:l] truncated_lines.append(text) # Tokenize all lines at once. Each line will be tokenized independently. # We use add_special_tokens=False to avoid introducing BOS/EOS tokens automatically. batch_encodings = self.tokenizer( truncated_lines, add_special_tokens=False, truncation=True, max_length=self.seq_length - 2, # Reserve space for EOS tokens return_tensors=None ) # batch_encodings["input_ids"] is a list of lists, each sub-list is token_ids for a line. input_ids_list = [] for tokens in batch_encodings["input_ids"]: # Append an EOS token after each line tokens.append(self.eos_token_id) input_ids_list.extend(tokens) # Now we have a single list of input_ids for all rows. # Ensure final token is EOS if input_ids_list[-1] != self.eos_token_id: input_ids_list.append(self.eos_token_id) # Handle length adjustments if len(input_ids_list) > self.seq_length: # Truncate from the end tokens_to_remove = len(input_ids_list) - self.seq_length input_ids_list = input_ids_list[:-tokens_to_remove] # Ensure EOS at the end after truncation if input_ids_list[-1] != self.eos_token_id: input_ids_list[-1] = self.eos_token_id elif len(input_ids_list) < self.seq_length: # Pad until we reach seq_length padding_length = self.seq_length - len(input_ids_list) input_ids_list.extend([self.pad_token_id] * padding_length) # Ensure EOS at the end input_ids_list[-1] = self.eos_token_id input_ids = torch.tensor(input_ids_list, dtype=torch.long) return input_ids def evaluate_model(model, dataloader, device): """ Evaluate the model batch by batch and print the losses for each batch. """ model.eval() total_loss = 0 with torch.no_grad(): for batch_idx, input_ids in enumerate(tqdm(dataloader, desc="Evaluating Model")): input_ids = input_ids.to(device) # Evaluate the model outputs = model(input_ids, labels=input_ids) loss = outputs.loss.item() total_loss += loss # Print loss for the current batch print(f"Batch {batch_idx + 1} Loss: {loss:.4f}") avg_loss = total_loss / len(dataloader) return avg_loss def evaluate_single_model(model_path, tokenizer_path, csv_path, seq_length, batch_size, device): """ Evaluate a single model on the dataset and print losses for each batch. """ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) dataset = CSVDataset(csv_path, tokenizer, seq_length, rows_per_sample=50) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4) # model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device) # Load model in 4-bit precision # bnb_config = BitsAndBytesConfig(load_in_4bit=True) # Load the quantized model model = AutoModelForCausalLM.from_pretrained( model_path, # quantization_config=bnb_config, # Use quantization torch_dtype=torch.float16, # 4-bit models compute in FP32 # device_map="auto" ).to(device) # Convert model to bfloat16 # model.to(torch.bfloat16) # # Remove quantization metadata from config # if hasattr(model.config, "quantization_config"): # delattr(model.config, "quantization_config") # print("Removed quantization_config from model configuration.") # Check model's dtype print(model.dtype) # Should print torch.bfloat16 # Save the model in bfloat16 precision # model.save_pretrained("model_bfloat16") print("Evaluating Model...") avg_loss = evaluate_model(model, dataloader, device) print(f"Average Loss: {avg_loss:.4f}") return avg_loss if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True, help="Path to the model.") parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the tokenizer.") parser.add_argument("--csv_path", type=str, required=True, help="Path to the CSV file with 'Text' column.") parser.add_argument("--seq_length", type=int, default=4096, help="Maximum sequence length.") parser.add_argument("--batch_size", type=int, default=2, help="Batch size for evaluation.") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use.") args = parser.parse_args() evaluate_single_model( args.model_path, args.tokenizer_path, args.csv_path, args.seq_length, args.batch_size, args.device )