| |
|
| | def main(args): |
| | MODEL_NAME = args.model |
| |
|
| | EVAL_FILE = args.file |
| | print(f"Using evaluation file: {EVAL_FILE}") |
| |
|
| | |
| | import shutil |
| | import os |
| | import time |
| |
|
| | run_start_time = time.time() |
| |
|
| | os.makedirs("tmp", exist_ok=True) |
| | os.makedirs("evals_res", exist_ok=True) |
| |
|
| | |
| | EVAL_FILE_BASENAME = os.path.basename(EVAL_FILE) |
| | MODEL_NAME_STR = "+".join(args.model.split("/")) |
| | SAVED_EVAL_FILE = f"{str(run_start_time)}_{MODEL_NAME_STR}_{EVAL_FILE_BASENAME}_seq{args.num_seqs}_tok{args.tokens}_q{args.quant_policy}_tpp{args.top_p}_mnp{args.min_p}_tpk{args.top_k}" |
| |
|
| | import os |
| |
|
| | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" |
| | os.environ["TOKENIZERS_PARALLELISM"] = "false" |
| | os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda/bin/ptxas" |
| | import re |
| | import random |
| | import warnings |
| | from collections import Counter |
| | import numpy as np, pandas as pd, polars as pl |
| |
|
| | import torch |
| | import lmdeploy |
| | from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig |
| | from transformers import AutoTokenizer |
| |
|
| | warnings.simplefilter("ignore") |
| | print("PyTorch version:", torch.__version__) |
| | print("LMDeploy:", lmdeploy.__version__) |
| |
|
| | def seed_everything(seed): |
| | os.environ["PYTHONHASHSEED"] = str(seed) |
| | random.seed(seed) |
| | np.random.seed(seed) |
| | torch.manual_seed(seed) |
| | torch.cuda.manual_seed(seed) |
| | torch.backends.cudnn.benchmark = True |
| | torch.backends.cudnn.deterministic = True |
| |
|
| | seed_everything(seed=0) |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | llm_model_pth = MODEL_NAME |
| |
|
| | MAX_NUM_SEQS = args.num_seqs |
| | MAX_MODEL_LEN = 1024 * 12 |
| | EVAL = True |
| | EVAL_SELECTED_QUESTIONS_ONLY = False |
| |
|
| | engine_config = TurbomindEngineConfig( |
| | |
| | quant_policy=args.quant_policy, |
| | cache_max_entry_count=0.95, |
| | session_len=MAX_MODEL_LEN, |
| | enable_prefix_caching=True, |
| | max_batch_size=MAX_NUM_SEQS, |
| | ) |
| |
|
| | pipe = pipeline(llm_model_pth, backend_config=engine_config) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(llm_model_pth, trust_remote_code=False) |
| |
|
| | import re |
| |
|
| | def extract_boxed_text(text): |
| | pattern = r"oxed{(.*?)}" |
| | matches = re.findall(pattern, text) |
| | if not matches: |
| | return "" |
| | for match in matches[::-1]: |
| | if match != "": |
| | return match |
| | return "" |
| |
|
| | def batch_message_filter(list_of_messages) -> tuple[list[list[dict]], list[str]]: |
| | extracted_answers = [] |
| | list_of_messages_to_keep = [] |
| | for messages in list_of_messages: |
| | answer = extract_boxed_text(messages[-1]["content"]) |
| | if answer: |
| | extracted_answers.append(answer) |
| | else: |
| | list_of_messages_to_keep.append(messages) |
| | return list_of_messages_to_keep, extracted_answers |
| |
|
| | def select_answer(answers): |
| | counter = Counter() |
| | for answer in answers: |
| | try: |
| | if int(answer) == float(answer): |
| | counter[int(answer)] += 1 + random.random() / 1_000 |
| | except: |
| | pass |
| | if not counter: |
| | return 210 |
| | _, answer = sorted([(v, k) for k, v in counter.items()], reverse=True)[0] |
| | return answer % 1000 |
| |
|
| | def batch_message_generate(list_of_messages) -> list[list[dict]]: |
| | max_tokens = args.tokens |
| | |
| | |
| | |
| |
|
| | list_of_texts = [ |
| | tokenizer.apply_chat_template( |
| | conversation=messages, tokenize=False, add_generation_prompt=True |
| | ) |
| | for messages in list_of_messages |
| | ] |
| |
|
| | gen_configs = [ |
| | GenerationConfig( |
| | do_sample=True, |
| | temperature=1.0, |
| | top_k=args.top_k, |
| | top_p=args.top_p, |
| | min_p=args.min_p, |
| | skip_special_tokens=True, |
| | max_new_tokens=max_tokens, |
| | stop_words=["</think>"], |
| | ) |
| | for prompt in list_of_texts |
| | ] |
| |
|
| | request_output = pipe( |
| | list_of_texts, |
| | gen_config=gen_configs, |
| | ) |
| | print( |
| | [ |
| | single_request_output.generate_token_len |
| | for single_request_output in request_output |
| | ] |
| | ) |
| |
|
| | sort_keys_and_list_of_messages = [] |
| | for messages, single_request_output in zip(list_of_messages, request_output): |
| | |
| | |
| | |
| | messages.append( |
| | {"role": "assistant", "content": single_request_output.text} |
| | ) |
| |
|
| | sort_keys_and_list_of_messages.append( |
| | (single_request_output.generate_token_len, messages) |
| | ) |
| | print([sort_key for sort_key, _ in sort_keys_and_list_of_messages]) |
| | sort_keys_and_list_of_messages.sort( |
| | key=lambda sort_key_and_messages: sort_key_and_messages[0] |
| | ) |
| | print([sort_key for sort_key, _ in sort_keys_and_list_of_messages]) |
| |
|
| | list_of_messages = [messages for _, messages in sort_keys_and_list_of_messages] |
| | return list_of_messages |
| |
|
| | def create_starter_messages(question: str, index: int) -> str: |
| | options = [] |
| | for _ in range(1): |
| | options.append( |
| | [ |
| | { |
| | "role": "system", |
| | "content": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step. Return final answer within \\boxed{}, after taking modulo 1000.", |
| | }, |
| | {"role": "user", "content": question}, |
| | ] |
| | ) |
| |
|
| | return options[index % len(options)] |
| |
|
| | def predict_for_question(question: str, question_id=time.time()) -> int: |
| | import os |
| | import time |
| |
|
| | start_time = time.time() |
| |
|
| | if EVAL_SELECTED_QUESTIONS_ONLY and not os.getenv( |
| | "KAGGLE_IS_COMPETITION_RERUN" |
| | ): |
| | |
| | |
| | if ( |
| | "Triangle" not in question |
| | and "delightful" not in question |
| | and "George" not in question |
| | ): |
| | return 210 |
| |
|
| | """ if time.time() > cutoff_time: |
| | return 210 """ |
| |
|
| | print(question) |
| |
|
| | num_seqs = MAX_NUM_SEQS |
| |
|
| | list_of_messages = [ |
| | create_starter_messages(question, index) for index in range(num_seqs) |
| | ] |
| |
|
| | all_extracted_answers = [] |
| | for _ in range(1): |
| | list_of_messages = batch_message_generate(list_of_messages) |
| |
|
| | if not os.getenv("KAGGLE_IS_COMPETITION_RERUN"): |
| | df = pd.DataFrame( |
| | { |
| | "question": [question] * len(list_of_messages), |
| | "message": [ |
| | messages[-1]["content"] for messages in list_of_messages |
| | ], |
| | } |
| | ) |
| | df.to_csv(f"tmp/{str(question_id)}_{SAVED_EVAL_FILE}.csv", index=False) |
| |
|
| | list_of_messages, extracted_answers = batch_message_filter(list_of_messages) |
| | all_extracted_answers.extend(extracted_answers) |
| |
|
| | print(all_extracted_answers) |
| | answer = select_answer(all_extracted_answers) |
| | print(answer) |
| |
|
| | print("\n\n") |
| | |
| | print(f"Time taken: {time.time() - start_time}") |
| | return answer |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | import uuid |
| |
|
| | TEMP_CSV = f"tmp/evals_{SAVED_EVAL_FILE}.csv" |
| |
|
| | def predict( |
| | id_: pl.DataFrame, question: pl.DataFrame |
| | ) -> pl.DataFrame | pd.DataFrame: |
| | id_ = id_["id"][0] |
| | print("------") |
| | print(id_) |
| |
|
| | question = question["problem"][0] |
| | answer = predict_for_question(question, question_id=id_) |
| | print("------\n\n\n") |
| |
|
| | if EVAL and not os.getenv("KAGGLE_IS_COMPETITION_RERUN"): |
| | |
| | row = {"id": id_, "question": question, "answer": answer} |
| |
|
| | |
| | temp_df = pd.DataFrame([row]) |
| |
|
| | |
| | |
| | if not os.path.exists(TEMP_CSV): |
| | temp_df.to_csv(TEMP_CSV, index=False) |
| | else: |
| | temp_df.to_csv(TEMP_CSV, mode="a", header=False, index=False) |
| |
|
| | return pl.DataFrame({"id": id_, "answer": answer}) |
| |
|
| | """ predict_for_question( |
| | "Fred and George take part in a tennis tournament with $4046$ other players. In each round, the players are paired into $2024$ matches. How many ways are there to arrange the first round such that Fred and George do not have to play each other? (Two arrangements for the first round are \\textit{different} if there is a player with a different opponent in the two arrangements.)" |
| | ) |
| | predict_for_question( |
| | "Triangle $ABC$ has side length $AB = 120$ and circumradius $R = 100$. Let $D$ be the foot of the perpendicular from $C$ to the line $AB$. What is the greatest possible length of segment $CD$?" |
| | ) |
| | |
| | return """ |
| |
|
| | def sample_and_predict(csv_file: str) -> None: |
| | """ |
| | Reads all rows from the given CSV file, and for each row, |
| | calls the predict() function to process the problem. |
| | """ |
| | |
| | df = pd.read_csv(csv_file) |
| |
|
| | |
| | df = df.sample(frac=1, random_state=2024).reset_index(drop=True) |
| |
|
| | |
| | for index, row in df.iterrows(): |
| | id_value = row["id"] |
| | problem_value = row["problem"] |
| |
|
| | print(f"Processing row {index}: id = {id_value}, problem = {problem_value}") |
| |
|
| | |
| | id_df = pl.DataFrame({"id": [id_value]}) |
| | problem_df = pl.DataFrame({"problem": [problem_value]}) |
| |
|
| | |
| | result = predict(id_df, problem_df) |
| | print("Prediction result:") |
| | print(result) |
| | print("\n") |
| | |
| | |
| |
|
| | sample_and_predict(EVAL_FILE) |
| |
|
| | |
| | if ( |
| | EVAL |
| | and not EVAL_SELECTED_QUESTIONS_ONLY |
| | and not os.getenv("KAGGLE_IS_COMPETITION_RERUN") |
| | ): |
| | import pandas as pd |
| |
|
| | |
| | reference_input_path = EVAL_FILE |
| | predictions_path = TEMP_CSV |
| |
|
| | |
| | reference_df = pd.read_csv(reference_input_path) |
| | predictions_df = pd.read_csv(predictions_path) |
| |
|
| | |
| | reference_df["id"] = reference_df["id"].astype(str).str.strip() |
| | predictions_df["id"] = predictions_df["id"].astype(str).str.strip() |
| |
|
| | |
| | reference_df["answer"] = ( |
| | reference_df["answer"].astype(str).str.strip().str.lower() |
| | ) |
| | predictions_df["answer"] = ( |
| | predictions_df["answer"].astype(str).str.strip().str.lower() |
| | ) |
| |
|
| | |
| | merged_df = pd.merge( |
| | reference_df, |
| | predictions_df, |
| | on="id", |
| | how="inner", |
| | suffixes=("_ref", "_pred"), |
| | ) |
| |
|
| | |
| | merged_df["is_correct"] = merged_df["answer_ref"] == merged_df["answer_pred"] |
| |
|
| | |
| | total = len(merged_df) |
| | correct = merged_df["is_correct"].sum() |
| | accuracy = correct / total |
| |
|
| | std_outputs = "" |
| | std_outputs = std_outputs + f"Total predictions compared: {total}" + "\n" |
| | std_outputs = std_outputs + f"Number of correct predictions: {correct}" + "\n" |
| | std_outputs = std_outputs + f"Accuracy: {accuracy:.2%}" + "\n" |
| |
|
| | |
| | incorrect_df = merged_df[~merged_df["is_correct"]] |
| | if not incorrect_df.empty: |
| | std_outputs = std_outputs + "\nIncorrect predictions:" + "\n" |
| | |
| | std_outputs = ( |
| | std_outputs |
| | + str(incorrect_df[["id", "problem", "answer_ref", "answer_pred"]]) |
| | + "\n" |
| | ) |
| | else: |
| | std_outputs = std_outputs + "\nAll predictions match the reference!" + "\n" |
| |
|
| | time_taken = time.time() - run_start_time |
| | std_outputs = std_outputs + f"Time taken: {time_taken:.2f} seconds" + "\n" |
| | print(std_outputs) |
| |
|
| | |
| |
|
| | |
| | with open(f"evals_res/outputs_{SAVED_EVAL_FILE}.log", "w") as f: |
| | f.write(std_outputs) |
| |
|
| | |
| | |
| | merged_df.to_csv(f"evals_res/evals_{SAVED_EVAL_FILE}.csv", index=False) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import argparse |
| | import time |
| |
|
| | start = time.time() |
| |
|
| | parser = argparse.ArgumentParser() |
| | parser.add_argument( |
| | "--model", |
| | type=str, |
| | default="casperhansen/deepseek-r1-distill-qwen-7b-awq", |
| | help="Model to use", |
| | ) |
| | parser.add_argument( |
| | "--file", |
| | type=str, |
| | default="hard_batch_1", |
| | help="Eval File to use", |
| | ) |
| | parser.add_argument( |
| | "--num_seqs", |
| | type=int, |
| | default=48, |
| | help="Number of sequences to generate per prompt", |
| | ) |
| | parser.add_argument( |
| | "--tokens", |
| | type=int, |
| | default=1024 * 12, |
| | help="Number of sequences to generate per prompt", |
| | ) |
| |
|
| | parser.add_argument( |
| | "--quant_policy", |
| | type=int, |
| | default=8, |
| | choices=[8, 4, 0], |
| | help="Number of sequences to generate per prompt", |
| | ) |
| |
|
| | parser.add_argument( |
| | "--top_k", |
| | type=int, |
| | default=50, |
| | help="Number of sequences to generate per prompt", |
| | ) |
| |
|
| | parser.add_argument( |
| | "--top_p", |
| | type=float, |
| | default=0.90, |
| | help="Number of sequences to generate per prompt", |
| | ) |
| |
|
| | parser.add_argument( |
| | "--min_p", |
| | type=float, |
| | default=0.05, |
| | help="Number of sequences to generate per prompt", |
| | ) |
| |
|
| | args = parser.parse_args() |
| | main(args) |
| |
|
| | print(f"Time Taken: {time.time() - start}") |
| |
|
| |
|