| | import os |
| | import random |
| | import glob |
| | import json |
| |
|
| | import numpy as np |
| | from flask import Flask, render_template, request |
| |
|
| | app = Flask(__name__) |
| |
|
| |
|
| | with open("problems.json") as f: |
| | problems = json.load(f) |
| | problem_choices = [q["question_title"] for q in problems] |
| |
|
| | random_idxs = list(range(len(problems))) |
| | random.shuffle(random_idxs) |
| |
|
| | with open("all_outputs.json") as f: |
| | all_outputs = json.load(f) |
| | all_models = list(all_outputs.keys()) |
| |
|
| |
|
| | num_questions_filtered = len(problems) |
| |
|
| | all_correctness_by_problem = { |
| | idx: {model: np.mean(all_outputs[model][idx]["pass1_list"]) for model in all_models} |
| | for idx in random_idxs |
| | } |
| |
|
| |
|
| | def calculate_color(performance): |
| | |
| | |
| | if performance > 0.75: |
| | return f"rgba(0, 150, 0, 0.5)" |
| | elif performance > 0.5: |
| | return f"rgba(50, 150, 0, {performance})" |
| | elif performance > 0.25: |
| | return f"rgba(150, 50, 0, {1-performance})" |
| | else: |
| | return f"rgba(150, 0, 0, 0.5)" |
| |
|
| |
|
| | all_evaluations_by_problem_colored = [ |
| | ( |
| | trueidx, |
| | { |
| | model: { |
| | "correctness": f"{all_correctness_by_problem[idx][model]*100:.1f}", |
| | "correctness_color": calculate_color( |
| | all_correctness_by_problem[idx][model] |
| | ), |
| | } |
| | for model in all_models |
| | }, |
| | problems[idx]["difficulty"], |
| | ) |
| | for trueidx, idx in enumerate(random_idxs) |
| | ] |
| |
|
| | all_data_for_view_formatted = { |
| | model: [ |
| | [{"code": a, "pass1": b} for a, b in zip(row["code_list"], row["pass1_list"])] |
| | |
| | for idx in random_idxs |
| | for row in [resp[idx]] |
| | ] |
| | for model, resp in all_outputs.items() |
| | } |
| |
|
| |
|
| | @app.route("/") |
| | def home(): |
| | |
| | print(all_models) |
| | return render_template( |
| | "index.html", models=all_models, problems=all_evaluations_by_problem_colored |
| | ) |
| |
|
| |
|
| | @app.route("/problem/<int:problem_idx>") |
| | def problem(problem_idx): |
| | |
| |
|
| | data = { |
| | model: all_data_for_view_formatted[model][problem_idx] for model in all_models |
| | } |
| | evaluation = all_evaluations_by_problem_colored[problem_idx][1] |
| | question = problems[problem_idx] |
| |
|
| | |
| |
|
| | return render_template( |
| | "problem.html", |
| | problem_idx=problem_idx, |
| | evaluation=evaluation, |
| | models=all_models, |
| | question=question, |
| | data=data, |
| | ) |
| |
|
| |
|
| | mini_models = [ |
| | "DeepSeek-V2", |
| | "DeepSeekCoder-V2", |
| | "DSCoder-33b-Ins", |
| | "LLama3-70b-Ins", |
| | "GPT-4-Turbo-2024-04-09", |
| | "GPT-4O-2024-05-13", |
| | "Claude-2", |
| | "Claude-Instant-1", |
| | "Claude-3-Opus", |
| | "Claude-3-Sonnet", |
| | "Claude-3-Haiku", |
| | ] |
| |
|
| |
|
| | @app.route("/mini") |
| | def mini(): |
| | |
| | return render_template( |
| | "index_mini.html", |
| | models=mini_models, |
| | problems=all_evaluations_by_problem_colored, |
| | ) |
| |
|
| |
|
| | @app.route("/problem_mini/<int:problem_idx>") |
| | def problem_mini(problem_idx): |
| | |
| |
|
| | data = { |
| | model: all_data_for_view_formatted[model][problem_idx] for model in mini_models |
| | } |
| | evaluation = all_evaluations_by_problem_colored[problem_idx][1] |
| | question = problems[problem_idx] |
| |
|
| | |
| |
|
| | return render_template( |
| | "problem_mini.html", |
| | problem_idx=problem_idx, |
| | evaluation=evaluation, |
| | models=mini_models, |
| | question=question, |
| | data=data, |
| | ) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | app.run(host="0.0.0.0", port=7860) |
| |
|