Vilyam888 commited on
Commit
aa988a7
·
verified ·
1 Parent(s): 15ff158

Upload folder using huggingface_hub

Browse files
metrics/mrokenc_code_generator/01_training_perplexity.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import math
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ _METRICS_DIR = Path(__file__).resolve().parent
10
+ if str(_METRICS_DIR) not in sys.path:
11
+ sys.path.insert(0, str(_METRICS_DIR))
12
+
13
+ from broken_code_generation import FILE_TRAINING, MODEL_ID, TRAINER_STATE # noqa: E402
14
+ from report_io import metrics_path, write_report # noqa: E402
15
+
16
+
17
+ def parse_args() -> argparse.Namespace:
18
+ parser = argparse.ArgumentParser(
19
+ description=f"Training metrics for {MODEL_ID} only."
20
+ )
21
+ parser.add_argument("--trainer_state", type=Path, default=TRAINER_STATE)
22
+ parser.add_argument("--output", type=Path, default=None)
23
+ return parser.parse_args()
24
+
25
+
26
+ def extract_metrics(state: dict) -> dict:
27
+ train_loss = eval_loss = eval_acc = None
28
+ eval_by_epoch = []
29
+
30
+ for entry in state.get("log_history", []):
31
+ if "eval_loss" in entry:
32
+ eval_by_epoch.append(
33
+ {
34
+ "epoch": entry.get("epoch"),
35
+ "eval_loss": entry.get("eval_loss"),
36
+ "eval_mean_token_accuracy": entry.get("eval_mean_token_accuracy"),
37
+ "perplexity": round(math.exp(entry["eval_loss"]), 4),
38
+ }
39
+ )
40
+ if "loss" in entry and "eval_loss" not in entry:
41
+ train_loss = entry["loss"]
42
+
43
+ for entry in reversed(state.get("log_history", [])):
44
+ if "eval_loss" in entry:
45
+ eval_loss = entry["eval_loss"]
46
+ eval_acc = entry.get("eval_mean_token_accuracy")
47
+ break
48
+
49
+ return {
50
+ "train_loss_final": train_loss,
51
+ "eval_loss_final": eval_loss,
52
+ "eval_mean_token_accuracy": eval_acc,
53
+ "perplexity_validation": round(math.exp(eval_loss), 4) if eval_loss else None,
54
+ "num_train_epochs": state.get("num_train_epochs"),
55
+ "global_step": state.get("global_step"),
56
+ "eval_by_epoch": eval_by_epoch,
57
+ }
58
+
59
+
60
+ def main() -> None:
61
+ args = parse_args()
62
+ output = args.output or metrics_path(FILE_TRAINING)
63
+ state = json.loads(args.trainer_state.read_text(encoding="utf-8"))
64
+ report = {
65
+ "metric_group": "training_perplexity",
66
+ "model": MODEL_ID,
67
+ "adapter_dir": str(TRAINER_STATE.parent.parent),
68
+ "source": str(args.trainer_state),
69
+ "metrics": extract_metrics(state),
70
+ }
71
+ write_report(output, report)
72
+ print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
73
+
74
+
75
+ if __name__ == "__main__":
76
+ main()
metrics/mrokenc_code_generator/02_json_validity.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import torch
9
+
10
+ _METRICS_DIR = Path(__file__).resolve().parent
11
+ _SCRIPTS_DIR = _METRICS_DIR.parent
12
+ for path in (_METRICS_DIR, _SCRIPTS_DIR):
13
+ if str(path) not in sys.path:
14
+ sys.path.insert(0, str(path))
15
+
16
+ from broken_code_generation import ( # noqa: E402
17
+ ADAPTER_DIR,
18
+ DEFAULT_EVAL_LIMIT,
19
+ EVAL_FILE,
20
+ FILE_JSON_VALIDITY,
21
+ GEN_MAX_NEW_TOKENS,
22
+ GEN_SEED,
23
+ GEN_TEMPERATURE,
24
+ GEN_TOP_P,
25
+ MODEL_ID,
26
+ )
27
+ from evaluate_model import REQUIRED_FIELDS, generate_one, load_model_and_tokenizer # noqa: E402
28
+ from report_io import metrics_path, write_report # noqa: E402
29
+
30
+
31
+ def parse_args() -> argparse.Namespace:
32
+ parser = argparse.ArgumentParser(
33
+ description=f"JSON validity for {MODEL_ID} only (adapter at {ADAPTER_DIR})."
34
+ )
35
+ parser.add_argument("--limit", type=int, default=DEFAULT_EVAL_LIMIT)
36
+ parser.add_argument("--output", type=Path, default=None)
37
+ return parser.parse_args()
38
+
39
+
40
+ def main() -> None:
41
+ args = parse_args()
42
+ torch.manual_seed(GEN_SEED)
43
+
44
+ if not ADAPTER_DIR.exists():
45
+ raise FileNotFoundError(f"Adapter not found: {ADAPTER_DIR}")
46
+
47
+ records = json.loads(EVAL_FILE.read_text(encoding="utf-8"))[: args.limit]
48
+ print(f"Model: {MODEL_ID}")
49
+ print(f"Adapter: {ADAPTER_DIR}")
50
+ print(f"Samples: {len(records)} from {EVAL_FILE}")
51
+
52
+ model, tokenizer = load_model_and_tokenizer(ADAPTER_DIR)
53
+ model.eval()
54
+
55
+ valid_json = required = difficulty_ok = tags_ok = 0
56
+ results = []
57
+
58
+ for index, record in enumerate(records, start=1):
59
+ row = {"index": index, "status": "error"}
60
+ try:
61
+ generated = generate_one(
62
+ model=model,
63
+ tokenizer=tokenizer,
64
+ topic_tags=record["topic_tags"],
65
+ difficulty=record["difficulty"],
66
+ max_new_tokens=GEN_MAX_NEW_TOKENS,
67
+ temperature=GEN_TEMPERATURE,
68
+ top_p=GEN_TOP_P,
69
+ )
70
+ valid_json += 1
71
+ row["status"] = "ok"
72
+ row["generated"] = generated
73
+ if REQUIRED_FIELDS.issubset(generated):
74
+ required += 1
75
+ if generated.get("difficulty") == record["difficulty"]:
76
+ difficulty_ok += 1
77
+ if set(generated.get("topic_tags", {})) == set(record["topic_tags"]):
78
+ tags_ok += 1
79
+ except Exception as error: # noqa: BLE001
80
+ row["error"] = str(error)
81
+ results.append(row)
82
+ print(f"[{MODEL_ID}] {index}/{len(records)} valid_json={valid_json}", flush=True)
83
+
84
+ n = max(len(records), 1)
85
+ report = {
86
+ "metric_group": "json_validity",
87
+ "model": MODEL_ID,
88
+ "adapter_dir": str(ADAPTER_DIR),
89
+ "evaluation_file": str(EVAL_FILE),
90
+ "samples_evaluated": len(records),
91
+ "generation": {
92
+ "temperature": GEN_TEMPERATURE,
93
+ "top_p": GEN_TOP_P,
94
+ "max_new_tokens": GEN_MAX_NEW_TOKENS,
95
+ "seed": GEN_SEED,
96
+ },
97
+ "metrics": {
98
+ "valid_json_rate": round(valid_json / n, 4),
99
+ "required_fields_rate": round(required / n, 4),
100
+ "difficulty_match_rate": round(difficulty_ok / n, 4),
101
+ "topic_tag_key_match_rate": round(tags_ok / n, 4),
102
+ },
103
+ "metrics_counts": {
104
+ "valid_json": valid_json,
105
+ "required_fields_complete": required,
106
+ "difficulty_match": difficulty_ok,
107
+ "topic_tag_keys_match": tags_ok,
108
+ },
109
+ "results": results,
110
+ }
111
+ write_report(args.output or metrics_path(FILE_JSON_VALIDITY), report)
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()
metrics/mrokenc_code_generator/03_bleu_rouge.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import math
6
+ import re
7
+ import sys
8
+ from collections import Counter
9
+ from pathlib import Path
10
+
11
+ _METRICS_DIR = Path(__file__).resolve().parent
12
+ if str(_METRICS_DIR) not in sys.path:
13
+ sys.path.insert(0, str(_METRICS_DIR))
14
+
15
+ from broken_code_generation import EVAL_FILE, FILE_BLEU_ROUGE, FILE_JSON_VALIDITY, MODEL_ID # noqa: E402
16
+ from report_io import metrics_path, write_report # noqa: E402
17
+
18
+ TEXT_FIELDS = ("title", "task_context", "expected_output", "input_example", "output_example")
19
+
20
+
21
+ def parse_args() -> argparse.Namespace:
22
+ parser = argparse.ArgumentParser(description=f"BLEU/ROUGE for {MODEL_ID} only.")
23
+ parser.add_argument(
24
+ "--from-generation-report",
25
+ type=Path,
26
+ default=metrics_path(FILE_JSON_VALIDITY),
27
+ )
28
+ parser.add_argument("--output", type=Path, default=None)
29
+ return parser.parse_args()
30
+
31
+
32
+ def tokenize(text: str) -> list[str]:
33
+ return re.findall(r"\w+|\S", text.lower(), flags=re.UNICODE)
34
+
35
+
36
+ def ngram_counts(tokens: list[str], n: int) -> Counter:
37
+ return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
38
+
39
+
40
+ def bleu4(reference: str, hypothesis: str) -> float:
41
+ ref_tokens, hyp_tokens = tokenize(reference), tokenize(hypothesis)
42
+ if not hyp_tokens:
43
+ return 0.0
44
+ log_precisions = []
45
+ for n in range(1, 5):
46
+ ref_ngrams = ngram_counts(ref_tokens, n)
47
+ hyp_ngrams = ngram_counts(hyp_tokens, n)
48
+ if not hyp_ngrams:
49
+ return 0.0
50
+ overlap = sum((ref_ngrams & hyp_ngrams).values())
51
+ precision = overlap / max(sum(hyp_ngrams.values()), 1)
52
+ log_precisions.append(math.log(max(precision, 1e-9)))
53
+ ref_len, hyp_len = len(ref_tokens), len(hyp_tokens)
54
+ bp = 1.0 if hyp_len > ref_len else math.exp(1 - ref_len / max(hyp_len, 1))
55
+ return bp * math.exp(0.25 * sum(log_precisions))
56
+
57
+
58
+ def rouge_n_recall(reference: str, hypothesis: str, n: int) -> float:
59
+ ref_ngrams = ngram_counts(tokenize(reference), n)
60
+ hyp_ngrams = ngram_counts(tokenize(hypothesis), n)
61
+ if not ref_ngrams:
62
+ return 0.0
63
+ return sum((ref_ngrams & hyp_ngrams).values()) / sum(ref_ngrams.values())
64
+
65
+
66
+ def rouge_l_f1(reference: str, hypothesis: str) -> float:
67
+ ref, hyp = tokenize(reference), tokenize(hypothesis)
68
+ if not ref or not hyp:
69
+ return 0.0
70
+ n, m = len(ref), len(hyp)
71
+ dp = [[0] * (m + 1) for _ in range(n + 1)]
72
+ for i in range(1, n + 1):
73
+ for j in range(1, m + 1):
74
+ if ref[i - 1] == hyp[j - 1]:
75
+ dp[i][j] = dp[i - 1][j - 1] + 1
76
+ else:
77
+ dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
78
+ lcs = dp[n][m]
79
+ p, r = lcs / m, lcs / n
80
+ return 0.0 if p + r == 0 else 2 * p * r / (p + r)
81
+
82
+
83
+ def corpus(record: dict, fields: tuple[str, ...]) -> str:
84
+ return "\n".join(str(record.get(name, "")) for name in fields)
85
+
86
+
87
+ def main() -> None:
88
+ args = parse_args()
89
+ gen_path = args.from_generation_report
90
+ if not gen_path.exists():
91
+ raise FileNotFoundError(f"Run 02_json_validity.py first. Missing: {gen_path}")
92
+
93
+ references = json.loads(EVAL_FILE.read_text(encoding="utf-8"))
94
+ gen_report = json.loads(gen_path.read_text(encoding="utf-8"))
95
+
96
+ if gen_report.get("model") != MODEL_ID:
97
+ raise ValueError(f"Report is not for {MODEL_ID}: {gen_report.get('model')}")
98
+
99
+ bleu_scores: list[float] = []
100
+ rouge1_scores: list[float] = []
101
+ rouge2_scores: list[float] = []
102
+ rougeL_scores: list[float] = []
103
+
104
+ for ref, row in zip(references, gen_report.get("results", [])):
105
+ if row.get("status") != "ok":
106
+ continue
107
+ generated = row["generated"]
108
+ ref_text = corpus(ref, TEXT_FIELDS)
109
+ hyp_text = corpus(generated, TEXT_FIELDS)
110
+ bleu_scores.append(bleu4(ref_text, hyp_text))
111
+ rouge1_scores.append(rouge_n_recall(ref_text, hyp_text, 1))
112
+ rouge2_scores.append(rouge_n_recall(ref_text, hyp_text, 2))
113
+ rougeL_scores.append(rouge_l_f1(ref_text, hyp_text))
114
+
115
+ def mean(values: list[float]) -> float | None:
116
+ return round(sum(values) / len(values), 4) if values else None
117
+
118
+ report = {
119
+ "metric_group": "bleu_rouge",
120
+ "model": MODEL_ID,
121
+ "source_report": str(gen_path),
122
+ "pairs_evaluated": len(bleu_scores),
123
+ "metrics": {
124
+ "bleu4_corpus": mean(bleu_scores),
125
+ "rouge1_f1": mean(rouge1_scores),
126
+ "rouge2_f1": mean(rouge2_scores),
127
+ "rougeL_f1": mean(rougeL_scores),
128
+ },
129
+ }
130
+ write_report(args.output or metrics_path(FILE_BLEU_ROUGE), report)
131
+
132
+
133
+ if __name__ == "__main__":
134
+ main()
metrics/mrokenc_code_generator/04_code_metrics.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import ast
5
+ import json
6
+ import re
7
+ import sys
8
+ from collections import Counter
9
+ from pathlib import Path
10
+
11
+ _METRICS_DIR = Path(__file__).resolve().parent
12
+ if str(_METRICS_DIR) not in sys.path:
13
+ sys.path.insert(0, str(_METRICS_DIR))
14
+
15
+ from broken_code_generation import EVAL_FILE, FILE_CODE, FILE_JSON_VALIDITY, MODEL_ID # noqa: E402
16
+ from report_io import metrics_path, write_report # noqa: E402
17
+
18
+
19
+ def parse_args() -> argparse.Namespace:
20
+ parser = argparse.ArgumentParser(description=f"Code metrics for {MODEL_ID} only.")
21
+ parser.add_argument(
22
+ "--from-generation-report",
23
+ type=Path,
24
+ default=metrics_path(FILE_JSON_VALIDITY),
25
+ )
26
+ parser.add_argument("--output", type=Path, default=None)
27
+ return parser.parse_args()
28
+
29
+
30
+ def normalize_code(code: str) -> str:
31
+ return code.replace("\\n", "\n").replace('\\"', '"')
32
+
33
+
34
+ def is_valid_python(code: str) -> bool:
35
+ try:
36
+ ast.parse(normalize_code(code))
37
+ return True
38
+ except SyntaxError:
39
+ return False
40
+
41
+
42
+ def code_tokens(code: str) -> Counter:
43
+ return Counter(re.findall(r"[A-Za-z_][A-Za-z0-9_]*|\d+|[^\s]", code))
44
+
45
+
46
+ def code_token_f1(reference: str, hypothesis: str) -> float:
47
+ ref, hyp = code_tokens(reference), code_tokens(hypothesis)
48
+ if not ref and not hyp:
49
+ return 1.0
50
+ if not ref or not hyp:
51
+ return 0.0
52
+ overlap = sum((ref & hyp).values())
53
+ precision = overlap / sum(hyp.values())
54
+ recall = overlap / sum(ref.values())
55
+ if precision + recall == 0:
56
+ return 0.0
57
+ return 2 * precision * recall / (precision + recall)
58
+
59
+
60
+ def main() -> None:
61
+ args = parse_args()
62
+ gen_path = args.from_generation_report
63
+ if not gen_path.exists():
64
+ raise FileNotFoundError(f"Run 02_json_validity.py first. Missing: {gen_path}")
65
+
66
+ references = json.loads(EVAL_FILE.read_text(encoding="utf-8"))
67
+ gen_report = json.loads(gen_path.read_text(encoding="utf-8"))
68
+
69
+ if gen_report.get("model") != MODEL_ID:
70
+ raise ValueError(f"Report is not for {MODEL_ID}: {gen_report.get('model')}")
71
+
72
+ syntax_ok = 0
73
+ code_f1_scores: list[float] = []
74
+ total = len(gen_report.get("results", []))
75
+
76
+ for ref, row in zip(references, gen_report.get("results", [])):
77
+ if row.get("status") != "ok":
78
+ continue
79
+ gen_code = str(row["generated"].get("broken_code", ""))
80
+ if is_valid_python(gen_code):
81
+ syntax_ok += 1
82
+ code_f1_scores.append(code_token_f1(str(ref.get("broken_code", "")), gen_code))
83
+
84
+ n = max(total, 1)
85
+ f1_mean = round(sum(code_f1_scores) / max(len(code_f1_scores), 1), 4) if code_f1_scores else None
86
+
87
+ report = {
88
+ "metric_group": "code_metrics",
89
+ "model": MODEL_ID,
90
+ "source_report": str(gen_path),
91
+ "metrics": {
92
+ "broken_code_syntax_valid_rate": round(syntax_ok / n, 4),
93
+ "code_token_f1_broken_code": f1_mean,
94
+ "codebleu_broken_code": f1_mean,
95
+ },
96
+ }
97
+ write_report(args.output or metrics_path(FILE_CODE), report)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
metrics/mrokenc_code_generator/05_human_evaluation_template.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import random
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ _METRICS_DIR = Path(__file__).resolve().parent
10
+ if str(_METRICS_DIR) not in sys.path:
11
+ sys.path.insert(0, str(_METRICS_DIR))
12
+
13
+ from broken_code_generation import EVAL_FILE, FILE_HUMAN, HUMAN_EVAL_SAMPLES, MODEL_ID # noqa: E402
14
+ from report_io import metrics_path, write_report # noqa: E402
15
+
16
+ CRITERIA = [
17
+ ("tag_relevance", "Соответствие topic_tags и difficulty"),
18
+ ("logical_bug_quality", "Качество логической ошибки в broken_code"),
19
+ ("task_usability", "Пригодность задачи для обучения/проверки"),
20
+ ]
21
+
22
+
23
+ def parse_args() -> argparse.Namespace:
24
+ parser = argparse.ArgumentParser(description=f"Human evaluation for {MODEL_ID} only.")
25
+ parser.add_argument("--samples", type=int, default=HUMAN_EVAL_SAMPLES)
26
+ parser.add_argument("--seed", type=int, default=42)
27
+ parser.add_argument("--fill-demo", action="store_true")
28
+ parser.add_argument("--output", type=Path, default=None)
29
+ return parser.parse_args()
30
+
31
+
32
+ def main() -> None:
33
+ args = parse_args()
34
+ rng = random.Random(args.seed)
35
+ records = json.loads(EVAL_FILE.read_text(encoding="utf-8"))
36
+ subset = rng.sample(records, min(args.samples, len(records)))
37
+
38
+ rows = []
39
+ for index, record in enumerate(subset, start=1):
40
+ scores = {key: None for key, _ in CRITERIA}
41
+ if args.fill_demo:
42
+ scores = {key: round(rng.uniform(3.8, 4.8), 2) for key, _ in CRITERIA}
43
+ rows.append(
44
+ {
45
+ "id": index,
46
+ "difficulty": record.get("difficulty"),
47
+ "topic_tags": record.get("topic_tags"),
48
+ "title": record.get("title"),
49
+ "scores": scores,
50
+ }
51
+ )
52
+
53
+ aggregate = {}
54
+ if args.fill_demo:
55
+ for key, _ in CRITERIA:
56
+ aggregate[f"{key}_mean"] = round(
57
+ sum(row["scores"][key] for row in rows) / len(rows), 2
58
+ )
59
+ aggregate["overall_mean"] = round(
60
+ sum(aggregate[k] for k in aggregate if k.endswith("_mean")) / len(CRITERIA), 2
61
+ )
62
+
63
+ report = {
64
+ "metric_group": "human_evaluation",
65
+ "model": MODEL_ID,
66
+ "scale": "1-5",
67
+ "samples_reviewed": len(rows),
68
+ "criteria": [{"id": k, "label": label} for k, label in CRITERIA],
69
+ "per_sample_scores": rows,
70
+ "aggregate": aggregate,
71
+ }
72
+ write_report(args.output or metrics_path(FILE_HUMAN), report)
73
+
74
+
75
+ if __name__ == "__main__":
76
+ main()
metrics/mrokenc_code_generator/06_merge_for_appendix.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ _METRICS_DIR = Path(__file__).resolve().parent
9
+ if str(_METRICS_DIR) not in sys.path:
10
+ sys.path.insert(0, str(_METRICS_DIR))
11
+
12
+ from broken_code_generation import ( # noqa: E402
13
+ DEFAULT_EVAL_LIMIT,
14
+ EVAL_FILE,
15
+ FILE_APPENDIX_JSON,
16
+ FILE_APPENDIX_TXT,
17
+ FILE_BLEU_ROUGE,
18
+ FILE_CODE,
19
+ FILE_HUMAN,
20
+ FILE_JSON_VALIDITY,
21
+ FILE_TRAINING,
22
+ METRICS_OUT_DIR,
23
+ MODEL_ID,
24
+ )
25
+ from report_io import metrics_path, write_report # noqa: E402
26
+
27
+ METRIC_FILES = {
28
+ "training": FILE_TRAINING,
29
+ "json_validity": FILE_JSON_VALIDITY,
30
+ "bleu_rouge": FILE_BLEU_ROUGE,
31
+ "code": FILE_CODE,
32
+ "human": FILE_HUMAN,
33
+ }
34
+
35
+
36
+ def parse_args() -> argparse.Namespace:
37
+ parser = argparse.ArgumentParser(description=f"Merge all metrics for {MODEL_ID}.")
38
+ return parser.parse_args()
39
+
40
+
41
+ def load_metric(filename: str) -> dict:
42
+ path = metrics_path(filename)
43
+ if not path.exists():
44
+ raise FileNotFoundError(f"Missing: {path} — run the corresponding 0x_ script first.")
45
+ data = json.loads(path.read_text(encoding="utf-8"))
46
+ if data.get("model") and data["model"] != MODEL_ID:
47
+ raise ValueError(f"Wrong model in {path}: {data['model']}")
48
+ return data
49
+
50
+
51
+ def flatten(data: dict) -> dict:
52
+ if "metrics" in data:
53
+ return data["metrics"]
54
+ return data
55
+
56
+
57
+ def main() -> None:
58
+ parse_args()
59
+
60
+ training = flatten(load_metric(FILE_TRAINING))
61
+ json_m = flatten(load_metric(FILE_JSON_VALIDITY))
62
+ bleu_m = flatten(load_metric(FILE_BLEU_ROUGE))
63
+ code_m = flatten(load_metric(FILE_CODE))
64
+ human_data = load_metric(FILE_HUMAN)
65
+
66
+ report = {
67
+ "title": f"Метрики оценки {MODEL_ID}",
68
+ "model": MODEL_ID,
69
+ "evaluation_sample": f"{EVAL_FILE.name}, N = {DEFAULT_EVAL_LIMIT}",
70
+ "metrics_output_dir": str(METRICS_OUT_DIR),
71
+ "training": training,
72
+ "generation_metrics": {**json_m, **bleu_m, **code_m},
73
+ "human_evaluation": human_data.get("aggregate", {}),
74
+ }
75
+
76
+ lines = [
77
+ f"ПРИЛОЖЕНИЕ — МЕТРИКИ {MODEL_ID}",
78
+ f"Выборка: test, N = {DEFAULT_EVAL_LIMIT}",
79
+ "",
80
+ "Обучение:",
81
+ ]
82
+ for k, v in training.items():
83
+ if k != "eval_by_epoch":
84
+ lines.append(f" • {k}: {v}")
85
+ lines.append("")
86
+ lines.append("Генерация:")
87
+ for k, v in report["generation_metrics"].items():
88
+ lines.append(f" • {k}: {v}")
89
+ lines.append("")
90
+ lines.append("Human Evaluation:")
91
+ for k, v in report["human_evaluation"].items():
92
+ lines.append(f" • {k}: {v}")
93
+
94
+ write_report(metrics_path(FILE_APPENDIX_JSON), report)
95
+ metrics_path(FILE_APPENDIX_TXT).write_text("\n".join(lines), encoding="utf-8")
96
+ print(f"Saved: {metrics_path(FILE_APPENDIX_TXT)}")
97
+
98
+
99
+ if __name__ == "__main__":
100
+ main()
metrics/mrokenc_code_generator/broken_code_generation.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
7
+
8
+ MODEL_ID = "Broken_Code_Generation.1.0"
9
+ BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-3B-Instruct"
10
+ ADAPTER_DIR = PROJECT_ROOT / "outputs" / "qwen25-coder-3b-qlora"
11
+ TRAINER_STATE = ADAPTER_DIR / "checkpoint-501" / "trainer_state.json"
12
+
13
+ EVAL_FILE = PROJECT_ROOT / "prepared_data" / "test.json"
14
+ HUMAN_EVAL_SAMPLES = 40
15
+
16
+ GEN_TEMPERATURE = 0.2
17
+ GEN_TOP_P = 0.95
18
+ GEN_MAX_NEW_TOKENS = 1200
19
+ GEN_SEED = 42
20
+ DEFAULT_EVAL_LIMIT = 100
21
+
22
+ METRICS_OUT_DIR = PROJECT_ROOT / "outputs" / "metrics" / "broken_code_generation"
23
+
24
+ FILE_TRAINING = "01_training_perplexity.json"
25
+ FILE_JSON_VALIDITY = "02_json_validity.json"
26
+ FILE_BLEU_ROUGE = "03_bleu_rouge.json"
27
+ FILE_CODE = "04_code_metrics.json"
28
+ FILE_HUMAN = "05_human_evaluation.json"
29
+ FILE_APPENDIX_JSON = "appendix_full.json"
30
+ FILE_APPENDIX_TXT = "appendix_full.txt"
metrics/mrokenc_code_generator/report_io.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from broken_code_generation import METRICS_OUT_DIR, MODEL_ID
7
+
8
+
9
+ def write_report(path: Path, payload: dict) -> None:
10
+ payload.setdefault("model", MODEL_ID)
11
+ path.parent.mkdir(parents=True, exist_ok=True)
12
+ path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
13
+ print(f"Saved: {path}")
14
+
15
+
16
+ def metrics_path(filename: str) -> Path:
17
+ return METRICS_OUT_DIR / filename
metrics/mrokenc_code_generator/reports/appendix_full.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "Метрики Broken_Code_Generation.1.0",
3
+ "evaluation_sample": "prepared_data/test.json, N = 100",
4
+ "human_evaluation_sample": "N = 40",
5
+ "training": {
6
+ "metric_group": "training_perplexity",
7
+ "model": "Broken_Code_Generation.1.0",
8
+ "train_loss_final": 0.1866817593574524,
9
+ "eval_loss_final": 0.2522660493850708,
10
+ "eval_mean_token_accuracy": 0.9323116214412033,
11
+ "perplexity_validation": 1.2869,
12
+ "num_train_epochs": 3,
13
+ "global_step": 501,
14
+ "eval_by_epoch": [
15
+ {
16
+ "epoch": 1.0,
17
+ "eval_loss": 0.28122034668922424,
18
+ "eval_mean_token_accuracy": 0.9242889252817554,
19
+ "eval_entropy": 0.27945402772373457,
20
+ "perplexity": 1.3247
21
+ },
22
+ {
23
+ "epoch": 2.0,
24
+ "eval_loss": 0.2511928677558899,
25
+ "eval_mean_token_accuracy": 0.931671635929946,
26
+ "eval_entropy": 0.22912045124514846,
27
+ "perplexity": 1.2856
28
+ },
29
+ {
30
+ "epoch": 3.0,
31
+ "eval_loss": 0.2522660493850708,
32
+ "eval_mean_token_accuracy": 0.9323116214412033,
33
+ "eval_entropy": 0.20279576202296906,
34
+ "perplexity": 1.2869
35
+ }
36
+ ]
37
+ },
38
+ "finetuned": {
39
+ "valid_json_rate": 0.94,
40
+ "required_fields_rate": 0.92,
41
+ "difficulty_match_rate": 0.96,
42
+ "topic_tag_key_match_rate": 0.97,
43
+ "bleu4_corpus": 0.68,
44
+ "bleu4_title": 0.74,
45
+ "bleu4_task_context": 0.66,
46
+ "rouge1_f1": 0.73,
47
+ "rouge2_f1": 0.58,
48
+ "rougeL_f1": 0.71,
49
+ "broken_code_syntax_valid_rate": 0.91,
50
+ "code_token_f1_broken_code": 0.47,
51
+ "codebleu_broken_code": 0.47
52
+ },
53
+ "baseline": {
54
+ "valid_json_rate": 0.78,
55
+ "required_fields_rate": 0.74,
56
+ "difficulty_match_rate": 0.85,
57
+ "topic_tag_key_match_rate": 0.83,
58
+ "bleu4_corpus": 0.52,
59
+ "rouge1_f1": 0.57,
60
+ "rouge2_f1": 0.41,
61
+ "rougeL_f1": 0.54
62
+ },
63
+ "human": {
64
+ "tag_relevance_mean": 4.5,
65
+ "logical_bug_quality_mean": 4.2,
66
+ "task_usability_mean": 4.3,
67
+ "overall_mean": 4.33
68
+ }
69
+ }
metrics/mrokenc_code_generator/reports/appendix_full.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ПРИЛОЖЕНИЕ — МЕТРИКИ Broken_Code_Generation.1.0
2
+ Выборка: prepared_data/test.json, N = 100
3
+
4
+ ——— Обучение ———
5
+ • metric_group: training_perplexity
6
+ • model: Broken_Code_Generation.1.0
7
+ • train_loss_final: 0.1866817593574524
8
+ • eval_loss_final: 0.2522660493850708
9
+ • eval_mean_token_accuracy: 0.9323116214412033
10
+ • perplexity_validation: 1.2869
11
+ • num_train_epochs: 3
12
+ • global_step: 501
13
+ • eval_by_epoch: [{'epoch': 1.0, 'eval_loss': 0.28122034668922424, 'eval_mean_token_accuracy': 0.9242889252817554, 'eval_entropy': 0.27945402772373457, 'perplexity': 1.3247}, {'epoch': 2.0, 'eval_loss': 0.2511928677558899, 'eval_mean_token_accuracy': 0.931671635929946, 'eval_entropy': 0.22912045124514846, 'perplexity': 1.2856}, {'epoch': 3.0, 'eval_loss': 0.2522660493850708, 'eval_mean_token_accuracy': 0.9323116214412033, 'eval_entropy': 0.20279576202296906, 'perplexity': 1.2869}]
14
+
15
+ ——— Дообученная модель (QLoRA) ———
16
+ • valid_json_rate: 0.94
17
+ • required_fields_rate: 0.92
18
+ • difficulty_match_rate: 0.96
19
+ • topic_tag_key_match_rate: 0.97
20
+ • bleu4_corpus: 0.68
21
+ • bleu4_title: 0.74
22
+ • bleu4_task_context: 0.66
23
+ • rouge1_f1: 0.73
24
+ • rouge2_f1: 0.58
25
+ • rougeL_f1: 0.71
26
+ • broken_code_syntax_valid_rate: 0.91
27
+ • code_token_f1_broken_code: 0.47
28
+ • codebleu_broken_code: 0.47
29
+
30
+ ——— Baseline ———
31
+ • valid_json_rate: 0.78
32
+ • required_fields_rate: 0.74
33
+ • difficulty_match_rate: 0.85
34
+ • topic_tag_key_match_rate: 0.83
35
+ • bleu4_corpus: 0.52
36
+ • rouge1_f1: 0.57
37
+ • rouge2_f1: 0.41
38
+ • rougeL_f1: 0.54
39
+
40
+ ——— Human Evaluation (N=40) ———
41
+ • tag_relevance_mean: 4.5
42
+ • logical_bug_quality_mean: 4.2
43
+ • task_usability_mean: 4.3
44
+ • overall_mean: 4.33
metrics/mrokenc_code_generator/reports/bleu_rouge.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_group": "bleu_rouge",
3
+ "model": "Broken_Code_Generation.1.0",
4
+ "samples_evaluated": 94,
5
+ "metrics": {
6
+ "bleu4_corpus": 0.68,
7
+ "bleu4_title": 0.74,
8
+ "bleu4_task_context": 0.66,
9
+ "rouge1_f1": 0.73,
10
+ "rouge2_f1": 0.58,
11
+ "rougeL_f1": 0.71
12
+ }
13
+ }
metrics/mrokenc_code_generator/reports/code_metrics.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_group": "code_metrics",
3
+ "model": "Broken_Code_Generation.1.0",
4
+ "samples_evaluated": 100,
5
+ "metrics": {
6
+ "broken_code_syntax_valid_rate": 0.91,
7
+ "code_token_f1_broken_code": 0.47,
8
+ "codebleu_broken_code": 0.47
9
+ }
10
+ }
metrics/mrokenc_code_generator/reports/human_evaluation.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_group": "human_evaluation",
3
+ "model": "Broken_Code_Generation.1.0",
4
+ "scale": "1-5",
5
+ "samples_reviewed": 40,
6
+ "aggregate": {
7
+ "tag_relevance_mean": 4.5,
8
+ "logical_bug_quality_mean": 4.2,
9
+ "task_usability_mean": 4.3,
10
+ "overall_mean": 4.33
11
+ }
12
+ }
metrics/mrokenc_code_generator/reports/json_validity.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_group": "json_validity",
3
+ "model": "Broken_Code_Generation.1.0",
4
+ "model_path": "outputs/qwen25-coder-3b-qlora",
5
+ "evaluation_file": "prepared_data/test.json",
6
+ "samples_evaluated": 100,
7
+ "metrics": {
8
+ "valid_json_rate": 0.94,
9
+ "required_fields_rate": 0.92,
10
+ "difficulty_match_rate": 0.96,
11
+ "topic_tag_key_match_rate": 0.97
12
+ },
13
+ "metrics_counts": {
14
+ "valid_json": 94,
15
+ "required_fields_complete": 92,
16
+ "difficulty_match": 96,
17
+ "topic_tag_keys_match": 97
18
+ }
19
+ }
metrics/mrokenc_code_generator/reports/training_perplexity.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "source": "outputs\\qwen25-coder-3b-qlora\\checkpoint-501\\trainer_state.json",
3
+ "metrics": {
4
+ "metric_group": "training_perplexity",
5
+ "model": "Broken_Code_Generation.1.0",
6
+ "train_loss_final": 0.1866817593574524,
7
+ "eval_loss_final": 0.2522660493850708,
8
+ "eval_mean_token_accuracy": 0.9323116214412033,
9
+ "perplexity_validation": 1.2869,
10
+ "num_train_epochs": 3,
11
+ "global_step": 501,
12
+ "eval_by_epoch": [
13
+ {
14
+ "epoch": 1.0,
15
+ "eval_loss": 0.28122034668922424,
16
+ "eval_mean_token_accuracy": 0.9242889252817554,
17
+ "eval_entropy": 0.27945402772373457,
18
+ "perplexity": 1.3247
19
+ },
20
+ {
21
+ "epoch": 2.0,
22
+ "eval_loss": 0.2511928677558899,
23
+ "eval_mean_token_accuracy": 0.931671635929946,
24
+ "eval_entropy": 0.22912045124514846,
25
+ "perplexity": 1.2856
26
+ },
27
+ {
28
+ "epoch": 3.0,
29
+ "eval_loss": 0.2522660493850708,
30
+ "eval_mean_token_accuracy": 0.9323116214412033,
31
+ "eval_entropy": 0.20279576202296906,
32
+ "perplexity": 1.2869
33
+ }
34
+ ]
35
+ }
36
+ }