Text Generation
Transformers
Safetensors
qwen2
qwen
qwen2.5-coder
code
fine-tuned
russian
conversational
text-generation-inference
Instructions to use Vilyam888/Broken_Code_Generation.1.0 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Vilyam888/Broken_Code_Generation.1.0 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Vilyam888/Broken_Code_Generation.1.0") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("Vilyam888/Broken_Code_Generation.1.0") model = AutoModelForCausalLM.from_pretrained("Vilyam888/Broken_Code_Generation.1.0") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Inference
- Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Vilyam888/Broken_Code_Generation.1.0 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Vilyam888/Broken_Code_Generation.1.0" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Vilyam888/Broken_Code_Generation.1.0", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Vilyam888/Broken_Code_Generation.1.0
- SGLang
How to use Vilyam888/Broken_Code_Generation.1.0 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Vilyam888/Broken_Code_Generation.1.0" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Vilyam888/Broken_Code_Generation.1.0", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Vilyam888/Broken_Code_Generation.1.0" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Vilyam888/Broken_Code_Generation.1.0", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Vilyam888/Broken_Code_Generation.1.0 with Docker Model Runner:
docker model run hf.co/Vilyam888/Broken_Code_Generation.1.0
Upload folder using huggingface_hub
Browse files- metrics/mrokenc_code_generator/01_training_perplexity.py +76 -0
- metrics/mrokenc_code_generator/02_json_validity.py +115 -0
- metrics/mrokenc_code_generator/03_bleu_rouge.py +134 -0
- metrics/mrokenc_code_generator/04_code_metrics.py +101 -0
- metrics/mrokenc_code_generator/05_human_evaluation_template.py +76 -0
- metrics/mrokenc_code_generator/06_merge_for_appendix.py +100 -0
- metrics/mrokenc_code_generator/broken_code_generation.py +30 -0
- metrics/mrokenc_code_generator/report_io.py +17 -0
- metrics/mrokenc_code_generator/reports/appendix_full.json +69 -0
- metrics/mrokenc_code_generator/reports/appendix_full.txt +44 -0
- metrics/mrokenc_code_generator/reports/bleu_rouge.json +13 -0
- metrics/mrokenc_code_generator/reports/code_metrics.json +10 -0
- metrics/mrokenc_code_generator/reports/human_evaluation.json +12 -0
- metrics/mrokenc_code_generator/reports/json_validity.json +19 -0
- metrics/mrokenc_code_generator/reports/training_perplexity.json +36 -0
metrics/mrokenc_code_generator/01_training_perplexity.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import math
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
_METRICS_DIR = Path(__file__).resolve().parent
|
| 10 |
+
if str(_METRICS_DIR) not in sys.path:
|
| 11 |
+
sys.path.insert(0, str(_METRICS_DIR))
|
| 12 |
+
|
| 13 |
+
from broken_code_generation import FILE_TRAINING, MODEL_ID, TRAINER_STATE # noqa: E402
|
| 14 |
+
from report_io import metrics_path, write_report # noqa: E402
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def parse_args() -> argparse.Namespace:
|
| 18 |
+
parser = argparse.ArgumentParser(
|
| 19 |
+
description=f"Training metrics for {MODEL_ID} only."
|
| 20 |
+
)
|
| 21 |
+
parser.add_argument("--trainer_state", type=Path, default=TRAINER_STATE)
|
| 22 |
+
parser.add_argument("--output", type=Path, default=None)
|
| 23 |
+
return parser.parse_args()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_metrics(state: dict) -> dict:
|
| 27 |
+
train_loss = eval_loss = eval_acc = None
|
| 28 |
+
eval_by_epoch = []
|
| 29 |
+
|
| 30 |
+
for entry in state.get("log_history", []):
|
| 31 |
+
if "eval_loss" in entry:
|
| 32 |
+
eval_by_epoch.append(
|
| 33 |
+
{
|
| 34 |
+
"epoch": entry.get("epoch"),
|
| 35 |
+
"eval_loss": entry.get("eval_loss"),
|
| 36 |
+
"eval_mean_token_accuracy": entry.get("eval_mean_token_accuracy"),
|
| 37 |
+
"perplexity": round(math.exp(entry["eval_loss"]), 4),
|
| 38 |
+
}
|
| 39 |
+
)
|
| 40 |
+
if "loss" in entry and "eval_loss" not in entry:
|
| 41 |
+
train_loss = entry["loss"]
|
| 42 |
+
|
| 43 |
+
for entry in reversed(state.get("log_history", [])):
|
| 44 |
+
if "eval_loss" in entry:
|
| 45 |
+
eval_loss = entry["eval_loss"]
|
| 46 |
+
eval_acc = entry.get("eval_mean_token_accuracy")
|
| 47 |
+
break
|
| 48 |
+
|
| 49 |
+
return {
|
| 50 |
+
"train_loss_final": train_loss,
|
| 51 |
+
"eval_loss_final": eval_loss,
|
| 52 |
+
"eval_mean_token_accuracy": eval_acc,
|
| 53 |
+
"perplexity_validation": round(math.exp(eval_loss), 4) if eval_loss else None,
|
| 54 |
+
"num_train_epochs": state.get("num_train_epochs"),
|
| 55 |
+
"global_step": state.get("global_step"),
|
| 56 |
+
"eval_by_epoch": eval_by_epoch,
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def main() -> None:
|
| 61 |
+
args = parse_args()
|
| 62 |
+
output = args.output or metrics_path(FILE_TRAINING)
|
| 63 |
+
state = json.loads(args.trainer_state.read_text(encoding="utf-8"))
|
| 64 |
+
report = {
|
| 65 |
+
"metric_group": "training_perplexity",
|
| 66 |
+
"model": MODEL_ID,
|
| 67 |
+
"adapter_dir": str(TRAINER_STATE.parent.parent),
|
| 68 |
+
"source": str(args.trainer_state),
|
| 69 |
+
"metrics": extract_metrics(state),
|
| 70 |
+
}
|
| 71 |
+
write_report(output, report)
|
| 72 |
+
print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
main()
|
metrics/mrokenc_code_generator/02_json_validity.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
_METRICS_DIR = Path(__file__).resolve().parent
|
| 11 |
+
_SCRIPTS_DIR = _METRICS_DIR.parent
|
| 12 |
+
for path in (_METRICS_DIR, _SCRIPTS_DIR):
|
| 13 |
+
if str(path) not in sys.path:
|
| 14 |
+
sys.path.insert(0, str(path))
|
| 15 |
+
|
| 16 |
+
from broken_code_generation import ( # noqa: E402
|
| 17 |
+
ADAPTER_DIR,
|
| 18 |
+
DEFAULT_EVAL_LIMIT,
|
| 19 |
+
EVAL_FILE,
|
| 20 |
+
FILE_JSON_VALIDITY,
|
| 21 |
+
GEN_MAX_NEW_TOKENS,
|
| 22 |
+
GEN_SEED,
|
| 23 |
+
GEN_TEMPERATURE,
|
| 24 |
+
GEN_TOP_P,
|
| 25 |
+
MODEL_ID,
|
| 26 |
+
)
|
| 27 |
+
from evaluate_model import REQUIRED_FIELDS, generate_one, load_model_and_tokenizer # noqa: E402
|
| 28 |
+
from report_io import metrics_path, write_report # noqa: E402
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def parse_args() -> argparse.Namespace:
|
| 32 |
+
parser = argparse.ArgumentParser(
|
| 33 |
+
description=f"JSON validity for {MODEL_ID} only (adapter at {ADAPTER_DIR})."
|
| 34 |
+
)
|
| 35 |
+
parser.add_argument("--limit", type=int, default=DEFAULT_EVAL_LIMIT)
|
| 36 |
+
parser.add_argument("--output", type=Path, default=None)
|
| 37 |
+
return parser.parse_args()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main() -> None:
|
| 41 |
+
args = parse_args()
|
| 42 |
+
torch.manual_seed(GEN_SEED)
|
| 43 |
+
|
| 44 |
+
if not ADAPTER_DIR.exists():
|
| 45 |
+
raise FileNotFoundError(f"Adapter not found: {ADAPTER_DIR}")
|
| 46 |
+
|
| 47 |
+
records = json.loads(EVAL_FILE.read_text(encoding="utf-8"))[: args.limit]
|
| 48 |
+
print(f"Model: {MODEL_ID}")
|
| 49 |
+
print(f"Adapter: {ADAPTER_DIR}")
|
| 50 |
+
print(f"Samples: {len(records)} from {EVAL_FILE}")
|
| 51 |
+
|
| 52 |
+
model, tokenizer = load_model_and_tokenizer(ADAPTER_DIR)
|
| 53 |
+
model.eval()
|
| 54 |
+
|
| 55 |
+
valid_json = required = difficulty_ok = tags_ok = 0
|
| 56 |
+
results = []
|
| 57 |
+
|
| 58 |
+
for index, record in enumerate(records, start=1):
|
| 59 |
+
row = {"index": index, "status": "error"}
|
| 60 |
+
try:
|
| 61 |
+
generated = generate_one(
|
| 62 |
+
model=model,
|
| 63 |
+
tokenizer=tokenizer,
|
| 64 |
+
topic_tags=record["topic_tags"],
|
| 65 |
+
difficulty=record["difficulty"],
|
| 66 |
+
max_new_tokens=GEN_MAX_NEW_TOKENS,
|
| 67 |
+
temperature=GEN_TEMPERATURE,
|
| 68 |
+
top_p=GEN_TOP_P,
|
| 69 |
+
)
|
| 70 |
+
valid_json += 1
|
| 71 |
+
row["status"] = "ok"
|
| 72 |
+
row["generated"] = generated
|
| 73 |
+
if REQUIRED_FIELDS.issubset(generated):
|
| 74 |
+
required += 1
|
| 75 |
+
if generated.get("difficulty") == record["difficulty"]:
|
| 76 |
+
difficulty_ok += 1
|
| 77 |
+
if set(generated.get("topic_tags", {})) == set(record["topic_tags"]):
|
| 78 |
+
tags_ok += 1
|
| 79 |
+
except Exception as error: # noqa: BLE001
|
| 80 |
+
row["error"] = str(error)
|
| 81 |
+
results.append(row)
|
| 82 |
+
print(f"[{MODEL_ID}] {index}/{len(records)} valid_json={valid_json}", flush=True)
|
| 83 |
+
|
| 84 |
+
n = max(len(records), 1)
|
| 85 |
+
report = {
|
| 86 |
+
"metric_group": "json_validity",
|
| 87 |
+
"model": MODEL_ID,
|
| 88 |
+
"adapter_dir": str(ADAPTER_DIR),
|
| 89 |
+
"evaluation_file": str(EVAL_FILE),
|
| 90 |
+
"samples_evaluated": len(records),
|
| 91 |
+
"generation": {
|
| 92 |
+
"temperature": GEN_TEMPERATURE,
|
| 93 |
+
"top_p": GEN_TOP_P,
|
| 94 |
+
"max_new_tokens": GEN_MAX_NEW_TOKENS,
|
| 95 |
+
"seed": GEN_SEED,
|
| 96 |
+
},
|
| 97 |
+
"metrics": {
|
| 98 |
+
"valid_json_rate": round(valid_json / n, 4),
|
| 99 |
+
"required_fields_rate": round(required / n, 4),
|
| 100 |
+
"difficulty_match_rate": round(difficulty_ok / n, 4),
|
| 101 |
+
"topic_tag_key_match_rate": round(tags_ok / n, 4),
|
| 102 |
+
},
|
| 103 |
+
"metrics_counts": {
|
| 104 |
+
"valid_json": valid_json,
|
| 105 |
+
"required_fields_complete": required,
|
| 106 |
+
"difficulty_match": difficulty_ok,
|
| 107 |
+
"topic_tag_keys_match": tags_ok,
|
| 108 |
+
},
|
| 109 |
+
"results": results,
|
| 110 |
+
}
|
| 111 |
+
write_report(args.output or metrics_path(FILE_JSON_VALIDITY), report)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
main()
|
metrics/mrokenc_code_generator/03_bleu_rouge.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import math
|
| 6 |
+
import re
|
| 7 |
+
import sys
|
| 8 |
+
from collections import Counter
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
_METRICS_DIR = Path(__file__).resolve().parent
|
| 12 |
+
if str(_METRICS_DIR) not in sys.path:
|
| 13 |
+
sys.path.insert(0, str(_METRICS_DIR))
|
| 14 |
+
|
| 15 |
+
from broken_code_generation import EVAL_FILE, FILE_BLEU_ROUGE, FILE_JSON_VALIDITY, MODEL_ID # noqa: E402
|
| 16 |
+
from report_io import metrics_path, write_report # noqa: E402
|
| 17 |
+
|
| 18 |
+
TEXT_FIELDS = ("title", "task_context", "expected_output", "input_example", "output_example")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def parse_args() -> argparse.Namespace:
|
| 22 |
+
parser = argparse.ArgumentParser(description=f"BLEU/ROUGE for {MODEL_ID} only.")
|
| 23 |
+
parser.add_argument(
|
| 24 |
+
"--from-generation-report",
|
| 25 |
+
type=Path,
|
| 26 |
+
default=metrics_path(FILE_JSON_VALIDITY),
|
| 27 |
+
)
|
| 28 |
+
parser.add_argument("--output", type=Path, default=None)
|
| 29 |
+
return parser.parse_args()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def tokenize(text: str) -> list[str]:
|
| 33 |
+
return re.findall(r"\w+|\S", text.lower(), flags=re.UNICODE)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def ngram_counts(tokens: list[str], n: int) -> Counter:
|
| 37 |
+
return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def bleu4(reference: str, hypothesis: str) -> float:
|
| 41 |
+
ref_tokens, hyp_tokens = tokenize(reference), tokenize(hypothesis)
|
| 42 |
+
if not hyp_tokens:
|
| 43 |
+
return 0.0
|
| 44 |
+
log_precisions = []
|
| 45 |
+
for n in range(1, 5):
|
| 46 |
+
ref_ngrams = ngram_counts(ref_tokens, n)
|
| 47 |
+
hyp_ngrams = ngram_counts(hyp_tokens, n)
|
| 48 |
+
if not hyp_ngrams:
|
| 49 |
+
return 0.0
|
| 50 |
+
overlap = sum((ref_ngrams & hyp_ngrams).values())
|
| 51 |
+
precision = overlap / max(sum(hyp_ngrams.values()), 1)
|
| 52 |
+
log_precisions.append(math.log(max(precision, 1e-9)))
|
| 53 |
+
ref_len, hyp_len = len(ref_tokens), len(hyp_tokens)
|
| 54 |
+
bp = 1.0 if hyp_len > ref_len else math.exp(1 - ref_len / max(hyp_len, 1))
|
| 55 |
+
return bp * math.exp(0.25 * sum(log_precisions))
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def rouge_n_recall(reference: str, hypothesis: str, n: int) -> float:
|
| 59 |
+
ref_ngrams = ngram_counts(tokenize(reference), n)
|
| 60 |
+
hyp_ngrams = ngram_counts(tokenize(hypothesis), n)
|
| 61 |
+
if not ref_ngrams:
|
| 62 |
+
return 0.0
|
| 63 |
+
return sum((ref_ngrams & hyp_ngrams).values()) / sum(ref_ngrams.values())
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def rouge_l_f1(reference: str, hypothesis: str) -> float:
|
| 67 |
+
ref, hyp = tokenize(reference), tokenize(hypothesis)
|
| 68 |
+
if not ref or not hyp:
|
| 69 |
+
return 0.0
|
| 70 |
+
n, m = len(ref), len(hyp)
|
| 71 |
+
dp = [[0] * (m + 1) for _ in range(n + 1)]
|
| 72 |
+
for i in range(1, n + 1):
|
| 73 |
+
for j in range(1, m + 1):
|
| 74 |
+
if ref[i - 1] == hyp[j - 1]:
|
| 75 |
+
dp[i][j] = dp[i - 1][j - 1] + 1
|
| 76 |
+
else:
|
| 77 |
+
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
|
| 78 |
+
lcs = dp[n][m]
|
| 79 |
+
p, r = lcs / m, lcs / n
|
| 80 |
+
return 0.0 if p + r == 0 else 2 * p * r / (p + r)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def corpus(record: dict, fields: tuple[str, ...]) -> str:
|
| 84 |
+
return "\n".join(str(record.get(name, "")) for name in fields)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def main() -> None:
|
| 88 |
+
args = parse_args()
|
| 89 |
+
gen_path = args.from_generation_report
|
| 90 |
+
if not gen_path.exists():
|
| 91 |
+
raise FileNotFoundError(f"Run 02_json_validity.py first. Missing: {gen_path}")
|
| 92 |
+
|
| 93 |
+
references = json.loads(EVAL_FILE.read_text(encoding="utf-8"))
|
| 94 |
+
gen_report = json.loads(gen_path.read_text(encoding="utf-8"))
|
| 95 |
+
|
| 96 |
+
if gen_report.get("model") != MODEL_ID:
|
| 97 |
+
raise ValueError(f"Report is not for {MODEL_ID}: {gen_report.get('model')}")
|
| 98 |
+
|
| 99 |
+
bleu_scores: list[float] = []
|
| 100 |
+
rouge1_scores: list[float] = []
|
| 101 |
+
rouge2_scores: list[float] = []
|
| 102 |
+
rougeL_scores: list[float] = []
|
| 103 |
+
|
| 104 |
+
for ref, row in zip(references, gen_report.get("results", [])):
|
| 105 |
+
if row.get("status") != "ok":
|
| 106 |
+
continue
|
| 107 |
+
generated = row["generated"]
|
| 108 |
+
ref_text = corpus(ref, TEXT_FIELDS)
|
| 109 |
+
hyp_text = corpus(generated, TEXT_FIELDS)
|
| 110 |
+
bleu_scores.append(bleu4(ref_text, hyp_text))
|
| 111 |
+
rouge1_scores.append(rouge_n_recall(ref_text, hyp_text, 1))
|
| 112 |
+
rouge2_scores.append(rouge_n_recall(ref_text, hyp_text, 2))
|
| 113 |
+
rougeL_scores.append(rouge_l_f1(ref_text, hyp_text))
|
| 114 |
+
|
| 115 |
+
def mean(values: list[float]) -> float | None:
|
| 116 |
+
return round(sum(values) / len(values), 4) if values else None
|
| 117 |
+
|
| 118 |
+
report = {
|
| 119 |
+
"metric_group": "bleu_rouge",
|
| 120 |
+
"model": MODEL_ID,
|
| 121 |
+
"source_report": str(gen_path),
|
| 122 |
+
"pairs_evaluated": len(bleu_scores),
|
| 123 |
+
"metrics": {
|
| 124 |
+
"bleu4_corpus": mean(bleu_scores),
|
| 125 |
+
"rouge1_f1": mean(rouge1_scores),
|
| 126 |
+
"rouge2_f1": mean(rouge2_scores),
|
| 127 |
+
"rougeL_f1": mean(rougeL_scores),
|
| 128 |
+
},
|
| 129 |
+
}
|
| 130 |
+
write_report(args.output or metrics_path(FILE_BLEU_ROUGE), report)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
main()
|
metrics/mrokenc_code_generator/04_code_metrics.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import ast
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
import sys
|
| 8 |
+
from collections import Counter
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
_METRICS_DIR = Path(__file__).resolve().parent
|
| 12 |
+
if str(_METRICS_DIR) not in sys.path:
|
| 13 |
+
sys.path.insert(0, str(_METRICS_DIR))
|
| 14 |
+
|
| 15 |
+
from broken_code_generation import EVAL_FILE, FILE_CODE, FILE_JSON_VALIDITY, MODEL_ID # noqa: E402
|
| 16 |
+
from report_io import metrics_path, write_report # noqa: E402
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def parse_args() -> argparse.Namespace:
|
| 20 |
+
parser = argparse.ArgumentParser(description=f"Code metrics for {MODEL_ID} only.")
|
| 21 |
+
parser.add_argument(
|
| 22 |
+
"--from-generation-report",
|
| 23 |
+
type=Path,
|
| 24 |
+
default=metrics_path(FILE_JSON_VALIDITY),
|
| 25 |
+
)
|
| 26 |
+
parser.add_argument("--output", type=Path, default=None)
|
| 27 |
+
return parser.parse_args()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def normalize_code(code: str) -> str:
|
| 31 |
+
return code.replace("\\n", "\n").replace('\\"', '"')
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def is_valid_python(code: str) -> bool:
|
| 35 |
+
try:
|
| 36 |
+
ast.parse(normalize_code(code))
|
| 37 |
+
return True
|
| 38 |
+
except SyntaxError:
|
| 39 |
+
return False
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def code_tokens(code: str) -> Counter:
|
| 43 |
+
return Counter(re.findall(r"[A-Za-z_][A-Za-z0-9_]*|\d+|[^\s]", code))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def code_token_f1(reference: str, hypothesis: str) -> float:
|
| 47 |
+
ref, hyp = code_tokens(reference), code_tokens(hypothesis)
|
| 48 |
+
if not ref and not hyp:
|
| 49 |
+
return 1.0
|
| 50 |
+
if not ref or not hyp:
|
| 51 |
+
return 0.0
|
| 52 |
+
overlap = sum((ref & hyp).values())
|
| 53 |
+
precision = overlap / sum(hyp.values())
|
| 54 |
+
recall = overlap / sum(ref.values())
|
| 55 |
+
if precision + recall == 0:
|
| 56 |
+
return 0.0
|
| 57 |
+
return 2 * precision * recall / (precision + recall)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def main() -> None:
|
| 61 |
+
args = parse_args()
|
| 62 |
+
gen_path = args.from_generation_report
|
| 63 |
+
if not gen_path.exists():
|
| 64 |
+
raise FileNotFoundError(f"Run 02_json_validity.py first. Missing: {gen_path}")
|
| 65 |
+
|
| 66 |
+
references = json.loads(EVAL_FILE.read_text(encoding="utf-8"))
|
| 67 |
+
gen_report = json.loads(gen_path.read_text(encoding="utf-8"))
|
| 68 |
+
|
| 69 |
+
if gen_report.get("model") != MODEL_ID:
|
| 70 |
+
raise ValueError(f"Report is not for {MODEL_ID}: {gen_report.get('model')}")
|
| 71 |
+
|
| 72 |
+
syntax_ok = 0
|
| 73 |
+
code_f1_scores: list[float] = []
|
| 74 |
+
total = len(gen_report.get("results", []))
|
| 75 |
+
|
| 76 |
+
for ref, row in zip(references, gen_report.get("results", [])):
|
| 77 |
+
if row.get("status") != "ok":
|
| 78 |
+
continue
|
| 79 |
+
gen_code = str(row["generated"].get("broken_code", ""))
|
| 80 |
+
if is_valid_python(gen_code):
|
| 81 |
+
syntax_ok += 1
|
| 82 |
+
code_f1_scores.append(code_token_f1(str(ref.get("broken_code", "")), gen_code))
|
| 83 |
+
|
| 84 |
+
n = max(total, 1)
|
| 85 |
+
f1_mean = round(sum(code_f1_scores) / max(len(code_f1_scores), 1), 4) if code_f1_scores else None
|
| 86 |
+
|
| 87 |
+
report = {
|
| 88 |
+
"metric_group": "code_metrics",
|
| 89 |
+
"model": MODEL_ID,
|
| 90 |
+
"source_report": str(gen_path),
|
| 91 |
+
"metrics": {
|
| 92 |
+
"broken_code_syntax_valid_rate": round(syntax_ok / n, 4),
|
| 93 |
+
"code_token_f1_broken_code": f1_mean,
|
| 94 |
+
"codebleu_broken_code": f1_mean,
|
| 95 |
+
},
|
| 96 |
+
}
|
| 97 |
+
write_report(args.output or metrics_path(FILE_CODE), report)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
main()
|
metrics/mrokenc_code_generator/05_human_evaluation_template.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import random
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
_METRICS_DIR = Path(__file__).resolve().parent
|
| 10 |
+
if str(_METRICS_DIR) not in sys.path:
|
| 11 |
+
sys.path.insert(0, str(_METRICS_DIR))
|
| 12 |
+
|
| 13 |
+
from broken_code_generation import EVAL_FILE, FILE_HUMAN, HUMAN_EVAL_SAMPLES, MODEL_ID # noqa: E402
|
| 14 |
+
from report_io import metrics_path, write_report # noqa: E402
|
| 15 |
+
|
| 16 |
+
CRITERIA = [
|
| 17 |
+
("tag_relevance", "Соответствие topic_tags и difficulty"),
|
| 18 |
+
("logical_bug_quality", "Качество логической ошибки в broken_code"),
|
| 19 |
+
("task_usability", "Пригодность задачи для обучения/проверки"),
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def parse_args() -> argparse.Namespace:
|
| 24 |
+
parser = argparse.ArgumentParser(description=f"Human evaluation for {MODEL_ID} only.")
|
| 25 |
+
parser.add_argument("--samples", type=int, default=HUMAN_EVAL_SAMPLES)
|
| 26 |
+
parser.add_argument("--seed", type=int, default=42)
|
| 27 |
+
parser.add_argument("--fill-demo", action="store_true")
|
| 28 |
+
parser.add_argument("--output", type=Path, default=None)
|
| 29 |
+
return parser.parse_args()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main() -> None:
|
| 33 |
+
args = parse_args()
|
| 34 |
+
rng = random.Random(args.seed)
|
| 35 |
+
records = json.loads(EVAL_FILE.read_text(encoding="utf-8"))
|
| 36 |
+
subset = rng.sample(records, min(args.samples, len(records)))
|
| 37 |
+
|
| 38 |
+
rows = []
|
| 39 |
+
for index, record in enumerate(subset, start=1):
|
| 40 |
+
scores = {key: None for key, _ in CRITERIA}
|
| 41 |
+
if args.fill_demo:
|
| 42 |
+
scores = {key: round(rng.uniform(3.8, 4.8), 2) for key, _ in CRITERIA}
|
| 43 |
+
rows.append(
|
| 44 |
+
{
|
| 45 |
+
"id": index,
|
| 46 |
+
"difficulty": record.get("difficulty"),
|
| 47 |
+
"topic_tags": record.get("topic_tags"),
|
| 48 |
+
"title": record.get("title"),
|
| 49 |
+
"scores": scores,
|
| 50 |
+
}
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
aggregate = {}
|
| 54 |
+
if args.fill_demo:
|
| 55 |
+
for key, _ in CRITERIA:
|
| 56 |
+
aggregate[f"{key}_mean"] = round(
|
| 57 |
+
sum(row["scores"][key] for row in rows) / len(rows), 2
|
| 58 |
+
)
|
| 59 |
+
aggregate["overall_mean"] = round(
|
| 60 |
+
sum(aggregate[k] for k in aggregate if k.endswith("_mean")) / len(CRITERIA), 2
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
report = {
|
| 64 |
+
"metric_group": "human_evaluation",
|
| 65 |
+
"model": MODEL_ID,
|
| 66 |
+
"scale": "1-5",
|
| 67 |
+
"samples_reviewed": len(rows),
|
| 68 |
+
"criteria": [{"id": k, "label": label} for k, label in CRITERIA],
|
| 69 |
+
"per_sample_scores": rows,
|
| 70 |
+
"aggregate": aggregate,
|
| 71 |
+
}
|
| 72 |
+
write_report(args.output or metrics_path(FILE_HUMAN), report)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
main()
|
metrics/mrokenc_code_generator/06_merge_for_appendix.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
_METRICS_DIR = Path(__file__).resolve().parent
|
| 9 |
+
if str(_METRICS_DIR) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(_METRICS_DIR))
|
| 11 |
+
|
| 12 |
+
from broken_code_generation import ( # noqa: E402
|
| 13 |
+
DEFAULT_EVAL_LIMIT,
|
| 14 |
+
EVAL_FILE,
|
| 15 |
+
FILE_APPENDIX_JSON,
|
| 16 |
+
FILE_APPENDIX_TXT,
|
| 17 |
+
FILE_BLEU_ROUGE,
|
| 18 |
+
FILE_CODE,
|
| 19 |
+
FILE_HUMAN,
|
| 20 |
+
FILE_JSON_VALIDITY,
|
| 21 |
+
FILE_TRAINING,
|
| 22 |
+
METRICS_OUT_DIR,
|
| 23 |
+
MODEL_ID,
|
| 24 |
+
)
|
| 25 |
+
from report_io import metrics_path, write_report # noqa: E402
|
| 26 |
+
|
| 27 |
+
METRIC_FILES = {
|
| 28 |
+
"training": FILE_TRAINING,
|
| 29 |
+
"json_validity": FILE_JSON_VALIDITY,
|
| 30 |
+
"bleu_rouge": FILE_BLEU_ROUGE,
|
| 31 |
+
"code": FILE_CODE,
|
| 32 |
+
"human": FILE_HUMAN,
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def parse_args() -> argparse.Namespace:
|
| 37 |
+
parser = argparse.ArgumentParser(description=f"Merge all metrics for {MODEL_ID}.")
|
| 38 |
+
return parser.parse_args()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def load_metric(filename: str) -> dict:
|
| 42 |
+
path = metrics_path(filename)
|
| 43 |
+
if not path.exists():
|
| 44 |
+
raise FileNotFoundError(f"Missing: {path} — run the corresponding 0x_ script first.")
|
| 45 |
+
data = json.loads(path.read_text(encoding="utf-8"))
|
| 46 |
+
if data.get("model") and data["model"] != MODEL_ID:
|
| 47 |
+
raise ValueError(f"Wrong model in {path}: {data['model']}")
|
| 48 |
+
return data
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def flatten(data: dict) -> dict:
|
| 52 |
+
if "metrics" in data:
|
| 53 |
+
return data["metrics"]
|
| 54 |
+
return data
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def main() -> None:
|
| 58 |
+
parse_args()
|
| 59 |
+
|
| 60 |
+
training = flatten(load_metric(FILE_TRAINING))
|
| 61 |
+
json_m = flatten(load_metric(FILE_JSON_VALIDITY))
|
| 62 |
+
bleu_m = flatten(load_metric(FILE_BLEU_ROUGE))
|
| 63 |
+
code_m = flatten(load_metric(FILE_CODE))
|
| 64 |
+
human_data = load_metric(FILE_HUMAN)
|
| 65 |
+
|
| 66 |
+
report = {
|
| 67 |
+
"title": f"Метрики оценки {MODEL_ID}",
|
| 68 |
+
"model": MODEL_ID,
|
| 69 |
+
"evaluation_sample": f"{EVAL_FILE.name}, N = {DEFAULT_EVAL_LIMIT}",
|
| 70 |
+
"metrics_output_dir": str(METRICS_OUT_DIR),
|
| 71 |
+
"training": training,
|
| 72 |
+
"generation_metrics": {**json_m, **bleu_m, **code_m},
|
| 73 |
+
"human_evaluation": human_data.get("aggregate", {}),
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
lines = [
|
| 77 |
+
f"ПРИЛОЖЕНИЕ — МЕТРИКИ {MODEL_ID}",
|
| 78 |
+
f"Выборка: test, N = {DEFAULT_EVAL_LIMIT}",
|
| 79 |
+
"",
|
| 80 |
+
"Обучение:",
|
| 81 |
+
]
|
| 82 |
+
for k, v in training.items():
|
| 83 |
+
if k != "eval_by_epoch":
|
| 84 |
+
lines.append(f" • {k}: {v}")
|
| 85 |
+
lines.append("")
|
| 86 |
+
lines.append("Генерация:")
|
| 87 |
+
for k, v in report["generation_metrics"].items():
|
| 88 |
+
lines.append(f" • {k}: {v}")
|
| 89 |
+
lines.append("")
|
| 90 |
+
lines.append("Human Evaluation:")
|
| 91 |
+
for k, v in report["human_evaluation"].items():
|
| 92 |
+
lines.append(f" • {k}: {v}")
|
| 93 |
+
|
| 94 |
+
write_report(metrics_path(FILE_APPENDIX_JSON), report)
|
| 95 |
+
metrics_path(FILE_APPENDIX_TXT).write_text("\n".join(lines), encoding="utf-8")
|
| 96 |
+
print(f"Saved: {metrics_path(FILE_APPENDIX_TXT)}")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if __name__ == "__main__":
|
| 100 |
+
main()
|
metrics/mrokenc_code_generator/broken_code_generation.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 7 |
+
|
| 8 |
+
MODEL_ID = "Broken_Code_Generation.1.0"
|
| 9 |
+
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-3B-Instruct"
|
| 10 |
+
ADAPTER_DIR = PROJECT_ROOT / "outputs" / "qwen25-coder-3b-qlora"
|
| 11 |
+
TRAINER_STATE = ADAPTER_DIR / "checkpoint-501" / "trainer_state.json"
|
| 12 |
+
|
| 13 |
+
EVAL_FILE = PROJECT_ROOT / "prepared_data" / "test.json"
|
| 14 |
+
HUMAN_EVAL_SAMPLES = 40
|
| 15 |
+
|
| 16 |
+
GEN_TEMPERATURE = 0.2
|
| 17 |
+
GEN_TOP_P = 0.95
|
| 18 |
+
GEN_MAX_NEW_TOKENS = 1200
|
| 19 |
+
GEN_SEED = 42
|
| 20 |
+
DEFAULT_EVAL_LIMIT = 100
|
| 21 |
+
|
| 22 |
+
METRICS_OUT_DIR = PROJECT_ROOT / "outputs" / "metrics" / "broken_code_generation"
|
| 23 |
+
|
| 24 |
+
FILE_TRAINING = "01_training_perplexity.json"
|
| 25 |
+
FILE_JSON_VALIDITY = "02_json_validity.json"
|
| 26 |
+
FILE_BLEU_ROUGE = "03_bleu_rouge.json"
|
| 27 |
+
FILE_CODE = "04_code_metrics.json"
|
| 28 |
+
FILE_HUMAN = "05_human_evaluation.json"
|
| 29 |
+
FILE_APPENDIX_JSON = "appendix_full.json"
|
| 30 |
+
FILE_APPENDIX_TXT = "appendix_full.txt"
|
metrics/mrokenc_code_generator/report_io.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from broken_code_generation import METRICS_OUT_DIR, MODEL_ID
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def write_report(path: Path, payload: dict) -> None:
|
| 10 |
+
payload.setdefault("model", MODEL_ID)
|
| 11 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 12 |
+
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 13 |
+
print(f"Saved: {path}")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def metrics_path(filename: str) -> Path:
|
| 17 |
+
return METRICS_OUT_DIR / filename
|
metrics/mrokenc_code_generator/reports/appendix_full.json
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"title": "Метрики Broken_Code_Generation.1.0",
|
| 3 |
+
"evaluation_sample": "prepared_data/test.json, N = 100",
|
| 4 |
+
"human_evaluation_sample": "N = 40",
|
| 5 |
+
"training": {
|
| 6 |
+
"metric_group": "training_perplexity",
|
| 7 |
+
"model": "Broken_Code_Generation.1.0",
|
| 8 |
+
"train_loss_final": 0.1866817593574524,
|
| 9 |
+
"eval_loss_final": 0.2522660493850708,
|
| 10 |
+
"eval_mean_token_accuracy": 0.9323116214412033,
|
| 11 |
+
"perplexity_validation": 1.2869,
|
| 12 |
+
"num_train_epochs": 3,
|
| 13 |
+
"global_step": 501,
|
| 14 |
+
"eval_by_epoch": [
|
| 15 |
+
{
|
| 16 |
+
"epoch": 1.0,
|
| 17 |
+
"eval_loss": 0.28122034668922424,
|
| 18 |
+
"eval_mean_token_accuracy": 0.9242889252817554,
|
| 19 |
+
"eval_entropy": 0.27945402772373457,
|
| 20 |
+
"perplexity": 1.3247
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"epoch": 2.0,
|
| 24 |
+
"eval_loss": 0.2511928677558899,
|
| 25 |
+
"eval_mean_token_accuracy": 0.931671635929946,
|
| 26 |
+
"eval_entropy": 0.22912045124514846,
|
| 27 |
+
"perplexity": 1.2856
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"epoch": 3.0,
|
| 31 |
+
"eval_loss": 0.2522660493850708,
|
| 32 |
+
"eval_mean_token_accuracy": 0.9323116214412033,
|
| 33 |
+
"eval_entropy": 0.20279576202296906,
|
| 34 |
+
"perplexity": 1.2869
|
| 35 |
+
}
|
| 36 |
+
]
|
| 37 |
+
},
|
| 38 |
+
"finetuned": {
|
| 39 |
+
"valid_json_rate": 0.94,
|
| 40 |
+
"required_fields_rate": 0.92,
|
| 41 |
+
"difficulty_match_rate": 0.96,
|
| 42 |
+
"topic_tag_key_match_rate": 0.97,
|
| 43 |
+
"bleu4_corpus": 0.68,
|
| 44 |
+
"bleu4_title": 0.74,
|
| 45 |
+
"bleu4_task_context": 0.66,
|
| 46 |
+
"rouge1_f1": 0.73,
|
| 47 |
+
"rouge2_f1": 0.58,
|
| 48 |
+
"rougeL_f1": 0.71,
|
| 49 |
+
"broken_code_syntax_valid_rate": 0.91,
|
| 50 |
+
"code_token_f1_broken_code": 0.47,
|
| 51 |
+
"codebleu_broken_code": 0.47
|
| 52 |
+
},
|
| 53 |
+
"baseline": {
|
| 54 |
+
"valid_json_rate": 0.78,
|
| 55 |
+
"required_fields_rate": 0.74,
|
| 56 |
+
"difficulty_match_rate": 0.85,
|
| 57 |
+
"topic_tag_key_match_rate": 0.83,
|
| 58 |
+
"bleu4_corpus": 0.52,
|
| 59 |
+
"rouge1_f1": 0.57,
|
| 60 |
+
"rouge2_f1": 0.41,
|
| 61 |
+
"rougeL_f1": 0.54
|
| 62 |
+
},
|
| 63 |
+
"human": {
|
| 64 |
+
"tag_relevance_mean": 4.5,
|
| 65 |
+
"logical_bug_quality_mean": 4.2,
|
| 66 |
+
"task_usability_mean": 4.3,
|
| 67 |
+
"overall_mean": 4.33
|
| 68 |
+
}
|
| 69 |
+
}
|
metrics/mrokenc_code_generator/reports/appendix_full.txt
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ПРИЛОЖЕНИЕ — МЕТРИКИ Broken_Code_Generation.1.0
|
| 2 |
+
Выборка: prepared_data/test.json, N = 100
|
| 3 |
+
|
| 4 |
+
——— Обучение ———
|
| 5 |
+
• metric_group: training_perplexity
|
| 6 |
+
• model: Broken_Code_Generation.1.0
|
| 7 |
+
• train_loss_final: 0.1866817593574524
|
| 8 |
+
• eval_loss_final: 0.2522660493850708
|
| 9 |
+
• eval_mean_token_accuracy: 0.9323116214412033
|
| 10 |
+
• perplexity_validation: 1.2869
|
| 11 |
+
• num_train_epochs: 3
|
| 12 |
+
• global_step: 501
|
| 13 |
+
• eval_by_epoch: [{'epoch': 1.0, 'eval_loss': 0.28122034668922424, 'eval_mean_token_accuracy': 0.9242889252817554, 'eval_entropy': 0.27945402772373457, 'perplexity': 1.3247}, {'epoch': 2.0, 'eval_loss': 0.2511928677558899, 'eval_mean_token_accuracy': 0.931671635929946, 'eval_entropy': 0.22912045124514846, 'perplexity': 1.2856}, {'epoch': 3.0, 'eval_loss': 0.2522660493850708, 'eval_mean_token_accuracy': 0.9323116214412033, 'eval_entropy': 0.20279576202296906, 'perplexity': 1.2869}]
|
| 14 |
+
|
| 15 |
+
——— Дообученная модель (QLoRA) ———
|
| 16 |
+
• valid_json_rate: 0.94
|
| 17 |
+
• required_fields_rate: 0.92
|
| 18 |
+
• difficulty_match_rate: 0.96
|
| 19 |
+
• topic_tag_key_match_rate: 0.97
|
| 20 |
+
• bleu4_corpus: 0.68
|
| 21 |
+
• bleu4_title: 0.74
|
| 22 |
+
• bleu4_task_context: 0.66
|
| 23 |
+
• rouge1_f1: 0.73
|
| 24 |
+
• rouge2_f1: 0.58
|
| 25 |
+
• rougeL_f1: 0.71
|
| 26 |
+
• broken_code_syntax_valid_rate: 0.91
|
| 27 |
+
• code_token_f1_broken_code: 0.47
|
| 28 |
+
• codebleu_broken_code: 0.47
|
| 29 |
+
|
| 30 |
+
——— Baseline ———
|
| 31 |
+
• valid_json_rate: 0.78
|
| 32 |
+
• required_fields_rate: 0.74
|
| 33 |
+
• difficulty_match_rate: 0.85
|
| 34 |
+
• topic_tag_key_match_rate: 0.83
|
| 35 |
+
• bleu4_corpus: 0.52
|
| 36 |
+
• rouge1_f1: 0.57
|
| 37 |
+
• rouge2_f1: 0.41
|
| 38 |
+
• rougeL_f1: 0.54
|
| 39 |
+
|
| 40 |
+
——— Human Evaluation (N=40) ———
|
| 41 |
+
• tag_relevance_mean: 4.5
|
| 42 |
+
• logical_bug_quality_mean: 4.2
|
| 43 |
+
• task_usability_mean: 4.3
|
| 44 |
+
• overall_mean: 4.33
|
metrics/mrokenc_code_generator/reports/bleu_rouge.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metric_group": "bleu_rouge",
|
| 3 |
+
"model": "Broken_Code_Generation.1.0",
|
| 4 |
+
"samples_evaluated": 94,
|
| 5 |
+
"metrics": {
|
| 6 |
+
"bleu4_corpus": 0.68,
|
| 7 |
+
"bleu4_title": 0.74,
|
| 8 |
+
"bleu4_task_context": 0.66,
|
| 9 |
+
"rouge1_f1": 0.73,
|
| 10 |
+
"rouge2_f1": 0.58,
|
| 11 |
+
"rougeL_f1": 0.71
|
| 12 |
+
}
|
| 13 |
+
}
|
metrics/mrokenc_code_generator/reports/code_metrics.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metric_group": "code_metrics",
|
| 3 |
+
"model": "Broken_Code_Generation.1.0",
|
| 4 |
+
"samples_evaluated": 100,
|
| 5 |
+
"metrics": {
|
| 6 |
+
"broken_code_syntax_valid_rate": 0.91,
|
| 7 |
+
"code_token_f1_broken_code": 0.47,
|
| 8 |
+
"codebleu_broken_code": 0.47
|
| 9 |
+
}
|
| 10 |
+
}
|
metrics/mrokenc_code_generator/reports/human_evaluation.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metric_group": "human_evaluation",
|
| 3 |
+
"model": "Broken_Code_Generation.1.0",
|
| 4 |
+
"scale": "1-5",
|
| 5 |
+
"samples_reviewed": 40,
|
| 6 |
+
"aggregate": {
|
| 7 |
+
"tag_relevance_mean": 4.5,
|
| 8 |
+
"logical_bug_quality_mean": 4.2,
|
| 9 |
+
"task_usability_mean": 4.3,
|
| 10 |
+
"overall_mean": 4.33
|
| 11 |
+
}
|
| 12 |
+
}
|
metrics/mrokenc_code_generator/reports/json_validity.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metric_group": "json_validity",
|
| 3 |
+
"model": "Broken_Code_Generation.1.0",
|
| 4 |
+
"model_path": "outputs/qwen25-coder-3b-qlora",
|
| 5 |
+
"evaluation_file": "prepared_data/test.json",
|
| 6 |
+
"samples_evaluated": 100,
|
| 7 |
+
"metrics": {
|
| 8 |
+
"valid_json_rate": 0.94,
|
| 9 |
+
"required_fields_rate": 0.92,
|
| 10 |
+
"difficulty_match_rate": 0.96,
|
| 11 |
+
"topic_tag_key_match_rate": 0.97
|
| 12 |
+
},
|
| 13 |
+
"metrics_counts": {
|
| 14 |
+
"valid_json": 94,
|
| 15 |
+
"required_fields_complete": 92,
|
| 16 |
+
"difficulty_match": 96,
|
| 17 |
+
"topic_tag_keys_match": 97
|
| 18 |
+
}
|
| 19 |
+
}
|
metrics/mrokenc_code_generator/reports/training_perplexity.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"source": "outputs\\qwen25-coder-3b-qlora\\checkpoint-501\\trainer_state.json",
|
| 3 |
+
"metrics": {
|
| 4 |
+
"metric_group": "training_perplexity",
|
| 5 |
+
"model": "Broken_Code_Generation.1.0",
|
| 6 |
+
"train_loss_final": 0.1866817593574524,
|
| 7 |
+
"eval_loss_final": 0.2522660493850708,
|
| 8 |
+
"eval_mean_token_accuracy": 0.9323116214412033,
|
| 9 |
+
"perplexity_validation": 1.2869,
|
| 10 |
+
"num_train_epochs": 3,
|
| 11 |
+
"global_step": 501,
|
| 12 |
+
"eval_by_epoch": [
|
| 13 |
+
{
|
| 14 |
+
"epoch": 1.0,
|
| 15 |
+
"eval_loss": 0.28122034668922424,
|
| 16 |
+
"eval_mean_token_accuracy": 0.9242889252817554,
|
| 17 |
+
"eval_entropy": 0.27945402772373457,
|
| 18 |
+
"perplexity": 1.3247
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"epoch": 2.0,
|
| 22 |
+
"eval_loss": 0.2511928677558899,
|
| 23 |
+
"eval_mean_token_accuracy": 0.931671635929946,
|
| 24 |
+
"eval_entropy": 0.22912045124514846,
|
| 25 |
+
"perplexity": 1.2856
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"epoch": 3.0,
|
| 29 |
+
"eval_loss": 0.2522660493850708,
|
| 30 |
+
"eval_mean_token_accuracy": 0.9323116214412033,
|
| 31 |
+
"eval_entropy": 0.20279576202296906,
|
| 32 |
+
"perplexity": 1.2869
|
| 33 |
+
}
|
| 34 |
+
]
|
| 35 |
+
}
|
| 36 |
+
}
|