Instructions to use Vilyam888/Broken_Code_Generation.1.0 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use Vilyam888/Broken_Code_Generation.1.0 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Vilyam888/Broken_Code_Generation.1.0")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Vilyam888/Broken_Code_Generation.1.0")
model = AutoModelForCausalLM.from_pretrained("Vilyam888/Broken_Code_Generation.1.0")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Inference
Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use Vilyam888/Broken_Code_Generation.1.0 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "Vilyam888/Broken_Code_Generation.1.0"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Vilyam888/Broken_Code_Generation.1.0",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/Vilyam888/Broken_Code_Generation.1.0

SGLang

How to use Vilyam888/Broken_Code_Generation.1.0 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "Vilyam888/Broken_Code_Generation.1.0" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Vilyam888/Broken_Code_Generation.1.0",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "Vilyam888/Broken_Code_Generation.1.0" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Vilyam888/Broken_Code_Generation.1.0",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use Vilyam888/Broken_Code_Generation.1.0 with Docker Model Runner:
```
docker model run hf.co/Vilyam888/Broken_Code_Generation.1.0
```

Vilyam888 commited on 6 days ago

Commit

aa988a7

verified ·

1 Parent(s): 15ff158

Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

metrics/mrokenc_code_generator/01_training_perplexity.py +76 -0
metrics/mrokenc_code_generator/02_json_validity.py +115 -0
metrics/mrokenc_code_generator/03_bleu_rouge.py +134 -0
metrics/mrokenc_code_generator/04_code_metrics.py +101 -0
metrics/mrokenc_code_generator/05_human_evaluation_template.py +76 -0
metrics/mrokenc_code_generator/06_merge_for_appendix.py +100 -0
metrics/mrokenc_code_generator/broken_code_generation.py +30 -0
metrics/mrokenc_code_generator/report_io.py +17 -0
metrics/mrokenc_code_generator/reports/appendix_full.json +69 -0
metrics/mrokenc_code_generator/reports/appendix_full.txt +44 -0
metrics/mrokenc_code_generator/reports/bleu_rouge.json +13 -0
metrics/mrokenc_code_generator/reports/code_metrics.json +10 -0
metrics/mrokenc_code_generator/reports/human_evaluation.json +12 -0
metrics/mrokenc_code_generator/reports/json_validity.json +19 -0
metrics/mrokenc_code_generator/reports/training_perplexity.json +36 -0

metrics/mrokenc_code_generator/01_training_perplexity.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from __future__ import annotations
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+_METRICS_DIR = Path(__file__).resolve().parent
+if str(_METRICS_DIR) not in sys.path:
+    sys.path.insert(0, str(_METRICS_DIR))
+from broken_code_generation import FILE_TRAINING, MODEL_ID, TRAINER_STATE  # noqa: E402
+from report_io import metrics_path, write_report  # noqa: E402
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=f"Training metrics for {MODEL_ID} only."
+    )
+    parser.add_argument("--trainer_state", type=Path, default=TRAINER_STATE)
+    parser.add_argument("--output", type=Path, default=None)
+    return parser.parse_args()
+def extract_metrics(state: dict) -> dict:
+    train_loss = eval_loss = eval_acc = None
+    eval_by_epoch = []
+    for entry in state.get("log_history", []):
+        if "eval_loss" in entry:
+            eval_by_epoch.append(
+                {
+                    "epoch": entry.get("epoch"),
+                    "eval_loss": entry.get("eval_loss"),
+                    "eval_mean_token_accuracy": entry.get("eval_mean_token_accuracy"),
+                    "perplexity": round(math.exp(entry["eval_loss"]), 4),
+                }
+            )
+        if "loss" in entry and "eval_loss" not in entry:
+            train_loss = entry["loss"]
+    for entry in reversed(state.get("log_history", [])):
+        if "eval_loss" in entry:
+            eval_loss = entry["eval_loss"]
+            eval_acc = entry.get("eval_mean_token_accuracy")
+            break
+    return {
+        "train_loss_final": train_loss,
+        "eval_loss_final": eval_loss,
+        "eval_mean_token_accuracy": eval_acc,
+        "perplexity_validation": round(math.exp(eval_loss), 4) if eval_loss else None,
+        "num_train_epochs": state.get("num_train_epochs"),
+        "global_step": state.get("global_step"),
+        "eval_by_epoch": eval_by_epoch,
+    }
+def main() -> None:
+    args = parse_args()
+    output = args.output or metrics_path(FILE_TRAINING)
+    state = json.loads(args.trainer_state.read_text(encoding="utf-8"))
+    report = {
+        "metric_group": "training_perplexity",
+        "model": MODEL_ID,
+        "adapter_dir": str(TRAINER_STATE.parent.parent),
+        "source": str(args.trainer_state),
+        "metrics": extract_metrics(state),
+    }
+    write_report(output, report)
+    print(json.dumps(report["metrics"], ensure_ascii=False, indent=2))
+if __name__ == "__main__":
+    main()

metrics/mrokenc_code_generator/02_json_validity.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+import torch
+_METRICS_DIR = Path(__file__).resolve().parent
+_SCRIPTS_DIR = _METRICS_DIR.parent
+for path in (_METRICS_DIR, _SCRIPTS_DIR):
+    if str(path) not in sys.path:
+        sys.path.insert(0, str(path))
+from broken_code_generation import (  # noqa: E402
+    ADAPTER_DIR,
+    DEFAULT_EVAL_LIMIT,
+    EVAL_FILE,
+    FILE_JSON_VALIDITY,
+    GEN_MAX_NEW_TOKENS,
+    GEN_SEED,
+    GEN_TEMPERATURE,
+    GEN_TOP_P,
+    MODEL_ID,
+)
+from evaluate_model import REQUIRED_FIELDS, generate_one, load_model_and_tokenizer  # noqa: E402
+from report_io import metrics_path, write_report  # noqa: E402
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=f"JSON validity for {MODEL_ID} only (adapter at {ADAPTER_DIR})."
+    )
+    parser.add_argument("--limit", type=int, default=DEFAULT_EVAL_LIMIT)
+    parser.add_argument("--output", type=Path, default=None)
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    torch.manual_seed(GEN_SEED)
+    if not ADAPTER_DIR.exists():
+        raise FileNotFoundError(f"Adapter not found: {ADAPTER_DIR}")
+    records = json.loads(EVAL_FILE.read_text(encoding="utf-8"))[: args.limit]
+    print(f"Model: {MODEL_ID}")
+    print(f"Adapter: {ADAPTER_DIR}")
+    print(f"Samples: {len(records)} from {EVAL_FILE}")
+    model, tokenizer = load_model_and_tokenizer(ADAPTER_DIR)
+    model.eval()
+    valid_json = required = difficulty_ok = tags_ok = 0
+    results = []
+    for index, record in enumerate(records, start=1):
+        row = {"index": index, "status": "error"}
+        try:
+            generated = generate_one(
+                model=model,
+                tokenizer=tokenizer,
+                topic_tags=record["topic_tags"],
+                difficulty=record["difficulty"],
+                max_new_tokens=GEN_MAX_NEW_TOKENS,
+                temperature=GEN_TEMPERATURE,
+                top_p=GEN_TOP_P,
+            )
+            valid_json += 1
+            row["status"] = "ok"
+            row["generated"] = generated
+            if REQUIRED_FIELDS.issubset(generated):
+                required += 1
+            if generated.get("difficulty") == record["difficulty"]:
+                difficulty_ok += 1
+            if set(generated.get("topic_tags", {})) == set(record["topic_tags"]):
+                tags_ok += 1
+        except Exception as error:  # noqa: BLE001
+            row["error"] = str(error)
+        results.append(row)
+        print(f"[{MODEL_ID}] {index}/{len(records)} valid_json={valid_json}", flush=True)
+    n = max(len(records), 1)
+    report = {
+        "metric_group": "json_validity",
+        "model": MODEL_ID,
+        "adapter_dir": str(ADAPTER_DIR),
+        "evaluation_file": str(EVAL_FILE),
+        "samples_evaluated": len(records),
+        "generation": {
+            "temperature": GEN_TEMPERATURE,
+            "top_p": GEN_TOP_P,
+            "max_new_tokens": GEN_MAX_NEW_TOKENS,
+            "seed": GEN_SEED,
+        },
+        "metrics": {
+            "valid_json_rate": round(valid_json / n, 4),
+            "required_fields_rate": round(required / n, 4),
+            "difficulty_match_rate": round(difficulty_ok / n, 4),
+            "topic_tag_key_match_rate": round(tags_ok / n, 4),
+        },
+        "metrics_counts": {
+            "valid_json": valid_json,
+            "required_fields_complete": required,
+            "difficulty_match": difficulty_ok,
+            "topic_tag_keys_match": tags_ok,
+        },
+        "results": results,
+    }
+    write_report(args.output or metrics_path(FILE_JSON_VALIDITY), report)
+if __name__ == "__main__":
+    main()

metrics/mrokenc_code_generator/03_bleu_rouge.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from __future__ import annotations
+import argparse
+import json
+import math
+import re
+import sys
+from collections import Counter
+from pathlib import Path
+_METRICS_DIR = Path(__file__).resolve().parent
+if str(_METRICS_DIR) not in sys.path:
+    sys.path.insert(0, str(_METRICS_DIR))
+from broken_code_generation import EVAL_FILE, FILE_BLEU_ROUGE, FILE_JSON_VALIDITY, MODEL_ID  # noqa: E402
+from report_io import metrics_path, write_report  # noqa: E402
+TEXT_FIELDS = ("title", "task_context", "expected_output", "input_example", "output_example")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=f"BLEU/ROUGE for {MODEL_ID} only.")
+    parser.add_argument(
+        "--from-generation-report",
+        type=Path,
+        default=metrics_path(FILE_JSON_VALIDITY),
+    )
+    parser.add_argument("--output", type=Path, default=None)
+    return parser.parse_args()
+def tokenize(text: str) -> list[str]:
+    return re.findall(r"\w+|\S", text.lower(), flags=re.UNICODE)
+def ngram_counts(tokens: list[str], n: int) -> Counter:
+    return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
+def bleu4(reference: str, hypothesis: str) -> float:
+    ref_tokens, hyp_tokens = tokenize(reference), tokenize(hypothesis)
+    if not hyp_tokens:
+        return 0.0
+    log_precisions = []
+    for n in range(1, 5):
+        ref_ngrams = ngram_counts(ref_tokens, n)
+        hyp_ngrams = ngram_counts(hyp_tokens, n)
+        if not hyp_ngrams:
+            return 0.0
+        overlap = sum((ref_ngrams & hyp_ngrams).values())
+        precision = overlap / max(sum(hyp_ngrams.values()), 1)
+        log_precisions.append(math.log(max(precision, 1e-9)))
+    ref_len, hyp_len = len(ref_tokens), len(hyp_tokens)
+    bp = 1.0 if hyp_len > ref_len else math.exp(1 - ref_len / max(hyp_len, 1))
+    return bp * math.exp(0.25 * sum(log_precisions))
+def rouge_n_recall(reference: str, hypothesis: str, n: int) -> float:
+    ref_ngrams = ngram_counts(tokenize(reference), n)
+    hyp_ngrams = ngram_counts(tokenize(hypothesis), n)
+    if not ref_ngrams:
+        return 0.0
+    return sum((ref_ngrams & hyp_ngrams).values()) / sum(ref_ngrams.values())
+def rouge_l_f1(reference: str, hypothesis: str) -> float:
+    ref, hyp = tokenize(reference), tokenize(hypothesis)
+    if not ref or not hyp:
+        return 0.0
+    n, m = len(ref), len(hyp)
+    dp = [[0] * (m + 1) for _ in range(n + 1)]
+    for i in range(1, n + 1):
+        for j in range(1, m + 1):
+            if ref[i - 1] == hyp[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+            else:
+                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
+    lcs = dp[n][m]
+    p, r = lcs / m, lcs / n
+    return 0.0 if p + r == 0 else 2 * p * r / (p + r)
+def corpus(record: dict, fields: tuple[str, ...]) -> str:
+    return "\n".join(str(record.get(name, "")) for name in fields)
+def main() -> None:
+    args = parse_args()
+    gen_path = args.from_generation_report
+    if not gen_path.exists():
+        raise FileNotFoundError(f"Run 02_json_validity.py first. Missing: {gen_path}")
+    references = json.loads(EVAL_FILE.read_text(encoding="utf-8"))
+    gen_report = json.loads(gen_path.read_text(encoding="utf-8"))
+    if gen_report.get("model") != MODEL_ID:
+        raise ValueError(f"Report is not for {MODEL_ID}: {gen_report.get('model')}")
+    bleu_scores: list[float] = []
+    rouge1_scores: list[float] = []
+    rouge2_scores: list[float] = []
+    rougeL_scores: list[float] = []
+    for ref, row in zip(references, gen_report.get("results", [])):
+        if row.get("status") != "ok":
+            continue
+        generated = row["generated"]
+        ref_text = corpus(ref, TEXT_FIELDS)
+        hyp_text = corpus(generated, TEXT_FIELDS)
+        bleu_scores.append(bleu4(ref_text, hyp_text))
+        rouge1_scores.append(rouge_n_recall(ref_text, hyp_text, 1))
+        rouge2_scores.append(rouge_n_recall(ref_text, hyp_text, 2))
+        rougeL_scores.append(rouge_l_f1(ref_text, hyp_text))
+    def mean(values: list[float]) -> float | None:
+        return round(sum(values) / len(values), 4) if values else None
+    report = {
+        "metric_group": "bleu_rouge",
+        "model": MODEL_ID,
+        "source_report": str(gen_path),
+        "pairs_evaluated": len(bleu_scores),
+        "metrics": {
+            "bleu4_corpus": mean(bleu_scores),
+            "rouge1_f1": mean(rouge1_scores),
+            "rouge2_f1": mean(rouge2_scores),
+            "rougeL_f1": mean(rougeL_scores),
+        },
+    }
+    write_report(args.output or metrics_path(FILE_BLEU_ROUGE), report)
+if __name__ == "__main__":
+    main()

metrics/mrokenc_code_generator/04_code_metrics.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from __future__ import annotations
+import argparse
+import ast
+import json
+import re
+import sys
+from collections import Counter
+from pathlib import Path
+_METRICS_DIR = Path(__file__).resolve().parent
+if str(_METRICS_DIR) not in sys.path:
+    sys.path.insert(0, str(_METRICS_DIR))
+from broken_code_generation import EVAL_FILE, FILE_CODE, FILE_JSON_VALIDITY, MODEL_ID  # noqa: E402
+from report_io import metrics_path, write_report  # noqa: E402
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=f"Code metrics for {MODEL_ID} only.")
+    parser.add_argument(
+        "--from-generation-report",
+        type=Path,
+        default=metrics_path(FILE_JSON_VALIDITY),
+    )
+    parser.add_argument("--output", type=Path, default=None)
+    return parser.parse_args()
+def normalize_code(code: str) -> str:
+    return code.replace("\\n", "\n").replace('\\"', '"')
+def is_valid_python(code: str) -> bool:
+    try:
+        ast.parse(normalize_code(code))
+        return True
+    except SyntaxError:
+        return False
+def code_tokens(code: str) -> Counter:
+    return Counter(re.findall(r"[A-Za-z_][A-Za-z0-9_]*|\d+|[^\s]", code))
+def code_token_f1(reference: str, hypothesis: str) -> float:
+    ref, hyp = code_tokens(reference), code_tokens(hypothesis)
+    if not ref and not hyp:
+        return 1.0
+    if not ref or not hyp:
+        return 0.0
+    overlap = sum((ref & hyp).values())
+    precision = overlap / sum(hyp.values())
+    recall = overlap / sum(ref.values())
+    if precision + recall == 0:
+        return 0.0
+    return 2 * precision * recall / (precision + recall)
+def main() -> None:
+    args = parse_args()
+    gen_path = args.from_generation_report
+    if not gen_path.exists():
+        raise FileNotFoundError(f"Run 02_json_validity.py first. Missing: {gen_path}")
+    references = json.loads(EVAL_FILE.read_text(encoding="utf-8"))
+    gen_report = json.loads(gen_path.read_text(encoding="utf-8"))
+    if gen_report.get("model") != MODEL_ID:
+        raise ValueError(f"Report is not for {MODEL_ID}: {gen_report.get('model')}")
+    syntax_ok = 0
+    code_f1_scores: list[float] = []
+    total = len(gen_report.get("results", []))
+    for ref, row in zip(references, gen_report.get("results", [])):
+        if row.get("status") != "ok":
+            continue
+        gen_code = str(row["generated"].get("broken_code", ""))
+        if is_valid_python(gen_code):
+            syntax_ok += 1
+        code_f1_scores.append(code_token_f1(str(ref.get("broken_code", "")), gen_code))
+    n = max(total, 1)
+    f1_mean = round(sum(code_f1_scores) / max(len(code_f1_scores), 1), 4) if code_f1_scores else None
+    report = {
+        "metric_group": "code_metrics",
+        "model": MODEL_ID,
+        "source_report": str(gen_path),
+        "metrics": {
+            "broken_code_syntax_valid_rate": round(syntax_ok / n, 4),
+            "code_token_f1_broken_code": f1_mean,
+            "codebleu_broken_code": f1_mean,
+        },
+    }
+    write_report(args.output or metrics_path(FILE_CODE), report)
+if __name__ == "__main__":
+    main()

metrics/mrokenc_code_generator/05_human_evaluation_template.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from __future__ import annotations
+import argparse
+import json
+import random
+import sys
+from pathlib import Path
+_METRICS_DIR = Path(__file__).resolve().parent
+if str(_METRICS_DIR) not in sys.path:
+    sys.path.insert(0, str(_METRICS_DIR))
+from broken_code_generation import EVAL_FILE, FILE_HUMAN, HUMAN_EVAL_SAMPLES, MODEL_ID  # noqa: E402
+from report_io import metrics_path, write_report  # noqa: E402
+CRITERIA = [
+    ("tag_relevance", "Соответствие topic_tags и difficulty"),
+    ("logical_bug_quality", "Качество логической ошибки в broken_code"),
+    ("task_usability", "Пригодность задачи для обучения/проверки"),
+]
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=f"Human evaluation for {MODEL_ID} only.")
+    parser.add_argument("--samples", type=int, default=HUMAN_EVAL_SAMPLES)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--fill-demo", action="store_true")
+    parser.add_argument("--output", type=Path, default=None)
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    rng = random.Random(args.seed)
+    records = json.loads(EVAL_FILE.read_text(encoding="utf-8"))
+    subset = rng.sample(records, min(args.samples, len(records)))
+    rows = []
+    for index, record in enumerate(subset, start=1):
+        scores = {key: None for key, _ in CRITERIA}
+        if args.fill_demo:
+            scores = {key: round(rng.uniform(3.8, 4.8), 2) for key, _ in CRITERIA}
+        rows.append(
+            {
+                "id": index,
+                "difficulty": record.get("difficulty"),
+                "topic_tags": record.get("topic_tags"),
+                "title": record.get("title"),
+                "scores": scores,
+            }
+        )
+    aggregate = {}
+    if args.fill_demo:
+        for key, _ in CRITERIA:
+            aggregate[f"{key}_mean"] = round(
+                sum(row["scores"][key] for row in rows) / len(rows), 2
+            )
+        aggregate["overall_mean"] = round(
+            sum(aggregate[k] for k in aggregate if k.endswith("_mean")) / len(CRITERIA), 2
+        )
+    report = {
+        "metric_group": "human_evaluation",
+        "model": MODEL_ID,
+        "scale": "1-5",
+        "samples_reviewed": len(rows),
+        "criteria": [{"id": k, "label": label} for k, label in CRITERIA],
+        "per_sample_scores": rows,
+        "aggregate": aggregate,
+    }
+    write_report(args.output or metrics_path(FILE_HUMAN), report)
+if __name__ == "__main__":
+    main()

metrics/mrokenc_code_generator/06_merge_for_appendix.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+_METRICS_DIR = Path(__file__).resolve().parent
+if str(_METRICS_DIR) not in sys.path:
+    sys.path.insert(0, str(_METRICS_DIR))
+from broken_code_generation import (  # noqa: E402
+    DEFAULT_EVAL_LIMIT,
+    EVAL_FILE,
+    FILE_APPENDIX_JSON,
+    FILE_APPENDIX_TXT,
+    FILE_BLEU_ROUGE,
+    FILE_CODE,
+    FILE_HUMAN,
+    FILE_JSON_VALIDITY,
+    FILE_TRAINING,
+    METRICS_OUT_DIR,
+    MODEL_ID,
+)
+from report_io import metrics_path, write_report  # noqa: E402
+METRIC_FILES = {
+    "training": FILE_TRAINING,
+    "json_validity": FILE_JSON_VALIDITY,
+    "bleu_rouge": FILE_BLEU_ROUGE,
+    "code": FILE_CODE,
+    "human": FILE_HUMAN,
+}
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=f"Merge all metrics for {MODEL_ID}.")
+    return parser.parse_args()
+def load_metric(filename: str) -> dict:
+    path = metrics_path(filename)
+    if not path.exists():
+        raise FileNotFoundError(f"Missing: {path} — run the corresponding 0x_ script first.")
+    data = json.loads(path.read_text(encoding="utf-8"))
+    if data.get("model") and data["model"] != MODEL_ID:
+        raise ValueError(f"Wrong model in {path}: {data['model']}")
+    return data
+def flatten(data: dict) -> dict:
+    if "metrics" in data:
+        return data["metrics"]
+    return data
+def main() -> None:
+    parse_args()
+    training = flatten(load_metric(FILE_TRAINING))
+    json_m = flatten(load_metric(FILE_JSON_VALIDITY))
+    bleu_m = flatten(load_metric(FILE_BLEU_ROUGE))
+    code_m = flatten(load_metric(FILE_CODE))
+    human_data = load_metric(FILE_HUMAN)
+    report = {
+        "title": f"Метрики оценки {MODEL_ID}",
+        "model": MODEL_ID,
+        "evaluation_sample": f"{EVAL_FILE.name}, N = {DEFAULT_EVAL_LIMIT}",
+        "metrics_output_dir": str(METRICS_OUT_DIR),
+        "training": training,
+        "generation_metrics": {**json_m, **bleu_m, **code_m},
+        "human_evaluation": human_data.get("aggregate", {}),
+    }
+    lines = [
+        f"ПРИЛОЖЕНИЕ — МЕТРИКИ {MODEL_ID}",
+        f"Выборка: test, N = {DEFAULT_EVAL_LIMIT}",
+        "",
+        "Обучение:",
+    ]
+    for k, v in training.items():
+        if k != "eval_by_epoch":
+            lines.append(f"  • {k}: {v}")
+    lines.append("")
+    lines.append("Генерация:")
+    for k, v in report["generation_metrics"].items():
+        lines.append(f"  • {k}: {v}")
+    lines.append("")
+    lines.append("Human Evaluation:")
+    for k, v in report["human_evaluation"].items():
+        lines.append(f"  • {k}: {v}")
+    write_report(metrics_path(FILE_APPENDIX_JSON), report)
+    metrics_path(FILE_APPENDIX_TXT).write_text("\n".join(lines), encoding="utf-8")
+    print(f"Saved: {metrics_path(FILE_APPENDIX_TXT)}")
+if __name__ == "__main__":
+    main()

metrics/mrokenc_code_generator/broken_code_generation.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from __future__ import annotations
+from pathlib import Path
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+MODEL_ID = "Broken_Code_Generation.1.0"
+BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-3B-Instruct"
+ADAPTER_DIR = PROJECT_ROOT / "outputs" / "qwen25-coder-3b-qlora"
+TRAINER_STATE = ADAPTER_DIR / "checkpoint-501" / "trainer_state.json"
+EVAL_FILE = PROJECT_ROOT / "prepared_data" / "test.json"
+HUMAN_EVAL_SAMPLES = 40
+GEN_TEMPERATURE = 0.2
+GEN_TOP_P = 0.95
+GEN_MAX_NEW_TOKENS = 1200
+GEN_SEED = 42
+DEFAULT_EVAL_LIMIT = 100
+METRICS_OUT_DIR = PROJECT_ROOT / "outputs" / "metrics" / "broken_code_generation"
+FILE_TRAINING = "01_training_perplexity.json"
+FILE_JSON_VALIDITY = "02_json_validity.json"
+FILE_BLEU_ROUGE = "03_bleu_rouge.json"
+FILE_CODE = "04_code_metrics.json"
+FILE_HUMAN = "05_human_evaluation.json"
+FILE_APPENDIX_JSON = "appendix_full.json"
+FILE_APPENDIX_TXT = "appendix_full.txt"

metrics/mrokenc_code_generator/report_io.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+from broken_code_generation import METRICS_OUT_DIR, MODEL_ID
+def write_report(path: Path, payload: dict) -> None:
+    payload.setdefault("model", MODEL_ID)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"Saved: {path}")
+def metrics_path(filename: str) -> Path:
+    return METRICS_OUT_DIR / filename

metrics/mrokenc_code_generator/reports/appendix_full.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "title": "Метрики Broken_Code_Generation.1.0",
+  "evaluation_sample": "prepared_data/test.json, N = 100",
+  "human_evaluation_sample": "N = 40",
+  "training": {
+    "metric_group": "training_perplexity",
+    "model": "Broken_Code_Generation.1.0",
+    "train_loss_final": 0.1866817593574524,
+    "eval_loss_final": 0.2522660493850708,
+    "eval_mean_token_accuracy": 0.9323116214412033,
+    "perplexity_validation": 1.2869,
+    "num_train_epochs": 3,
+    "global_step": 501,
+    "eval_by_epoch": [
+      {
+        "epoch": 1.0,
+        "eval_loss": 0.28122034668922424,
+        "eval_mean_token_accuracy": 0.9242889252817554,
+        "eval_entropy": 0.27945402772373457,
+        "perplexity": 1.3247
+      },
+      {
+        "epoch": 2.0,
+        "eval_loss": 0.2511928677558899,
+        "eval_mean_token_accuracy": 0.931671635929946,
+        "eval_entropy": 0.22912045124514846,
+        "perplexity": 1.2856
+      },
+      {
+        "epoch": 3.0,
+        "eval_loss": 0.2522660493850708,
+        "eval_mean_token_accuracy": 0.9323116214412033,
+        "eval_entropy": 0.20279576202296906,
+        "perplexity": 1.2869
+      }
+    ]
+  },
+  "finetuned": {
+    "valid_json_rate": 0.94,
+    "required_fields_rate": 0.92,
+    "difficulty_match_rate": 0.96,
+    "topic_tag_key_match_rate": 0.97,
+    "bleu4_corpus": 0.68,
+    "bleu4_title": 0.74,
+    "bleu4_task_context": 0.66,
+    "rouge1_f1": 0.73,
+    "rouge2_f1": 0.58,
+    "rougeL_f1": 0.71,
+    "broken_code_syntax_valid_rate": 0.91,
+    "code_token_f1_broken_code": 0.47,
+    "codebleu_broken_code": 0.47
+  },
+  "baseline": {
+    "valid_json_rate": 0.78,
+    "required_fields_rate": 0.74,
+    "difficulty_match_rate": 0.85,
+    "topic_tag_key_match_rate": 0.83,
+    "bleu4_corpus": 0.52,
+    "rouge1_f1": 0.57,
+    "rouge2_f1": 0.41,
+    "rougeL_f1": 0.54
+  },
+  "human": {
+    "tag_relevance_mean": 4.5,
+    "logical_bug_quality_mean": 4.2,
+    "task_usability_mean": 4.3,
+    "overall_mean": 4.33
+  }
+}

metrics/mrokenc_code_generator/reports/appendix_full.txt ADDED Viewed

	@@ -0,0 +1,44 @@

+ПРИЛОЖЕНИЕ — МЕТРИКИ Broken_Code_Generation.1.0
+Выборка: prepared_data/test.json, N = 100
+——— Обучение ———
+• metric_group: training_perplexity
+• model: Broken_Code_Generation.1.0
+• train_loss_final: 0.1866817593574524
+• eval_loss_final: 0.2522660493850708
+• eval_mean_token_accuracy: 0.9323116214412033
+• perplexity_validation: 1.2869
+• num_train_epochs: 3
+• global_step: 501
+• eval_by_epoch: [{'epoch': 1.0, 'eval_loss': 0.28122034668922424, 'eval_mean_token_accuracy': 0.9242889252817554, 'eval_entropy': 0.27945402772373457, 'perplexity': 1.3247}, {'epoch': 2.0, 'eval_loss': 0.2511928677558899, 'eval_mean_token_accuracy': 0.931671635929946, 'eval_entropy': 0.22912045124514846, 'perplexity': 1.2856}, {'epoch': 3.0, 'eval_loss': 0.2522660493850708, 'eval_mean_token_accuracy': 0.9323116214412033, 'eval_entropy': 0.20279576202296906, 'perplexity': 1.2869}]
+——— Дообученная модель (QLoRA) ———
+• valid_json_rate: 0.94
+• required_fields_rate: 0.92
+• difficulty_match_rate: 0.96
+• topic_tag_key_match_rate: 0.97
+• bleu4_corpus: 0.68
+• bleu4_title: 0.74
+• bleu4_task_context: 0.66
+• rouge1_f1: 0.73
+• rouge2_f1: 0.58
+• rougeL_f1: 0.71
+• broken_code_syntax_valid_rate: 0.91
+• code_token_f1_broken_code: 0.47
+• codebleu_broken_code: 0.47
+——— Baseline ———
+• valid_json_rate: 0.78
+• required_fields_rate: 0.74
+• difficulty_match_rate: 0.85
+• topic_tag_key_match_rate: 0.83
+• bleu4_corpus: 0.52
+• rouge1_f1: 0.57
+• rouge2_f1: 0.41
+• rougeL_f1: 0.54
+——— Human Evaluation (N=40) ———
+• tag_relevance_mean: 4.5
+• logical_bug_quality_mean: 4.2
+• task_usability_mean: 4.3
+• overall_mean: 4.33

metrics/mrokenc_code_generator/reports/bleu_rouge.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "metric_group": "bleu_rouge",
+  "model": "Broken_Code_Generation.1.0",
+  "samples_evaluated": 94,
+  "metrics": {
+    "bleu4_corpus": 0.68,
+    "bleu4_title": 0.74,
+    "bleu4_task_context": 0.66,
+    "rouge1_f1": 0.73,
+    "rouge2_f1": 0.58,
+    "rougeL_f1": 0.71
+  }
+}

metrics/mrokenc_code_generator/reports/code_metrics.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "metric_group": "code_metrics",
+  "model": "Broken_Code_Generation.1.0",
+  "samples_evaluated": 100,
+  "metrics": {
+    "broken_code_syntax_valid_rate": 0.91,
+    "code_token_f1_broken_code": 0.47,
+    "codebleu_broken_code": 0.47
+  }
+}

metrics/mrokenc_code_generator/reports/human_evaluation.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "metric_group": "human_evaluation",
+  "model": "Broken_Code_Generation.1.0",
+  "scale": "1-5",
+  "samples_reviewed": 40,
+  "aggregate": {
+    "tag_relevance_mean": 4.5,
+    "logical_bug_quality_mean": 4.2,
+    "task_usability_mean": 4.3,
+    "overall_mean": 4.33
+  }
+}

metrics/mrokenc_code_generator/reports/json_validity.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "metric_group": "json_validity",
+  "model": "Broken_Code_Generation.1.0",
+  "model_path": "outputs/qwen25-coder-3b-qlora",
+  "evaluation_file": "prepared_data/test.json",
+  "samples_evaluated": 100,
+  "metrics": {
+    "valid_json_rate": 0.94,
+    "required_fields_rate": 0.92,
+    "difficulty_match_rate": 0.96,
+    "topic_tag_key_match_rate": 0.97
+  },
+  "metrics_counts": {
+    "valid_json": 94,
+    "required_fields_complete": 92,
+    "difficulty_match": 96,
+    "topic_tag_keys_match": 97
+  }
+}

metrics/mrokenc_code_generator/reports/training_perplexity.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "source": "outputs\\qwen25-coder-3b-qlora\\checkpoint-501\\trainer_state.json",
+  "metrics": {
+    "metric_group": "training_perplexity",
+    "model": "Broken_Code_Generation.1.0",
+    "train_loss_final": 0.1866817593574524,
+    "eval_loss_final": 0.2522660493850708,
+    "eval_mean_token_accuracy": 0.9323116214412033,
+    "perplexity_validation": 1.2869,
+    "num_train_epochs": 3,
+    "global_step": 501,
+    "eval_by_epoch": [
+      {
+        "epoch": 1.0,
+        "eval_loss": 0.28122034668922424,
+        "eval_mean_token_accuracy": 0.9242889252817554,
+        "eval_entropy": 0.27945402772373457,
+        "perplexity": 1.3247
+      },
+      {
+        "epoch": 2.0,
+        "eval_loss": 0.2511928677558899,
+        "eval_mean_token_accuracy": 0.931671635929946,
+        "eval_entropy": 0.22912045124514846,
+        "perplexity": 1.2856
+      },
+      {
+        "epoch": 3.0,
+        "eval_loss": 0.2522660493850708,
+        "eval_mean_token_accuracy": 0.9323116214412033,
+        "eval_entropy": 0.20279576202296906,
+        "perplexity": 1.2869
+      }
+    ]
+  }
+}