| | import json |
| | import re |
| | from transformers import AutoTokenizer |
| |
|
| | |
| | dataset_path = "all_dataset_train.jsonl" |
| | model_path = "/root/autodl-tmp/output_7B_FULL_cotSFT/v8-20250720-210226/checkpoint-58" |
| | required_fields = ["input", "output"] |
| | max_token_length = 8192 |
| |
|
| | |
| | print("Loading tokenizer...") |
| | tokenizer = AutoTokenizer.from_pretrained(model_path) |
| |
|
| | |
| | def has_control_chars(text): |
| | return bool(re.search(r"[\x00-\x1F\x7F]", text)) |
| |
|
| | |
| | print("Checking dataset...\n") |
| | with open(dataset_path, "r", encoding="utf-8") as f: |
| | for idx, line in enumerate(f, 1): |
| | try: |
| | data = json.loads(line) |
| | except json.JSONDecodeError as e: |
| | print(f"[Line {idx}] ❌ JSON decode error: {e}") |
| | continue |
| |
|
| | |
| | for field in required_fields: |
| | if field not in data: |
| | print(f"[Line {idx}] ❌ Missing required field: '{field}'") |
| | elif not data[field].strip(): |
| | print(f"[Line {idx}] ❌ Field '{field}' is empty") |
| |
|
| | |
| | input_text = data.get("input", "") |
| | output_text = data.get("output", "") |
| | if has_control_chars(input_text + output_text): |
| | print(f"[Line {idx}] ⚠️ Contains control characters") |
| |
|
| | |
| | try: |
| | tokens = tokenizer(input_text + output_text, return_tensors="pt") |
| | token_len = tokens["input_ids"].shape[1] |
| | if token_len > max_token_length: |
| | print(f"[Line {idx}] ⚠️ Too many tokens: {token_len} > {max_token_length}") |
| | except Exception as e: |
| | print(f"[Line {idx}] ❌ Tokenization error: {e}") |
| |
|
| | print("\n✅ Dataset check complete.") |
| |
|