SciCode
/

dataset-builder

DouDou commited on 21 days ago

Commit

216c37b

verified ·

1 Parent(s): b805898

Upload data3/check_index_distribution.py with huggingface_hub

Files changed (1) hide show

data3/check_index_distribution.py ADDED Viewed

+#!/usr/bin/env python3
+"""
+Check the distribution of indices in enhanced_dataset.csv
+"""
+import csv
+from tqdm import tqdm
+print("Sampling enhanced_dataset.csv to understand index distribution...")
+seen_indices = []
+with open('enhanced_dataset.csv', 'r', encoding='utf-8') as f:
+    reader = csv.DictReader(f)
+    for i, row in enumerate(reader):
+        idx = row.get('', row.get('Unnamed: 0.1', row.get('Unnamed: 0')))
+        if idx:
+            try:
+                idx = int(idx)
+                seen_indices.append((i, idx))  # (csv_row_num, index_value)
+            except:
+                pass
+        # Sample every 10000 rows to save time
+        if i % 10000 == 0 and i > 0:
+            print(f"CSV row {i}: index value = {idx}")
+        if i >= 2000000:  # Check first 2M rows
+            break
+print(f"\nTotal sampled: {len(seen_indices)}")
+print(f"First few: {seen_indices[:5]}")
+print(f"Last few: {seen_indices[-5:]}")
+# Check if indices are sequential
+gaps = []
+for i in range(1, min(100, len(seen_indices))):
+    diff = seen_indices[i][1] - seen_indices[i-1][1]
+    if diff != 1:
+        gaps.append((seen_indices[i-1], seen_indices[i], diff))
+print(f"\nFound {len(gaps)} gaps in first 100 rows")
+if gaps:
+    print(f"Example gaps: {gaps[:5]}")