File size: 1,289 Bytes
216c37b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | #!/usr/bin/env python3
"""
Check the distribution of indices in enhanced_dataset.csv
"""
import csv
from tqdm import tqdm
print("Sampling enhanced_dataset.csv to understand index distribution...")
seen_indices = []
with open('enhanced_dataset.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
idx = row.get('', row.get('Unnamed: 0.1', row.get('Unnamed: 0')))
if idx:
try:
idx = int(idx)
seen_indices.append((i, idx)) # (csv_row_num, index_value)
except:
pass
# Sample every 10000 rows to save time
if i % 10000 == 0 and i > 0:
print(f"CSV row {i}: index value = {idx}")
if i >= 2000000: # Check first 2M rows
break
print(f"\nTotal sampled: {len(seen_indices)}")
print(f"First few: {seen_indices[:5]}")
print(f"Last few: {seen_indices[-5:]}")
# Check if indices are sequential
gaps = []
for i in range(1, min(100, len(seen_indices))):
diff = seen_indices[i][1] - seen_indices[i-1][1]
if diff != 1:
gaps.append((seen_indices[i-1], seen_indices[i], diff))
print(f"\nFound {len(gaps)} gaps in first 100 rows")
if gaps:
print(f"Example gaps: {gaps[:5]}")
|