#!/usr/bin/env python3 """ Check the distribution of indices in enhanced_dataset.csv """ import csv from tqdm import tqdm print("Sampling enhanced_dataset.csv to understand index distribution...") seen_indices = [] with open('enhanced_dataset.csv', 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for i, row in enumerate(reader): idx = row.get('', row.get('Unnamed: 0.1', row.get('Unnamed: 0'))) if idx: try: idx = int(idx) seen_indices.append((i, idx)) # (csv_row_num, index_value) except: pass # Sample every 10000 rows to save time if i % 10000 == 0 and i > 0: print(f"CSV row {i}: index value = {idx}") if i >= 2000000: # Check first 2M rows break print(f"\nTotal sampled: {len(seen_indices)}") print(f"First few: {seen_indices[:5]}") print(f"Last few: {seen_indices[-5:]}") # Check if indices are sequential gaps = [] for i in range(1, min(100, len(seen_indices))): diff = seen_indices[i][1] - seen_indices[i-1][1] if diff != 1: gaps.append((seen_indices[i-1], seen_indices[i], diff)) print(f"\nFound {len(gaps)} gaps in first 100 rows") if gaps: print(f"Example gaps: {gaps[:5]}")