#!/usr/bin/env python3
"""
Check the distribution of indices in enhanced_dataset.csv
"""
import csv
from tqdm import tqdm

print("Sampling enhanced_dataset.csv to understand index distribution...")

seen_indices = []
with open('enhanced_dataset.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    
    for i, row in enumerate(reader):
        idx = row.get('', row.get('Unnamed: 0.1', row.get('Unnamed: 0')))
        if idx:
            try:
                idx = int(idx)
                seen_indices.append((i, idx))  # (csv_row_num, index_value)
            except:
                pass
        
        # Sample every 10000 rows to save time
        if i % 10000 == 0 and i > 0:
            print(f"CSV row {i}: index value = {idx}")
        
        if i >= 2000000:  # Check first 2M rows
            break

print(f"\nTotal sampled: {len(seen_indices)}")
print(f"First few: {seen_indices[:5]}")
print(f"Last few: {seen_indices[-5:]}")

# Check if indices are sequential
gaps = []
for i in range(1, min(100, len(seen_indices))):
    diff = seen_indices[i][1] - seen_indices[i-1][1]
    if diff != 1:
        gaps.append((seen_indices[i-1], seen_indices[i], diff))

print(f"\nFound {len(gaps)} gaps in first 100 rows")
if gaps:
    print(f"Example gaps: {gaps[:5]}")