| #!/usr/bin/env python3 | |
| """ | |
| Check the distribution of indices in enhanced_dataset.csv | |
| """ | |
| import csv | |
| from tqdm import tqdm | |
| print("Sampling enhanced_dataset.csv to understand index distribution...") | |
| seen_indices = [] | |
| with open('enhanced_dataset.csv', 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| for i, row in enumerate(reader): | |
| idx = row.get('', row.get('Unnamed: 0.1', row.get('Unnamed: 0'))) | |
| if idx: | |
| try: | |
| idx = int(idx) | |
| seen_indices.append((i, idx)) # (csv_row_num, index_value) | |
| except: | |
| pass | |
| # Sample every 10000 rows to save time | |
| if i % 10000 == 0 and i > 0: | |
| print(f"CSV row {i}: index value = {idx}") | |
| if i >= 2000000: # Check first 2M rows | |
| break | |
| print(f"\nTotal sampled: {len(seen_indices)}") | |
| print(f"First few: {seen_indices[:5]}") | |
| print(f"Last few: {seen_indices[-5:]}") | |
| # Check if indices are sequential | |
| gaps = [] | |
| for i in range(1, min(100, len(seen_indices))): | |
| diff = seen_indices[i][1] - seen_indices[i-1][1] | |
| if diff != 1: | |
| gaps.append((seen_indices[i-1], seen_indices[i], diff)) | |
| print(f"\nFound {len(gaps)} gaps in first 100 rows") | |
| if gaps: | |
| print(f"Example gaps: {gaps[:5]}") | |