dataset-builder / data3 /check_index_distribution.py

DouDou

Upload data3/check_index_distribution.py with huggingface_hub

216c37b verified 21 days ago

1.29 kB

	#!/usr/bin/env python3
	"""
	Check the distribution of indices in enhanced_dataset.csv
	"""
	import csv
	from tqdm import tqdm

	print("Sampling enhanced_dataset.csv to understand index distribution...")

	seen_indices = []
	with open('enhanced_dataset.csv', 'r', encoding='utf-8') as f:
	reader = csv.DictReader(f)

	for i, row in enumerate(reader):
	idx = row.get('', row.get('Unnamed: 0.1', row.get('Unnamed: 0')))
	if idx:
	try:
	idx = int(idx)
	seen_indices.append((i, idx)) # (csv_row_num, index_value)
	except:
	pass

	# Sample every 10000 rows to save time
	if i % 10000 == 0 and i > 0:
	print(f"CSV row {i}: index value = {idx}")

	if i >= 2000000: # Check first 2M rows
	break

	print(f"\nTotal sampled: {len(seen_indices)}")
	print(f"First few: {seen_indices[:5]}")
	print(f"Last few: {seen_indices[-5:]}")

	# Check if indices are sequential
	gaps = []
	for i in range(1, min(100, len(seen_indices))):
	diff = seen_indices[i][1] - seen_indices[i-1][1]
	if diff != 1:
	gaps.append((seen_indices[i-1], seen_indices[i], diff))

	print(f"\nFound {len(gaps)} gaps in first 100 rows")
	if gaps:
	print(f"Example gaps: {gaps[:5]}")