DouDou commited on
Commit
216c37b
·
verified ·
1 Parent(s): b805898

Upload data3/check_index_distribution.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data3/check_index_distribution.py +43 -0
data3/check_index_distribution.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Check the distribution of indices in enhanced_dataset.csv
4
+ """
5
+ import csv
6
+ from tqdm import tqdm
7
+
8
+ print("Sampling enhanced_dataset.csv to understand index distribution...")
9
+
10
+ seen_indices = []
11
+ with open('enhanced_dataset.csv', 'r', encoding='utf-8') as f:
12
+ reader = csv.DictReader(f)
13
+
14
+ for i, row in enumerate(reader):
15
+ idx = row.get('', row.get('Unnamed: 0.1', row.get('Unnamed: 0')))
16
+ if idx:
17
+ try:
18
+ idx = int(idx)
19
+ seen_indices.append((i, idx)) # (csv_row_num, index_value)
20
+ except:
21
+ pass
22
+
23
+ # Sample every 10000 rows to save time
24
+ if i % 10000 == 0 and i > 0:
25
+ print(f"CSV row {i}: index value = {idx}")
26
+
27
+ if i >= 2000000: # Check first 2M rows
28
+ break
29
+
30
+ print(f"\nTotal sampled: {len(seen_indices)}")
31
+ print(f"First few: {seen_indices[:5]}")
32
+ print(f"Last few: {seen_indices[-5:]}")
33
+
34
+ # Check if indices are sequential
35
+ gaps = []
36
+ for i in range(1, min(100, len(seen_indices))):
37
+ diff = seen_indices[i][1] - seen_indices[i-1][1]
38
+ if diff != 1:
39
+ gaps.append((seen_indices[i-1], seen_indices[i], diff))
40
+
41
+ print(f"\nFound {len(gaps)} gaps in first 100 rows")
42
+ if gaps:
43
+ print(f"Example gaps: {gaps[:5]}")