dataset-builder / data3 /check_enhanced.py
DouDou
Upload data3/check_enhanced.py with huggingface_hub
b805898 verified
#!/usr/bin/env python3
import csv
import json
print("Checking enhanced_dataset.csv...")
with open('enhanced_dataset.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
# Get first row
row = next(reader)
print(f"Columns: {list(row.keys())}")
print(f"\nFirst row values:")
print(f" Unnamed: 0: {row.get('Unnamed: 0', 'N/A')}")
print(f" Unnamed: 0.1: {row.get('Unnamed: 0.1', 'N/A')}")
print(f" repo_name: {row.get('repo_name', 'N/A')}")
print(f" path: {row.get('path', 'N/A')}")
print(f" language: {row.get('language', 'N/A')}")
# Try to find the row matching original_index=489788
print("\n\nSearching for original_index=489788...")
f.seek(0)
next(reader) # Skip header
for i, row in enumerate(reader):
# Check different potential index columns
idx_val = row.get('Unnamed: 0.1') or row.get('Unnamed: 0') or row.get('')
if idx_val == '489788':
print(f"Found at row {i+1}!")
print(f" repo_name: '{row.get('repo_name', 'N/A')}'")
print(f" path: '{row.get('path', 'N/A')}'")
print(f" language: '{row.get('language', 'N/A')}'")
break
if i >= 100000: # Don't search forever
print(f"Not found in first 100k rows")
break