#!/usr/bin/env python3 """ Let's understand the relationship between the datasets by comparing a few records. """ import csv import json # Read function_dataset_v2.csv and check what each column represents print("=== function_dataset_v2.csv structure ===") with open('function_dataset_v2.csv', 'r', encoding='utf-8') as f: reader = csv.DictReader(f) headers = reader.fieldnames print(f"Headers: {headers}") # Get a row that HAS metadata print("\nFinding a row with complete metadata...") for row in reader: if row['repo_name'] and row['path'] and row['language']: print(f"\nSample row WITH metadata:") print(f" original_index: {row['original_index']}") print(f" function_index: {row['function_index']}") print(f" repo_name: {row['repo_name']}") print(f" path: {row['path']}") print(f" language: {row['language']}") print(f" function_name: {row['function_name']}") break # Now check programming_problems.jsonl print("\n\n=== programming_problems.jsonl structure ===") with open('programming_problems.jsonl', 'r', encoding='utf-8') as f: # Find an entry with row_number that might match for line in f: data = json.loads(line.strip()) # Just show first entry print(f"First entry:") print(f" row_number: {data.get('row_number')}") print(f" metadata.original_index: {data['metadata']['original_index']}") print(f" metadata.function_name: {data['metadata']['function_name']}") print(f" metadata.repo_name: '{data['metadata']['repo_name']}'") print(f" metadata.path: '{data['metadata']['path']}'") print(f" metadata.language: '{data['metadata']['language']}'") break # The key question: does row_number in JSONL match the row number in CSV? print("\n\n=== Checking if row_number matches CSV row ===") with open('programming_problems.jsonl', 'r', encoding='utf-8') as f: data = json.loads(f.readline()) target_row = data.get('row_number') print(f"JSONL row_number: {target_row}") # Get that row from CSV (row_number is probably 1-indexed after header) with open('function_dataset_v2.csv', 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for i, row in enumerate(reader): if i + 1 == target_row: # CSV rows are 1-indexed print(f"\nCSV row {target_row}:") print(f" original_index: {row['original_index']}") print(f" repo_name: '{row['repo_name']}'") print(f" path: '{row['path']}'") print(f" language: '{row['language']}'") print(f" function_name: '{row['function_name']}'") # Check if function names match if row['function_name'] == data['metadata']['function_name']: print(f"\n✅ Function names match! We should use row_number as the key.") else: print(f"\n❌ Function names don't match.") break