File size: 6,463 Bytes
fe8e53f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | #!/usr/bin/env python3
"""
Memory-efficient script to enrich programming_problems.jsonl
Only loads the exact rows we need from enhanced_dataset.csv
"""
import json
import csv
from tqdm import tqdm
import sys
def get_needed_original_indices(function_csv, input_jsonl):
"""
Get the set of original_index values we actually need to look up.
Returns:
Dictionary mapping original_index to list of row_numbers that need it
"""
print("Step 1: Determining which original_index values we need...")
# First, get row_number to original_index mapping from function_dataset_v2
row_to_original = {}
with open(function_csv, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for i, row in enumerate(tqdm(reader, desc="Reading function_dataset_v2"), start=1):
try:
original_index = int(row['original_index'])
row_to_original[i] = original_index
except (ValueError, KeyError):
pass
# Next, get the row_numbers from JSONL that we need to enrich
needed_indices = {}
with open(input_jsonl, 'r', encoding='utf-8') as f:
for line in tqdm(f, desc="Reading JSONL", total=22532):
data = json.loads(line.strip())
row_number = data.get('row_number')
if row_number in row_to_original:
original_index = row_to_original[row_number]
if original_index not in needed_indices:
needed_indices[original_index] = []
needed_indices[original_index].append(row_number)
print(f"Need to look up {len(needed_indices)} unique original_index values")
print(f"Max index needed: {max(needed_indices.keys())}")
print(f"Min index needed: {min(needed_indices.keys())}")
return row_to_original, needed_indices
def load_needed_metadata(enhanced_csv, needed_indices):
"""
Load only the needed rows from enhanced_dataset.csv.
Args:
enhanced_csv: Path to enhanced_dataset.csv
needed_indices: Set of original_index values we need
Returns:
Dictionary mapping original_index to {repo_name, path, language}
"""
print("\nStep 2: Loading only needed rows from enhanced_dataset.csv...")
print(f"Looking for {len(needed_indices)} unique indices...")
print("This will scan the entire file - may take several minutes...")
mapping = {}
needed_remaining = set(needed_indices.keys())
with open(enhanced_csv, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for i, row in enumerate(tqdm(reader, desc="Reading enhanced_dataset")):
# Get the index from various possible column names
idx = row.get('', row.get('Unnamed: 0.1', row.get('Unnamed: 0')))
if idx:
try:
idx = int(idx)
if idx in needed_remaining:
mapping[idx] = {
'repo_name': row.get('repo_name', ''),
'path': row.get('path', ''),
'language': row.get('language', '')
}
needed_remaining.remove(idx)
# Progress update every 1000 found
if len(mapping) % 1000 == 0:
print(f"Found {len(mapping)}/{len(needed_indices)} needed indices...")
# Early exit if we found everything
if len(needed_remaining) == 0:
print(f"Found all needed indices at row {i}!")
break
except (ValueError, KeyError):
pass
print(f"Loaded metadata for {len(mapping)} indices")
print(f"Missing: {len(needed_indices) - len(mapping)} indices")
if needed_remaining:
print(f"Example missing indices: {list(needed_remaining)[:10]}")
return mapping
def enrich_programming_problems(input_jsonl, output_jsonl, metadata_mapping, row_to_original):
"""
Enrich programming_problems.jsonl with metadata.
"""
print("\nStep 3: Enriching JSONL file...")
matched_count = 0
unmatched_count = 0
with open(input_jsonl, 'r', encoding='utf-8') as f_in, \
open(output_jsonl, 'w', encoding='utf-8') as f_out:
for line in tqdm(f_in, desc="Processing JSONL", total=22532):
data = json.loads(line.strip())
row_number = data.get('row_number')
if row_number in row_to_original:
original_index = row_to_original[row_number]
if original_index in metadata_mapping:
enrichment = metadata_mapping[original_index]
data['metadata']['repo_name'] = enrichment['repo_name']
data['metadata']['path'] = enrichment['path']
data['metadata']['language'] = enrichment['language']
matched_count += 1
else:
unmatched_count += 1
else:
unmatched_count += 1
f_out.write(json.dumps(data, ensure_ascii=False) + '\n')
return matched_count, unmatched_count
def main():
enhanced_csv = 'enhanced_dataset.csv'
function_csv = 'function_dataset_v2.csv'
input_jsonl = 'programming_problems.jsonl'
output_jsonl = 'programming_problems_enriched.jsonl'
# Step 1: Determine what we need
row_to_original, needed_indices = get_needed_original_indices(function_csv, input_jsonl)
# Step 2: Load only what we need
metadata_mapping = load_needed_metadata(enhanced_csv, needed_indices)
# Step 3: Enrich the JSONL
matched, unmatched = enrich_programming_problems(input_jsonl, output_jsonl,
metadata_mapping, row_to_original)
print(f"\n{'='*60}")
print(f"✅ Enrichment complete!")
print(f"{'='*60}")
print(f"Output written to: {output_jsonl}")
print(f"Matched: {matched}")
print(f"Unmatched: {unmatched}")
print(f"Total: {matched + unmatched}")
print(f"Match rate: {matched / (matched + unmatched) * 100:.1f}%")
return 0
if __name__ == '__main__':
sys.exit(main())
|