Spaces:
Running on A10G
Running on A10G
File size: 579 Bytes
95cbc5b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | import json
from pathlib import Path
def get_ids(file_path):
ids = set()
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
obj = json.loads(line)
ids.add(obj.get('commit_id') or obj.get('sample_id'))
return ids
train_ids = get_ids('data/devign_train.jsonl')
test_ids = get_ids('data/devign_test.jsonl')
overlap = train_ids.intersection(test_ids)
print(f"Train IDs: {len(train_ids)}")
print(f"Test IDs: {len(test_ids)}")
print(f"Overlap: {len(overlap)}")
if overlap:
print(f"Overlapping IDs: {list(overlap)[:5]}")
|