| |
|
|
| import pandas as pd |
| import json |
|
|
|
|
| def read_jsonl(filename): |
| """Reads a jsonl file and yields each line as a dictionary""" |
| lines = [] |
| |
| with open(filename, "r", encoding="utf-8") as file: |
| for line in file: |
| lines.append(json.loads(line)) |
| |
| |
| return lines |
|
|
| |
|
|
|
|
| def write_jsonl(filename, lines): |
| """Writes a python list of dictionaries into a jsonl file""" |
| with open(filename, "w", encoding="utf-8") as file: |
| for line in lines: |
| file.write(json.dumps(line) + "\n") |
|
|
|
|
| df = pd.read_parquet("./data/CodeContest/validation.parquet", engine='pyarrow') |
| df = df[['name', 'cf_contest_id', 'cf_tags', 'difficulty', |
| 'description', 'public_tests', 'private_tests', 'generated_tests']] |
|
|
|
|
| def get_test_cases(input, output): |
| return { |
| "input": str(input), |
| "output": [str(output)] |
| } |
|
|
|
|
| test_datasets = [] |
|
|
| for i in range(len(df)): |
| row = df.iloc[i] |
|
|
| public_test_cases = list( |
| map(get_test_cases, row['public_tests']['input'], row['public_tests']['output'])) |
| test_cases = [] |
| test_cases.extend(list(map( |
| get_test_cases, row['private_tests']['input'], row['private_tests']['output']))) |
| test_cases.extend(list(map( |
| get_test_cases, row['generated_tests']['input'], row['generated_tests']['output']))) |
|
|
| test = { |
| "name": str(row['name']), |
| "description": str(row['description']), |
| "tags": list(row['cf_tags']), |
| "difficulty": int(row['difficulty']), |
| "id": int(row['cf_contest_id']), |
| "sample_io": public_test_cases, |
| "test_list": test_cases |
| } |
|
|
| test_datasets.append(test) |
|
|
|
|
| write_jsonl("./data/CodeContest/Val.jsonl", test_datasets) |
|
|