| """ |
| This is a utility script for use in sagemaker |
| """ |
|
|
| import json |
| import pandas as pd |
| import pyarrow as pa |
| import pyarrow.parquet as pq |
| import os |
| from tqdm import tqdm |
|
|
| |
| json_file_path = "/home/studio-lab-user/arxiv-paper-recommender-system/arxiv-metadata-oai-snapshot.json" |
| parquet_file_path = "/home/studio-lab-user/arxiv-paper-recommender-system/data/processed/arxiv_papers_raw.parquet.gzip" |
|
|
| |
| batch_size = 10000 |
|
|
| |
| parent_dir = os.path.dirname(parquet_file_path) |
| os.makedirs(parent_dir, exist_ok=True) |
|
|
| |
| with open(json_file_path, 'r') as file: |
| |
| arxiv_data = [] |
| processed_count = 0 |
|
|
| |
| for line in tqdm(file): |
| |
| arxiv_data.append(json.loads(line)) |
|
|
| processed_count += 1 |
|
|
| |
| if processed_count % batch_size == 0: |
| df = pd.DataFrame.from_records(arxiv_data) |
| |
| |
| |
| table = pa.Table.from_pandas(df) |
|
|
| |
| pq.write_to_dataset(table , root_path=parquet_file_path) |
| arxiv_data = [] |
|
|
| |
| if arxiv_data: |
| df = pd.DataFrame.from_records(arxiv_data) |
| |
| |
| pq.write_to_dataset(parquet_file_path , root_path=parquet_file_path) |
|
|