| | import os |
| | import pandas as pd |
| | from pathlib import Path |
| | from tqdm import tqdm |
| |
|
| | |
| |
|
| |
|
| | def process_csvs(folder_path, new_folder_name): |
| | |
| | board = os.path.basename(folder_path) |
| | |
| | sorted_folder = Path(new_folder_name) |
| | sorted_folder.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | all_files = [ |
| | os.path.join(folder_path, file) |
| | for file in os.listdir(folder_path) |
| | if file.endswith(".csv") |
| | ] |
| | |
| | list_of_dataframes = [pd.read_csv(file) for file in all_files] |
| | |
| | combined_df = pd.concat(list_of_dataframes, ignore_index=True) |
| |
|
| | |
| | combined_df = combined_df.sort_values(by="last_edit") |
| |
|
| | |
| | num_chunks = len(combined_df) // 10000 + (1 if len(combined_df) % 10000 else 0) |
| | chunks = [combined_df.iloc[i * 10000 : (i + 1) * 10000] for i in range(num_chunks)] |
| |
|
| | |
| | for idx, chunk in tqdm(enumerate(chunks)): |
| | start_date = pd.to_datetime(chunk["last_edit"].iloc[0]).strftime("%d%m%y") |
| | end_date = pd.to_datetime(chunk["last_edit"].iloc[-1]).strftime("%d%m%y") |
| | filename = f"BitcoinForum_{board}_{start_date}_to_{end_date}.csv" |
| | chunk.to_csv(os.path.join(sorted_folder, filename), index=False) |
| |
|
| |
|
| | folder_paths = [ |
| | "./raw-data", |
| | "./preprocessed-data", |
| | ] |
| |
|
| | |
| | for folder_path in folder_paths: |
| | folder_name = os.path.basename(folder_path) |
| | new_folder_name = f"sorted-{folder_name}" |
| | for folder in tqdm(os.listdir(folder_path)): |
| | if os.path.isdir(os.path.join(folder_path, folder)): |
| | process_csvs(os.path.join(folder_path, folder), new_folder_name) |
| |
|