| | import os |
| | import re |
| | from pathlib import Path |
| | from bs4 import BeautifulSoup |
| |
|
| |
|
| | def process_html_file(file_path, output_path): |
| | with open(file_path, 'r', encoding='utf-8') as file: |
| | content = file.read() |
| |
|
| | soup = BeautifulSoup(content, 'html.parser') |
| |
|
| | |
| | statement_tag = soup.find(lambda tag: tag.name == "h3" and tag.find("span", string="Statement:")) |
| |
|
| | if statement_tag: |
| | |
| | statement_text = statement_tag.get_text(strip=True) |
| |
|
| | |
| | new_statement = re.sub(r'\s*in the table:.*$', '', statement_text, flags=re.DOTALL) |
| |
|
| | |
| | new_h3 = soup.new_tag('h3') |
| | new_span = soup.new_tag('span') |
| | new_span.string = 'Statement:' |
| | new_h3.append(new_span) |
| | new_h3.append(f" {new_statement}") |
| |
|
| | |
| | statement_tag.replace_with(new_h3) |
| |
|
| | |
| | with open(output_path, 'w', encoding='utf-8') as file: |
| | file.write(str(soup)) |
| |
|
| |
|
| | def process_directory(input_dir, output_dir): |
| | subfolders = ['TP', 'TN', 'FP', 'FN'] |
| |
|
| | for subfolder in subfolders: |
| | input_subfolder = Path(input_dir) / subfolder |
| | output_subfolder = Path(output_dir) / subfolder |
| |
|
| | if not input_subfolder.exists(): |
| | print(f"Warning: {input_subfolder} does not exist. Skipping.") |
| | continue |
| |
|
| | output_subfolder.mkdir(parents=True, exist_ok=True) |
| |
|
| | for file in input_subfolder.glob('*.html'): |
| | output_file = output_subfolder / file.name |
| | process_html_file(file, output_file) |
| | print(f"Processed: {file} -> {output_file}") |
| |
|
| |
|
| | |
| | input_directory = "htmls_DATER_mod" |
| | output_directory = "htmls_DATER_mod2" |
| |
|
| | |
| | process_directory(input_directory, output_directory) |
| |
|
| | print("Processing complete. Modified files are in the output directory.") |