| import os |
| import shutil |
| from pathlib import Path |
| import re |
| from bs4 import BeautifulSoup |
|
|
|
|
| def process_html_file(file_path, output_path): |
| with open(file_path, 'r', encoding='utf-8') as file: |
| content = file.read() |
|
|
| |
| content = re.sub(r'<h3>Prediction:.*?</h3>\n?', '', content, flags=re.DOTALL) |
|
|
| |
| soup = BeautifulSoup(content, 'html.parser') |
|
|
| |
| for table in soup.find_all('table'): |
| if table.find('td', string='verification_result'): |
| table.decompose() |
|
|
| |
| with open(output_path, 'w', encoding='utf-8') as file: |
| file.write(str(soup)) |
|
|
|
|
| def process_directory(input_dir, output_dir): |
| for root, dirs, files in os.walk(input_dir): |
| for file in files: |
| if file.endswith('.html'): |
| input_path = Path(root) / file |
| relative_path = input_path.relative_to(input_dir) |
| output_path = Path(output_dir) / relative_path |
|
|
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| process_html_file(input_path, output_path) |
|
|
|
|
| |
| input_directory = "htmls_POS" |
| output_directory = "htmls_POS_mod2" |
|
|
| |
| process_directory(input_directory, output_directory) |
|
|
| print("Processing complete for POS. Modified files are in the output directory.") |