| import argparse |
| import json |
| import os |
| import re |
| import time |
| import uuid |
| from datetime import datetime |
|
|
| import pandas as pd |
|
|
| from tools.aws_functions import download_file_from_s3, export_outputs_to_s3 |
| from tools.config import ( |
| ACCESS_LOGS_FOLDER, |
| ALLOW_LIST_PATH, |
| AWS_ACCESS_KEY, |
| AWS_LLM_PII_OPTION, |
| AWS_PII_OPTION, |
| AWS_REGION, |
| AWS_SECRET_KEY, |
| AZURE_OPENAI_API_KEY, |
| AZURE_OPENAI_INFERENCE_ENDPOINT, |
| CHOSEN_COMPREHEND_ENTITIES, |
| CHOSEN_LLM_ENTITIES, |
| CHOSEN_LLM_PII_INFERENCE_METHOD, |
| CHOSEN_REDACT_ENTITIES, |
| CLOUD_LLM_PII_MODEL_CHOICE, |
| CLOUD_VLM_MODEL_CHOICE, |
| COMPRESS_REDACTED_PDF, |
| CUSTOM_ENTITIES, |
| DEFAULT_COMBINE_PAGES, |
| DEFAULT_COST_CODE, |
| DEFAULT_DUPLICATE_DETECTION_THRESHOLD, |
| DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, |
| DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, |
| DEFAULT_INFERENCE_SERVER_PII_MODEL, |
| DEFAULT_INFERENCE_SERVER_VLM_MODEL, |
| DEFAULT_LANGUAGE, |
| DEFAULT_LOCAL_OCR_MODEL, |
| DEFAULT_MIN_CONSECUTIVE_PAGES, |
| DEFAULT_MIN_WORD_COUNT, |
| DEFAULT_TABULAR_ANONYMISATION_STRATEGY, |
| DENY_LIST_PATH, |
| DIRECT_MODE_DEFAULT_USER, |
| DISPLAY_FILE_NAMES_IN_LOGS, |
| DO_INITIAL_TABULAR_DATA_CLEAN, |
| DOCUMENT_REDACTION_BUCKET, |
| EFFICIENT_OCR, |
| EFFICIENT_OCR_MIN_EMBEDDED_IMAGE_PX, |
| EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION, |
| EFFICIENT_OCR_MIN_WORDS, |
| FEEDBACK_LOGS_FOLDER, |
| FULL_COMPREHEND_ENTITY_LIST, |
| FULL_ENTITY_LIST, |
| FULL_LLM_ENTITY_LIST, |
| GEMINI_API_KEY, |
| GRADIO_TEMP_DIR, |
| HYBRID_TEXTRACT_BEDROCK_VLM, |
| IMAGES_DPI, |
| INFERENCE_SERVER_API_URL, |
| INFERENCE_SERVER_PII_OPTION, |
| INPUT_FOLDER, |
| LLM_MAX_NEW_TOKENS, |
| LLM_PII_INFERENCE_METHODS, |
| LLM_TEMPERATURE, |
| LOCAL_OCR_MODEL_OPTIONS, |
| LOCAL_PII_OPTION, |
| LOCAL_TRANSFORMERS_LLM_PII_OPTION, |
| OCR_FIRST_PASS_MAX_WORKERS, |
| OUTPUT_FOLDER, |
| OVERWRITE_EXISTING_OCR_RESULTS, |
| PADDLE_MODEL_PATH, |
| PREPROCESS_LOCAL_OCR_IMAGES, |
| REMOVE_DUPLICATE_ROWS, |
| RETURN_REDACTED_PDF, |
| RUN_AWS_FUNCTIONS, |
| S3_OUTPUTS_BUCKET, |
| S3_OUTPUTS_FOLDER, |
| S3_USAGE_LOGS_FOLDER, |
| SAVE_LOGS_TO_CSV, |
| SAVE_LOGS_TO_DYNAMODB, |
| SAVE_OUTPUTS_TO_S3, |
| SAVE_PAGE_OCR_VISUALISATIONS, |
| SESSION_OUTPUT_FOLDER, |
| SPACY_MODEL_PATH, |
| SUMMARY_PAGE_GROUP_MAX_WORKERS, |
| TEXTRACT_JOBS_LOCAL_LOC, |
| TEXTRACT_JOBS_S3_LOC, |
| TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, |
| TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, |
| TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, |
| USAGE_LOGS_FOLDER, |
| USE_GREEDY_DUPLICATE_DETECTION, |
| WHOLE_PAGE_REDACTION_LIST_PATH, |
| convert_string_to_boolean, |
| ) |
|
|
|
|
| def _generate_session_hash() -> str: |
| """Generate a unique session hash for logging purposes.""" |
| return str(uuid.uuid4())[:8] |
|
|
|
|
| def _sanitize_folder_name(folder_name: str, max_length: int = 50) -> str: |
| """ |
| Sanitize folder name for S3 compatibility. |
| |
| Replaces 'strange' characters (anything that's not alphanumeric, dash, underscore, or full stop) |
| with underscores, and limits the length to max_length characters. |
| |
| Args: |
| folder_name: Original folder name to sanitize |
| max_length: Maximum length for the folder name (default: 50) |
| |
| Returns: |
| Sanitized folder name |
| """ |
| if not folder_name: |
| return folder_name |
|
|
| |
| |
| sanitized = re.sub(r"[^a-zA-Z0-9._-]", "_", folder_name) |
|
|
| |
| if len(sanitized) > max_length: |
| sanitized = sanitized[:max_length] |
|
|
| return sanitized |
|
|
|
|
| def get_username_and_folders( |
| username: str = "", |
| output_folder_textbox: str = OUTPUT_FOLDER, |
| input_folder_textbox: str = INPUT_FOLDER, |
| session_output_folder: bool = SESSION_OUTPUT_FOLDER, |
| textract_document_upload_input_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, |
| textract_document_upload_output_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, |
| s3_textract_document_logs_subfolder: str = TEXTRACT_JOBS_S3_LOC, |
| local_textract_document_logs_subfolder: str = TEXTRACT_JOBS_LOCAL_LOC, |
| ): |
|
|
| |
| if username: |
| out_session_hash = username |
| else: |
| out_session_hash = _generate_session_hash() |
|
|
| |
| sanitized_session_hash = _sanitize_folder_name(out_session_hash) |
|
|
| if session_output_folder: |
| output_folder = output_folder_textbox + sanitized_session_hash + "/" |
| input_folder = input_folder_textbox + sanitized_session_hash + "/" |
|
|
| textract_document_upload_input_folder = ( |
| textract_document_upload_input_folder + "/" + sanitized_session_hash |
| ) |
| textract_document_upload_output_folder = ( |
| textract_document_upload_output_folder + "/" + sanitized_session_hash |
| ) |
|
|
| s3_textract_document_logs_subfolder = ( |
| s3_textract_document_logs_subfolder + "/" + sanitized_session_hash |
| ) |
| local_textract_document_logs_subfolder = ( |
| local_textract_document_logs_subfolder + "/" + sanitized_session_hash + "/" |
| ) |
|
|
| else: |
| output_folder = output_folder_textbox |
| input_folder = input_folder_textbox |
|
|
| if not os.path.exists(output_folder): |
| os.mkdir(output_folder) |
| if not os.path.exists(input_folder): |
| os.mkdir(input_folder) |
|
|
| return ( |
| out_session_hash, |
| output_folder, |
| out_session_hash, |
| input_folder, |
| textract_document_upload_input_folder, |
| textract_document_upload_output_folder, |
| s3_textract_document_logs_subfolder, |
| local_textract_document_logs_subfolder, |
| ) |
|
|
|
|
| def _get_env_list(env_var_name: str) -> list[str]: |
| """Parses a comma-separated environment variable into a list of strings.""" |
| value = env_var_name[1:-1].strip().replace('"', "").replace("'", "") |
| if not value: |
| return [] |
| |
| return [s.strip() for s in value.split(",") if s.strip()] |
|
|
|
|
| def _download_s3_file_if_needed( |
| file_path: str, default_filename: str = "downloaded_file" |
| ) -> str: |
| """ |
| Download a file from S3 if the path starts with 's3://' or 'S3://', otherwise return the path as-is. |
| |
| Args: |
| file_path: File path (either local or S3 URL) |
| default_filename: Default filename to use if S3 key doesn't have a filename |
| |
| Returns: |
| Local file path (downloaded from S3 or original path) |
| """ |
| if not file_path: |
| return file_path |
|
|
| |
| file_path_stripped = file_path.strip() |
| file_path_upper = file_path_stripped.upper() |
| if not file_path_upper.startswith("S3://"): |
| return file_path |
|
|
| |
| temp_dir = GRADIO_TEMP_DIR if GRADIO_TEMP_DIR else INPUT_FOLDER |
| os.makedirs(temp_dir, exist_ok=True) |
|
|
| |
| |
| s3_path = ( |
| file_path_stripped.split("://", 1)[1] |
| if "://" in file_path_stripped |
| else file_path_stripped |
| ) |
| |
| if "/" in s3_path: |
| bucket_name_s3, s3_key = s3_path.split("/", 1) |
| else: |
| |
| bucket_name_s3 = s3_path |
| s3_key = "" |
|
|
| |
| filename = os.path.basename(s3_key) if s3_key else bucket_name_s3 |
| if not filename: |
| filename = default_filename |
|
|
| |
| local_file_path = os.path.join(temp_dir, filename) |
|
|
| |
| try: |
| download_file_from_s3( |
| bucket_name=bucket_name_s3, |
| key=s3_key, |
| local_file_path_and_name=local_file_path, |
| ) |
| print(f"S3 file downloaded successfully: {file_path} -> {local_file_path}") |
| return local_file_path |
| except Exception as e: |
| print(f"Error downloading file from S3 ({file_path}): {e}") |
| raise Exception(f"Failed to download file from S3: {e}") |
|
|
|
|
| def _build_s3_output_folder( |
| s3_outputs_folder: str, |
| session_hash: str, |
| save_to_user_folders: bool, |
| ) -> str: |
| """ |
| Build the S3 output folder path with session hash and date suffix if needed. |
| |
| Args: |
| s3_outputs_folder: Base S3 folder path |
| session_hash: Session hash/username |
| save_to_user_folders: Whether to append session hash to folder path |
| |
| Returns: |
| Final S3 folder path with session hash and date suffix |
| """ |
| if not s3_outputs_folder: |
| return "" |
|
|
| |
| if save_to_user_folders and session_hash: |
| sanitized_session_hash = _sanitize_folder_name(session_hash) |
| s3_outputs_folder = ( |
| s3_outputs_folder.rstrip("/") + "/" + sanitized_session_hash + "/" |
| ) |
| else: |
| |
| if not s3_outputs_folder.endswith("/"): |
| s3_outputs_folder = s3_outputs_folder + "/" |
|
|
| |
| today_suffix = datetime.now().strftime("%Y%m%d") + "/" |
| s3_outputs_folder = s3_outputs_folder.rstrip("/") + "/" + today_suffix |
|
|
| return s3_outputs_folder |
|
|
|
|
| |
| CHOSEN_COMPREHEND_ENTITIES.extend(CUSTOM_ENTITIES) |
| FULL_COMPREHEND_ENTITY_LIST.extend(CUSTOM_ENTITIES) |
|
|
| chosen_redact_entities = CHOSEN_REDACT_ENTITIES |
| full_entity_list = FULL_ENTITY_LIST |
| chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES |
| full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST |
| chosen_llm_entities = CHOSEN_LLM_ENTITIES |
| full_llm_entity_list = FULL_LLM_ENTITY_LIST |
| default_handwrite_signature_checkbox = DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX |
|
|
|
|
| |
|
|
|
|
| def build_cli_argument_parser() -> argparse.ArgumentParser: |
| """Build the CLI ArgumentParser (shared by main(), Agent API, and tests).""" |
| parser = argparse.ArgumentParser( |
| description="A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.", |
| formatter_class=argparse.RawTextHelpFormatter, |
| epilog=""" |
| Examples: |
| |
| To run these, you need to do the following: |
| |
| - Open a terminal window |
| |
| - CD to the app folder that contains this file (cli_redact.py) |
| |
| - Load the virtual environment using either conda or venv depending on your setup |
| |
| - Run one of the example commands below |
| |
| - Look in the output/ folder to see output files: |
| |
| # Redaction |
| |
| ## Redact a PDF with default settings (local OCR): |
| python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf |
| |
| ## Extract text from a PDF only (i.e. no redaction), using local OCR: |
| python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector None |
| |
| ## Extract text from a PDF only (i.e. no redaction), using local OCR, with a whole page redaction list: |
| python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector Local --local_redact_entities CUSTOM |
| |
| ## Redact a PDF with allow list (local OCR) and custom list of redaction entities: |
| python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --allow_list_file example_data/test_allow_list_graduate.csv --local_redact_entities TITLES PERSON DATE_TIME |
| |
| ## Redact a PDF with limited pages and text extraction method (local text) with custom fuzzy matching: |
| python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv --local_redact_entities CUSTOM_FUZZY --page_min 1 --page_max 3 --ocr_method "Local text" --fuzzy_mistakes 3 |
| |
| ## Redaction with custom deny list, allow list, and whole page redaction list: |
| python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/partnership_toolkit_redact_custom_deny_list.csv --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --allow_list_file example_data/test_allow_list_partnership.csv |
| |
| ## Redact an image: |
| python cli_redact.py --input_file example_data/example_complaint_letter.jpg |
| |
| ## Anonymise csv file with specific columns: |
| python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy replace_redacted |
| |
| ## Anonymise csv file with a different strategy (remove text completely): |
| python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy redact |
| |
| ## Anonymise Excel file, remove text completely: |
| python cli_redact.py --input_file example_data/combined_case_notes.xlsx --text_columns "Case Note" "Client" --excel_sheets combined_case_notes --anon_strategy redact |
| |
| ## Anonymise a word document: |
| python cli_redact.py --input_file "example_data/Bold minimalist professional cover letter.docx" --anon_strategy replace_redacted |
| |
| # Redaction with AWS services: |
| |
| ## Use Textract and Comprehend: |
| python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --ocr_method "AWS Textract" --pii_detector "AWS Comprehend" |
| |
| # LLM PII identification (entity subset and custom instructions) |
| |
| ## Redact with LLM PII entity subset (NAME, EMAIL_ADDRESS, etc.) and custom instructions: |
| python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --llm_redact_entities NAME EMAIL_ADDRESS PHONE_NUMBER ADDRESS CUSTOM --custom_llm_instructions "Do not redact the name of the university." |
| |
| ## Redact with custom LLM instructions only (use default LLM entities from config): |
| python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --custom_llm_instructions "Redact all company names with the label COMPANY_NAME." |
| |
| ## Redact specific pages with AWS OCR and signature extraction: |
| python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures" |
| |
| ## Redact with AWS OCR and additional layout extraction options: |
| python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_layout |
| |
| # Duplicate page detection |
| |
| ## Find duplicate pages in OCR files: |
| python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 |
| |
| ## Find duplicate in OCR files at the line level: |
| python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --combine_pages False --min_word_count 3 |
| |
| ## Find duplicate rows in tabular data: |
| python cli_redact.py --task deduplicate --input_file example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv --duplicate_type tabular --text_columns "text" --similarity_threshold 0.95 |
| |
| # AWS Textract whole document analysis |
| |
| ## Submit document to Textract for basic text analysis: |
| python cli_redact.py --task textract --textract_action submit --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf |
| |
| ## Submit document to Textract for analysis with signature extraction (Job ID will be printed to the console, you need this to retrieve the results): |
| python cli_redact.py --task textract --textract_action submit --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --extract_signatures |
| |
| ## Retrieve Textract results by job ID (returns a .json file output): |
| python cli_redact.py --task textract --textract_action retrieve --job_id 12345678-1234-1234-1234-123456789012 |
| |
| ## List recent Textract jobs: |
| python cli_redact.py --task textract --textract_action list |
| |
| # Document summarisation |
| |
| # Summarise from a PDF with AWS Bedrock |
| python cli_redact.py --task summarise --input_file example_data/example_data/Partnership-Agreement-Toolkit_0_0.pdf --summarisation_inference_method "LLM (AWS Bedrock)" |
| |
| ## Summarise document(s) from OCR output CSV(s) using AWS Bedrock: |
| python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_inference_method "LLM (AWS Bedrock)" |
| |
| ## Summarise with local LLM and detailed format: |
| python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_inference_method "Local transformers LLM" --summarisation_format detailed |
| |
| ## Summarise with additional context and instructions (concise format): |
| python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv --summarisation_context "This is a partnership agreement" --summarisation_additional_instructions "Focus on key obligations and termination clauses" --summarisation_format concise |
| |
| ## Summarise multiple OCR CSV files: |
| python cli_redact.py --task summarise --input_file example_data/example_outputs/Partnership-Agreement-Toolkit_0_0.pdf_ocr_output.csv example_data/example_outputs/example_of_emails_sent_to_a_professor_before_applying_ocr_output_textract.csv --summarisation_inference_method "LLM (AWS Bedrock)" |
| |
| # Combine review PDFs |
| |
| ## Merge redaction comments from multiple '_redactions_for_review' PDFs into one file: |
| python cli_redact.py --task combine_review_pdfs --input_file path/to/review1.pdf path/to/review2.pdf --output_dir output/ |
| |
| """, |
| ) |
|
|
| |
| task_group = parser.add_argument_group("Task Selection") |
| task_group.add_argument( |
| "--task", |
| choices=[ |
| "redact", |
| "deduplicate", |
| "textract", |
| "summarise", |
| "combine_review_pdfs", |
| "export_review_redaction_overlay", |
| "export_review_page_ocr_visualisation", |
| ], |
| default="redact", |
| help="Task to perform: redact (PII redaction/anonymisation), deduplicate (find duplicate content), textract (AWS Textract batch operations), summarise (LLM-based document summarisation from OCR CSV files), combine_review_pdfs (merge redaction comments from multiple '_redactions_for_review' PDFs into one file), export_review_redaction_overlay (write a redaction overlay JPEG for a page image + boxes JSON), or export_review_page_ocr_visualisation (write an OCR visualisation PNG for a page image + OCR JSON).", |
| ) |
|
|
| |
| general_group = parser.add_argument_group("General Options") |
| general_group.add_argument( |
| "--input_file", |
| nargs="+", |
| help="Path to the input file(s) to process. Separate multiple files with a space, and use quotes if there are spaces in the file name.", |
| ) |
| general_group.add_argument( |
| "--output_dir", default=OUTPUT_FOLDER, help="Directory for all output files." |
| ) |
| general_group.add_argument( |
| "--input_dir", default=INPUT_FOLDER, help="Directory for all input files." |
| ) |
|
|
| export_group = parser.add_argument_group( |
| "Review export (page image visualisations)" |
| ) |
| export_group.add_argument( |
| "--page_image_path", |
| default="", |
| help="Path to a single page raster image (PNG/JPG) used as underlay for export tasks.", |
| ) |
| export_group.add_argument( |
| "--page_number", |
| type=int, |
| default=1, |
| help="1-based page number (used for naming).", |
| ) |
| export_group.add_argument( |
| "--doc_base_name", |
| default="review", |
| help="Basename for output file naming (e.g. document name without extension).", |
| ) |
| export_group.add_argument( |
| "--boxes_json_path", |
| default="", |
| help="Path to JSON file containing a list of annotator-style boxes for overlay export.", |
| ) |
| export_group.add_argument( |
| "--review_df_json_path", |
| default="", |
| help="Optional path to JSON file containing review dataframe records (list[dict]) for stable label ordering/pattern mapping.", |
| ) |
| export_group.add_argument( |
| "--label_abbrev_chars", |
| type=int, |
| default=-1, |
| help="Optional override: draw N leading label characters on overlay image (use -1 to use config default).", |
| ) |
| export_group.add_argument( |
| "--ocr_results_json_path", |
| default="", |
| help="Path to JSON file containing OCR-with-words results dict for OCR visualisation export.", |
| ) |
| general_group.add_argument( |
| "--language", default=DEFAULT_LANGUAGE, help="Language of the document content." |
| ) |
| general_group.add_argument( |
| "--allow_list", |
| default=ALLOW_LIST_PATH, |
| help="Path to a CSV file with words to exclude from redaction.", |
| ) |
| general_group.add_argument( |
| "--pii_detector", |
| choices=[LOCAL_PII_OPTION, AWS_PII_OPTION, "None"], |
| default=LOCAL_PII_OPTION, |
| help="Core PII detection method (Local or AWS Comprehend, or None).", |
| ) |
| general_group.add_argument( |
| "--username", default=DIRECT_MODE_DEFAULT_USER, help="Username for the session." |
| ) |
| general_group.add_argument( |
| "--save_to_user_folders", |
| default=SESSION_OUTPUT_FOLDER, |
| help="Whether to save to user folders or not.", |
| ) |
|
|
| general_group.add_argument( |
| "--local_redact_entities", |
| nargs="+", |
| choices=full_entity_list, |
| default=chosen_redact_entities, |
| help=f"Local redaction entities to use. Default: {chosen_redact_entities}. Full list: {full_entity_list}.", |
| ) |
|
|
| general_group.add_argument( |
| "--aws_redact_entities", |
| nargs="+", |
| choices=full_comprehend_entity_list, |
| default=chosen_comprehend_entities, |
| help=f"AWS redaction entities to use. Default: {chosen_comprehend_entities}. Full list: {full_comprehend_entity_list}.", |
| ) |
|
|
| general_group.add_argument( |
| "--aws_access_key", default=AWS_ACCESS_KEY, help="Your AWS Access Key ID." |
| ) |
| general_group.add_argument( |
| "--aws_secret_key", default=AWS_SECRET_KEY, help="Your AWS Secret Access Key." |
| ) |
| general_group.add_argument( |
| "--cost_code", default=DEFAULT_COST_CODE, help="Cost code for tracking usage." |
| ) |
| general_group.add_argument( |
| "--aws_region", default=AWS_REGION, help="AWS region for cloud services." |
| ) |
| general_group.add_argument( |
| "--s3_bucket", |
| default=DOCUMENT_REDACTION_BUCKET, |
| help="S3 bucket name for cloud operations.", |
| ) |
| general_group.add_argument( |
| "--save_outputs_to_s3", |
| default=SAVE_OUTPUTS_TO_S3, |
| help="Upload output files (redacted PDFs, anonymized documents, etc.) to S3 after processing.", |
| ) |
| general_group.add_argument( |
| "--s3_outputs_folder", |
| default=S3_OUTPUTS_FOLDER, |
| help="S3 folder (key prefix) for saving output files. If left blank, outputs will not be uploaded even if --save_outputs_to_s3 is enabled.", |
| ) |
| general_group.add_argument( |
| "--s3_outputs_bucket", |
| default=S3_OUTPUTS_BUCKET, |
| help="S3 bucket name for output files (defaults to --s3_bucket if not specified).", |
| ) |
| general_group.add_argument( |
| "--do_initial_clean", |
| default=DO_INITIAL_TABULAR_DATA_CLEAN, |
| help="Perform initial text cleaning for tabular data.", |
| ) |
| general_group.add_argument( |
| "--save_logs_to_csv", |
| default=SAVE_LOGS_TO_CSV, |
| help="Save processing logs to CSV files.", |
| ) |
| general_group.add_argument( |
| "--save_logs_to_dynamodb", |
| default=SAVE_LOGS_TO_DYNAMODB, |
| help="Save processing logs to DynamoDB.", |
| ) |
| general_group.add_argument( |
| "--display_file_names_in_logs", |
| default=DISPLAY_FILE_NAMES_IN_LOGS, |
| help="Include file names in log outputs.", |
| ) |
| general_group.add_argument( |
| "--upload_logs_to_s3", |
| default=RUN_AWS_FUNCTIONS, |
| help="Upload log files to S3 after processing.", |
| ) |
| general_group.add_argument( |
| "--s3_logs_prefix", |
| default=S3_USAGE_LOGS_FOLDER, |
| help="S3 prefix for usage log files.", |
| ) |
| general_group.add_argument( |
| "--feedback_logs_folder", |
| default=FEEDBACK_LOGS_FOLDER, |
| help="Directory for feedback log files.", |
| ) |
| general_group.add_argument( |
| "--access_logs_folder", |
| default=ACCESS_LOGS_FOLDER, |
| help="Directory for access log files.", |
| ) |
| general_group.add_argument( |
| "--usage_logs_folder", |
| default=USAGE_LOGS_FOLDER, |
| help="Directory for usage log files.", |
| ) |
| general_group.add_argument( |
| "--paddle_model_path", |
| default=PADDLE_MODEL_PATH, |
| help="Directory for PaddleOCR model storage.", |
| ) |
| general_group.add_argument( |
| "--spacy_model_path", |
| default=SPACY_MODEL_PATH, |
| help="Directory for spaCy model storage.", |
| ) |
|
|
| |
| pdf_group = parser.add_argument_group( |
| "PDF/Image Redaction Options (.pdf, .png, .jpg)" |
| ) |
| pdf_group.add_argument( |
| "--ocr_method", |
| choices=["AWS Textract", "Local OCR", "Local text"], |
| default="Local OCR", |
| help="OCR method for text extraction from images.", |
| ) |
| pdf_group.add_argument( |
| "--page_min", type=int, default=0, help="First page to redact." |
| ) |
| pdf_group.add_argument( |
| "--page_max", type=int, default=0, help="Last page to redact." |
| ) |
| pdf_group.add_argument( |
| "--images_dpi", |
| type=float, |
| default=float(IMAGES_DPI), |
| help="DPI for image processing.", |
| ) |
| pdf_group.add_argument( |
| "--chosen_local_ocr_model", |
| choices=LOCAL_OCR_MODEL_OPTIONS, |
| default=DEFAULT_LOCAL_OCR_MODEL, |
| help="Local OCR model to use.", |
| ) |
| pdf_group.add_argument( |
| "--preprocess_local_ocr_images", |
| default=PREPROCESS_LOCAL_OCR_IMAGES, |
| help="Preprocess images before OCR.", |
| ) |
| pdf_group.add_argument( |
| "--compress_redacted_pdf", |
| default=COMPRESS_REDACTED_PDF, |
| help="Compress the final redacted PDF.", |
| ) |
| pdf_group.add_argument( |
| "--return_pdf_end_of_redaction", |
| default=RETURN_REDACTED_PDF, |
| help="Return PDF at end of redaction process.", |
| ) |
| post_redact_qa_group = pdf_group.add_mutually_exclusive_group() |
| post_redact_qa_group.add_argument( |
| "--post-redact-pass1-qa", |
| dest="post_redact_pass1_qa", |
| action="store_true", |
| help="Enable post-redaction Pass 1 QA for this run (overrides POST_REDACT_PASS1_QA env).", |
| ) |
| post_redact_qa_group.add_argument( |
| "--no-post-redact-pass1-qa", |
| dest="post_redact_pass1_qa", |
| action="store_false", |
| help="Disable post-redaction Pass 1 QA for this run.", |
| ) |
| post_redact_prune_group = pdf_group.add_mutually_exclusive_group() |
| post_redact_prune_group.add_argument( |
| "--post-redact-pass1-auto-prune", |
| dest="post_redact_pass1_auto_prune", |
| action="store_true", |
| help="Write sibling pruned review CSV during Pass 1 QA (overrides POST_REDACT_PASS1_AUTO_PRUNE env).", |
| ) |
| post_redact_prune_group.add_argument( |
| "--no-post-redact-pass1-auto-prune", |
| dest="post_redact_pass1_auto_prune", |
| action="store_false", |
| help="Disable auto-prune during Pass 1 QA for this run.", |
| ) |
| pdf_group.set_defaults( |
| post_redact_pass1_qa=None, |
| post_redact_pass1_auto_prune=None, |
| ) |
| pdf_group.add_argument( |
| "--deny_list_file", |
| default=DENY_LIST_PATH, |
| help="Custom words file to recognize for redaction.", |
| ) |
| pdf_group.add_argument( |
| "--allow_list_file", |
| default=ALLOW_LIST_PATH, |
| help="Custom words file to recognize for redaction.", |
| ) |
| pdf_group.add_argument( |
| "--redact_whole_page_file", |
| default=WHOLE_PAGE_REDACTION_LIST_PATH, |
| help="File for pages to redact completely.", |
| ) |
| pdf_group.add_argument( |
| "--handwrite_signature_extraction", |
| nargs="+", |
| default=default_handwrite_signature_checkbox, |
| help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".', |
| ) |
| pdf_group.add_argument( |
| "--extract_forms", |
| action="store_true", |
| help="Extract forms during Textract analysis.", |
| ) |
| pdf_group.add_argument( |
| "--extract_tables", |
| action="store_true", |
| help="Extract tables during Textract analysis.", |
| ) |
| pdf_group.add_argument( |
| "--extract_layout", |
| action="store_true", |
| help="Extract layout during Textract analysis.", |
| ) |
| pdf_group.add_argument( |
| "--vlm_model_choice", |
| default=CLOUD_VLM_MODEL_CHOICE, |
| help="VLM model choice for OCR (e.g., 'qwen.qwen3-vl-235b-a22b' for Bedrock, or model name for other providers).", |
| ) |
| pdf_group.add_argument( |
| "--inference_server_vlm_model", |
| default=DEFAULT_INFERENCE_SERVER_VLM_MODEL, |
| help="Inference server VLM model name for OCR.", |
| ) |
| pdf_group.add_argument( |
| "--inference_server_api_url", |
| default=INFERENCE_SERVER_API_URL, |
| help="Inference server API URL.", |
| ) |
| pdf_group.add_argument( |
| "--gemini_api_key", |
| default=GEMINI_API_KEY, |
| help="Google Gemini API key for VLM OCR.", |
| ) |
| pdf_group.add_argument( |
| "--azure_openai_api_key", |
| default=AZURE_OPENAI_API_KEY, |
| help="Azure OpenAI API key for VLM OCR.", |
| ) |
| pdf_group.add_argument( |
| "--azure_openai_endpoint", |
| default=AZURE_OPENAI_INFERENCE_ENDPOINT, |
| help="Azure OpenAI endpoint URL for VLM OCR.", |
| ) |
| pdf_group.add_argument( |
| "--efficient_ocr", |
| action="store_true", |
| default=None, |
| help="Use efficient OCR: try selectable text first per page, run OCR only when needed (saves time/cost). Defaults to EFFICIENT_OCR config.", |
| ) |
| pdf_group.add_argument( |
| "--no_efficient_ocr", |
| action="store_false", |
| dest="efficient_ocr", |
| help="Disable efficient OCR (use selected OCR method for all pages).", |
| ) |
| pdf_group.add_argument( |
| "--efficient_ocr_min_words", |
| type=int, |
| default=None, |
| metavar="N", |
| help="Minimum words on a page to use text-only route; below this use OCR. Defaults to EFFICIENT_OCR_MIN_WORDS config (e.g. 20).", |
| ) |
| pdf_group.add_argument( |
| "--efficient_ocr_min_image_coverage_fraction", |
| type=float, |
| default=None, |
| metavar="F", |
| help="Efficient OCR: min fraction of page area (0-1) for an embedded image to force OCR; 0 disables. Defaults to EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION config (e.g. 0.03).", |
| ) |
| pdf_group.add_argument( |
| "--efficient_ocr_min_embedded_image_px", |
| type=int, |
| default=None, |
| metavar="N", |
| help="Efficient OCR: min width and height (PDF points, ~px at 72 dpi) for an embedded image placement to count toward image-based OCR routing; 0 disables. Defaults to EFFICIENT_OCR_MIN_EMBEDDED_IMAGE_PX config (e.g. 10).", |
| ) |
| pdf_group.add_argument( |
| "--ocr_first_pass_max_workers", |
| type=int, |
| default=None, |
| metavar="N", |
| help="Max threads for OCR first pass (1 = sequential). Defaults to OCR_FIRST_PASS_MAX_WORKERS config (e.g. 3).", |
| ) |
| pdf_group.add_argument( |
| "--hybrid_textract_bedrock_vlm", |
| action="store_true", |
| default=None, |
| help="When using AWS Textract, re-run low-confidence lines with Bedrock VLM for higher quality. Defaults to HYBRID_TEXTRACT_BEDROCK_VLM config.", |
| ) |
| pdf_group.add_argument( |
| "--no_hybrid_textract_bedrock_vlm", |
| action="store_false", |
| dest="hybrid_textract_bedrock_vlm", |
| help="Disable hybrid Textract + Bedrock VLM (use Textract only).", |
| ) |
| pdf_group.add_argument( |
| "--overwrite_existing_ocr_results", |
| action="store_true", |
| default=None, |
| help="Ignore cached OCR JSON files and re-run OCR. Defaults to OVERWRITE_EXISTING_OCR_RESULTS config (e.g. False).", |
| ) |
| pdf_group.add_argument( |
| "--no_overwrite_existing_ocr_results", |
| action="store_false", |
| dest="overwrite_existing_ocr_results", |
| help="Use existing OCR results when available (do not overwrite cached JSON).", |
| ) |
| pdf_group.add_argument( |
| "--save_page_ocr_visualisations", |
| action="store_true", |
| default=None, |
| help="Save page OCR visualisations (debug bounding boxes). Defaults to SAVE_PAGE_OCR_VISUALISATIONS config.", |
| ) |
| pdf_group.add_argument( |
| "--no_save_page_ocr_visualisations", |
| action="store_false", |
| dest="save_page_ocr_visualisations", |
| help="Do not save page OCR visualisations (debug bounding boxes).", |
| ) |
|
|
| |
| llm_group = parser.add_argument_group("LLM PII Detection Options") |
| llm_group.add_argument( |
| "--llm_model_choice", |
| default=CLOUD_LLM_PII_MODEL_CHOICE, |
| help="LLM model choice for PII detection. Defaults to CLOUD_LLM_PII_MODEL_CHOICE for Bedrock. " |
| "Note: The actual model used is determined by pii_identification_method - " |
| "CLOUD_LLM_PII_MODEL_CHOICE for Bedrock, INFERENCE_SERVER_LLM_PII_MODEL_CHOICE for inference server, " |
| "LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE for local transformers.", |
| ) |
| llm_group.add_argument( |
| "--llm_inference_method", |
| choices=LLM_PII_INFERENCE_METHODS, |
| default=CHOSEN_LLM_PII_INFERENCE_METHOD, |
| help="LLM inference method for PII detection: aws-bedrock, local, inference-server, azure-openai, or gemini.", |
| ) |
| llm_group.add_argument( |
| "--inference_server_pii_model", |
| default=DEFAULT_INFERENCE_SERVER_PII_MODEL, |
| help="Inference server PII detection model name.", |
| ) |
| llm_group.add_argument( |
| "--llm_temperature", |
| type=float, |
| default=LLM_TEMPERATURE, |
| help="Temperature for LLM PII detection (lower = more deterministic).", |
| ) |
| llm_group.add_argument( |
| "--llm_max_tokens", |
| type=int, |
| default=LLM_MAX_NEW_TOKENS, |
| help="Maximum tokens in LLM response for PII detection.", |
| ) |
| llm_group.add_argument( |
| "--llm_redact_entities", |
| nargs="+", |
| choices=full_llm_entity_list, |
| default=chosen_llm_entities, |
| help=f"Subset of entities for LLM PII detection (when pii_detector uses an LLM). Default: {chosen_llm_entities}. Full list: {full_llm_entity_list}.", |
| ) |
| llm_group.add_argument( |
| "--custom_llm_instructions", |
| default="", |
| help="Custom instructions for LLM-based entity detection (e.g. 'don't redact anything related to Mark Wilson' or 'redact all company names with the label COMPANY_NAME').", |
| ) |
|
|
| |
| tabular_group = parser.add_argument_group( |
| "Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)" |
| ) |
| tabular_group.add_argument( |
| "--anon_strategy", |
| choices=[ |
| "redact", |
| "redact completely", |
| "replace_redacted", |
| "entity_type", |
| "encrypt", |
| "hash", |
| "replace with 'REDACTED'", |
| "replace with <ENTITY_NAME>", |
| "mask", |
| "fake_first_name", |
| ], |
| default=DEFAULT_TABULAR_ANONYMISATION_STRATEGY, |
| help="The anonymisation strategy to apply.", |
| ) |
| tabular_group.add_argument( |
| "--text_columns", |
| nargs="+", |
| default=list(), |
| help="A list of column names to anonymise or deduplicate in tabular data.", |
| ) |
| tabular_group.add_argument( |
| "--excel_sheets", |
| nargs="+", |
| default=list(), |
| help="Specific Excel sheet names to process.", |
| ) |
| tabular_group.add_argument( |
| "--fuzzy_mistakes", |
| type=int, |
| default=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, |
| help="Number of allowed spelling mistakes for fuzzy matching.", |
| ) |
| tabular_group.add_argument( |
| "--match_fuzzy_whole_phrase_bool", |
| default=True, |
| help="Match fuzzy whole phrase boolean.", |
| ) |
| |
| duplicate_group = parser.add_argument_group("Duplicate Detection Options") |
| duplicate_group.add_argument( |
| "--duplicate_type", |
| choices=["pages", "tabular"], |
| default="pages", |
| help="Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).", |
| ) |
| duplicate_group.add_argument( |
| "--similarity_threshold", |
| type=float, |
| default=DEFAULT_DUPLICATE_DETECTION_THRESHOLD, |
| help="Similarity threshold (0-1) to consider content as duplicates.", |
| ) |
| duplicate_group.add_argument( |
| "--min_word_count", |
| type=int, |
| default=DEFAULT_MIN_WORD_COUNT, |
| help="Minimum word count for text to be considered in duplicate analysis.", |
| ) |
| duplicate_group.add_argument( |
| "--min_consecutive_pages", |
| type=int, |
| default=DEFAULT_MIN_CONSECUTIVE_PAGES, |
| help="Minimum number of consecutive pages to consider as a match.", |
| ) |
| duplicate_group.add_argument( |
| "--greedy_match", |
| default=USE_GREEDY_DUPLICATE_DETECTION, |
| help="Use greedy matching strategy for consecutive pages.", |
| ) |
| duplicate_group.add_argument( |
| "--combine_pages", |
| default=DEFAULT_COMBINE_PAGES, |
| help="Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.", |
| ) |
| duplicate_group.add_argument( |
| "--remove_duplicate_rows", |
| default=REMOVE_DUPLICATE_ROWS, |
| help="Remove duplicate rows from the output.", |
| ) |
|
|
| |
| summarisation_group = parser.add_argument_group("Document Summarisation Options") |
| summarisation_group.add_argument( |
| "--summarisation_inference_method", |
| choices=[ |
| AWS_LLM_PII_OPTION, |
| LOCAL_TRANSFORMERS_LLM_PII_OPTION, |
| INFERENCE_SERVER_PII_OPTION, |
| ], |
| default=AWS_LLM_PII_OPTION, |
| help="LLM inference method for summarisation (same options as GUI).", |
| ) |
| summarisation_group.add_argument( |
| "--summarisation_temperature", |
| type=float, |
| default=0.6, |
| help="Temperature for summarisation (0.0-2.0). Lower is more deterministic.", |
| ) |
| summarisation_group.add_argument( |
| "--summarisation_max_pages_per_group", |
| type=int, |
| default=30, |
| help="Maximum pages per page-group summary (in addition to context-length limits).", |
| ) |
| summarisation_group.add_argument( |
| "--summary_page_group_max_workers", |
| type=int, |
| default=SUMMARY_PAGE_GROUP_MAX_WORKERS, |
| metavar="N", |
| help="Max threads for page-group summarisation (1 = sequential). Defaults to SUMMARY_PAGE_GROUP_MAX_WORKERS config (e.g. 1).", |
| ) |
| summarisation_group.add_argument( |
| "--summarisation_api_key", |
| default="", |
| help="API key for summarisation (if required by the chosen LLM).", |
| ) |
| summarisation_group.add_argument( |
| "--summarisation_context", |
| default="", |
| help="Additional context for summarisation (e.g. 'This is a consultation response document').", |
| ) |
| summarisation_group.add_argument( |
| "--summarisation_format", |
| choices=["concise", "detailed"], |
| default="detailed", |
| help="Summary format: concise (key themes only) or detailed (as much detail as possible).", |
| ) |
| summarisation_group.add_argument( |
| "--summarisation_additional_instructions", |
| default="", |
| help="Additional summary instructions (e.g. 'Focus on key decisions and recommendations').", |
| ) |
|
|
| |
| textract_group = parser.add_argument_group("Textract Batch Operations Options") |
| textract_group.add_argument( |
| "--textract_action", |
| choices=["submit", "retrieve", "list"], |
| help="Textract action to perform: submit (submit document for analysis), retrieve (get results by job ID), or list (show recent jobs).", |
| ) |
| textract_group.add_argument("--job_id", help="Textract job ID for retrieve action.") |
| textract_group.add_argument( |
| "--extract_signatures", |
| action="store_true", |
| help="Extract signatures during Textract analysis (for submit action).", |
| ) |
| textract_group.add_argument( |
| "--textract_bucket", |
| default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, |
| help="S3 bucket name for Textract operations (overrides default).", |
| ) |
| textract_group.add_argument( |
| "--textract_input_prefix", |
| default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, |
| help="S3 prefix for input files in Textract operations.", |
| ) |
| textract_group.add_argument( |
| "--textract_output_prefix", |
| default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, |
| help="S3 prefix for output files in Textract operations.", |
| ) |
| textract_group.add_argument( |
| "--s3_textract_document_logs_subfolder", |
| default=TEXTRACT_JOBS_S3_LOC, |
| help="S3 prefix for logs in Textract operations.", |
| ) |
| textract_group.add_argument( |
| "--local_textract_document_logs_subfolder", |
| default=TEXTRACT_JOBS_LOCAL_LOC, |
| help="Local prefix for logs in Textract operations.", |
| ) |
| textract_group.add_argument( |
| "--poll_interval", |
| type=int, |
| default=30, |
| help="Polling interval in seconds for Textract job status.", |
| ) |
| textract_group.add_argument( |
| "--max_poll_attempts", |
| type=int, |
| default=120, |
| help="Maximum number of polling attempts for Textract job completion.", |
| ) |
| return parser |
|
|
|
|
| def get_cli_default_args_dict() -> dict: |
| """All CLI flag defaults as a dict; merge agent overrides then call main(direct_mode_args=...).""" |
| return vars(build_cli_argument_parser().parse_args([])) |
|
|
|
|
| def resolve_deny_list_for_redaction(args) -> list[str] | str: |
| """ |
| Inline ``deny_list`` (API/direct mode) takes precedence over ``deny_list_file``. |
| """ |
| inline = getattr(args, "deny_list", None) |
| if isinstance(inline, list): |
| return [str(x) for x in inline if str(x).strip()] |
| file_path = getattr(args, "deny_list_file", None) or "" |
| return file_path |
|
|
|
|
| def resolve_allow_list_for_redaction(args) -> list[str] | str: |
| """ |
| Inline ``allow_list`` (API/direct mode) takes precedence over file paths. |
| """ |
| inline = getattr(args, "allow_list", None) |
| if isinstance(inline, list): |
| return [str(x) for x in inline if str(x).strip()] |
| file_path = getattr(args, "allow_list_file", None) or "" |
| if file_path: |
| return file_path |
| path_only = getattr(args, "allow_list", None) |
| return path_only if isinstance(path_only, str) else "" |
|
|
|
|
| def resolve_efficient_ocr_for_redaction(args) -> bool: |
| """ |
| CLI ``--efficient_ocr`` defaults to ``None`` when unset; treat that as the |
| deployment ``EFFICIENT_OCR`` config (same as the Gradio checkbox default). |
| """ |
| value = getattr(args, "efficient_ocr", None) |
| if value is None: |
| return bool(EFFICIENT_OCR) |
| return bool(value) |
|
|
|
|
| def resolve_chosen_local_ocr_model_for_redaction(args) -> str: |
| """Local OCR engine from CLI/API overrides, else deployment default.""" |
| model = getattr(args, "chosen_local_ocr_model", None) |
| if model is None or str(model).strip() == "": |
| return str(DEFAULT_LOCAL_OCR_MODEL) |
| return str(model) |
|
|
|
|
| def main(direct_mode_args={}): |
| """ |
| A unified command-line interface to prepare, redact, and anonymise various document types. |
| |
| Args: |
| direct_mode_args (dict, optional): Dictionary of arguments for direct mode execution. |
| If provided, uses these instead of parsing command line arguments. |
| """ |
| parser = build_cli_argument_parser() |
| |
| if direct_mode_args: |
| |
| args = argparse.Namespace(**direct_mode_args) |
| else: |
| |
| args = parser.parse_args() |
|
|
| |
| |
| |
| |
| if args.input_file: |
| if isinstance(args.input_file, list): |
| |
| downloaded_files = [] |
| for file_path in args.input_file: |
| downloaded_path = _download_s3_file_if_needed(file_path) |
| downloaded_files.append(downloaded_path) |
| args.input_file = downloaded_files |
| elif isinstance(args.input_file, str): |
| |
| if "|" in args.input_file: |
| file_list = [f.strip() for f in args.input_file.split("|") if f.strip()] |
| downloaded_files = [] |
| for file_path in file_list: |
| downloaded_path = _download_s3_file_if_needed(file_path) |
| downloaded_files.append(downloaded_path) |
| args.input_file = downloaded_files |
| else: |
| |
| args.input_file = [_download_s3_file_if_needed(args.input_file)] |
|
|
| |
| if args.deny_list_file: |
| args.deny_list_file = _download_s3_file_if_needed( |
| args.deny_list_file, default_filename="downloaded_deny_list.csv" |
| ) |
| if args.allow_list_file: |
| args.allow_list_file = _download_s3_file_if_needed( |
| args.allow_list_file, default_filename="downloaded_allow_list.csv" |
| ) |
| if args.redact_whole_page_file: |
| args.redact_whole_page_file = _download_s3_file_if_needed( |
| args.redact_whole_page_file, |
| default_filename="downloaded_redact_whole_page.csv", |
| ) |
|
|
| |
| |
| if args.preprocess_local_ocr_images == "True": |
| args.preprocess_local_ocr_images = True |
| else: |
| args.preprocess_local_ocr_images = False |
| if args.greedy_match == "True": |
| args.greedy_match = True |
| else: |
| args.greedy_match = False |
| if args.combine_pages == "True": |
| args.combine_pages = True |
| else: |
| args.combine_pages = False |
| if args.remove_duplicate_rows == "True": |
| args.remove_duplicate_rows = True |
| else: |
| args.remove_duplicate_rows = False |
| if args.return_pdf_end_of_redaction == "True": |
| args.return_pdf_end_of_redaction = True |
| else: |
| args.return_pdf_end_of_redaction = False |
| if args.compress_redacted_pdf == "True": |
| args.compress_redacted_pdf = True |
| else: |
| args.compress_redacted_pdf = False |
| if args.do_initial_clean == "True": |
| args.do_initial_clean = True |
| else: |
| args.do_initial_clean = False |
| if args.save_logs_to_csv == "True": |
| args.save_logs_to_csv = True |
| else: |
| args.save_logs_to_csv = False |
| if args.save_logs_to_dynamodb == "True": |
| args.save_logs_to_dynamodb = True |
| else: |
| args.save_logs_to_dynamodb = False |
| if args.display_file_names_in_logs == "True": |
| args.display_file_names_in_logs = True |
| else: |
| args.display_file_names_in_logs = False |
| if args.match_fuzzy_whole_phrase_bool == "True": |
| args.match_fuzzy_whole_phrase_bool = True |
| else: |
| args.match_fuzzy_whole_phrase_bool = False |
| |
| args.save_to_user_folders = convert_string_to_boolean(args.save_to_user_folders) |
| |
| args.save_outputs_to_s3 = convert_string_to_boolean(args.save_outputs_to_s3) |
|
|
| |
| extraction_options = ( |
| list(args.handwrite_signature_extraction) |
| if args.handwrite_signature_extraction |
| else [] |
| ) |
| if args.extract_forms: |
| extraction_options.append("Extract forms") |
| if args.extract_tables: |
| extraction_options.append("Extract tables") |
| if args.extract_layout: |
| extraction_options.append("Extract layout") |
| args.handwrite_signature_extraction = extraction_options |
|
|
| if args.task in [ |
| "redact", |
| "deduplicate", |
| "summarise", |
| "combine_review_pdfs", |
| ]: |
| if args.input_file: |
| if isinstance(args.input_file, str): |
| args.input_file = [args.input_file] |
|
|
| _, file_extension = os.path.splitext(args.input_file[0]) |
| file_extension = file_extension.lower() |
| else: |
| raise ValueError(f"Error: --input_file is required for '{args.task}' task.") |
| else: |
| file_extension = "" |
|
|
| |
| usage_logger = None |
| if args.save_logs_to_csv or args.save_logs_to_dynamodb: |
| from tools.cli_usage_logger import create_cli_usage_logger |
|
|
| try: |
| usage_logger = create_cli_usage_logger(logs_folder=args.usage_logs_folder) |
| except Exception as e: |
| print(f"Warning: Could not initialise usage logger: {e}") |
|
|
| |
| ( |
| session_hash, |
| args.output_dir, |
| _, |
| args.input_dir, |
| args.textract_input_prefix, |
| args.textract_output_prefix, |
| args.s3_textract_document_logs_subfolder, |
| args.local_textract_document_logs_subfolder, |
| ) = get_username_and_folders( |
| username=args.username, |
| output_folder_textbox=args.output_dir, |
| input_folder_textbox=args.input_dir, |
| session_output_folder=args.save_to_user_folders, |
| textract_document_upload_input_folder=args.textract_input_prefix, |
| textract_document_upload_output_folder=args.textract_output_prefix, |
| s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, |
| local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder, |
| ) |
|
|
| print( |
| f"Conducting analyses with user {args.username}. Outputs will be saved to {args.output_dir}." |
| ) |
|
|
| |
| s3_output_folder = "" |
| if args.save_outputs_to_s3 and args.s3_outputs_folder: |
| s3_output_folder = _build_s3_output_folder( |
| s3_outputs_folder=args.s3_outputs_folder, |
| session_hash=session_hash, |
| save_to_user_folders=args.save_to_user_folders, |
| ) |
| if s3_output_folder: |
| print(f"S3 output folder: s3://{args.s3_outputs_bucket}/{s3_output_folder}") |
| elif args.save_outputs_to_s3 and not args.s3_outputs_folder: |
| print( |
| "Warning: --save_outputs_to_s3 is enabled but --s3_outputs_folder is not set. Outputs will not be uploaded to S3." |
| ) |
|
|
| |
|
|
| |
| if ( |
| args.task in ["redact", "deduplicate", "summarise", "combine_review_pdfs"] |
| and not args.input_file |
| ): |
| print(f"Error: --input_file is required for '{args.task}' task.") |
| return |
|
|
| if args.ocr_method in ["Local OCR", "AWS Textract"]: |
| args.prepare_images = True |
| else: |
| args.prepare_images = False |
|
|
| from tools.cli_usage_logger import create_cli_usage_logger, log_redaction_usage |
|
|
| |
| if args.task == "redact": |
|
|
| |
| if file_extension in [".pdf", ".png", ".jpg", ".jpeg"]: |
| print("--- Detected PDF/Image file. Starting Redaction Workflow... ---") |
| start_time = time.time() |
| try: |
| from tools.file_conversion import prepare_image_or_pdf |
| from tools.file_redaction import run_redaction |
| from tools.redaction_types import RedactionContext, RedactionOptions |
|
|
| |
| print("\nStep 1: Preparing document...") |
| ( |
| prep_summary, |
| prepared_pdf_paths, |
| image_file_paths, |
| _, |
| _, |
| pdf_doc, |
| image_annotations, |
| _, |
| original_cropboxes, |
| page_sizes, |
| _, |
| _, |
| _, |
| _, |
| _, |
| ) = prepare_image_or_pdf( |
| file_paths=args.input_file, |
| text_extract_method=args.ocr_method, |
| all_line_level_ocr_results_df=pd.DataFrame(), |
| all_page_line_level_ocr_results_with_words_df=pd.DataFrame(), |
| first_loop_state=True, |
| prepare_for_review=False, |
| output_folder=args.output_dir, |
| input_folder=args.input_dir, |
| prepare_images=args.prepare_images, |
| page_min=args.page_min, |
| page_max=args.page_max, |
| ) |
| print(f"Preparation complete. {prep_summary}") |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| print("\nStep 2: Running redaction...") |
| ( |
| output_summary, |
| output_files, |
| _, |
| _, |
| log_files, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| comprehend_query_number, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| page_sizes, |
| _, |
| _, |
| _, |
| _, |
| total_textract_query_number, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| vlm_model_name, |
| vlm_total_input_tokens, |
| vlm_total_output_tokens, |
| llm_model_name, |
| llm_total_input_tokens, |
| llm_total_output_tokens, |
| _, |
| ) = run_redaction( |
| args.input_file, |
| RedactionOptions( |
| chosen_redact_entities=args.local_redact_entities, |
| chosen_redact_comprehend_entities=args.aws_redact_entities, |
| chosen_llm_entities=args.llm_redact_entities, |
| text_extraction_method=args.ocr_method, |
| in_allow_list=resolve_allow_list_for_redaction(args), |
| in_deny_list=resolve_deny_list_for_redaction(args), |
| redact_whole_page_list=args.redact_whole_page_file, |
| page_min=args.page_min, |
| page_max=args.page_max, |
| handwrite_signature_checkbox=args.handwrite_signature_extraction, |
| max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes, |
| match_fuzzy_whole_phrase_bool=args.match_fuzzy_whole_phrase_bool, |
| pii_identification_method=args.pii_detector, |
| aws_access_key_textbox=args.aws_access_key, |
| aws_secret_key_textbox=args.aws_secret_key, |
| language=args.language, |
| output_folder=args.output_dir, |
| input_folder=args.input_dir, |
| custom_llm_instructions=args.custom_llm_instructions, |
| inference_server_vlm_model=( |
| args.inference_server_vlm_model |
| if args.inference_server_vlm_model |
| else DEFAULT_INFERENCE_SERVER_VLM_MODEL |
| ), |
| chosen_local_ocr_model=resolve_chosen_local_ocr_model_for_redaction( |
| args |
| ), |
| efficient_ocr=resolve_efficient_ocr_for_redaction(args), |
| efficient_ocr_min_words=( |
| args.efficient_ocr_min_words |
| if getattr(args, "efficient_ocr_min_words", None) |
| is not None |
| else EFFICIENT_OCR_MIN_WORDS |
| ), |
| efficient_ocr_min_image_coverage_fraction=( |
| args.efficient_ocr_min_image_coverage_fraction |
| if getattr( |
| args, |
| "efficient_ocr_min_image_coverage_fraction", |
| None, |
| ) |
| is not None |
| else EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION |
| ), |
| efficient_ocr_min_embedded_image_px=( |
| args.efficient_ocr_min_embedded_image_px |
| if getattr( |
| args, "efficient_ocr_min_embedded_image_px", None |
| ) |
| is not None |
| else EFFICIENT_OCR_MIN_EMBEDDED_IMAGE_PX |
| ), |
| ocr_first_pass_max_workers=( |
| args.ocr_first_pass_max_workers |
| if getattr(args, "ocr_first_pass_max_workers", None) |
| is not None |
| else OCR_FIRST_PASS_MAX_WORKERS |
| ), |
| hybrid_textract_bedrock_vlm=getattr( |
| args, |
| "hybrid_textract_bedrock_vlm", |
| HYBRID_TEXTRACT_BEDROCK_VLM, |
| ), |
| overwrite_existing_ocr_results=getattr( |
| args, |
| "overwrite_existing_ocr_results", |
| OVERWRITE_EXISTING_OCR_RESULTS, |
| ), |
| save_page_ocr_visualisations=( |
| getattr(args, "save_page_ocr_visualisations", None) |
| if getattr(args, "save_page_ocr_visualisations", None) |
| is not None |
| else SAVE_PAGE_OCR_VISUALISATIONS |
| ), |
| post_redact_pass1_qa=getattr( |
| args, "post_redact_pass1_qa", None |
| ), |
| post_redact_pass1_auto_prune=getattr( |
| args, "post_redact_pass1_auto_prune", None |
| ), |
| ), |
| RedactionContext( |
| prepared_pdf_file_paths=prepared_pdf_paths, |
| pdf_image_file_paths=image_file_paths, |
| pymupdf_doc=pdf_doc, |
| annotations_all_pages=image_annotations, |
| page_sizes=page_sizes, |
| document_cropboxes=original_cropboxes, |
| ), |
| |
| |
| |
| |
| ) |
|
|
| |
| end_time = time.time() |
| processing_time = end_time - start_time |
|
|
| |
| if usage_logger: |
| try: |
| |
| print("Saving logs to CSV") |
| doc_file_name = ( |
| os.path.basename(args.input_file[0]) |
| if args.display_file_names_in_logs |
| else "document" |
| ) |
| data_file_name = "" |
|
|
| |
| is_textract_call = args.ocr_method == "AWS Textract" |
|
|
| |
| total_pages = len(page_sizes) if page_sizes else 1 |
|
|
| |
| textract_queries = ( |
| int(total_textract_query_number) if is_textract_call else 0 |
| ) |
| comprehend_queries = ( |
| int(comprehend_query_number) |
| if args.pii_detector == "AWS Comprehend" |
| else 0 |
| ) |
|
|
| |
| handwriting_signature = ( |
| ", ".join(args.handwrite_signature_extraction) |
| if args.handwrite_signature_extraction |
| else "" |
| ) |
|
|
| log_redaction_usage( |
| logger=usage_logger, |
| session_hash=session_hash, |
| doc_file_name=doc_file_name, |
| data_file_name=data_file_name, |
| time_taken=processing_time, |
| total_pages=total_pages, |
| textract_queries=textract_queries, |
| pii_method=args.pii_detector, |
| comprehend_queries=comprehend_queries, |
| cost_code=args.cost_code, |
| handwriting_signature=handwriting_signature, |
| text_extraction_method=args.ocr_method, |
| is_textract_call=is_textract_call, |
| task=args.task, |
| save_to_dynamodb=args.save_logs_to_dynamodb, |
| save_to_s3=args.upload_logs_to_s3, |
| s3_bucket=args.s3_bucket, |
| s3_key_prefix=args.s3_logs_prefix, |
| vlm_model_name=vlm_model_name, |
| vlm_total_input_tokens=vlm_total_input_tokens, |
| vlm_total_output_tokens=vlm_total_output_tokens, |
| llm_model_name=llm_model_name, |
| llm_total_input_tokens=llm_total_input_tokens, |
| llm_total_output_tokens=llm_total_output_tokens, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not log usage data: {e}") |
|
|
| print("\n--- Redaction Process Complete ---") |
| print(f"Summary: {output_summary}") |
| print(f"Processing time: {processing_time:.2f} seconds") |
| print(f"\nOutput files saved to: {args.output_dir}") |
| print("Generated Files:", sorted(output_files)) |
| if log_files: |
| print("Log Files:", sorted(log_files)) |
|
|
| |
| if args.save_outputs_to_s3 and s3_output_folder and output_files: |
| print("\n--- Uploading output files to S3 ---") |
| try: |
| |
| ( |
| os.path.splitext(os.path.basename(args.input_file[0]))[0] |
| if args.input_file |
| else None |
| ) |
| export_outputs_to_s3( |
| file_list_state=output_files, |
| s3_output_folder_state_value=s3_output_folder, |
| save_outputs_to_s3_flag=args.save_outputs_to_s3, |
| base_file_state=( |
| args.input_file[0] if args.input_file else None |
| ), |
| s3_bucket=args.s3_outputs_bucket, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not upload output files to S3: {e}") |
|
|
| except Exception as e: |
| print( |
| f"\nAn error occurred during the PDF/Image redaction workflow: {e}" |
| ) |
|
|
| |
| elif file_extension in [".docx", ".xlsx", ".xls", ".csv", ".parquet"]: |
| print( |
| "--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---" |
| ) |
| start_time = time.time() |
| try: |
| from tools.data_anonymise import anonymise_files_with_open_text |
|
|
| |
| |
| |
| |
|
|
| |
| ( |
| output_summary, |
| output_files, |
| _, |
| _, |
| log_files, |
| _, |
| processing_time, |
| comprehend_query_number, |
| _, |
| _, |
| _, |
| ) = anonymise_files_with_open_text( |
| file_paths=args.input_file, |
| in_text="", |
| anon_strategy=args.anon_strategy, |
| chosen_cols=args.text_columns, |
| chosen_redact_entities=args.local_redact_entities, |
| in_allow_list=resolve_allow_list_for_redaction(args), |
| in_excel_sheets=args.excel_sheets, |
| first_loop_state=True, |
| output_folder=args.output_dir, |
| in_deny_list=resolve_deny_list_for_redaction(args), |
| max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes, |
| pii_identification_method=args.pii_detector, |
| chosen_redact_comprehend_entities=args.aws_redact_entities, |
| aws_access_key_textbox=args.aws_access_key, |
| aws_secret_key_textbox=args.aws_secret_key, |
| language=args.language, |
| do_initial_clean=args.do_initial_clean, |
| ) |
|
|
| |
| end_time = time.time() |
| processing_time = end_time - start_time |
|
|
| |
| if usage_logger: |
| try: |
| print("Saving logs to CSV") |
| |
| doc_file_name = "" |
| data_file_name = ( |
| os.path.basename(args.input_file[0]) |
| if args.display_file_names_in_logs |
| else "data_file" |
| ) |
|
|
| |
| is_textract_call = False |
|
|
| |
| total_pages = 0 |
|
|
| |
| textract_queries = 0 |
| comprehend_queries = ( |
| comprehend_query_number |
| if args.pii_detector == "AWS Comprehend" |
| else 0 |
| ) |
|
|
| |
| handwriting_signature = "" |
|
|
| log_redaction_usage( |
| logger=usage_logger, |
| session_hash=session_hash, |
| doc_file_name=doc_file_name, |
| data_file_name=data_file_name, |
| time_taken=processing_time, |
| total_pages=total_pages, |
| textract_queries=textract_queries, |
| pii_method=args.pii_detector, |
| comprehend_queries=comprehend_queries, |
| cost_code=args.cost_code, |
| handwriting_signature=handwriting_signature, |
| text_extraction_method="tabular", |
| is_textract_call=is_textract_call, |
| task=args.task, |
| save_to_dynamodb=args.save_logs_to_dynamodb, |
| save_to_s3=args.upload_logs_to_s3, |
| s3_bucket=args.s3_bucket, |
| s3_key_prefix=args.s3_logs_prefix, |
| vlm_model_name="", |
| vlm_total_input_tokens=0, |
| vlm_total_output_tokens=0, |
| llm_model_name="", |
| llm_total_input_tokens=0, |
| llm_total_output_tokens=0, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not log usage data: {e}") |
|
|
| print("\n--- Anonymisation Process Complete ---") |
| print(f"Summary: {output_summary}") |
| print(f"Processing time: {processing_time:.2f} seconds") |
| print(f"\nOutput files saved to: {args.output_dir}") |
| print("Generated Files:", sorted(output_files)) |
| if log_files: |
| print("Log Files:", sorted(log_files)) |
|
|
| |
| if args.save_outputs_to_s3 and s3_output_folder and output_files: |
| print("\n--- Uploading output files to S3 ---") |
| try: |
| export_outputs_to_s3( |
| file_list_state=output_files, |
| s3_output_folder_state_value=s3_output_folder, |
| save_outputs_to_s3_flag=args.save_outputs_to_s3, |
| base_file_state=( |
| args.input_file[0] if args.input_file else None |
| ), |
| s3_bucket=args.s3_outputs_bucket, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not upload output files to S3: {e}") |
|
|
| except Exception as e: |
| print( |
| f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}" |
| ) |
|
|
| else: |
| print(f"Error: Unsupported file type '{file_extension}' for redaction.") |
| print("Supported types for redaction: .pdf, .png, .jpg, .jpeg") |
| print( |
| "Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet" |
| ) |
|
|
| |
| elif args.task == "deduplicate": |
| print("--- Starting Duplicate Detection Workflow... ---") |
| try: |
| from tools.find_duplicate_pages import run_duplicate_analysis |
|
|
| if args.duplicate_type == "pages": |
| |
| if file_extension == ".csv": |
| print( |
| "--- Detected OCR CSV file. Starting Page Duplicate Detection... ---" |
| ) |
|
|
| start_time = time.time() |
|
|
| if args.combine_pages is True: |
| print("Combining pages...") |
| else: |
| print("Using line-level duplicate detection...") |
|
|
| |
| ( |
| results_df, |
| output_paths, |
| full_data_by_file, |
| processing_time, |
| task_textbox, |
| _, |
| _, |
| _, |
| ) = run_duplicate_analysis( |
| files=args.input_file, |
| threshold=args.similarity_threshold, |
| min_words=args.min_word_count, |
| min_consecutive=args.min_consecutive_pages, |
| greedy_match=args.greedy_match, |
| combine_pages=args.combine_pages, |
| output_folder=args.output_dir, |
| all_page_line_level_ocr_results_df_base=pd.DataFrame(), |
| ocr_df_paths_list=[], |
| ) |
|
|
| end_time = time.time() |
| processing_time = end_time - start_time |
|
|
| print("\n--- Page Duplicate Detection Complete ---") |
| print(f"Found {len(results_df)} duplicate matches") |
| print(f"\nOutput files saved to: {args.output_dir}") |
| if output_paths: |
| print("Generated Files:", sorted(output_paths)) |
|
|
| |
| if args.save_outputs_to_s3 and s3_output_folder and output_paths: |
| print("\n--- Uploading output files to S3 ---") |
| try: |
| export_outputs_to_s3( |
| file_list_state=output_paths, |
| s3_output_folder_state_value=s3_output_folder, |
| save_outputs_to_s3_flag=args.save_outputs_to_s3, |
| base_file_state=( |
| args.input_file[0] if args.input_file else None |
| ), |
| s3_bucket=args.s3_outputs_bucket, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not upload output files to S3: {e}") |
|
|
| |
| if usage_logger: |
| try: |
| print("Saving logs to CSV") |
| doc_file_name = ( |
| os.path.basename(args.input_file[0]) |
| if args.display_file_names_in_logs and args.input_file |
| else "document" |
| ) |
| data_file_name = "" |
| log_redaction_usage( |
| logger=usage_logger, |
| session_hash=session_hash, |
| doc_file_name=doc_file_name, |
| data_file_name=data_file_name, |
| time_taken=processing_time, |
| total_pages=0, |
| textract_queries=0, |
| comprehend_queries=0, |
| pii_method=args.pii_detector, |
| cost_code=args.cost_code, |
| handwriting_signature="", |
| text_extraction_method=args.ocr_method, |
| is_textract_call=False, |
| task=args.task, |
| save_to_dynamodb=args.save_logs_to_dynamodb, |
| save_to_s3=args.upload_logs_to_s3, |
| s3_bucket=args.s3_bucket, |
| s3_key_prefix=args.s3_logs_prefix, |
| vlm_model_name="", |
| vlm_total_input_tokens=0, |
| vlm_total_output_tokens=0, |
| llm_model_name="", |
| llm_total_input_tokens=0, |
| llm_total_output_tokens=0, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not log usage data: {e}") |
|
|
| else: |
| print( |
| "Error: Page duplicate detection requires CSV files with OCR data." |
| ) |
| print("Please provide a CSV file containing OCR output data.") |
|
|
| |
| if usage_logger: |
| try: |
| |
| print("Saving logs to CSV") |
| doc_file_name = ( |
| os.path.basename(args.input_file[0]) |
| if args.display_file_names_in_logs |
| else "document" |
| ) |
| data_file_name = ( |
| "" |
| ) |
|
|
| |
| is_textract_call = False |
|
|
| |
| total_pages = len(page_sizes) if page_sizes else 1 |
|
|
| |
| textract_queries = 0 |
| comprehend_queries = 0 |
|
|
| |
| handwriting_signature = "" |
|
|
| log_redaction_usage( |
| logger=usage_logger, |
| session_hash=session_hash, |
| doc_file_name=doc_file_name, |
| data_file_name=data_file_name, |
| time_taken=processing_time, |
| total_pages=total_pages, |
| textract_queries=textract_queries, |
| pii_method=args.pii_detector, |
| comprehend_queries=comprehend_queries, |
| cost_code=args.cost_code, |
| handwriting_signature=handwriting_signature, |
| text_extraction_method=args.ocr_method, |
| is_textract_call=is_textract_call, |
| task=args.task, |
| save_to_dynamodb=args.save_logs_to_dynamodb, |
| save_to_s3=args.upload_logs_to_s3, |
| s3_bucket=args.s3_bucket, |
| s3_key_prefix=args.s3_logs_prefix, |
| vlm_model_name="", |
| vlm_total_input_tokens=0, |
| vlm_total_output_tokens=0, |
| llm_model_name="", |
| llm_total_input_tokens=0, |
| llm_total_output_tokens=0, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not log usage data: {e}") |
|
|
| elif args.duplicate_type == "tabular": |
| |
| from tools.find_duplicate_tabular import run_tabular_duplicate_detection |
|
|
| if file_extension in [".csv", ".xlsx", ".xls", ".parquet"]: |
| print( |
| "--- Detected tabular file. Starting Tabular Duplicate Detection... ---" |
| ) |
|
|
| start_time = time.time() |
|
|
| ( |
| results_df, |
| output_paths, |
| full_data_by_file, |
| processing_time, |
| task_textbox, |
| ) = run_tabular_duplicate_detection( |
| files=args.input_file, |
| threshold=args.similarity_threshold, |
| min_words=args.min_word_count, |
| text_columns=args.text_columns, |
| output_folder=args.output_dir, |
| do_initial_clean_dup=args.do_initial_clean, |
| in_excel_tabular_sheets=args.excel_sheets, |
| remove_duplicate_rows=args.remove_duplicate_rows, |
| ) |
|
|
| end_time = time.time() |
| processing_time = end_time - start_time |
|
|
| |
| if usage_logger: |
| try: |
| |
| print("Saving logs to CSV") |
| doc_file_name = "" |
| data_file_name = ( |
| os.path.basename(args.input_file[0]) |
| if args.display_file_names_in_logs and args.input_file |
| else "data_file" |
| ) |
|
|
| is_textract_call = False |
| total_pages = 0 |
| textract_queries = 0 |
| comprehend_queries = 0 |
| handwriting_signature = "" |
|
|
| log_redaction_usage( |
| logger=usage_logger, |
| session_hash=session_hash, |
| doc_file_name=doc_file_name, |
| data_file_name=data_file_name, |
| time_taken=processing_time, |
| total_pages=total_pages, |
| textract_queries=textract_queries, |
| pii_method=args.pii_detector, |
| comprehend_queries=comprehend_queries, |
| cost_code=args.cost_code, |
| handwriting_signature=handwriting_signature, |
| text_extraction_method=args.ocr_method, |
| is_textract_call=is_textract_call, |
| task=args.task, |
| save_to_dynamodb=args.save_logs_to_dynamodb, |
| save_to_s3=args.upload_logs_to_s3, |
| s3_bucket=args.s3_bucket, |
| s3_key_prefix=args.s3_logs_prefix, |
| vlm_model_name="", |
| vlm_total_input_tokens=0, |
| vlm_total_output_tokens=0, |
| llm_model_name="", |
| llm_total_input_tokens=0, |
| llm_total_output_tokens=0, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not log usage data: {e}") |
|
|
| print("\n--- Tabular Duplicate Detection Complete ---") |
| print(f"Found {len(results_df)} duplicate matches") |
| print(f"\nOutput files saved to: {args.output_dir}") |
| if output_paths: |
| print("Generated Files:", sorted(output_paths)) |
|
|
| |
| if args.save_outputs_to_s3 and s3_output_folder and output_paths: |
| print("\n--- Uploading output files to S3 ---") |
| try: |
| export_outputs_to_s3( |
| file_list_state=output_paths, |
| s3_output_folder_state_value=s3_output_folder, |
| save_outputs_to_s3_flag=args.save_outputs_to_s3, |
| base_file_state=( |
| args.input_file[0] if args.input_file else None |
| ), |
| s3_bucket=args.s3_outputs_bucket, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not upload output files to S3: {e}") |
|
|
| else: |
| print( |
| "Error: Tabular duplicate detection requires CSV, Excel, or Parquet files." |
| ) |
| print("Supported types: .csv, .xlsx, .xls, .parquet") |
| else: |
| print(f"Error: Invalid duplicate type '{args.duplicate_type}'.") |
| print("Valid options: 'pages' or 'tabular'") |
|
|
| except Exception as e: |
| print(f"\nAn error occurred during the duplicate detection workflow: {e}") |
|
|
| |
| elif args.task == "textract": |
| print("--- Starting Textract Batch Operations Workflow... ---") |
|
|
| if not args.textract_action: |
| print("Error: --textract_action is required for textract task.") |
| print("Valid options: 'submit', 'retrieve', or 'list'") |
| return |
|
|
| try: |
| if args.textract_action == "submit": |
| from tools.textract_batch_call import ( |
| analyse_document_with_textract_api, |
| load_in_textract_job_details, |
| ) |
|
|
| |
| if not args.input_file: |
| print("Error: --input_file is required for submit action.") |
| return |
|
|
| print(f"--- Submitting document to Textract: {args.input_file} ---") |
|
|
| start_time = time.time() |
|
|
| |
| job_df = load_in_textract_job_details( |
| load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, |
| load_local_jobs_loc=args.local_textract_document_logs_subfolder, |
| ) |
|
|
| |
| signature_options = ( |
| ["Extract handwriting", "Extract signatures"] |
| if args.extract_signatures |
| else ["Extract handwriting"] |
| ) |
|
|
| |
| textract_bucket = args.textract_bucket if args.textract_bucket else "" |
|
|
| |
| ( |
| result_message, |
| job_id, |
| job_type, |
| successful_job_number, |
| is_textract_call, |
| total_pages, |
| task_textbox, |
| ) = analyse_document_with_textract_api( |
| local_pdf_path=args.input_file, |
| s3_input_prefix=args.textract_input_prefix, |
| s3_output_prefix=args.textract_output_prefix, |
| job_df=job_df, |
| s3_bucket_name=textract_bucket, |
| general_s3_bucket_name=args.s3_bucket, |
| local_output_dir=args.output_dir, |
| handwrite_signature_checkbox=signature_options, |
| aws_region=args.aws_region, |
| ) |
|
|
| end_time = time.time() |
| processing_time = end_time - start_time |
|
|
| print("\n--- Textract Job Submitted Successfully ---") |
| print(f"Job ID: {job_id}") |
| print(f"Job Type: {job_type}") |
| print(f"Message: {result_message}") |
| print(f"Results will be available in: {args.output_dir}") |
|
|
| |
| if usage_logger: |
| try: |
| |
| print("Saving logs to CSV") |
| doc_file_name = ( |
| os.path.basename(args.input_file[0]) |
| if args.display_file_names_in_logs |
| else "document" |
| ) |
| data_file_name = "" |
|
|
| |
| is_textract_call = True |
| args.ocr_method == "AWS Textract" |
|
|
| |
| textract_queries = total_pages |
| comprehend_queries = 0 |
|
|
| |
| handwriting_signature = "" |
|
|
| log_redaction_usage( |
| logger=usage_logger, |
| session_hash=session_hash, |
| doc_file_name=doc_file_name, |
| data_file_name=data_file_name, |
| time_taken=processing_time, |
| total_pages=total_pages, |
| textract_queries=textract_queries, |
| pii_method=args.pii_detector, |
| comprehend_queries=comprehend_queries, |
| cost_code=args.cost_code, |
| handwriting_signature=handwriting_signature, |
| text_extraction_method=args.ocr_method, |
| is_textract_call=is_textract_call, |
| task=args.task, |
| save_to_dynamodb=args.save_logs_to_dynamodb, |
| save_to_s3=args.upload_logs_to_s3, |
| s3_bucket=args.s3_bucket, |
| s3_key_prefix=args.s3_logs_prefix, |
| vlm_model_name="", |
| vlm_total_input_tokens=0, |
| vlm_total_output_tokens=0, |
| llm_model_name="", |
| llm_total_input_tokens=0, |
| llm_total_output_tokens=0, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not log usage data: {e}") |
|
|
| elif args.textract_action == "retrieve": |
| print(f"--- Retrieving Textract results for Job ID: {args.job_id} ---") |
|
|
| from tools.textract_batch_call import ( |
| load_in_textract_job_details, |
| poll_whole_document_textract_analysis_progress_and_download, |
| ) |
|
|
| |
| if not args.job_id: |
| print("Error: --job_id is required for retrieve action.") |
| return |
|
|
| |
| print("Loading existing job details...") |
| job_df = load_in_textract_job_details( |
| load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, |
| load_local_jobs_loc=args.local_textract_document_logs_subfolder, |
| ) |
|
|
| |
| job_type = "document_text_detection" |
| if not job_df.empty and "job_id" in job_df.columns: |
| matching_jobs = job_df.loc[job_df["job_id"] == args.job_id] |
| if not matching_jobs.empty and "job_type" in matching_jobs.columns: |
| job_type = matching_jobs.iloc[0]["job_type"] |
|
|
| |
| textract_bucket = args.textract_bucket if args.textract_bucket else "" |
|
|
| |
| print("Polling for completion and downloading results...") |
| downloaded_file_path, job_status, updated_job_df, output_filename = ( |
| poll_whole_document_textract_analysis_progress_and_download( |
| job_id=args.job_id, |
| job_type_dropdown=job_type, |
| s3_output_prefix=args.textract_output_prefix, |
| pdf_filename="", |
| job_df=job_df, |
| s3_bucket_name=textract_bucket, |
| load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, |
| load_local_jobs_loc=args.local_textract_document_logs_subfolder, |
| local_output_dir=args.output_dir, |
| poll_interval_seconds=args.poll_interval, |
| max_polling_attempts=args.max_poll_attempts, |
| ) |
| ) |
|
|
| print("\n--- Textract Results Retrieved Successfully ---") |
| print(f"Job Status: {job_status}") |
| print(f"Downloaded File: {downloaded_file_path}") |
| |
|
|
| elif args.textract_action == "list": |
| from tools.textract_batch_call import load_in_textract_job_details |
|
|
| |
| print("--- Listing Recent Textract Jobs ---") |
|
|
| job_df = load_in_textract_job_details( |
| load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, |
| load_local_jobs_loc=args.local_textract_document_logs_subfolder, |
| ) |
|
|
| if job_df.empty: |
| print("No recent Textract jobs found.") |
| else: |
| print(f"\nFound {len(job_df)} recent Textract jobs:") |
| print("-" * 80) |
| for _, job in job_df.iterrows(): |
| print(f"Job ID: {job.get('job_id', 'N/A')}") |
| print(f"File: {job.get('file_name', 'N/A')}") |
| print(f"Type: {job.get('job_type', 'N/A')}") |
| print(f"Signatures: {job.get('signature_extraction', 'N/A')}") |
| print(f"Date: {job.get('job_date_time', 'N/A')}") |
| print("-" * 80) |
|
|
| else: |
| print(f"Error: Invalid textract_action '{args.textract_action}'.") |
| print("Valid options: 'submit', 'retrieve', or 'list'") |
|
|
| except Exception as e: |
| print(f"\nAn error occurred during the Textract workflow: {e}") |
|
|
| elif args.task == "summarise": |
| print("--- Document Summarisation ---") |
| try: |
| from tools.cli_usage_logger import log_redaction_usage |
| from tools.file_conversion import is_pdf |
| from tools.summaries import ( |
| concise_summary_format_prompt, |
| detailed_summary_format_prompt, |
| load_csv_files_to_dataframe, |
| summarise_document_wrapper, |
| ) |
|
|
| |
| format_map = { |
| "concise": concise_summary_format_prompt, |
| "detailed": detailed_summary_format_prompt, |
| } |
| summarise_format_radio = format_map.get( |
| args.summarisation_format, detailed_summary_format_prompt |
| ) |
|
|
| |
| input_paths = ( |
| [args.input_file] |
| if isinstance(args.input_file, str) |
| else list(args.input_file or []) |
| ) |
| input_paths = [p for p in input_paths if p and str(p).strip()] |
|
|
| |
| summarise_from_pdf = any(is_pdf(p) for p in input_paths) |
| if summarise_from_pdf: |
| pdf_path = next((p for p in input_paths if is_pdf(p)), None) |
| if not pdf_path: |
| print("Error: No PDF path found in input files.") |
| return |
| print( |
| f"Detected PDF input. Extracting text with '{args.ocr_method}' then summarising..." |
| ) |
| from tools.file_conversion import prepare_image_or_pdf |
| from tools.file_redaction import run_redaction |
| from tools.redaction_types import RedactionContext, RedactionOptions |
|
|
| prepare_images = args.ocr_method in ["Local OCR", "AWS Textract"] |
| ( |
| _prep_summary, |
| prepared_pdf_paths, |
| image_file_paths, |
| _, |
| _, |
| pdf_doc, |
| image_annotations, |
| _, |
| original_cropboxes, |
| page_sizes, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| ) = prepare_image_or_pdf( |
| file_paths=[pdf_path], |
| text_extract_method=args.ocr_method, |
| all_line_level_ocr_results_df=pd.DataFrame(), |
| all_page_line_level_ocr_results_with_words_df=pd.DataFrame(), |
| first_loop_state=True, |
| prepare_for_review=False, |
| output_folder=args.output_dir, |
| input_folder=args.input_dir, |
| prepare_images=prepare_images, |
| page_min=args.page_min, |
| page_max=args.page_max, |
| ) |
| print(f" {_prep_summary}") |
|
|
| ( |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| ocr_df, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| _, |
| ) = run_redaction( |
| [pdf_path], |
| RedactionOptions( |
| chosen_redact_entities=args.local_redact_entities or [], |
| chosen_redact_comprehend_entities=args.aws_redact_entities |
| or [], |
| chosen_llm_entities=args.llm_redact_entities or [], |
| text_extraction_method=args.ocr_method, |
| in_allow_list=resolve_allow_list_for_redaction(args), |
| in_deny_list=resolve_deny_list_for_redaction(args), |
| redact_whole_page_list=args.redact_whole_page_file, |
| page_min=args.page_min, |
| page_max=args.page_max, |
| handwrite_signature_checkbox=args.handwrite_signature_extraction |
| or [], |
| max_fuzzy_spelling_mistakes_num=getattr( |
| args, "fuzzy_mistakes", DEFAULT_FUZZY_SPELLING_MISTAKES_NUM |
| ), |
| match_fuzzy_whole_phrase_bool=getattr( |
| args, "match_fuzzy_whole_phrase_bool", True |
| ), |
| pii_identification_method=args.pii_detector or "Local", |
| aws_access_key_textbox=args.aws_access_key or "", |
| aws_secret_key_textbox=args.aws_secret_key or "", |
| language=args.language, |
| output_folder=args.output_dir, |
| input_folder=args.input_dir, |
| custom_llm_instructions=args.custom_llm_instructions or "", |
| inference_server_vlm_model=( |
| getattr(args, "inference_server_vlm_model", None) |
| or DEFAULT_INFERENCE_SERVER_VLM_MODEL |
| ), |
| chosen_local_ocr_model=resolve_chosen_local_ocr_model_for_redaction( |
| args |
| ), |
| efficient_ocr=resolve_efficient_ocr_for_redaction(args), |
| efficient_ocr_min_words=( |
| getattr(args, "efficient_ocr_min_words", None) |
| or EFFICIENT_OCR_MIN_WORDS |
| ), |
| efficient_ocr_min_image_coverage_fraction=( |
| getattr( |
| args, "efficient_ocr_min_image_coverage_fraction", None |
| ) |
| if getattr( |
| args, "efficient_ocr_min_image_coverage_fraction", None |
| ) |
| is not None |
| else EFFICIENT_OCR_MIN_IMAGE_COVERAGE_FRACTION |
| ), |
| efficient_ocr_min_embedded_image_px=( |
| getattr(args, "efficient_ocr_min_embedded_image_px", None) |
| if getattr( |
| args, "efficient_ocr_min_embedded_image_px", None |
| ) |
| is not None |
| else EFFICIENT_OCR_MIN_EMBEDDED_IMAGE_PX |
| ), |
| ocr_first_pass_max_workers=( |
| getattr(args, "ocr_first_pass_max_workers", None) |
| or OCR_FIRST_PASS_MAX_WORKERS |
| ), |
| hybrid_textract_bedrock_vlm=getattr( |
| args, |
| "hybrid_textract_bedrock_vlm", |
| HYBRID_TEXTRACT_BEDROCK_VLM, |
| ), |
| overwrite_existing_ocr_results=getattr( |
| args, |
| "overwrite_existing_ocr_results", |
| OVERWRITE_EXISTING_OCR_RESULTS, |
| ), |
| save_page_ocr_visualisations=( |
| getattr(args, "save_page_ocr_visualisations", None) |
| if getattr(args, "save_page_ocr_visualisations", None) |
| is not None |
| else SAVE_PAGE_OCR_VISUALISATIONS |
| ), |
| text_extraction_only=True, |
| post_redact_pass1_qa=getattr( |
| args, "post_redact_pass1_qa", None |
| ), |
| post_redact_pass1_auto_prune=getattr( |
| args, "post_redact_pass1_auto_prune", None |
| ), |
| ), |
| RedactionContext( |
| prepared_pdf_file_paths=prepared_pdf_paths, |
| pdf_image_file_paths=image_file_paths, |
| pymupdf_doc=pdf_doc, |
| annotations_all_pages=image_annotations, |
| page_sizes=page_sizes, |
| document_cropboxes=original_cropboxes, |
| ), |
| ) |
|
|
| if ocr_df is None or ( |
| isinstance(ocr_df, pd.DataFrame) and ocr_df.empty |
| ): |
| print("Error: No OCR text extracted from PDF. Cannot summarise.") |
| return |
|
|
| |
| basename = os.path.basename(pdf_path) |
| file_name = os.path.splitext(basename)[0][:20] |
| invalid_chars = '<>:"/\\|?*' |
| for char in invalid_chars: |
| file_name = file_name.replace(char, "_") |
| file_name = file_name if file_name else "document" |
| else: |
| |
| ocr_df = load_csv_files_to_dataframe(input_paths) |
| if ocr_df is None or ocr_df.empty: |
| print( |
| "Error: No valid OCR data (page, line, text columns) in input file(s)." |
| ) |
| return |
|
|
| first_path = input_paths[0] if input_paths else "" |
| if first_path: |
| basename = os.path.basename(first_path) |
| file_name = os.path.splitext(basename)[0][:20] |
| invalid_chars = '<>:"/\\|?*' |
| for char in invalid_chars: |
| file_name = file_name.replace(char, "_") |
| file_name = file_name if file_name else "document" |
| else: |
| file_name = "document" |
|
|
| ( |
| output_files, |
| status_message, |
| llm_model_name, |
| llm_total_input_tokens, |
| llm_total_output_tokens, |
| summary_display_text, |
| elapsed_seconds, |
| ) = summarise_document_wrapper( |
| ocr_df, |
| args.output_dir, |
| args.summarisation_inference_method, |
| args.summarisation_api_key or "", |
| args.summarisation_temperature, |
| file_name, |
| args.summarisation_context or "", |
| args.aws_access_key or "", |
| args.aws_secret_key or "", |
| "", |
| AZURE_OPENAI_INFERENCE_ENDPOINT or "", |
| summarise_format_radio, |
| args.summarisation_additional_instructions or "", |
| args.summarisation_max_pages_per_group, |
| None, |
| ) |
|
|
| processing_time = elapsed_seconds |
|
|
| print(f"\n{status_message}") |
| if output_files: |
| print("Output files:") |
| for p in output_files: |
| print(f" {p}") |
| if summary_display_text: |
| print("\n--- Summary ---") |
| print( |
| summary_display_text[:2000] |
| + ("..." if len(summary_display_text) > 2000 else "") |
| ) |
|
|
| |
| if usage_logger: |
| try: |
| first_input = input_paths[0] if input_paths else "" |
| doc_file_name = ( |
| os.path.basename(first_input) |
| if args.display_file_names_in_logs and first_input |
| else "document" |
| ) |
| data_file_name = "" |
| total_pages = ( |
| int(ocr_df["page"].max()) |
| if "page" in ocr_df.columns and not ocr_df.empty |
| else 0 |
| ) |
|
|
| log_redaction_usage( |
| logger=usage_logger, |
| session_hash=session_hash, |
| doc_file_name=doc_file_name, |
| data_file_name=data_file_name, |
| time_taken=processing_time, |
| total_pages=total_pages, |
| textract_queries=0, |
| pii_method=args.summarisation_inference_method, |
| comprehend_queries=0, |
| cost_code=args.cost_code, |
| handwriting_signature="", |
| text_extraction_method="", |
| is_textract_call=False, |
| task="summarisation", |
| save_to_dynamodb=args.save_logs_to_dynamodb, |
| save_to_s3=args.upload_logs_to_s3, |
| s3_bucket=args.s3_bucket, |
| s3_key_prefix=args.s3_logs_prefix, |
| vlm_model_name="", |
| vlm_total_input_tokens=0, |
| vlm_total_output_tokens=0, |
| llm_model_name=llm_model_name or "", |
| llm_total_input_tokens=llm_total_input_tokens or 0, |
| llm_total_output_tokens=llm_total_output_tokens or 0, |
| ) |
| except Exception as e: |
| print(f"Warning: Could not log usage data: {e}") |
|
|
| except Exception as e: |
| print(f"\nAn error occurred during summarisation: {e}") |
| import traceback |
|
|
| traceback.print_exc() |
|
|
| elif args.task == "combine_review_pdfs": |
| print("--- Combine review PDFs ---") |
| try: |
| from tools.file_conversion import combine_review_pdf_files |
|
|
| paths = ( |
| [args.input_file] |
| if isinstance(args.input_file, str) |
| else list(args.input_file) |
| ) |
| if len(paths) < 2: |
| print("Error: combine_review_pdfs requires at least 2 input PDF files.") |
| return |
| out_dir = args.output_dir |
| os.makedirs(out_dir, exist_ok=True) |
| result = combine_review_pdf_files(paths, output_folder=out_dir) |
| if result: |
| print(f"Combined PDF saved to: {result[0]}") |
| else: |
| print("No output produced (empty file list or no valid paths).") |
| except ValueError as e: |
| print(f"Error: {e}") |
| except Exception as e: |
| print(f"\nAn error occurred while combining review PDFs: {e}") |
| import traceback |
|
|
| traceback.print_exc() |
|
|
| elif args.task == "export_review_redaction_overlay": |
| print("--- Export review redaction overlay image ---") |
| try: |
| from tools.redaction_review import visualise_review_redaction_boxes |
|
|
| if not args.page_image_path: |
| print( |
| "Error: --page_image_path is required for export_review_redaction_overlay." |
| ) |
| return |
| if not args.boxes_json_path: |
| print( |
| "Error: --boxes_json_path is required for export_review_redaction_overlay." |
| ) |
| return |
|
|
| with open(args.boxes_json_path, "r", encoding="utf-8") as f: |
| boxes = json.load(f) |
| if not isinstance(boxes, list) or not boxes: |
| print("Error: boxes JSON must be a non-empty list of box dicts.") |
| return |
|
|
| review_df = pd.DataFrame() |
| if args.review_df_json_path: |
| with open(args.review_df_json_path, "r", encoding="utf-8") as f: |
| recs = json.load(f) |
| if isinstance(recs, list) and recs: |
| review_df = pd.DataFrame(recs) |
|
|
| annotator = {"image": args.page_image_path, "boxes": boxes} |
| out = visualise_review_redaction_boxes( |
| annotator, |
| review_df=review_df, |
| output_folder=args.output_dir, |
| page_number=int(args.page_number or 1), |
| doc_base_name=str(args.doc_base_name or "review"), |
| label_abbrev_chars=( |
| None |
| if int(args.label_abbrev_chars) < 0 |
| else int(args.label_abbrev_chars) |
| ), |
| ) |
| if out: |
| print(f"Overlay image written to: {out}") |
| else: |
| print("No output produced (invalid image/boxes or write failed).") |
| except Exception as e: |
| print(f"\nAn error occurred while exporting overlay image: {e}") |
| import traceback |
|
|
| traceback.print_exc() |
|
|
| elif args.task == "export_review_page_ocr_visualisation": |
| print("--- Export review page OCR visualisation image ---") |
| try: |
| from PIL import Image |
|
|
| from tools.file_redaction import visualise_ocr_words_bounding_boxes |
| from tools.helper_functions import get_file_name_without_type |
| from tools.secure_path_utils import sanitize_filename |
|
|
| if not args.page_image_path: |
| print( |
| "Error: --page_image_path is required for export_review_page_ocr_visualisation." |
| ) |
| return |
| if not args.ocr_results_json_path: |
| print( |
| "Error: --ocr_results_json_path is required for export_review_page_ocr_visualisation." |
| ) |
| return |
|
|
| with open(args.ocr_results_json_path, "r", encoding="utf-8") as f: |
| ocr_results = json.load(f) |
| if not isinstance(ocr_results, dict) or not ocr_results: |
| print("Error: ocr_results JSON must be a non-empty dict.") |
| return |
|
|
| base = get_file_name_without_type(os.path.basename(str(args.doc_base_name))) |
| if not base or not str(base).strip(): |
| base = "review" |
| safe_base = sanitize_filename(str(base)) |
| image_name = f"{safe_base}_page{int(args.page_number or 1)}.png" |
|
|
| log_paths: list[str] = [] |
| log_paths = visualise_ocr_words_bounding_boxes( |
| Image.open(args.page_image_path).convert("RGB"), |
| ocr_results, |
| image_name=image_name, |
| output_folder=args.output_dir, |
| visualisation_folder="review_ocr_visualisations", |
| add_legend=True, |
| log_files_output_paths=log_paths, |
| ) |
| if log_paths: |
| print(f"OCR visualisation written to: {log_paths[-1]}") |
| else: |
| print("No output produced (invalid image/ocr_results or write failed).") |
| except Exception as e: |
| print(f"\nAn error occurred while exporting OCR visualisation image: {e}") |
| import traceback |
|
|
| traceback.print_exc() |
|
|
| else: |
| print(f"Error: Invalid task '{args.task}'.") |
| print( |
| "Valid options: 'redact', 'deduplicate', 'textract', 'summarise', 'combine_review_pdfs', 'export_review_redaction_overlay', or 'export_review_page_ocr_visualisation'" |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|