| | from web2json.ai_extractor import * |
| | from web2json.postprocessor import * |
| | from web2json.preprocessor import * |
| | from pydantic import BaseModel |
| |
|
| | class Pipeline: |
| | |
| | def __init__(self, |
| | preprocessor: Preprocessor, |
| | ai_extractor: AIExtractor, |
| | postprocessor: PostProcessor): |
| | self.preprocessor = preprocessor |
| | self.ai_extractor = ai_extractor |
| | self.postprocessor = postprocessor |
| |
|
| | def run(self, content: str, is_url: bool, schema:BaseModel) -> dict: |
| | """ |
| | Run the entire pipeline: preprocess, extract, and postprocess. |
| | |
| | Args: |
| | content (str): The raw content to process. |
| | is_url (bool): Whether the content is a URL or raw text. |
| | schema (BaseModel): The schema defining the structure of the expected output. |
| | |
| | Returns: |
| | dict: The final structured data after processing. |
| | """ |
| | |
| | preprocessed_content = self.preprocessor.preprocess(content, is_url) |
| | print(f"Preprocessed content: {preprocessed_content[:100]}...") |
| | print('+'*80) |
| | |
| | extracted_data = self.ai_extractor.extract(preprocessed_content, schema) |
| | print(f"Extracted data: {extracted_data[:100]}...") |
| | print('+'*80) |
| | |
| | final_output = self.postprocessor.process(extracted_data) |
| | print(f"Final output: {final_output}") |
| | print('+'*80) |
| | |
| | return final_output |
| |
|
| | |