Spaces:
Sleeping
Sleeping
| import base64 | |
| import io | |
| import re | |
| import pytesseract | |
| from PIL import Image | |
| from flair.data import Sentence | |
| from pdf2image import convert_from_bytes | |
| from ocr.core.config import settings | |
| def divide_images(contents: bytes) -> list[bytes]: | |
| images = convert_from_bytes(contents, dpi=250) | |
| image_bytes_list = [] | |
| for image in images: | |
| img_byte_array = io.BytesIO() | |
| image.save(img_byte_array, format='PNG') | |
| img_byte_array.seek(0) | |
| image_bytes_list.append(img_byte_array.read()) | |
| return image_bytes_list | |
| def extract_text_from_images(images: list[bytes]) -> str: | |
| extracted_texts = [] | |
| for image_bytes in images: | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| text = pytesseract.image_to_string(image) | |
| extracted_texts.append(text) | |
| return '\n'.join(extracted_texts) | |
| def prepare_request_content(images: list[bytes]) -> list: | |
| content = [ | |
| {"type": "text", "text": "Generate a report on the attached document"}, | |
| *[ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}", | |
| }, | |
| } | |
| for image in images | |
| ] | |
| ] | |
| return content | |
| def clean_response(text: str) -> str: | |
| try: | |
| text = re.search(r'```markdown\s*(.*?)\s*```', text, re.DOTALL).group(1) | |
| except Exception as e: | |
| pass | |
| return text | |
| def clean_text(text: str) -> str: | |
| sentence = Sentence(text) | |
| settings.TAGGER.predict(sentence) | |
| per_entities = [entity for entity in sentence.get_spans('ner') if entity.tag == 'PER'] | |
| per_entities = sorted(per_entities, key=lambda x: x.start_position, reverse=True) | |
| cleaned_text = text | |
| for entity in per_entities: | |
| start = entity.start_position | |
| end = entity.end_position | |
| cleaned_text = cleaned_text[:start] + cleaned_text[end:] | |
| return cleaned_text | |