Spaces:
Running
Running
| from io import BytesIO | |
| from fastapi import UploadFile, HTTPException | |
| import PyPDF2 | |
| import docx | |
| async def extract_text_from_file(file: UploadFile) -> str: | |
| """Extracts text from various file types.""" | |
| content = await file.read() | |
| file_stream = BytesIO(content) | |
| if file.content_type == "application/pdf": | |
| return extract_text_from_pdf(file_stream) | |
| elif file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
| return extract_text_from_docx(file_stream) | |
| elif file.content_type == "text/plain": | |
| return file_stream.read().decode("utf-8") | |
| else: | |
| raise HTTPException( | |
| status_code=415, | |
| detail="Unsupported file type. Please upload a .pdf, .docx, or .txt file." | |
| ) | |
| def extract_text_from_pdf(file_stream: BytesIO) -> str: | |
| """Extracts text from a PDF file.""" | |
| reader = PyPDF2.PdfReader(file_stream) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() or "" | |
| return text | |
| def extract_text_from_docx(file_stream: BytesIO) -> str: | |
| """Extracts text from a DOCX file.""" | |
| doc = docx.Document(file_stream) | |
| text = "" | |
| for para in doc.paragraphs: | |
| text += para.text + "\n" | |
| return text | |