Testing-AI-Contain / features /rag_chatbot /document_handler.py
Sangyog10's picture
set up rag pipeline for chatbot
29fbb51
from io import BytesIO
from fastapi import UploadFile, HTTPException
import PyPDF2
import docx
async def extract_text_from_file(file: UploadFile) -> str:
"""Extracts text from various file types."""
content = await file.read()
file_stream = BytesIO(content)
if file.content_type == "application/pdf":
return extract_text_from_pdf(file_stream)
elif file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return extract_text_from_docx(file_stream)
elif file.content_type == "text/plain":
return file_stream.read().decode("utf-8")
else:
raise HTTPException(
status_code=415,
detail="Unsupported file type. Please upload a .pdf, .docx, or .txt file."
)
def extract_text_from_pdf(file_stream: BytesIO) -> str:
"""Extracts text from a PDF file."""
reader = PyPDF2.PdfReader(file_stream)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
def extract_text_from_docx(file_stream: BytesIO) -> str:
"""Extracts text from a DOCX file."""
doc = docx.Document(file_stream)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text