Spaces:

can-org
/

Testing-AI-Contain

Running

set up rag pipeline for chatbot

29fbb51 10 months ago

1.26 kB

	from io import BytesIO
	from fastapi import UploadFile, HTTPException
	import PyPDF2
	import docx

	async def extract_text_from_file(file: UploadFile) -> str:
	"""Extracts text from various file types."""
	content = await file.read()
	file_stream = BytesIO(content)

	if file.content_type == "application/pdf":
	return extract_text_from_pdf(file_stream)
	elif file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	return extract_text_from_docx(file_stream)
	elif file.content_type == "text/plain":
	return file_stream.read().decode("utf-8")
	else:
	raise HTTPException(
	status_code=415,
	detail="Unsupported file type. Please upload a .pdf, .docx, or .txt file."
	)

	def extract_text_from_pdf(file_stream: BytesIO) -> str:
	"""Extracts text from a PDF file."""
	reader = PyPDF2.PdfReader(file_stream)
	text = ""
	for page in reader.pages:
	text += page.extract_text() or ""
	return text

	def extract_text_from_docx(file_stream: BytesIO) -> str:
	"""Extracts text from a DOCX file."""
	doc = docx.Document(file_stream)
	text = ""
	for para in doc.paragraphs:
	text += para.text + "\n"
	return text