Spaces:

aneesarom
/

test

Runtime error

App Files Files Community

test / app.py

aneesarom

Create app.py

f28740b verified 6 months ago

raw

history blame contribute delete

2.96 kB

	Hugging Face's logo
	Hugging Face
	Models
	Datasets
	Spaces
	Community
	Docs
	Enterprise
	Pricing



	Hugging Face is way more fun with friends and colleagues! 🤗 Join an organization
	Spaces:

	aneesarom
	/
	PDF-Text-Extractor


	like
	0

	Logs
	App
	Files
	Community
	Settings
	PDF-Text-Extractor
	/
	app.py

	aneesarom's picture
	aneesarom
	Update app.py
	4d17112
	verified
	5 days ago
	raw

	Copy download link
	history
	blame
	edit
	delete

	2.54 kB
	import json
	import gradio as gr
	import pdfplumber
	import requests
	from io import BytesIO

	def read_pdf_from_url(url: str) -> dict:
	"""
	Extracts text from a PDF file given a direct PDF download URL.
	Args:
	url (str): A URL that points directly to a PDF file.
	Returns:
	dict: JSON-formatted dictionary containing:
	- url (str): The PDF URL
	- page_count (int): Number of pages in the PDF
	- content (str): Extracted text from the PDF, with page numbers
	- error (str, optional): Error message if extraction fails
	"""
	try:
	if not url.startswith("http"):
	return {"error": "Invalid URL. Must start with http:// or https://"}

	response = requests.get(url, timeout=10)
	response.raise_for_status()

	if not response.content.startswith(b"%PDF-"):
	return {"error": "URL does not point to a valid PDF file"}

	file_like = BytesIO(response.content)
	text = ""
	with pdfplumber.open(file_like) as pdf:
	for page_num, page in enumerate(pdf.pages, start=1):
	page_text = page.extract_text()
	if page_text:
	text += f"[Page {page_num}]\n{page_text}\n\n"

	return {
	"url": url,
	"page_count": len(pdf.pages),
	"content": text.strip() if text else "No text found in PDF."
	}

	except Exception as e:
	return {"error": str(e)}

	# Example PDF URLs for the buttons
	example_urls = [
	["https://education.github.com/git-cheat-sheet-education.pdf"],
	["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"]
	]

	# Gradio MCP interface with examples
	demo = gr.Interface(
	fn=read_pdf_from_url,
	inputs=gr.Textbox(
	label="PDF URL",
	placeholder="Enter a direct PDF URL (e.g., GitHub raw link)"
	),
	outputs=gr.JSON(label="Extracted Text"),
	title="PDF Text Extractor From Url",
	description=(
	"Provide a URL that directly points to a PDF file (from any server). "
	"The server fetches the PDF and extracts the text content, returning it in JSON format."
	),
	examples=example_urls, # This adds buttons below the input box
	flagging_mode="never", # ✅ replaces allow_flagging
	cache_examples=False # ✅ disables caching (prevents CSV write)
	)

	if __name__ == "__main__":
	demo.launch(mcp_server=True)