| | Hugging Face's logo |
| | Hugging Face |
| | Models |
| | Datasets |
| | Spaces |
| | Community |
| | Docs |
| | Enterprise |
| | Pricing |
| | |
| | |
| | |
| | Hugging Face is way more fun with friends and colleagues! π€ Join an organization |
| | Spaces: |
| | |
| | aneesarom |
| | / |
| | PDF-Text-Extractor |
| | |
| | |
| | like |
| | 0 |
| | |
| | Logs |
| | App |
| | Files |
| | Community |
| | Settings |
| | PDF-Text-Extractor |
| | / |
| | app.py |
| | |
| | aneesarom's picture |
| | aneesarom |
| | Update app.py |
| | 4d17112 |
| | verified |
| | 5 days ago |
| | raw |
| |
|
| | Copy download link |
| | history |
| | blame |
| | edit |
| | delete |
| |
|
| | 2.54 kB |
| | import json |
| | import gradio as gr |
| | import pdfplumber |
| | import requests |
| | from io import BytesIO |
| |
|
| | def read_pdf_from_url(url: str) -> dict: |
| | """ |
| | Extracts text from a PDF file given a direct PDF download URL. |
| | Args: |
| | url (str): A URL that points directly to a PDF file. |
| | Returns: |
| | dict: JSON-formatted dictionary containing: |
| | - url (str): The PDF URL |
| | - page_count (int): Number of pages in the PDF |
| | - content (str): Extracted text from the PDF, with page numbers |
| | - error (str, optional): Error message if extraction fails |
| | """ |
| | try: |
| | if not url.startswith("http"): |
| | return {"error": "Invalid URL. Must start with http:// or https://"} |
| | |
| | response = requests.get(url, timeout=10) |
| | response.raise_for_status() |
| |
|
| | if not response.content.startswith(b"%PDF-"): |
| | return {"error": "URL does not point to a valid PDF file"} |
| |
|
| | file_like = BytesIO(response.content) |
| | text = "" |
| | with pdfplumber.open(file_like) as pdf: |
| | for page_num, page in enumerate(pdf.pages, start=1): |
| | page_text = page.extract_text() |
| | if page_text: |
| | text += f"[Page {page_num}]\n{page_text}\n\n" |
| |
|
| | return { |
| | "url": url, |
| | "page_count": len(pdf.pages), |
| | "content": text.strip() if text else "No text found in PDF." |
| | } |
| |
|
| | except Exception as e: |
| | return {"error": str(e)} |
| |
|
| | |
| | example_urls = [ |
| | ["https://education.github.com/git-cheat-sheet-education.pdf"], |
| | ["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"] |
| | ] |
| |
|
| | |
| | demo = gr.Interface( |
| | fn=read_pdf_from_url, |
| | inputs=gr.Textbox( |
| | label="PDF URL", |
| | placeholder="Enter a direct PDF URL (e.g., GitHub raw link)" |
| | ), |
| | outputs=gr.JSON(label="Extracted Text"), |
| | title="PDF Text Extractor From Url", |
| | description=( |
| | "Provide a URL that directly points to a PDF file (from any server). " |
| | "The server fetches the PDF and extracts the text content, returning it in JSON format." |
| | ), |
| | examples=example_urls, |
| | flagging_mode="never", |
| | cache_examples=False |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch(mcp_server=True) |
| | |
| |
|