Upload streamlit_app.py with huggingface_hub
Browse files- streamlit_app.py +48 -4
streamlit_app.py
CHANGED
|
@@ -20,6 +20,8 @@ try:
|
|
| 20 |
except ImportError:
|
| 21 |
_PDF_AVAILABLE = False
|
| 22 |
|
|
|
|
|
|
|
| 23 |
sys.path.insert(0, os.path.dirname(__file__))
|
| 24 |
from wiki.starter import get_starter_wiki
|
| 25 |
from core.compiler import compile_source, rebuild_index
|
|
@@ -143,6 +145,14 @@ def add_or_update_article(article: dict):
|
|
| 143 |
wiki["metadata"]["article_count"] = len(wiki["articles"])
|
| 144 |
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
def extract_pdf_text(file_bytes: bytes) -> tuple[str, int]:
|
| 147 |
"""Extract all text from a PDF. Returns (text, page_count)."""
|
| 148 |
reader = PdfReader(io.BytesIO(file_bytes))
|
|
@@ -416,20 +426,51 @@ Large PDFs (100+ pages) are supported; text is extracted from every page automat
|
|
| 416 |
src_title = st.text_input("Source title", placeholder="e.g. NICE NG51 β Sepsis (2016)")
|
| 417 |
src_type = st.selectbox("Type", ["Clinical Guideline", "Research Paper", "NMC Document", "NHS Protocol", "Textbook", "Other"])
|
| 418 |
|
| 419 |
-
input_method = st.radio(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
|
| 421 |
src_content = ""
|
| 422 |
pdf_meta = None
|
| 423 |
|
| 424 |
-
if input_method == "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
if not _PDF_AVAILABLE:
|
| 426 |
st.error("pypdf not installed β PDF upload unavailable.")
|
| 427 |
else:
|
|
|
|
| 428 |
uploaded_pdf = st.file_uploader(
|
| 429 |
-
"Upload PDF
|
| 430 |
type=["pdf"],
|
| 431 |
key="pdf_upload",
|
| 432 |
-
help="Text is extracted from every page. Large documents are fully supported.",
|
| 433 |
)
|
| 434 |
if uploaded_pdf is not None:
|
| 435 |
with st.spinner(f"Extracting text from {uploaded_pdf.name}..."):
|
|
@@ -445,6 +486,7 @@ Large PDFs (100+ pages) are supported; text is extracted from every page automat
|
|
| 445 |
st.error(f"PDF extraction failed: {e}")
|
| 446 |
if not src_title and uploaded_pdf:
|
| 447 |
src_title = uploaded_pdf.name.replace(".pdf", "").replace("_", " ")
|
|
|
|
| 448 |
else:
|
| 449 |
src_content = st.text_area(
|
| 450 |
"Paste text here",
|
|
@@ -468,6 +510,8 @@ Large PDFs (100+ pages) are supported; text is extracted from every page automat
|
|
| 468 |
entry["pdf_size_kb"] = pdf_meta["size_kb"]
|
| 469 |
wiki["sources"][src_id] = entry
|
| 470 |
log(f"ingest | Added source: {src_title} ({len(src_content):,} chars)")
|
|
|
|
|
|
|
| 471 |
st.success(f"Source added: **{src_title}**")
|
| 472 |
st.rerun()
|
| 473 |
|
|
|
|
| 20 |
except ImportError:
|
| 21 |
_PDF_AVAILABLE = False
|
| 22 |
|
| 23 |
+
import requests as _requests
|
| 24 |
+
|
| 25 |
sys.path.insert(0, os.path.dirname(__file__))
|
| 26 |
from wiki.starter import get_starter_wiki
|
| 27 |
from core.compiler import compile_source, rebuild_index
|
|
|
|
| 145 |
wiki["metadata"]["article_count"] = len(wiki["articles"])
|
| 146 |
|
| 147 |
|
| 148 |
+
def fetch_pdf_from_url(url: str, timeout: int = 60) -> bytes:
|
| 149 |
+
"""Fetch a PDF from a URL server-side (bypasses HF proxy upload limits)."""
|
| 150 |
+
headers = {"User-Agent": "NursingKnowledgeBase/1.0 (nursing education tool)"}
|
| 151 |
+
resp = _requests.get(url, headers=headers, timeout=timeout, stream=True)
|
| 152 |
+
resp.raise_for_status()
|
| 153 |
+
return resp.content
|
| 154 |
+
|
| 155 |
+
|
| 156 |
def extract_pdf_text(file_bytes: bytes) -> tuple[str, int]:
|
| 157 |
"""Extract all text from a PDF. Returns (text, page_count)."""
|
| 158 |
reader = PdfReader(io.BytesIO(file_bytes))
|
|
|
|
| 426 |
src_title = st.text_input("Source title", placeholder="e.g. NICE NG51 β Sepsis (2016)")
|
| 427 |
src_type = st.selectbox("Type", ["Clinical Guideline", "Research Paper", "NMC Document", "NHS Protocol", "Textbook", "Other"])
|
| 428 |
|
| 429 |
+
input_method = st.radio(
|
| 430 |
+
"Input method",
|
| 431 |
+
["PDF from URL", "Upload PDF", "Paste text"],
|
| 432 |
+
horizontal=True,
|
| 433 |
+
help="Use 'PDF from URL' for large files β the server fetches it directly.",
|
| 434 |
+
)
|
| 435 |
|
| 436 |
src_content = ""
|
| 437 |
pdf_meta = None
|
| 438 |
|
| 439 |
+
if input_method == "PDF from URL":
|
| 440 |
+
st.caption("Paste a direct link to any PDF β NICE guidelines, NMC documents, research papers, etc. The server fetches it, so there is no size limit.")
|
| 441 |
+
pdf_url = st.text_input(
|
| 442 |
+
"PDF URL",
|
| 443 |
+
placeholder="https://www.nice.org.uk/guidance/ng51/resources/sepsis-pdf-...",
|
| 444 |
+
key="pdf_url",
|
| 445 |
+
)
|
| 446 |
+
if pdf_url and st.button("Fetch & Extract", key="fetch_pdf"):
|
| 447 |
+
with st.spinner("Fetching PDF from URL..."):
|
| 448 |
+
try:
|
| 449 |
+
raw_bytes = fetch_pdf_from_url(pdf_url)
|
| 450 |
+
extracted, page_count = extract_pdf_text(raw_bytes)
|
| 451 |
+
src_content = extracted
|
| 452 |
+
pdf_meta = {"pages": page_count, "size_kb": len(raw_bytes) // 1024}
|
| 453 |
+
st.session_state["fetched_pdf_content"] = extracted
|
| 454 |
+
st.session_state["fetched_pdf_meta"] = pdf_meta
|
| 455 |
+
st.success(f"Fetched {page_count} pages / {len(extracted):,} characters")
|
| 456 |
+
with st.expander("Preview extracted text"):
|
| 457 |
+
st.text(extracted[:1500] + ("..." if len(extracted) > 1500 else ""))
|
| 458 |
+
except Exception as e:
|
| 459 |
+
st.error(f"Fetch failed: {e}")
|
| 460 |
+
# Persist fetched content across reruns
|
| 461 |
+
if not src_content and st.session_state.get("fetched_pdf_content"):
|
| 462 |
+
src_content = st.session_state["fetched_pdf_content"]
|
| 463 |
+
pdf_meta = st.session_state.get("fetched_pdf_meta")
|
| 464 |
+
|
| 465 |
+
elif input_method == "Upload PDF":
|
| 466 |
if not _PDF_AVAILABLE:
|
| 467 |
st.error("pypdf not installed β PDF upload unavailable.")
|
| 468 |
else:
|
| 469 |
+
st.caption("For large PDFs (>50 MB) use 'PDF from URL' instead β HF Spaces limits browser uploads.")
|
| 470 |
uploaded_pdf = st.file_uploader(
|
| 471 |
+
"Upload PDF",
|
| 472 |
type=["pdf"],
|
| 473 |
key="pdf_upload",
|
|
|
|
| 474 |
)
|
| 475 |
if uploaded_pdf is not None:
|
| 476 |
with st.spinner(f"Extracting text from {uploaded_pdf.name}..."):
|
|
|
|
| 486 |
st.error(f"PDF extraction failed: {e}")
|
| 487 |
if not src_title and uploaded_pdf:
|
| 488 |
src_title = uploaded_pdf.name.replace(".pdf", "").replace("_", " ")
|
| 489 |
+
|
| 490 |
else:
|
| 491 |
src_content = st.text_area(
|
| 492 |
"Paste text here",
|
|
|
|
| 510 |
entry["pdf_size_kb"] = pdf_meta["size_kb"]
|
| 511 |
wiki["sources"][src_id] = entry
|
| 512 |
log(f"ingest | Added source: {src_title} ({len(src_content):,} chars)")
|
| 513 |
+
st.session_state.pop("fetched_pdf_content", None)
|
| 514 |
+
st.session_state.pop("fetched_pdf_meta", None)
|
| 515 |
st.success(f"Source added: **{src_title}**")
|
| 516 |
st.rerun()
|
| 517 |
|