| |
|
|
| import os |
| import requests |
| import tempfile |
| from supabase import create_client |
| from langchain_core.documents import Document |
| from langchain_community.document_loaders import PyPDFLoader |
|
|
| |
| SUPABASE_URL = os.getenv("SUPABASE_URL") |
| SUPABASE_ANON_KEY = os.getenv("SUPABASE_ANON_KEY") |
|
|
| if not SUPABASE_URL or not SUPABASE_ANON_KEY: |
| raise RuntimeError("Missing SUPABASE_URL / SUPABASE_ANON_KEY") |
|
|
| supabase = create_client(SUPABASE_URL, SUPABASE_ANON_KEY) |
|
|
| |
| PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf" |
| PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/File%20PDF/{PDF_FILE}" |
|
|
| |
| HG_HTML_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html" |
|
|
|
|
| def load_hg_nrw(): |
| print(">>> Lade Hochschulgesetz NRW (§) aus Tabelle hg_nrw …") |
|
|
| rows = ( |
| supabase.table("hg_nrw") |
| .select("*") |
| .order("order_index") |
| .execute() |
| ).data |
|
|
| docs = [] |
| for r in rows: |
| abs_id = r["abs_id"] |
| title = r["title"] |
| content = r["content"] |
|
|
| |
| viewer_url = f"{HG_HTML_URL}#{abs_id}" |
|
|
| docs.append( |
| Document( |
| page_content=f"{title}\n{content}", |
| metadata={ |
| "source": "Hochschulgesetz NRW", |
| "paragraph": title, |
| "url": viewer_url, |
| }, |
| ) |
| ) |
|
|
| print(f"✔ {len(docs)} Paragraphen geladen.\n") |
| return docs |
|
|
|
|
| def load_pdf(): |
| print(">>> Lade Prüfungsordnung PDF …") |
|
|
| resp = requests.get(PDF_URL) |
| resp.raise_for_status() |
|
|
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: |
| tmp.write(resp.content) |
| path = tmp.name |
|
|
| pages = PyPDFLoader(path).load() |
|
|
| for i, p in enumerate(pages): |
| p.metadata["source"] = "Prüfungsordnung (PDF)" |
| p.metadata["page"] = i |
| p.metadata["pdf_url"] = PDF_URL |
|
|
| print(f"✔ {len(pages)} PDF-Seiten geladen.\n") |
| return pages |
|
|
|
|
| def load_documents(): |
| docs = [] |
| docs.extend(load_hg_nrw()) |
| docs.extend(load_pdf()) |
| print(f"✔ DOCUMENTS LOADED: {len(docs)}\n") |
| return docs |
|
|
|
|
| if __name__ == "__main__": |
| d = load_documents() |
| print("Example doc:", d[0]) |
|
|