| """ |
| LOAD_DOCUMENTS – SINGLE SOURCE OF TRUTH |
| |
| Nhiệm vụ: |
| 1) Lade Prüfungsordnung PDF direkt aus Supabase-Storage. |
| 2) Lade Hochschulgesetz NRW aus Supabase-Tabelle hg_nrw. |
| 3) Cung cấp metadata đầy đủ để các file khác KHÔNG PHẢI tính lại URL. |
| """ |
|
|
| import os |
| import tempfile |
| from dotenv import load_dotenv |
| from langchain_community.document_loaders import PyPDFLoader |
| from langchain_core.documents import Document |
| from supabase import create_client |
|
|
| load_dotenv() |
|
|
| import urllib.parse |
|
|
| |
| SUPABASE_URL = os.getenv("SUPABASE_URL") |
| SUPABASE_SERVICE_ROLE = os.getenv("SUPABASE_SERVICE_ROLE") |
|
|
| supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE) |
|
|
| |
|
|
| |
| PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf" |
|
|
| PDF_BUCKET = "File PDF" |
| ENC_BUCKET = urllib.parse.quote(PDF_BUCKET) |
|
|
| |
| PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{ENC_BUCKET}/{PDF_FILE}" |
|
|
|
|
| |
| HG_VIEWER_BUCKET = "hg_viewer" |
| HG_VIEWER_FILE = "hg_clean.html" |
| HG_VIEWER_URL = f"{SUPABASE_URL}/storage/v1/object/public/{HG_VIEWER_BUCKET}/{HG_VIEWER_FILE}" |
|
|
|
|
| |
| |
| |
|
|
| def load_pdf_from_supabase() -> list[Document]: |
| print("📥 Lade Prüfungsordnung PDF aus Supabase...") |
|
|
| response = supabase.storage.from_(PDF_BUCKET).download(PDF_FILE) |
| if response is None: |
| raise ValueError("❌ Konnte PDF nicht laden!") |
|
|
| |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: |
| tmp.write(response) |
| temp_pdf_path = tmp.name |
|
|
| pages = PyPDFLoader(temp_pdf_path).load() |
|
|
| for i, p in enumerate(pages): |
| p.metadata = { |
| "type": "pdf", |
| "source": "Prüfungsordnung", |
| "page": i, |
| "pdf_url": f"{PDF_URL}#page={i}", |
| "filename": PDF_FILE, |
| } |
|
|
| print(f"✔ {len(pages)} PDF-Seiten geladen.") |
| return pages |
|
|
|
|
| |
| |
| |
|
|
| def load_hg_from_supabase() -> list[Document]: |
| print("📥 Lade Hochschulgesetz NRW aus Tabelle hg_nrw...") |
|
|
| res = ( |
| supabase.table("hg_nrw") |
| .select("*") |
| .order("order_index", desc=False) |
| .execute() |
| ) |
| rows = res.data or [] |
| docs = [] |
|
|
| for row in rows: |
| abs_id = row["abs_id"] |
| title = row["title"] |
| content = row["content"] |
|
|
| viewer_url = f"{HG_VIEWER_URL}#{abs_id}" |
|
|
| docs.append( |
| Document( |
| page_content=content, |
| metadata={ |
| "type": "hg", |
| "source": "Hochschulgesetz NRW", |
| "abs_id": abs_id, |
| "title": title, |
| "viewer_url": viewer_url, |
| }, |
| ) |
| ) |
|
|
| print(f"✔ {len(docs)} HG-Absätze geladen.") |
| return docs |
|
|
|
|
| |
| |
| |
|
|
| def load_all_documents(): |
| pdf_docs = load_pdf_from_supabase() |
| hg_docs = load_hg_from_supabase() |
| return pdf_docs + hg_docs |
|
|
|
|
| if __name__ == "__main__": |
| docs = load_all_documents() |
| print("📚 Gesamt:", len(docs)) |
| print("🔎 Beispiel metadata:", docs[0].metadata) |
|
|