"""HTML builder — assembles the full ISP Handbook HTML document. Uses Jinja2 templates for HTML generation. Data preparation logic is preserved from the original string-concatenation approach. The output is a self-contained HTML suitable for Playwright Chromium PDF export. """ from __future__ import annotations import base64 import logging import mimetypes import os import re from pathlib import Path from typing import Any from jinja2 import Environment, FileSystemLoader, select_autoescape from markupsafe import Markup from app.core.config import get_settings from app.core.fonts import font_face_css, select_font_family from app.services.normalizer import normalize_section, normalize_university from app.services.renderers import ( fetch_image_data_uri, render_global_blocks, sort_toc, _extract_university_funding, ) from app.services.utils import ( format_money_figures, get_any, h, handbook_anchor, hb_slug, is_truthy, sort_sections_stable, ) logger = logging.getLogger(__name__) # Jinja2 environment — templates live alongside the app package _TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates" def _get_jinja_env() -> Environment: """Create a Jinja2 environment pointing to our templates directory.""" env = Environment( loader=FileSystemLoader(str(_TEMPLATES_DIR)), autoescape=select_autoescape(["html"]), trim_blocks=True, lstrip_blocks=True, ) return env def _static_base_url() -> str: """Return absolute file:// URL to the static directory.""" static_dir = Path(__file__).resolve().parent.parent / "static" return static_dir.as_uri() def _unused_pdf_override_css(font_stack: str) -> str: """Legacy inline PDF override CSS — kept for reference only. All styling now lives in static/css/print.css for Chromium rendering. """ return "" # Section class map SECTION_CLASS_MAP = { "overview": "sec-overview", "how_the_program_works": "sec-how", "qualification_requirements": "sec-qualification", "enrolment_steps": "sec-steps", "withdrawal_refund_policy": "sec-policy", "refund_guidelines": "sec-refund", "program_contributions": "sec-contributions", "program_features_breakdown": "sec-breakdown", "funding_options_available": "sec-funding", "summary_of_universities": "sec-summary", "summary_of_universities_cosigner": "sec-summary-cosigner", } PAGE_BREAK_KEYS = { "overview", "how_the_program_works", "qualification_requirements", "enrolment_steps", "withdrawal_refund_policy", "refund_guidelines", "program_contributions", "program_features_breakdown", "funding_options_available", "summary_of_universities", "summary_of_universities_cosigner", } def _collect_program_option_inconsistencies(value: Any, path: str, hits: list[str]) -> None: """Collect paths where only REGULAR or PRIME appears.""" if isinstance(value, dict): for k, v in value.items(): _collect_program_option_inconsistencies(v, f"{path}.{k}" if path else str(k), hits) return if isinstance(value, list): for i, v in enumerate(value): _collect_program_option_inconsistencies(v, f"{path}[{i}]", hits) return if value is None: return text = str(value) has_regular = bool(re.search(r"\bREGULAR\b", text, flags=re.IGNORECASE)) has_prime = bool(re.search(r"\bPRIME\b", text, flags=re.IGNORECASE)) if has_regular ^ has_prime: hits.append(path) def _prepare_university_data( uni_raw: dict[str, Any], allow_remote: bool, include_inactive_programs: bool, debug: bool, stats: dict[str, Any], ) -> dict[str, Any]: """Prepare a single university's template data. Extracts overview, campus image, benefits, programs, and extra sections from the raw sections list. This moves the logic that was in render_university_section into a data-preparation step so that the Jinja2 template handles the HTML. """ uni_name = uni_raw["name"] sections = uni_raw.get("sections", []) is_first = uni_raw.get("_is_first", False) stats["universities"] = stats.get("universities", 0) + 1 # Build section map; merge duplicate "programs" sec_map: dict[str, dict] = {} for s in sections: if not isinstance(s, dict): continue k = str(s.get("section_key", "")) if not k: continue if k == "programs" and k in sec_map: existing = sec_map["programs"].get("section_json", {}) incoming = s.get("section_json", {}) if not isinstance(existing, dict): existing = {} if not isinstance(incoming, dict): incoming = {} a = existing.get("programs", []) b = incoming.get("programs", []) if not isinstance(a, list): a = [] if not isinstance(b, list): b = [] existing["programs"] = a + b sec_map["programs"]["section_json"] = existing continue sec_map[k] = s # Campus image # Disable university campus-image embedding in the generation path. # Large per-school images were the main source of handbook timeouts in Space. img_section = sec_map.get("campus_image") or sec_map.get("image") campus_image = "" campus_caption = "" if img_section: j = img_section.get("section_json", {}) if isinstance(j, dict): campus_caption = str(j.get("caption", "")).strip() stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1 # Overview and website resolved_website = (uni_raw.get("website") or "").strip() overview_data = None if "overview" in sec_map: overview_json = sec_map["overview"].get("section_json", {}) if not isinstance(overview_json, dict): overview_json = {} site_from_overview = get_any( overview_json, ["university_website", "university_website_url", "website", "site", "url", "homepage", "web_url"], ) if not resolved_website and site_from_overview: resolved_website = site_from_overview overview_data = { "founded": get_any(overview_json, ["founded", "Founded"]), "total_students": get_any(overview_json, ["total_students", "Total Students"]), "undergraduates": get_any(overview_json, ["undergraduates", "Undergraduate Students", "undergraduate_students"]), "postgraduates": get_any(overview_json, ["postgraduate_students", "Postgraduate Students"]), "acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]), "location": get_any(overview_json, ["location", "Location"]), "tuition": format_money_figures(str(get_any(overview_json, [ "tuition_out_of_state_yearly", "Yearly Out of State Tuition Fees", "Yearly Out-of-State Tuition Fees", "Yearly Tuition Fees", "Yearly Out-of-State Tuition Fees:", ]) or "")) or None, } if resolved_website: stats["university_links"] = stats.get("university_links", 0) + 1 stats["website_rows"] = stats.get("website_rows", 0) + 1 # Benefits # Benefits + Funding benefits = [] funding_heading = "Funding Available" funding_items: list[str] = [] if "benefits" in sec_map: j = sec_map["benefits"].get("section_json", {}) if not isinstance(j, dict): j = {} raw_benefits = j.get("benefits", []) if isinstance(raw_benefits, list): benefits = [str(b).strip() for b in raw_benefits if str(b).strip()] else: benefits = [] funding_heading, funding_items = _extract_university_funding( j, { "school_category": uni_raw.get("school_category"), "status": "in" if is_truthy(uni_raw.get("is_active", True)) else "out", }, ) # Programs programs = None if "programs" in sec_map: j = sec_map["programs"].get("section_json", {}) if not isinstance(j, dict): j = {} programs_raw = j.get("programs", []) if not isinstance(programs_raw, list): programs_raw = [] if not include_inactive_programs: programs_raw = [ p for p in programs_raw if isinstance(p, dict) and is_truthy( p.get("program_active", p.get("is_active", p.get("active", 1))) ) ] programs = [] seen_names = set() for p in programs_raw: if not isinstance(p, dict): continue program_name = str(p.get("program_name", "")).strip() # Deduplicate by lowercase program name key = program_name.lower() if key in seen_names: continue seen_names.add(key) link = str(p.get("program_link", "")).strip() if not link and isinstance(p.get("program_links"), dict): link = str(p["program_links"].get("web_link", "")).strip() programs.append({ "name": program_name, "link": link, "designation": str(p.get("designation", "")), "entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))), }) # Extra sections skip_keys = {"campus_image", "image", "overview", "benefits", "programs"} extra_sections = [] for s in sections: if not isinstance(s, dict): continue k = str(s.get("section_key", "")) if not k or k in skip_keys: continue title = str(s.get("section_title", "")) j = s.get("section_json", {}) if not isinstance(j, dict): j = {} rendered = render_global_blocks(k, title, j, debug) extra_sections.append({"rendered_html": Markup(rendered)}) classes = ["uni"] if not is_first: classes.append("page-break") return { "name": uni_name, "anchor": uni_raw.get("anchor"), "sort_order": uni_raw.get("sort_order"), "website": resolved_website, "classes": classes, "overview": overview_data, "campus_image": campus_image, "campus_caption": campus_caption, "benefits": benefits, "funding_heading": funding_heading, "funding_items": funding_items, "programs": programs, "extra_sections": extra_sections, } def build_handbook_html( globals_data: list[dict[str, Any]], by_uni: dict[int, dict[str, Any]], images: dict[str, Any], allow_remote: bool, include_inactive_programs: bool = False, debug: bool = False, ) -> str: """Build the full handbook HTML document using Jinja2 templates. Preserves the same data preparation logic from the original version. Rendering is delegated to Jinja2 templates with Playwright-compatible HTML/CSS output. """ env = _get_jinja_env() template = env.get_template("handbook.html") font_meta = select_font_family() font_css = font_face_css(font_meta) # Base URL for static assets (CSS, images, etc.) base_url = _static_base_url() stats: dict[str, Any] = { "universities": 0, "images_embedded": 0, "images_placeholder": 0, "program_links_total": 0, "program_missing_links_total": 0, "missing_program_links": {}, "university_links": 0, "website_rows": 0, "program_option_warnings": [], } # ── Cover Image ── cover_image = images.get("coverImage", "") if cover_image and os.path.isfile(cover_image): cover_image = Path(cover_image).as_uri() else: cover_image = "" # ── TOC Image ── toc_image = images.get("tocImage", "") if toc_image and os.path.isfile(toc_image): toc_image = Path(toc_image).as_uri() else: toc_image = "" # ── Header Image (repeating page header) ── header_image = images.get("headerImage", "") if header_image and os.path.isfile(header_image): mime = mimetypes.guess_type(header_image)[0] or "image/jpeg" with open(header_image, "rb") as f: header_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}" else: header_image = "" # ── Label Image (repeating right-side label) ── label_image = images.get("labelImage", "") if label_image and os.path.isfile(label_image): mime = mimetypes.guess_type(label_image)[0] or "image/jpeg" with open(label_image, "rb") as f: label_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}" else: # Fallback to remote URL when local file is unavailable label_image = "https://finsapdev.qhtestingserver.com/MODEL_APIS/handbook/images/label.jpeg" # ── Prepare active universities (sorted: Tier One first, Tier Two second) ── active_universities: list[dict[str, Any]] = [] for uid, uni in by_uni.items(): if not isinstance(uni, dict): continue if not is_truthy(uni.get("is_active", True)): continue name = str(uni.get("university_name", f"University #{uid}")) anchor = handbook_anchor("uni", name, int(uid)) school_category = str(uni.get("school_category", "")).strip() tier = uni.get("tier") tier_label = str(uni.get("tier_label", "")).strip() active_universities.append({ "id": int(uid), "anchor": anchor, "name": name, "sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [], "website": str(uni.get("website", "")), "sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None, "school_category": school_category, "tier": tier, "tier_label": tier_label, }) # Explicit university display order _UNIVERSITY_ORDER: list[str] = [ "Indiana University of Pennsylvania", "Missouri State University", "University of Louisville", "University of Delaware", "Grand Valley State University", "Quinnipiac University", "William Jessup University", "Wilkes University", "University of South Dakota", "California Baptist University", "Illinois State University", "Virginia Commonwealth University", "Rutgers University-Camden", "University of Oklahoma", "Saint Louis University", "University of Alabama at Birmingham", "Oregon State University", "Rochester Institute of Technology", "Lewis University", "Texas State University", "Drew University", "University of Missouri- Saint Louis", "Montana State University", "Oklahoma City University", "University of Dayton", "Webster University", "Rockhurst University", ] _uni_order_map = {name.lower().strip(): idx for idx, name in enumerate(_UNIVERSITY_ORDER)} def _tier_sort(u: dict) -> tuple: name_lower = (u.get("name") or "").lower().strip() explicit = _uni_order_map.get(name_lower) if explicit is not None: return (0, explicit, 0) # Universities not in the explicit list go after, sorted by tier then alpha t = u.get("tier") rank = t if isinstance(t, int) else 99 return (1, rank, name_lower, u.get("id", 0)) active_universities.sort(key=_tier_sort) # ── Normalise globals ── globals_data = sort_sections_stable(globals_data) required_keys = [ "table_of_contents", "overview", "how_the_program_works", ] existing_keys = {str(g.get("section_key", "")).lower() for g in globals_data if isinstance(g, dict)} missing = [k for k in required_keys if k not in existing_keys] if missing: msg = f"Handbook required sections missing: {','.join(missing)}" logger.error(msg) raise RuntimeError(msg) general_sections: list[dict[str, Any]] = [] toc_sort_order = None toc_title = "Table of Contents" for idx, g in enumerate(globals_data): if not isinstance(g, dict): continue key_raw = str(g.get("section_key", "")) key = key_raw.lower() sort_order = int(g["sort_order"]) if g.get("sort_order") is not None and str(g.get("sort_order", "")).lstrip("-").isdigit() else None if key == "table_of_contents" and toc_sort_order is None: toc_sort_order = sort_order if sort_order is not None else (idx + 1) toc_title = str(g.get("section_title", "Table of Contents")) continue section_hits: list[str] = [] _collect_program_option_inconsistencies( g.get("section_json", {}), f"global.{key_raw}", section_hits, ) for hit in section_hits: if hit not in stats["program_option_warnings"]: stats["program_option_warnings"].append(hit) anchor = handbook_anchor("g", str(g.get("section_title", g.get("section_key", "section"))), idx) general_sections.append({ "anchor": anchor, "data": g, "sort_order": sort_order, }) # ── Build TOC items ── toc_items: list[dict[str, Any]] = [] for gs in general_sections: # Prefer the JSON-level title (display-ready) over the DB section_title gs_json = gs["data"].get("section_json", {}) if isinstance(gs_json, dict) and gs_json.get("title", "").strip(): title = gs_json["title"].strip() else: title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section"))) toc_items.append({ "title": title, "target": "#" + gs["anchor"], "level": 0, "bold": True, "sort": gs["sort_order"], }) for u in active_universities: toc_items.append({ "title": u["name"], "target": "#" + u["anchor"], "level": 1, "bold": False, "sort": u.get("sort_order"), }) # ── Prepare sorted TOC items for template ── sorted_toc = sort_toc(list(toc_items)) toc_items_sorted = [] for e in sorted_toc: if not isinstance(e, dict): continue title = str(e.get("title", "")).strip() if not title: continue level = max(0, min(3, int(e.get("level", 0)))) bold = bool(e.get("bold", False)) upper = bool(e.get("upper", False)) if level == 0: bold = True upper = True display_title = title.upper() if upper else title page = str(e.get("page", "")).strip() toc_items_sorted.append({ "title": title, "display_title": display_title, "target": str(e.get("target", e.get("anchor", ""))).strip(), "level": level, "bold": bold, "upper": upper, "page": page, }) # ── Prepare general sections with rendered HTML and typed blocks ── template_sections = [] for gs in general_sections: data = gs["data"] key_lower = str(data.get("section_key", "")).lower() sec_class = SECTION_CLASS_MAP.get(key_lower) if sec_class is None: sec_class = "sec-" + re.sub(r"[^a-z0-9]+", "-", key_lower) section_json = data.get("section_json", {}) if not isinstance(section_json, dict): section_json = {} # Typed blocks for the new rendering path blocks = normalize_section( str(data.get("section_key", "")), str(data.get("section_title", "")), section_json, debug=debug, ) # Legacy HTML fallback section_html = render_global_blocks( str(data.get("section_key", "")), str(data.get("section_title", "")), section_json, debug, ) if not section_html.strip() and not blocks: logger.warning( "Empty section render key=%s sort_order=%s", data.get("section_key"), data.get("sort_order"), ) template_sections.append({ "anchor": gs["anchor"], "data": data, "page_break": key_lower in PAGE_BREAK_KEYS, "sec_class": sec_class, "blocks": blocks, "rendered_html": Markup(section_html), }) # ── Prepare university data for templates (both old + new paths) ── # Group by tier for tier heading insertion in the PDF output university_template_data = [] university_block_data = [] # Track which tier label was last emitted so we can insert tier divider headings _seen_tier_labels: set[str] = set() for idx, uni_raw in enumerate(active_universities): uni_raw["_is_first"] = (idx == 0) # Insert tier group heading when tier changes current_tier_label = str(uni_raw.get("tier_label", "")).strip() if current_tier_label and current_tier_label not in _seen_tier_labels: _seen_tier_labels.add(current_tier_label) # Mark this university as starting a new tier group uni_raw["_tier_group_start"] = True uni_raw["_tier_group_label"] = f"{current_tier_label} Schools" uni_hits: list[str] = [] _collect_program_option_inconsistencies( uni_raw.get("sections", []), f"university.{uni_raw.get('name', idx)}", uni_hits, ) for hit in uni_hits: if hit not in stats["program_option_warnings"]: stats["program_option_warnings"].append(hit) # Legacy path uni_data = _prepare_university_data( uni_raw, allow_remote, include_inactive_programs, debug, stats, ) # Carry tier metadata to template data uni_data["tier"] = uni_raw.get("tier") uni_data["tier_label"] = uni_raw.get("tier_label", "") uni_data["tier_group_start"] = uni_raw.get("_tier_group_start", False) uni_data["tier_group_label"] = uni_raw.get("_tier_group_label", "") university_template_data.append(uni_data) # New block path uni_block = normalize_university( uni_raw, allow_remote, include_inactive_programs, debug, stats, ) university_block_data.append(uni_block) # ── Bottom pages ── bottom_pages_urls = [] raw_bottom = images.get("bottomPages", []) if isinstance(raw_bottom, list): for img_path in raw_bottom: if os.path.isfile(str(img_path)): bottom_pages_urls.append(Path(str(img_path)).as_uri()) # ── Render template ── if stats["program_option_warnings"]: logger.warning( "Program option consistency warnings (missing REGULAR or PRIME pair): %s", stats["program_option_warnings"], ) html = template.render( font_css=Markup(font_css), base_url=base_url, extra_css="", header_image=header_image, label_image=label_image, cover_image=cover_image, toc_image=toc_image, toc_items=toc_items, toc_items_sorted=toc_items_sorted, toc_title=toc_title, toc_sort_order=toc_sort_order, general_sections=template_sections, summary_block=None, universities=university_template_data, university_blocks=university_block_data, bottom_pages=bottom_pages_urls, debug=debug, stats=stats, ) return html