handbook_engine / app /services /html_builder.py
internationalscholarsprogram's picture
Fix enrollment rendering parity and header-safe Step 7 layout
a94f84a
"""HTML builder β€” assembles the full ISP Handbook HTML document.
Uses Jinja2 templates for HTML generation. Data preparation logic is
preserved from the original string-concatenation approach. The output
is a self-contained HTML suitable for Playwright Chromium PDF export.
"""
from __future__ import annotations
import base64
import logging
import mimetypes
import os
import re
from pathlib import Path
from typing import Any
from jinja2 import Environment, FileSystemLoader, select_autoescape
from markupsafe import Markup
from app.core.config import get_settings
from app.core.fonts import font_face_css, select_font_family
from app.services.normalizer import normalize_section, normalize_university
from app.services.renderers import (
fetch_image_data_uri,
render_global_blocks,
sort_toc,
_extract_university_funding,
)
from app.services.utils import (
format_money_figures,
get_any,
h,
handbook_anchor,
hb_slug,
is_truthy,
sort_sections_stable,
)
logger = logging.getLogger(__name__)
# Jinja2 environment β€” templates live alongside the app package
_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
def _get_jinja_env() -> Environment:
"""Create a Jinja2 environment pointing to our templates directory."""
env = Environment(
loader=FileSystemLoader(str(_TEMPLATES_DIR)),
autoescape=select_autoescape(["html"]),
trim_blocks=True,
lstrip_blocks=True,
)
return env
def _static_base_url() -> str:
"""Return absolute file:// URL to the static directory."""
static_dir = Path(__file__).resolve().parent.parent / "static"
return static_dir.as_uri()
def _unused_pdf_override_css(font_stack: str) -> str:
"""Legacy inline PDF override CSS β€” kept for reference only.
All styling now lives in static/css/print.css for Chromium rendering.
"""
return ""
# Section class map
SECTION_CLASS_MAP = {
"overview": "sec-overview",
"how_the_program_works": "sec-how",
"qualification_requirements": "sec-qualification",
"enrolment_steps": "sec-steps",
"withdrawal_refund_policy": "sec-policy",
"refund_guidelines": "sec-refund",
"program_contributions": "sec-contributions",
"program_features_breakdown": "sec-breakdown",
"funding_options_available": "sec-funding",
"summary_of_universities": "sec-summary",
"summary_of_universities_cosigner": "sec-summary-cosigner",
}
PAGE_BREAK_KEYS = {
"overview",
"how_the_program_works",
"qualification_requirements",
"enrolment_steps",
"withdrawal_refund_policy",
"refund_guidelines",
"program_contributions",
"program_features_breakdown",
"funding_options_available",
"summary_of_universities",
"summary_of_universities_cosigner",
}
def _collect_program_option_inconsistencies(value: Any, path: str, hits: list[str]) -> None:
"""Collect paths where only REGULAR or PRIME appears."""
if isinstance(value, dict):
for k, v in value.items():
_collect_program_option_inconsistencies(v, f"{path}.{k}" if path else str(k), hits)
return
if isinstance(value, list):
for i, v in enumerate(value):
_collect_program_option_inconsistencies(v, f"{path}[{i}]", hits)
return
if value is None:
return
text = str(value)
has_regular = bool(re.search(r"\bREGULAR\b", text, flags=re.IGNORECASE))
has_prime = bool(re.search(r"\bPRIME\b", text, flags=re.IGNORECASE))
if has_regular ^ has_prime:
hits.append(path)
def _prepare_university_data(
uni_raw: dict[str, Any],
allow_remote: bool,
include_inactive_programs: bool,
debug: bool,
stats: dict[str, Any],
) -> dict[str, Any]:
"""Prepare a single university's template data.
Extracts overview, campus image, benefits, programs, and extra sections
from the raw sections list. This moves the logic that was in
render_university_section into a data-preparation step so that the
Jinja2 template handles the HTML.
"""
uni_name = uni_raw["name"]
sections = uni_raw.get("sections", [])
is_first = uni_raw.get("_is_first", False)
stats["universities"] = stats.get("universities", 0) + 1
# Build section map; merge duplicate "programs"
sec_map: dict[str, dict] = {}
for s in sections:
if not isinstance(s, dict):
continue
k = str(s.get("section_key", ""))
if not k:
continue
if k == "programs" and k in sec_map:
existing = sec_map["programs"].get("section_json", {})
incoming = s.get("section_json", {})
if not isinstance(existing, dict):
existing = {}
if not isinstance(incoming, dict):
incoming = {}
a = existing.get("programs", [])
b = incoming.get("programs", [])
if not isinstance(a, list):
a = []
if not isinstance(b, list):
b = []
existing["programs"] = a + b
sec_map["programs"]["section_json"] = existing
continue
sec_map[k] = s
# Campus image
# Disable university campus-image embedding in the generation path.
# Large per-school images were the main source of handbook timeouts in Space.
img_section = sec_map.get("campus_image") or sec_map.get("image")
campus_image = ""
campus_caption = ""
if img_section:
j = img_section.get("section_json", {})
if isinstance(j, dict):
campus_caption = str(j.get("caption", "")).strip()
stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1
# Overview and website
resolved_website = (uni_raw.get("website") or "").strip()
overview_data = None
if "overview" in sec_map:
overview_json = sec_map["overview"].get("section_json", {})
if not isinstance(overview_json, dict):
overview_json = {}
site_from_overview = get_any(
overview_json,
["university_website", "university_website_url", "website", "site", "url", "homepage", "web_url"],
)
if not resolved_website and site_from_overview:
resolved_website = site_from_overview
overview_data = {
"founded": get_any(overview_json, ["founded", "Founded"]),
"total_students": get_any(overview_json, ["total_students", "Total Students"]),
"undergraduates": get_any(overview_json, ["undergraduates", "Undergraduate Students", "undergraduate_students"]),
"postgraduates": get_any(overview_json, ["postgraduate_students", "Postgraduate Students"]),
"acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]),
"location": get_any(overview_json, ["location", "Location"]),
"tuition": format_money_figures(str(get_any(overview_json, [
"tuition_out_of_state_yearly",
"Yearly Out of State Tuition Fees",
"Yearly Out-of-State Tuition Fees",
"Yearly Tuition Fees",
"Yearly Out-of-State Tuition Fees:",
]) or "")) or None,
}
if resolved_website:
stats["university_links"] = stats.get("university_links", 0) + 1
stats["website_rows"] = stats.get("website_rows", 0) + 1
# Benefits
# Benefits + Funding
benefits = []
funding_heading = "Funding Available"
funding_items: list[str] = []
if "benefits" in sec_map:
j = sec_map["benefits"].get("section_json", {})
if not isinstance(j, dict):
j = {}
raw_benefits = j.get("benefits", [])
if isinstance(raw_benefits, list):
benefits = [str(b).strip() for b in raw_benefits if str(b).strip()]
else:
benefits = []
funding_heading, funding_items = _extract_university_funding(
j,
{
"school_category": uni_raw.get("school_category"),
"status": "in" if is_truthy(uni_raw.get("is_active", True)) else "out",
},
)
# Programs
programs = None
if "programs" in sec_map:
j = sec_map["programs"].get("section_json", {})
if not isinstance(j, dict):
j = {}
programs_raw = j.get("programs", [])
if not isinstance(programs_raw, list):
programs_raw = []
if not include_inactive_programs:
programs_raw = [
p for p in programs_raw
if isinstance(p, dict) and is_truthy(
p.get("program_active", p.get("is_active", p.get("active", 1)))
)
]
programs = []
seen_names = set()
for p in programs_raw:
if not isinstance(p, dict):
continue
program_name = str(p.get("program_name", "")).strip()
# Deduplicate by lowercase program name
key = program_name.lower()
if key in seen_names:
continue
seen_names.add(key)
link = str(p.get("program_link", "")).strip()
if not link and isinstance(p.get("program_links"), dict):
link = str(p["program_links"].get("web_link", "")).strip()
programs.append({
"name": program_name,
"link": link,
"designation": str(p.get("designation", "")),
"entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))),
})
# Extra sections
skip_keys = {"campus_image", "image", "overview", "benefits", "programs"}
extra_sections = []
for s in sections:
if not isinstance(s, dict):
continue
k = str(s.get("section_key", ""))
if not k or k in skip_keys:
continue
title = str(s.get("section_title", ""))
j = s.get("section_json", {})
if not isinstance(j, dict):
j = {}
rendered = render_global_blocks(k, title, j, debug)
extra_sections.append({"rendered_html": Markup(rendered)})
classes = ["uni"]
if not is_first:
classes.append("page-break")
return {
"name": uni_name,
"anchor": uni_raw.get("anchor"),
"sort_order": uni_raw.get("sort_order"),
"website": resolved_website,
"classes": classes,
"overview": overview_data,
"campus_image": campus_image,
"campus_caption": campus_caption,
"benefits": benefits,
"funding_heading": funding_heading,
"funding_items": funding_items,
"programs": programs,
"extra_sections": extra_sections,
}
def build_handbook_html(
globals_data: list[dict[str, Any]],
by_uni: dict[int, dict[str, Any]],
images: dict[str, Any],
allow_remote: bool,
include_inactive_programs: bool = False,
debug: bool = False,
) -> str:
"""Build the full handbook HTML document using Jinja2 templates.
Preserves the same data preparation logic from the original version.
Rendering is delegated to Jinja2 templates with Playwright-compatible
HTML/CSS output.
"""
env = _get_jinja_env()
template = env.get_template("handbook.html")
font_meta = select_font_family()
font_css = font_face_css(font_meta)
# Base URL for static assets (CSS, images, etc.)
base_url = _static_base_url()
stats: dict[str, Any] = {
"universities": 0,
"images_embedded": 0,
"images_placeholder": 0,
"program_links_total": 0,
"program_missing_links_total": 0,
"missing_program_links": {},
"university_links": 0,
"website_rows": 0,
"program_option_warnings": [],
}
# ── Cover Image ──
cover_image = images.get("coverImage", "")
if cover_image and os.path.isfile(cover_image):
cover_image = Path(cover_image).as_uri()
else:
cover_image = ""
# ── TOC Image ──
toc_image = images.get("tocImage", "")
if toc_image and os.path.isfile(toc_image):
toc_image = Path(toc_image).as_uri()
else:
toc_image = ""
# ── Header Image (repeating page header) ──
header_image = images.get("headerImage", "")
if header_image and os.path.isfile(header_image):
mime = mimetypes.guess_type(header_image)[0] or "image/jpeg"
with open(header_image, "rb") as f:
header_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
else:
header_image = ""
# ── Label Image (repeating right-side label) ──
label_image = images.get("labelImage", "")
if label_image and os.path.isfile(label_image):
mime = mimetypes.guess_type(label_image)[0] or "image/jpeg"
with open(label_image, "rb") as f:
label_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
else:
# Fallback to remote URL when local file is unavailable
label_image = "https://finsapdev.qhtestingserver.com/MODEL_APIS/handbook/images/label.jpeg"
# ── Prepare active universities (sorted: Tier One first, Tier Two second) ──
active_universities: list[dict[str, Any]] = []
for uid, uni in by_uni.items():
if not isinstance(uni, dict):
continue
if not is_truthy(uni.get("is_active", True)):
continue
name = str(uni.get("university_name", f"University #{uid}"))
anchor = handbook_anchor("uni", name, int(uid))
school_category = str(uni.get("school_category", "")).strip()
tier = uni.get("tier")
tier_label = str(uni.get("tier_label", "")).strip()
active_universities.append({
"id": int(uid),
"anchor": anchor,
"name": name,
"sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [],
"website": str(uni.get("website", "")),
"sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None,
"school_category": school_category,
"tier": tier,
"tier_label": tier_label,
})
# Explicit university display order
_UNIVERSITY_ORDER: list[str] = [
"Indiana University of Pennsylvania",
"Missouri State University",
"University of Louisville",
"University of Delaware",
"Grand Valley State University",
"Quinnipiac University",
"William Jessup University",
"Wilkes University",
"University of South Dakota",
"California Baptist University",
"Illinois State University",
"Virginia Commonwealth University",
"Rutgers University-Camden",
"University of Oklahoma",
"Saint Louis University",
"University of Alabama at Birmingham",
"Oregon State University",
"Rochester Institute of Technology",
"Lewis University",
"Texas State University",
"Drew University",
"University of Missouri- Saint Louis",
"Montana State University",
"Oklahoma City University",
"University of Dayton",
"Webster University",
"Rockhurst University",
]
_uni_order_map = {name.lower().strip(): idx for idx, name in enumerate(_UNIVERSITY_ORDER)}
def _tier_sort(u: dict) -> tuple:
name_lower = (u.get("name") or "").lower().strip()
explicit = _uni_order_map.get(name_lower)
if explicit is not None:
return (0, explicit, 0)
# Universities not in the explicit list go after, sorted by tier then alpha
t = u.get("tier")
rank = t if isinstance(t, int) else 99
return (1, rank, name_lower, u.get("id", 0))
active_universities.sort(key=_tier_sort)
# ── Normalise globals ──
globals_data = sort_sections_stable(globals_data)
required_keys = [
"table_of_contents",
"overview",
"how_the_program_works",
]
existing_keys = {str(g.get("section_key", "")).lower() for g in globals_data if isinstance(g, dict)}
missing = [k for k in required_keys if k not in existing_keys]
if missing:
msg = f"Handbook required sections missing: {','.join(missing)}"
logger.error(msg)
raise RuntimeError(msg)
general_sections: list[dict[str, Any]] = []
toc_sort_order = None
toc_title = "Table of Contents"
for idx, g in enumerate(globals_data):
if not isinstance(g, dict):
continue
key_raw = str(g.get("section_key", ""))
key = key_raw.lower()
sort_order = int(g["sort_order"]) if g.get("sort_order") is not None and str(g.get("sort_order", "")).lstrip("-").isdigit() else None
if key == "table_of_contents" and toc_sort_order is None:
toc_sort_order = sort_order if sort_order is not None else (idx + 1)
toc_title = str(g.get("section_title", "Table of Contents"))
continue
section_hits: list[str] = []
_collect_program_option_inconsistencies(
g.get("section_json", {}),
f"global.{key_raw}",
section_hits,
)
for hit in section_hits:
if hit not in stats["program_option_warnings"]:
stats["program_option_warnings"].append(hit)
anchor = handbook_anchor("g", str(g.get("section_title", g.get("section_key", "section"))), idx)
general_sections.append({
"anchor": anchor,
"data": g,
"sort_order": sort_order,
})
# ── Build TOC items ──
toc_items: list[dict[str, Any]] = []
for gs in general_sections:
# Prefer the JSON-level title (display-ready) over the DB section_title
gs_json = gs["data"].get("section_json", {})
if isinstance(gs_json, dict) and gs_json.get("title", "").strip():
title = gs_json["title"].strip()
else:
title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section")))
toc_items.append({
"title": title,
"target": "#" + gs["anchor"],
"level": 0,
"bold": True,
"sort": gs["sort_order"],
})
for u in active_universities:
toc_items.append({
"title": u["name"],
"target": "#" + u["anchor"],
"level": 1,
"bold": False,
"sort": u.get("sort_order"),
})
# ── Prepare sorted TOC items for template ──
sorted_toc = sort_toc(list(toc_items))
toc_items_sorted = []
for e in sorted_toc:
if not isinstance(e, dict):
continue
title = str(e.get("title", "")).strip()
if not title:
continue
level = max(0, min(3, int(e.get("level", 0))))
bold = bool(e.get("bold", False))
upper = bool(e.get("upper", False))
if level == 0:
bold = True
upper = True
display_title = title.upper() if upper else title
page = str(e.get("page", "")).strip()
toc_items_sorted.append({
"title": title,
"display_title": display_title,
"target": str(e.get("target", e.get("anchor", ""))).strip(),
"level": level,
"bold": bold,
"upper": upper,
"page": page,
})
# ── Prepare general sections with rendered HTML and typed blocks ──
template_sections = []
for gs in general_sections:
data = gs["data"]
key_lower = str(data.get("section_key", "")).lower()
sec_class = SECTION_CLASS_MAP.get(key_lower)
if sec_class is None:
sec_class = "sec-" + re.sub(r"[^a-z0-9]+", "-", key_lower)
section_json = data.get("section_json", {})
if not isinstance(section_json, dict):
section_json = {}
# Typed blocks for the new rendering path
blocks = normalize_section(
str(data.get("section_key", "")),
str(data.get("section_title", "")),
section_json,
debug=debug,
)
# Legacy HTML fallback
section_html = render_global_blocks(
str(data.get("section_key", "")),
str(data.get("section_title", "")),
section_json,
debug,
)
if not section_html.strip() and not blocks:
logger.warning(
"Empty section render key=%s sort_order=%s",
data.get("section_key"),
data.get("sort_order"),
)
template_sections.append({
"anchor": gs["anchor"],
"data": data,
"page_break": key_lower in PAGE_BREAK_KEYS,
"sec_class": sec_class,
"blocks": blocks,
"rendered_html": Markup(section_html),
})
# ── Prepare university data for templates (both old + new paths) ──
# Group by tier for tier heading insertion in the PDF output
university_template_data = []
university_block_data = []
# Track which tier label was last emitted so we can insert tier divider headings
_seen_tier_labels: set[str] = set()
for idx, uni_raw in enumerate(active_universities):
uni_raw["_is_first"] = (idx == 0)
# Insert tier group heading when tier changes
current_tier_label = str(uni_raw.get("tier_label", "")).strip()
if current_tier_label and current_tier_label not in _seen_tier_labels:
_seen_tier_labels.add(current_tier_label)
# Mark this university as starting a new tier group
uni_raw["_tier_group_start"] = True
uni_raw["_tier_group_label"] = f"{current_tier_label} Schools"
uni_hits: list[str] = []
_collect_program_option_inconsistencies(
uni_raw.get("sections", []),
f"university.{uni_raw.get('name', idx)}",
uni_hits,
)
for hit in uni_hits:
if hit not in stats["program_option_warnings"]:
stats["program_option_warnings"].append(hit)
# Legacy path
uni_data = _prepare_university_data(
uni_raw, allow_remote, include_inactive_programs, debug, stats,
)
# Carry tier metadata to template data
uni_data["tier"] = uni_raw.get("tier")
uni_data["tier_label"] = uni_raw.get("tier_label", "")
uni_data["tier_group_start"] = uni_raw.get("_tier_group_start", False)
uni_data["tier_group_label"] = uni_raw.get("_tier_group_label", "")
university_template_data.append(uni_data)
# New block path
uni_block = normalize_university(
uni_raw, allow_remote, include_inactive_programs, debug, stats,
)
university_block_data.append(uni_block)
# ── Bottom pages ──
bottom_pages_urls = []
raw_bottom = images.get("bottomPages", [])
if isinstance(raw_bottom, list):
for img_path in raw_bottom:
if os.path.isfile(str(img_path)):
bottom_pages_urls.append(Path(str(img_path)).as_uri())
# ── Render template ──
if stats["program_option_warnings"]:
logger.warning(
"Program option consistency warnings (missing REGULAR or PRIME pair): %s",
stats["program_option_warnings"],
)
html = template.render(
font_css=Markup(font_css),
base_url=base_url,
extra_css="",
header_image=header_image,
label_image=label_image,
cover_image=cover_image,
toc_image=toc_image,
toc_items=toc_items,
toc_items_sorted=toc_items_sorted,
toc_title=toc_title,
toc_sort_order=toc_sort_order,
general_sections=template_sections,
summary_block=None,
universities=university_template_data,
university_blocks=university_block_data,
bottom_pages=bottom_pages_urls,
debug=debug,
stats=stats,
)
return html