Spaces:

internationalscholarsprogram
/

handbook_engine

Running

App Files Files Community

handbook_engine / app /services /html_builder.py

internationalscholarsprogram

Fix enrollment rendering parity and header-safe Step 7 layout

a94f84a about 2 months ago

raw

history blame contribute delete

24.3 kB

	"""HTML builder — assembles the full ISP Handbook HTML document.

	Uses Jinja2 templates for HTML generation. Data preparation logic is
	preserved from the original string-concatenation approach. The output
	is a self-contained HTML suitable for Playwright Chromium PDF export.
	"""

	from __future__ import annotations

	import base64
	import logging
	import mimetypes
	import os
	import re
	from pathlib import Path
	from typing import Any

	from jinja2 import Environment, FileSystemLoader, select_autoescape
	from markupsafe import Markup

	from app.core.config import get_settings
	from app.core.fonts import font_face_css, select_font_family
	from app.services.normalizer import normalize_section, normalize_university
	from app.services.renderers import (
	fetch_image_data_uri,
	render_global_blocks,
	sort_toc,
	_extract_university_funding,
	)
	from app.services.utils import (
	format_money_figures,
	get_any,
	h,
	handbook_anchor,
	hb_slug,
	is_truthy,
	sort_sections_stable,
	)

	logger = logging.getLogger(__name__)

	# Jinja2 environment — templates live alongside the app package
	_TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"


	def _get_jinja_env() -> Environment:
	"""Create a Jinja2 environment pointing to our templates directory."""
	env = Environment(
	loader=FileSystemLoader(str(_TEMPLATES_DIR)),
	autoescape=select_autoescape(["html"]),
	trim_blocks=True,
	lstrip_blocks=True,
	)
	return env


	def _static_base_url() -> str:
	"""Return absolute file:// URL to the static directory."""
	static_dir = Path(__file__).resolve().parent.parent / "static"
	return static_dir.as_uri()


	def _unused_pdf_override_css(font_stack: str) -> str:
	"""Legacy inline PDF override CSS — kept for reference only.
	All styling now lives in static/css/print.css for Chromium rendering.
	"""
	return ""


	# Section class map
	SECTION_CLASS_MAP = {
	"overview": "sec-overview",
	"how_the_program_works": "sec-how",
	"qualification_requirements": "sec-qualification",
	"enrolment_steps": "sec-steps",
	"withdrawal_refund_policy": "sec-policy",
	"refund_guidelines": "sec-refund",
	"program_contributions": "sec-contributions",
	"program_features_breakdown": "sec-breakdown",
	"funding_options_available": "sec-funding",
	"summary_of_universities": "sec-summary",
	"summary_of_universities_cosigner": "sec-summary-cosigner",
	}

	PAGE_BREAK_KEYS = {
	"overview",
	"how_the_program_works",
	"qualification_requirements",
	"enrolment_steps",
	"withdrawal_refund_policy",
	"refund_guidelines",
	"program_contributions",
	"program_features_breakdown",
	"funding_options_available",
	"summary_of_universities",
	"summary_of_universities_cosigner",
	}


	def _collect_program_option_inconsistencies(value: Any, path: str, hits: list[str]) -> None:
	"""Collect paths where only REGULAR or PRIME appears."""
	if isinstance(value, dict):
	for k, v in value.items():
	_collect_program_option_inconsistencies(v, f"{path}.{k}" if path else str(k), hits)
	return
	if isinstance(value, list):
	for i, v in enumerate(value):
	_collect_program_option_inconsistencies(v, f"{path}[{i}]", hits)
	return
	if value is None:
	return

	text = str(value)
	has_regular = bool(re.search(r"\bREGULAR\b", text, flags=re.IGNORECASE))
	has_prime = bool(re.search(r"\bPRIME\b", text, flags=re.IGNORECASE))
	if has_regular ^ has_prime:
	hits.append(path)


	def _prepare_university_data(
	uni_raw: dict[str, Any],
	allow_remote: bool,
	include_inactive_programs: bool,
	debug: bool,
	stats: dict[str, Any],
	) -> dict[str, Any]:
	"""Prepare a single university's template data.

	Extracts overview, campus image, benefits, programs, and extra sections
	from the raw sections list. This moves the logic that was in
	render_university_section into a data-preparation step so that the
	Jinja2 template handles the HTML.
	"""
	uni_name = uni_raw["name"]
	sections = uni_raw.get("sections", [])
	is_first = uni_raw.get("_is_first", False)

	stats["universities"] = stats.get("universities", 0) + 1

	# Build section map; merge duplicate "programs"
	sec_map: dict[str, dict] = {}
	for s in sections:
	if not isinstance(s, dict):
	continue
	k = str(s.get("section_key", ""))
	if not k:
	continue
	if k == "programs" and k in sec_map:
	existing = sec_map["programs"].get("section_json", {})
	incoming = s.get("section_json", {})
	if not isinstance(existing, dict):
	existing = {}
	if not isinstance(incoming, dict):
	incoming = {}
	a = existing.get("programs", [])
	b = incoming.get("programs", [])
	if not isinstance(a, list):
	a = []
	if not isinstance(b, list):
	b = []
	existing["programs"] = a + b
	sec_map["programs"]["section_json"] = existing
	continue
	sec_map[k] = s

	# Campus image
	# Disable university campus-image embedding in the generation path.
	# Large per-school images were the main source of handbook timeouts in Space.
	img_section = sec_map.get("campus_image") or sec_map.get("image")
	campus_image = ""
	campus_caption = ""
	if img_section:
	j = img_section.get("section_json", {})
	if isinstance(j, dict):
	campus_caption = str(j.get("caption", "")).strip()
	stats["images_placeholder"] = stats.get("images_placeholder", 0) + 1

	# Overview and website
	resolved_website = (uni_raw.get("website") or "").strip()
	overview_data = None

	if "overview" in sec_map:
	overview_json = sec_map["overview"].get("section_json", {})
	if not isinstance(overview_json, dict):
	overview_json = {}

	site_from_overview = get_any(
	overview_json,
	["university_website", "university_website_url", "website", "site", "url", "homepage", "web_url"],
	)
	if not resolved_website and site_from_overview:
	resolved_website = site_from_overview

	overview_data = {
	"founded": get_any(overview_json, ["founded", "Founded"]),
	"total_students": get_any(overview_json, ["total_students", "Total Students"]),
	"undergraduates": get_any(overview_json, ["undergraduates", "Undergraduate Students", "undergraduate_students"]),
	"postgraduates": get_any(overview_json, ["postgraduate_students", "Postgraduate Students"]),
	"acceptance_rate": get_any(overview_json, ["acceptance_rate", "Acceptance Rate"]),
	"location": get_any(overview_json, ["location", "Location"]),
	"tuition": format_money_figures(str(get_any(overview_json, [
	"tuition_out_of_state_yearly",
	"Yearly Out of State Tuition Fees",
	"Yearly Out-of-State Tuition Fees",
	"Yearly Tuition Fees",
	"Yearly Out-of-State Tuition Fees:",
	]) or "")) or None,
	}

	if resolved_website:
	stats["university_links"] = stats.get("university_links", 0) + 1
	stats["website_rows"] = stats.get("website_rows", 0) + 1

	# Benefits
	# Benefits + Funding
	benefits = []
	funding_heading = "Funding Available"
	funding_items: list[str] = []

	if "benefits" in sec_map:
	j = sec_map["benefits"].get("section_json", {})
	if not isinstance(j, dict):
	j = {}

	raw_benefits = j.get("benefits", [])
	if isinstance(raw_benefits, list):
	benefits = [str(b).strip() for b in raw_benefits if str(b).strip()]
	else:
	benefits = []

	funding_heading, funding_items = _extract_university_funding(
	j,
	{
	"school_category": uni_raw.get("school_category"),
	"status": "in" if is_truthy(uni_raw.get("is_active", True)) else "out",
	},
	)

	# Programs
	programs = None
	if "programs" in sec_map:
	j = sec_map["programs"].get("section_json", {})
	if not isinstance(j, dict):
	j = {}
	programs_raw = j.get("programs", [])
	if not isinstance(programs_raw, list):
	programs_raw = []

	if not include_inactive_programs:
	programs_raw = [
	p for p in programs_raw
	if isinstance(p, dict) and is_truthy(
	p.get("program_active", p.get("is_active", p.get("active", 1)))
	)
	]

	programs = []
	seen_names = set()
	for p in programs_raw:
	if not isinstance(p, dict):
	continue
	program_name = str(p.get("program_name", "")).strip()
	# Deduplicate by lowercase program name
	key = program_name.lower()
	if key in seen_names:
	continue
	seen_names.add(key)
	link = str(p.get("program_link", "")).strip()
	if not link and isinstance(p.get("program_links"), dict):
	link = str(p["program_links"].get("web_link", "")).strip()

	programs.append({
	"name": program_name,
	"link": link,
	"designation": str(p.get("designation", "")),
	"entrance": str(p.get("entrance_exam", p.get("entrance_examination", ""))),
	})

	# Extra sections
	skip_keys = {"campus_image", "image", "overview", "benefits", "programs"}
	extra_sections = []
	for s in sections:
	if not isinstance(s, dict):
	continue
	k = str(s.get("section_key", ""))
	if not k or k in skip_keys:
	continue
	title = str(s.get("section_title", ""))
	j = s.get("section_json", {})
	if not isinstance(j, dict):
	j = {}
	rendered = render_global_blocks(k, title, j, debug)
	extra_sections.append({"rendered_html": Markup(rendered)})

	classes = ["uni"]
	if not is_first:
	classes.append("page-break")

	return {
	"name": uni_name,
	"anchor": uni_raw.get("anchor"),
	"sort_order": uni_raw.get("sort_order"),
	"website": resolved_website,
	"classes": classes,
	"overview": overview_data,
	"campus_image": campus_image,
	"campus_caption": campus_caption,
	"benefits": benefits,
	"funding_heading": funding_heading,
	"funding_items": funding_items,
	"programs": programs,
	"extra_sections": extra_sections,
	}


	def build_handbook_html(
	globals_data: list[dict[str, Any]],
	by_uni: dict[int, dict[str, Any]],
	images: dict[str, Any],
	allow_remote: bool,
	include_inactive_programs: bool = False,
	debug: bool = False,
	) -> str:
	"""Build the full handbook HTML document using Jinja2 templates.

	Preserves the same data preparation logic from the original version.
	Rendering is delegated to Jinja2 templates with Playwright-compatible
	HTML/CSS output.
	"""
	env = _get_jinja_env()
	template = env.get_template("handbook.html")

	font_meta = select_font_family()
	font_css = font_face_css(font_meta)

	# Base URL for static assets (CSS, images, etc.)
	base_url = _static_base_url()

	stats: dict[str, Any] = {
	"universities": 0,
	"images_embedded": 0,
	"images_placeholder": 0,
	"program_links_total": 0,
	"program_missing_links_total": 0,
	"missing_program_links": {},
	"university_links": 0,
	"website_rows": 0,
	"program_option_warnings": [],
	}

	# ── Cover Image ──
	cover_image = images.get("coverImage", "")
	if cover_image and os.path.isfile(cover_image):
	cover_image = Path(cover_image).as_uri()
	else:
	cover_image = ""

	# ── TOC Image ──
	toc_image = images.get("tocImage", "")
	if toc_image and os.path.isfile(toc_image):
	toc_image = Path(toc_image).as_uri()
	else:
	toc_image = ""

	# ── Header Image (repeating page header) ──
	header_image = images.get("headerImage", "")
	if header_image and os.path.isfile(header_image):
	mime = mimetypes.guess_type(header_image)[0] or "image/jpeg"
	with open(header_image, "rb") as f:
	header_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
	else:
	header_image = ""

	# ── Label Image (repeating right-side label) ──
	label_image = images.get("labelImage", "")
	if label_image and os.path.isfile(label_image):
	mime = mimetypes.guess_type(label_image)[0] or "image/jpeg"
	with open(label_image, "rb") as f:
	label_image = f"data:{mime};base64,{base64.b64encode(f.read()).decode()}"
	else:
	# Fallback to remote URL when local file is unavailable
	label_image = "https://finsapdev.qhtestingserver.com/MODEL_APIS/handbook/images/label.jpeg"

	# ── Prepare active universities (sorted: Tier One first, Tier Two second) ──
	active_universities: list[dict[str, Any]] = []
	for uid, uni in by_uni.items():
	if not isinstance(uni, dict):
	continue
	if not is_truthy(uni.get("is_active", True)):
	continue
	name = str(uni.get("university_name", f"University #{uid}"))
	anchor = handbook_anchor("uni", name, int(uid))
	school_category = str(uni.get("school_category", "")).strip()
	tier = uni.get("tier")
	tier_label = str(uni.get("tier_label", "")).strip()
	active_universities.append({
	"id": int(uid),
	"anchor": anchor,
	"name": name,
	"sections": uni.get("sections", []) if isinstance(uni.get("sections"), list) else [],
	"website": str(uni.get("website", "")),
	"sort_order": int(uni["sort_order"]) if uni.get("sort_order") is not None and str(uni.get("sort_order", "")).lstrip("-").isdigit() else None,
	"school_category": school_category,
	"tier": tier,
	"tier_label": tier_label,
	})

	# Explicit university display order
	_UNIVERSITY_ORDER: list[str] = [
	"Indiana University of Pennsylvania",
	"Missouri State University",
	"University of Louisville",
	"University of Delaware",
	"Grand Valley State University",
	"Quinnipiac University",
	"William Jessup University",
	"Wilkes University",
	"University of South Dakota",
	"California Baptist University",
	"Illinois State University",
	"Virginia Commonwealth University",
	"Rutgers University-Camden",
	"University of Oklahoma",
	"Saint Louis University",
	"University of Alabama at Birmingham",
	"Oregon State University",
	"Rochester Institute of Technology",
	"Lewis University",
	"Texas State University",
	"Drew University",
	"University of Missouri- Saint Louis",
	"Montana State University",
	"Oklahoma City University",
	"University of Dayton",
	"Webster University",
	"Rockhurst University",
	]
	_uni_order_map = {name.lower().strip(): idx for idx, name in enumerate(_UNIVERSITY_ORDER)}

	def _tier_sort(u: dict) -> tuple:
	name_lower = (u.get("name") or "").lower().strip()
	explicit = _uni_order_map.get(name_lower)
	if explicit is not None:
	return (0, explicit, 0)
	# Universities not in the explicit list go after, sorted by tier then alpha
	t = u.get("tier")
	rank = t if isinstance(t, int) else 99
	return (1, rank, name_lower, u.get("id", 0))
	active_universities.sort(key=_tier_sort)

	# ── Normalise globals ──
	globals_data = sort_sections_stable(globals_data)

	required_keys = [
	"table_of_contents",
	"overview",
	"how_the_program_works",
	]
	existing_keys = {str(g.get("section_key", "")).lower() for g in globals_data if isinstance(g, dict)}
	missing = [k for k in required_keys if k not in existing_keys]
	if missing:
	msg = f"Handbook required sections missing: {','.join(missing)}"
	logger.error(msg)
	raise RuntimeError(msg)

	general_sections: list[dict[str, Any]] = []
	toc_sort_order = None
	toc_title = "Table of Contents"

	for idx, g in enumerate(globals_data):
	if not isinstance(g, dict):
	continue
	key_raw = str(g.get("section_key", ""))
	key = key_raw.lower()
	sort_order = int(g["sort_order"]) if g.get("sort_order") is not None and str(g.get("sort_order", "")).lstrip("-").isdigit() else None

	if key == "table_of_contents" and toc_sort_order is None:
	toc_sort_order = sort_order if sort_order is not None else (idx + 1)
	toc_title = str(g.get("section_title", "Table of Contents"))
	continue

	section_hits: list[str] = []
	_collect_program_option_inconsistencies(
	g.get("section_json", {}),
	f"global.{key_raw}",
	section_hits,
	)
	for hit in section_hits:
	if hit not in stats["program_option_warnings"]:
	stats["program_option_warnings"].append(hit)

	anchor = handbook_anchor("g", str(g.get("section_title", g.get("section_key", "section"))), idx)
	general_sections.append({
	"anchor": anchor,
	"data": g,
	"sort_order": sort_order,
	})

	# ── Build TOC items ──
	toc_items: list[dict[str, Any]] = []
	for gs in general_sections:
	# Prefer the JSON-level title (display-ready) over the DB section_title
	gs_json = gs["data"].get("section_json", {})
	if isinstance(gs_json, dict) and gs_json.get("title", "").strip():
	title = gs_json["title"].strip()
	else:
	title = str(gs["data"].get("section_title", gs["data"].get("section_key", "Section")))
	toc_items.append({
	"title": title,
	"target": "#" + gs["anchor"],
	"level": 0,
	"bold": True,
	"sort": gs["sort_order"],
	})

	for u in active_universities:
	toc_items.append({
	"title": u["name"],
	"target": "#" + u["anchor"],
	"level": 1,
	"bold": False,
	"sort": u.get("sort_order"),
	})

	# ── Prepare sorted TOC items for template ──
	sorted_toc = sort_toc(list(toc_items))
	toc_items_sorted = []
	for e in sorted_toc:
	if not isinstance(e, dict):
	continue
	title = str(e.get("title", "")).strip()
	if not title:
	continue
	level = max(0, min(3, int(e.get("level", 0))))
	bold = bool(e.get("bold", False))
	upper = bool(e.get("upper", False))
	if level == 0:
	bold = True
	upper = True
	display_title = title.upper() if upper else title
	page = str(e.get("page", "")).strip()

	toc_items_sorted.append({
	"title": title,
	"display_title": display_title,
	"target": str(e.get("target", e.get("anchor", ""))).strip(),
	"level": level,
	"bold": bold,
	"upper": upper,
	"page": page,
	})

	# ── Prepare general sections with rendered HTML and typed blocks ──
	template_sections = []
	for gs in general_sections:
	data = gs["data"]
	key_lower = str(data.get("section_key", "")).lower()

	sec_class = SECTION_CLASS_MAP.get(key_lower)
	if sec_class is None:
	sec_class = "sec-" + re.sub(r"[^a-z0-9]+", "-", key_lower)

	section_json = data.get("section_json", {})
	if not isinstance(section_json, dict):
	section_json = {}

	# Typed blocks for the new rendering path
	blocks = normalize_section(
	str(data.get("section_key", "")),
	str(data.get("section_title", "")),
	section_json,
	debug=debug,
	)

	# Legacy HTML fallback
	section_html = render_global_blocks(
	str(data.get("section_key", "")),
	str(data.get("section_title", "")),
	section_json,
	debug,
	)

	if not section_html.strip() and not blocks:
	logger.warning(
	"Empty section render key=%s sort_order=%s",
	data.get("section_key"),
	data.get("sort_order"),
	)

	template_sections.append({
	"anchor": gs["anchor"],
	"data": data,
	"page_break": key_lower in PAGE_BREAK_KEYS,
	"sec_class": sec_class,
	"blocks": blocks,
	"rendered_html": Markup(section_html),
	})

	# ── Prepare university data for templates (both old + new paths) ──
	# Group by tier for tier heading insertion in the PDF output
	university_template_data = []
	university_block_data = []
	# Track which tier label was last emitted so we can insert tier divider headings
	_seen_tier_labels: set[str] = set()

	for idx, uni_raw in enumerate(active_universities):
	uni_raw["_is_first"] = (idx == 0)

	# Insert tier group heading when tier changes
	current_tier_label = str(uni_raw.get("tier_label", "")).strip()
	if current_tier_label and current_tier_label not in _seen_tier_labels:
	_seen_tier_labels.add(current_tier_label)
	# Mark this university as starting a new tier group
	uni_raw["_tier_group_start"] = True
	uni_raw["_tier_group_label"] = f"{current_tier_label} Schools"

	uni_hits: list[str] = []
	_collect_program_option_inconsistencies(
	uni_raw.get("sections", []),
	f"university.{uni_raw.get('name', idx)}",
	uni_hits,
	)
	for hit in uni_hits:
	if hit not in stats["program_option_warnings"]:
	stats["program_option_warnings"].append(hit)

	# Legacy path
	uni_data = _prepare_university_data(
	uni_raw, allow_remote, include_inactive_programs, debug, stats,
	)
	# Carry tier metadata to template data
	uni_data["tier"] = uni_raw.get("tier")
	uni_data["tier_label"] = uni_raw.get("tier_label", "")
	uni_data["tier_group_start"] = uni_raw.get("_tier_group_start", False)
	uni_data["tier_group_label"] = uni_raw.get("_tier_group_label", "")
	university_template_data.append(uni_data)
	# New block path
	uni_block = normalize_university(
	uni_raw, allow_remote, include_inactive_programs, debug, stats,
	)
	university_block_data.append(uni_block)

	# ── Bottom pages ──
	bottom_pages_urls = []
	raw_bottom = images.get("bottomPages", [])
	if isinstance(raw_bottom, list):
	for img_path in raw_bottom:
	if os.path.isfile(str(img_path)):
	bottom_pages_urls.append(Path(str(img_path)).as_uri())

	# ── Render template ──
	if stats["program_option_warnings"]:
	logger.warning(
	"Program option consistency warnings (missing REGULAR or PRIME pair): %s",
	stats["program_option_warnings"],
	)

	html = template.render(
	font_css=Markup(font_css),
	base_url=base_url,
	extra_css="",
	header_image=header_image,
	label_image=label_image,
	cover_image=cover_image,
	toc_image=toc_image,
	toc_items=toc_items,
	toc_items_sorted=toc_items_sorted,
	toc_title=toc_title,
	toc_sort_order=toc_sort_order,
	general_sections=template_sections,
	summary_block=None,
	universities=university_template_data,
	university_blocks=university_block_data,
	bottom_pages=bottom_pages_urls,
	debug=debug,
	stats=stats,
	)

	return html