| | |
| | """ |
| | Train PosterSentry on the real posters.science corpus. |
| | |
| | Data sources (all real, zero synthetic): |
| | Positive (poster): |
| | 28K+ verified scientific posters from Zenodo & Figshare |
| | /home/joneill/Nextcloud/vaults/jmind/calmi2/poster_science/poster-pdf-meta/downloads/ |
| | |
| | Negative (non_poster): |
| | 2,036 verified non-posters (multi-page docs, proceedings, abstracts) |
| | Listed in: poster_classifier/non_posters_20251208_152217.txt |
| | |
| | Plus: single pages extracted from armanc/scientific_papers (real papers) |
| | Plus: ag_news articles (real junk text, rendered to match) |
| | |
| | Usage: |
| | cd /home/joneill/pubverse_brett/poster_sentry |
| | pip install -e ".[train]" |
| | python scripts/train_poster_sentry.py --n-per-class 5000 |
| | """ |
| |
|
| | import argparse |
| | import json |
| | import logging |
| | import os |
| | import random |
| | import sys |
| | import time |
| | from pathlib import Path |
| | from typing import Dict, List, Optional, Tuple |
| |
|
| | import numpy as np |
| |
|
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format="%(asctime)s | %(levelname)s | %(message)s", |
| | datefmt="%Y-%m-%d %H:%M:%S", |
| | ) |
| | logger = logging.getLogger(__name__) |
| |
|
| | SEED = 42 |
| | random.seed(SEED) |
| | np.random.seed(SEED) |
| |
|
| | |
| |
|
| | POSTER_SCIENCE_BASE = Path( |
| | "/home/joneill/Nextcloud/vaults/jmind/calmi2/poster_science" |
| | ) |
| | DOWNLOADS_DIR = POSTER_SCIENCE_BASE / "poster-pdf-meta" / "downloads" |
| | NON_POSTERS_LIST = ( |
| | POSTER_SCIENCE_BASE |
| | / "poster_classifier" |
| | / "non_posters_20251208_152217.txt" |
| | ) |
| | CLASSIFICATION_JSON = ( |
| | POSTER_SCIENCE_BASE |
| | / "poster_classifier" |
| | / "classification_results_20251208_152217.json" |
| | ) |
| |
|
| |
|
| | def _fix_path(p: str) -> str: |
| | """Fix paths from classification JSON β they use /home/joneill/vaults/ |
| | but the actual Nextcloud mount is /home/joneill/Nextcloud/vaults/.""" |
| | if "/joneill/vaults/" in p and "/Nextcloud/" not in p: |
| | return p.replace("/joneill/vaults/", "/joneill/Nextcloud/vaults/") |
| | return p |
| |
|
| |
|
| | def collect_poster_paths(max_n: int = 10000) -> List[str]: |
| | """Collect verified poster PDF paths from the corpus.""" |
| | |
| | if CLASSIFICATION_JSON.exists(): |
| | logger.info(f"Loading classification results from {CLASSIFICATION_JSON}") |
| | with open(CLASSIFICATION_JSON) as f: |
| | data = json.load(f) |
| | poster_entries = data.get("posters", []) |
| | paths = [_fix_path(e["pdf_path"]) for e in poster_entries if Path(_fix_path(e["pdf_path"])).exists()] |
| | logger.info(f" Found {len(paths)} verified poster paths") |
| | else: |
| | |
| | logger.info(f"Globbing {DOWNLOADS_DIR} for PDFs...") |
| | paths = [str(p) for p in DOWNLOADS_DIR.rglob("*.pdf")] |
| | paths += [str(p) for p in DOWNLOADS_DIR.rglob("*.PDF")] |
| | logger.info(f" Found {len(paths)} PDFs") |
| |
|
| | random.shuffle(paths) |
| | return paths[:max_n] |
| |
|
| |
|
| | def collect_non_poster_paths(max_n: int = 2000) -> List[str]: |
| | """Collect verified non-poster PDF paths. |
| | |
| | The non-posters were separated into: |
| | poster-pdf-meta/separated_non_posters/downloads/{zenodo,figshare}/ |
| | """ |
| | paths = [] |
| |
|
| | |
| | sep_dir = POSTER_SCIENCE_BASE / "poster-pdf-meta" / "separated_non_posters" / "downloads" |
| | if sep_dir.exists(): |
| | for pdf in sep_dir.rglob("*.pdf"): |
| | paths.append(str(pdf)) |
| | for pdf in sep_dir.rglob("*.PDF"): |
| | paths.append(str(pdf)) |
| | logger.info(f" Found {len(paths)} non-poster PDFs in {sep_dir}") |
| | else: |
| | |
| | logger.info(" Separated dir not found, trying original list...") |
| | if NON_POSTERS_LIST.exists(): |
| | with open(NON_POSTERS_LIST) as f: |
| | for line in f: |
| | p = _fix_path(line.strip()) |
| | if p and Path(p).exists(): |
| | paths.append(p) |
| | logger.info(f" Found {len(paths)} verified non-poster paths from list") |
| |
|
| | random.shuffle(paths) |
| | return paths[:max_n] |
| |
|
| |
|
| | def extract_features_from_pdfs( |
| | pdf_paths: List[str], |
| | label: int, |
| | text_model, |
| | visual_ext, |
| | structural_ext, |
| | max_text_chars: int = 4000, |
| | ) -> Tuple[np.ndarray, np.ndarray, List[str]]: |
| | """ |
| | Extract multimodal features from a list of PDFs. |
| | |
| | Returns (X, y, extracted_texts) where: |
| | X: (N, 542) feature matrix |
| | y: (N,) labels |
| | extracted_texts: list of extracted text strings (for PubGuard reuse) |
| | """ |
| | from tqdm import tqdm |
| | import fitz |
| | import re |
| |
|
| | embeddings = [] |
| | visual_vecs = [] |
| | struct_vecs = [] |
| | texts_out = [] |
| | labels = [] |
| |
|
| | for pdf_path in tqdm(pdf_paths, desc=f"{'poster' if label == 1 else 'non_poster'}"): |
| | try: |
| | |
| | doc = fitz.open(pdf_path) |
| | if len(doc) == 0: |
| | doc.close() |
| | continue |
| | text = doc[0].get_text() |
| | doc.close() |
| | text = re.sub(r"\s+", " ", text).strip()[:max_text_chars] |
| |
|
| | if len(text) < 20: |
| | continue |
| |
|
| | |
| | img = visual_ext.pdf_to_image(pdf_path) |
| | if img is not None: |
| | vf = visual_ext.extract(img) |
| | else: |
| | vf = {n: 0.0 for n in visual_ext.FEATURE_NAMES} |
| |
|
| | |
| | sf = structural_ext.extract(pdf_path) |
| |
|
| | texts_out.append(text) |
| | visual_vecs.append(visual_ext.to_vector(vf)) |
| | struct_vecs.append(structural_ext.to_vector(sf)) |
| | labels.append(label) |
| |
|
| | except Exception as e: |
| | logger.debug(f"Skipping {pdf_path}: {e}") |
| | continue |
| |
|
| | if not texts_out: |
| | return np.array([]), np.array([]), [] |
| |
|
| | |
| | logger.info(f"Embedding {len(texts_out)} texts...") |
| | emb = text_model.encode(texts_out, show_progress_bar=True) |
| | norms = np.linalg.norm(emb, axis=1, keepdims=True) |
| | norms = np.where(norms == 0, 1, norms) |
| | emb = (emb / norms).astype("float32") |
| |
|
| | visual_arr = np.array(visual_vecs, dtype="float32") |
| | struct_arr = np.array(struct_vecs, dtype="float32") |
| |
|
| | X = np.concatenate([emb, visual_arr, struct_arr], axis=1) |
| | y = np.array(labels) |
| |
|
| | return X, y, texts_out |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="Train PosterSentry") |
| | parser.add_argument("--n-per-class", type=int, default=5000, |
| | help="Max samples per class (poster/non_poster)") |
| | parser.add_argument("--test-size", type=float, default=0.15) |
| | parser.add_argument("--models-dir", default=None) |
| | parser.add_argument("--export-texts", default=None, |
| | help="Export extracted texts as NDJSON for PubGuard retraining") |
| | args = parser.parse_args() |
| |
|
| | from model2vec import StaticModel |
| | from sklearn.linear_model import LogisticRegression |
| | from sklearn.metrics import classification_report |
| | from sklearn.model_selection import train_test_split |
| | from poster_sentry.features import VisualFeatureExtractor, PDFStructuralExtractor |
| |
|
| | |
| | if args.models_dir: |
| | models_dir = Path(args.models_dir) |
| | else: |
| | models_dir = Path.home() / ".poster_sentry" / "models" |
| | models_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | logger.info("Loading model2vec...") |
| | emb_cache = models_dir / "poster-sentry-embedding" |
| | if emb_cache.exists(): |
| | text_model = StaticModel.from_pretrained(str(emb_cache)) |
| | else: |
| | text_model = StaticModel.from_pretrained("minishlab/potion-base-32M") |
| | emb_cache.parent.mkdir(parents=True, exist_ok=True) |
| | text_model.save_pretrained(str(emb_cache)) |
| |
|
| | visual_ext = VisualFeatureExtractor() |
| | structural_ext = PDFStructuralExtractor() |
| |
|
| | |
| | logger.info("=" * 60) |
| | logger.info("Collecting training data...") |
| | logger.info("=" * 60) |
| |
|
| | poster_paths = collect_poster_paths(max_n=args.n_per_class) |
| | non_poster_paths = collect_non_poster_paths(max_n=args.n_per_class) |
| |
|
| | logger.info(f"Poster PDFs to process: {len(poster_paths)}") |
| | logger.info(f"Non-poster PDFs to process: {len(non_poster_paths)}") |
| |
|
| | |
| | logger.info("=" * 60) |
| | logger.info("Extracting features from poster PDFs...") |
| | logger.info("=" * 60) |
| |
|
| | X_pos, y_pos, texts_pos = extract_features_from_pdfs( |
| | poster_paths, label=1, text_model=text_model, |
| | visual_ext=visual_ext, structural_ext=structural_ext, |
| | ) |
| |
|
| | logger.info(f"Poster features: {X_pos.shape}") |
| |
|
| | logger.info("=" * 60) |
| | logger.info("Extracting features from non-poster PDFs...") |
| | logger.info("=" * 60) |
| |
|
| | X_neg, y_neg, texts_neg = extract_features_from_pdfs( |
| | non_poster_paths, label=0, text_model=text_model, |
| | visual_ext=visual_ext, structural_ext=structural_ext, |
| | ) |
| |
|
| | logger.info(f"Non-poster features: {X_neg.shape}") |
| |
|
| | |
| | min_count = min(len(y_pos), len(y_neg)) |
| | logger.info(f"Balancing: {min_count} samples per class") |
| |
|
| | if len(y_pos) > min_count: |
| | idx = np.random.choice(len(y_pos), min_count, replace=False) |
| | X_pos = X_pos[idx] |
| | y_pos = y_pos[idx] |
| | texts_pos = [texts_pos[i] for i in idx] |
| |
|
| | if len(y_neg) > min_count: |
| | idx = np.random.choice(len(y_neg), min_count, replace=False) |
| | X_neg = X_neg[idx] |
| | y_neg = y_neg[idx] |
| | texts_neg = [texts_neg[i] for i in idx] |
| |
|
| | X = np.vstack([X_pos, X_neg]) |
| | y = np.concatenate([y_pos, y_neg]) |
| |
|
| | logger.info(f"Total training data: {X.shape} (poster={sum(y)}, non_poster={len(y)-sum(y)})") |
| |
|
| | |
| | if args.export_texts: |
| | export_path = Path(args.export_texts) |
| | export_path.parent.mkdir(parents=True, exist_ok=True) |
| | with open(export_path, "w") as f: |
| | for text in texts_pos: |
| | f.write(json.dumps({"text": text, "label": "poster"}) + "\n") |
| | for text in texts_neg: |
| | f.write(json.dumps({"text": text, "label": "non_poster"}) + "\n") |
| | logger.info(f"Exported {len(texts_pos) + len(texts_neg)} texts to {export_path}") |
| |
|
| | |
| | |
| | |
| | |
| | from sklearn.preprocessing import StandardScaler |
| |
|
| | logger.info("=" * 60) |
| | logger.info("Scaling features (StandardScaler)") |
| | logger.info("=" * 60) |
| |
|
| | scaler = StandardScaler() |
| | X_scaled = scaler.fit_transform(X) |
| |
|
| | |
| | emb_var = np.mean(np.var(X_scaled[:, :512], axis=0)) |
| | vis_var = np.mean(np.var(X_scaled[:, 512:527], axis=0)) |
| | str_var = np.mean(np.var(X_scaled[:, 527:], axis=0)) |
| | logger.info(f" Mean variance β text: {emb_var:.3f} visual: {vis_var:.3f} structural: {str_var:.3f}") |
| |
|
| | |
| | logger.info("=" * 60) |
| | logger.info("Training PosterSentry classifier") |
| | logger.info("=" * 60) |
| |
|
| | X_tr, X_te, y_tr, y_te = train_test_split( |
| | X_scaled, y, test_size=args.test_size, stratify=y, random_state=SEED, |
| | ) |
| |
|
| | logger.info(f"Train: {X_tr.shape[0]:,} | Test: {X_te.shape[0]:,}") |
| | logger.info(f"Features: {X_tr.shape[1]} (512 text + 15 visual + 15 structural)") |
| |
|
| | clf = LogisticRegression( |
| | C=1.0, max_iter=1000, class_weight="balanced", |
| | solver="lbfgs", n_jobs=1, random_state=SEED, |
| | ) |
| |
|
| | t0 = time.time() |
| | clf.fit(X_tr, y_tr) |
| | elapsed = time.time() - t0 |
| | logger.info(f"Trained in {elapsed:.1f}s") |
| |
|
| | y_pred = clf.predict(X_te) |
| | labels = ["non_poster", "poster"] |
| | report = classification_report(y_te, y_pred, target_names=labels, digits=4) |
| | logger.info(f"\n{report}") |
| |
|
| | |
| | coef = clf.coef_[0] |
| | all_names = ( |
| | [f"emb_{i}" for i in range(512)] |
| | + list(VisualFeatureExtractor.FEATURE_NAMES) |
| | + list(PDFStructuralExtractor.FEATURE_NAMES) |
| | ) |
| | top_idx = np.argsort(np.abs(coef))[-15:][::-1] |
| | logger.info("Top 15 features by |coefficient|:") |
| | for idx in top_idx: |
| | logger.info(f" {all_names[idx]:30s} coef={coef[idx]:+.4f}") |
| |
|
| | |
| | if clf.coef_.shape[0] == 1: |
| | W = np.vstack([-clf.coef_[0], clf.coef_[0]]).T.astype("float32") |
| | b = np.array([-clf.intercept_[0], clf.intercept_[0]], dtype="float32") |
| | else: |
| | W = clf.coef_.T.astype("float32") |
| | b = clf.intercept_.astype("float32") |
| |
|
| | head_path = models_dir / "poster_sentry_head.npz" |
| | np.savez( |
| | head_path, W=W, b=b, labels=np.array(labels), |
| | scaler_mean=scaler.mean_.astype("float32"), |
| | scaler_scale=scaler.scale_.astype("float32"), |
| | ) |
| | logger.info(f"Saved classifier head + scaler β {head_path}") |
| |
|
| | |
| | logger.info("\n" + "=" * 60) |
| | logger.info("SMOKE TEST") |
| | logger.info("=" * 60) |
| |
|
| | from poster_sentry import PosterSentry |
| |
|
| | sentry = PosterSentry(models_dir=models_dir) |
| | sentry.initialize() |
| |
|
| | |
| | test_pdfs = poster_paths[:2] + non_poster_paths[:2] |
| | for p in test_pdfs: |
| | try: |
| | result = sentry.classify(p) |
| | icon = "π" if result["is_poster"] else "π" |
| | print(f" {icon} {Path(p).name[:60]:60s} poster={result['is_poster']} conf={result['confidence']:.3f}") |
| | except Exception as e: |
| | print(f" β οΈ {Path(p).name[:60]}: {e}") |
| |
|
| | logger.info(f"\nDone! Model saved to: {models_dir}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|