Spaces:

Daniel0315
/

cithub_website

Sleeping

File size: 54,127 Bytes

d014856

from __future__ import annotations

import base64
import os
from pathlib import Path
from typing import List

import pandas as pd
import networkx as nx
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
from pyvis.network import Network
import streamlit.components.v1 as components

HF_REPO_ID = os.environ.get("HF_REPO_ID", "")

def csv_download_link(data: bytes, filename: str, label: str) -> None:

    b64 = base64.b64encode(data).decode()
    st.markdown(
        f'<a href="data:text/csv;base64,{b64}" download="{filename}" '
        f'style="display:block;text-align:center;padding:8px 12px;'
        f'background:#1e293b;color:white;border-radius:8px;'
        f'text-decoration:none;font-size:14px;width:100%;box-sizing:border-box;">'
        f'{label}</a>',
        unsafe_allow_html=True,
    )
HF_TOKEN   = os.environ.get("HF_TOKEN", "")

st.set_page_config(page_title="CitationHub", page_icon="📚", layout="wide")

ALLOWED_INTENTS = [
    "background","uses","similarities","motivation",
    "differences","future_work","extends",
]
INTENT_COLORS = {
    "background":"#94a3b8","uses":"#22c55e","similarities":"#3b82f6",
    "motivation":"#f59e0b","differences":"#ef4444",
    "future_work":"#8b5cf6","extends":"#06b6d4",
}
NODE_COLORS = {
    "seed_paper":"#111827","citing_paper":"#dbeafe","citation_event":"#fde68a",
    "journal":"#ede9fe","author":"#fee2e2","affiliation":"#fae8ff",
    "city":"#cffafe","country":"#ffedd5","field":"#e0e7ff","intent":"#dcfce7",
}
NODE_TYPE_COLORS = {
    "seed_paper":"#111827","citing_paper":"#3b82f6","citation_event":"#f59e0b",
    "journal":"#8b5cf6","author":"#ef4444","affiliation":"#ec4899",
    "city":"#06b6d4","country":"#f97316","field":"#6366f1","intent":"#22c55e",
}

DEFAULT_DATA_DIR = Path(os.environ.get(
    "CITATIONHUB_DATA_DIR",
    "/tmp/citationhub_data",
))

def fmt_num(x):
    try: return f"{int(x):,}"
    except: return "-"

def _hf_download(filename: str) -> str:
    from huggingface_hub import hf_hub_download
    return hf_hub_download(
        repo_id=HF_REPO_ID, repo_type="dataset",
        filename=f"data/{filename}", token=HF_TOKEN or None,
    )

def _read(filename: str, data_dir: Path | None = None, columns: list | None = None) -> pd.DataFrame:
    path = _hf_download(filename) if HF_REPO_ID else str(data_dir / filename)
    return pd.read_parquet(path, columns=columns, engine="pyarrow")

def _safe_cols(path: str, wanted: list) -> list:
    import pyarrow.parquet as pq
    avail = set(pq.read_schema(path).names)
    return [c for c in wanted if c in avail]

def plotly_network_fig(

    nodes_df: pd.DataFrame,

    edges_df: pd.DataFrame,

    title: str = "",

    height: int = 750,

    seed_node_ids: list | None = None,

) -> go.Figure:

    G = nx.Graph()
    node_meta: dict = {}
    for _, row in nodes_df.iterrows():
        nid = str(row["node_id"])
        G.add_node(nid)
        node_meta[nid] = row

    for _, row in edges_df.iterrows():
        s, t = str(row["source"]), str(row["target"])
        if s in node_meta and t in node_meta:
            G.add_edge(s, t, edge_type=row.get("edge_type", ""))

    if len(G.nodes) == 0:
        return go.Figure()

    k = max(1.5, 3.0 / (len(G.nodes) ** 0.4))
    pos = nx.spring_layout(G, seed=42, k=k, iterations=60)

    ex, ey = [], []
    for src, tgt in G.edges():
        x0, y0 = pos.get(src, (0, 0))
        x1, y1 = pos.get(tgt, (0, 0))
        ex += [x0, x1, None]
        ey += [y0, y1, None]

    traces: list[go.BaseTraceType] = [
        go.Scatter(
            x=ex, y=ey, mode="lines",
            line=dict(width=0.8, color="#cbd5e1"),
            hoverinfo="none", showlegend=False,
        )
    ]

    for ntype, color in NODE_TYPE_COLORS.items():
        subset = nodes_df[nodes_df["node_type"] == ntype]
        if subset.empty:
            continue
        xs, ys, hovers, texts = [], [], [], []
        for _, row in subset.iterrows():
            nid = str(row["node_id"])
            if nid not in pos:
                continue
            x, y = pos[nid]
            xs.append(x); ys.append(y)
            label = str(row.get("label", ""))[:50]
            texts.append(label if ntype == "seed_paper" else "")
            hovers.append(
                f"<b>{label}</b><br>"
                f"Type: {ntype}<br>"
                f"DOI: {row.get('doi','') or '-'}<br>"
                f"Pub: {row.get('publication_name','') or '-'}<br>"
                f"Group: {row.get('group','') or '-'}"
            )

        is_seed = ntype == "seed_paper"
        traces.append(go.Scatter(
            x=xs, y=ys,
            mode="markers+text" if is_seed else "markers",
            text=texts, textposition="top center",
            hovertext=hovers, hoverinfo="text",
            name=ntype,
            marker=dict(
                size=20 if is_seed else 10,
                color=color,
                line=dict(width=1.5 if is_seed else 0.5, color="white"),
                symbol="circle",
            ),
        ))

    fig = go.Figure(data=traces)
    fig.update_layout(
        title=dict(text=title, font=dict(size=14)),
        showlegend=True,
        legend=dict(title="Node type", itemsizing="constant"),
        hovermode="closest",
        height=height,
        margin=dict(l=0, r=0, t=40 if title else 10, b=0),
        paper_bgcolor="white",
        plot_bgcolor="#f8fafc",
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    )
    return fig

def plotly_ontology_fig(height: int = 820) -> go.Figure:


    NODE_PROPS = {
        "seed_paper":     "doi · title · journal\nauthor · affiliation\ncountry · field · citedby_count",
        "citation_event": "event_id · citing_year\nprimary_intent · context\nis_influential",
        "citing_paper":   "doi · title\nyear · venue · oa_pdf",
        "intent":         "background · uses\nsimilarities · motivation\ndifferences · future_work · extends",
        "journal":        "journal_name",
        "author":         "author_name · author_id",
        "affiliation":    "affiliation_name",
        "city":           "city_name",
        "country":        "country_name",
        "field":          "field_name",
    }

    node_defs = [
        ("seed",        "Top5PctCitedPaper", "seed_paper"),
        ("event",       "CitationEvent",     "citation_event"),
        ("citing",      "CitingPaper",        "citing_paper"),
        ("intent",      "Intent",             "intent"),
        ("journal",     "Journal",            "journal"),
        ("author",      "Author",             "author"),
        ("affiliation", "Affiliation",        "affiliation"),
        ("city",        "City",               "city"),
        ("country",     "Country",            "country"),
        ("field",       "Field",              "field"),
    ]
    edge_defs = [
        ("event","citing","hasCitingPaper"),    ("event","seed","hasCitedPaper"),
        ("event","intent","hasPrimaryIntent"),   ("seed","journal","publishedInJournal"),
        ("seed","author","hasAuthor"),           ("seed","affiliation","hasAffiliation"),
        ("seed","city","locatedInCity"),         ("seed","country","locatedInCountry"),
        ("seed","field","belongsToField"),
    ]
    G = nx.DiGraph()
    for nid, _, _ in node_defs:
        G.add_node(nid)
    for s, t, _ in edge_defs:
        G.add_edge(s, t)

    pos = nx.spring_layout(G, seed=7, k=2.5, iterations=80)

    ex, ey = [], []
    ann = []
    for s, t, lbl in edge_defs:
        x0, y0 = pos[s]; x1, y1 = pos[t]
        ex += [x0, x1, None]; ey += [y0, y1, None]
        mx, my = (x0+x1)/2, (y0+y1)/2
        ann.append(dict(
            x=mx, y=my, text=f"<i>{lbl}</i>",
            showarrow=False, font=dict(size=9, color="#64748b"),
            bgcolor="rgba(255,255,255,0.75)",
        ))

    traces: list[go.BaseTraceType] = [
        go.Scatter(x=ex, y=ey, mode="lines",
                   line=dict(width=1.2, color="#94a3b8"),
                   hoverinfo="none", showlegend=False)
    ]

    for nid, label, ntype in node_defs:
        x, y = pos[nid]
        color = NODE_TYPE_COLORS.get(ntype, "#94a3b8")
        props = NODE_PROPS.get(ntype, "")

        traces.append(go.Scatter(
            x=[x], y=[y], mode="markers+text",
            text=[f"<b>{label}</b>"], textposition="top center",
            hoverinfo="text",
            hovertext=(f"<b>{label}</b><br>Type: {ntype}<br>"
                       + props.replace("\n", "<br>")),
            name=label, showlegend=False,
            marker=dict(size=24, color=color,
                        line=dict(width=1.5, color="white")),
            textfont=dict(size=11, color="#1e293b"),
        ))

        if props:
            prop_html = props.replace("\n", "<br>")
            ann.append(dict(
                x=x, y=y,
                text=f"<span style='font-size:8px;color:#64748b'>{prop_html}</span>",
                showarrow=False,
                xanchor="center",
                yanchor="top",
                yshift=-22,
                font=dict(size=8, color="#64748b"),
                bgcolor="rgba(248,250,252,0.85)",
                borderpad=2,
            ))

    fig = go.Figure(data=traces)
    fig.update_layout(
        showlegend=False, hovermode="closest", height=height,
        annotations=ann,
        margin=dict(l=10, r=10, t=20, b=10),
        paper_bgcolor="white", plot_bgcolor="#f8fafc",
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    )
    return fig

def inject_fullscreen(html: str) -> str:
    extra = """

    <button onclick="var el=document.getElementById('mynetwork');

      if(el){if(el.requestFullscreen)el.requestFullscreen();

      else if(el.webkitRequestFullscreen)el.webkitRequestFullscreen();}"

      style="position:fixed;bottom:18px;right:18px;z-index:9999;

             padding:8px 18px;background:#1e293b;color:white;border:none;

             border-radius:8px;cursor:pointer;font-size:13px;

             box-shadow:0 2px 8px rgba(0,0,0,0.35);">⛶ Fullscreen</button>

    <div style="position:fixed;bottom:18px;left:18px;z-index:9999;font-size:12px;

                color:#64748b;background:rgba(255,255,255,0.85);

                padding:5px 10px;border-radius:6px;">

      🖱 Scroll: zoom &nbsp;|&nbsp; Drag: pan &nbsp;|&nbsp; Click node: info</div>

    <script>



    (function fixDPI() {

      var canvas = document.querySelector('#mynetwork canvas');

      if (!canvas) { setTimeout(fixDPI, 200); return; }

      var dpr = window.devicePixelRatio || 1;

      if (dpr <= 1) return;

      try {

        if (typeof network !== 'undefined') {

          network.canvas.pixelRatio = dpr;

          network.redraw();

        }

      } catch(e) {}

    })();

    </script>

    """
    return html.replace("</body>", extra + "</body>")

_SEED_COLS = [
    "seed_paper_id","doi","title","publication_name","creator","affilname",
    "affiliation_city","affiliation_country","group","cover_date","citedby_count",
    "author_id","affiliation_id","country_id","field_id","journal_id",
]
_INTENTS_SQL = "'" + "','".join(["background","uses","similarities","motivation",
                                  "differences","future_work","extends"]) + "'"

@st.cache_data(show_spinner=False)
def load_data(data_dir_str: str):
    import duckdb, pyarrow.parquet as pq
    d = None if HF_REPO_ID else Path(data_dir_str)

    seed_path   = _hf_download("seed_cited_papers_normalized.parquet") if HF_REPO_ID else str(d / "seed_cited_papers_normalized.parquet")
    events_path = _hf_download("citation_events_normalized.parquet")   if HF_REPO_ID else str(d / "citation_events_normalized.parquet")

    avail = pq.read_schema(seed_path).names
    cols  = [c for c in _SEED_COLS if c in avail]
    seed_df = pd.read_parquet(seed_path, columns=cols, engine="pyarrow")

    seed = pd.DataFrame({
        "seed_paper_id":  seed_df["seed_paper_id"],
        "doi":            seed_df.get("doi", pd.Series(dtype=str)).fillna(""),
        "title":          seed_df.get("title", pd.Series(dtype=str)).fillna(""),
        "journal":        seed_df.get("publication_name", pd.Series(dtype=str)).fillna(""),
        "author":         seed_df.get("creator", pd.Series(dtype=str)).fillna(""),
        "affiliation":    seed_df.get("affilname", pd.Series(dtype=str)).fillna(""),
        "city":           seed_df.get("affiliation_city", pd.Series(dtype=str)).fillna(""),
        "country":        seed_df.get("affiliation_country", pd.Series(dtype=str)).fillna(""),
        "field":          seed_df.get("group", pd.Series(dtype=str)).fillna(""),
        "cover_date":     seed_df.get("cover_date", pd.Series(dtype=str)).fillna(""),
        "citedby_count":  pd.to_numeric(seed_df.get("citedby_count"), errors="coerce").fillna(0).astype(int),
        "author_id":      seed_df.get("author_id", pd.Series(dtype=object)),
        "affiliation_id": seed_df.get("affiliation_id", pd.Series(dtype=object)),
        "country_id":     seed_df.get("country_id", pd.Series(dtype=object)),
        "field_id":       seed_df.get("field_id", pd.Series(dtype=object)),
        "journal_id":     seed_df.get("journal_id", pd.Series(dtype=object)),
    })
    for col in ["title","doi","journal","field","country"]:
        seed[f"{col}_lc"] = seed[col].astype(str).str.lower()
    seed = seed.sort_values(["citedby_count","title"], ascending=[False,True]).reset_index(drop=True)

    ep = events_path.replace("\\", "/")
    stats = duckdb.execute(f"""

        SELECT MIN(citing_year) AS yr_min, MAX(citing_year) AS yr_max,

               COUNT(*) AS total, COUNT(DISTINCT citing_paper_id) AS n_citing

        FROM read_parquet('{ep}')

        WHERE primary_intent IN ({_INTENTS_SQL})

    """).df().iloc[0]

    filters = {
        "fields":    sorted([x for x in seed["field"].dropna().astype(str).unique() if x]),
        "countries": sorted([x for x in seed["country"].dropna().astype(str).unique() if x]),
        "journals":  sorted([x for x in seed["journal"].dropna().astype(str).unique() if x]),
        "intents":   ALLOWED_INTENTS,
        "year_min":  int(stats["yr_min"]) if pd.notna(stats["yr_min"]) else 2000,
        "year_max":  int(stats["yr_max"]) if pd.notna(stats["yr_max"]) else 2025,
    }
    overview = {
        "seed_papers":     int(len(seed)),
        "citation_events": int(stats["total"]),
        "citing_papers":   int(stats["n_citing"]),
        "authors":         int(seed["author"].replace("", pd.NA).dropna().nunique()),
        "journals":        int(seed["journal"].replace("", pd.NA).dropna().nunique()),
        "countries":       int(seed["country"].replace("", pd.NA).dropna().nunique()),
        "fields":          int(seed["field"].replace("", pd.NA).dropna().nunique()),
        "intents":         len(ALLOWED_INTENTS),
    }
    return seed, events_path, filters, overview

@st.cache_data(show_spinner=False)
def load_events_for_paper(events_path: str, seed_paper_id: str, year_min: int, year_max: int) -> pd.DataFrame:
    import duckdb
    ep   = events_path.replace("\\", "/")
    sid  = seed_paper_id.replace("'", "''")
    return duckdb.execute(f"""

        SELECT citation_event_id,

               cited_seed_paper_id AS seed_paper_id,

               citing_paper_id, citing_title, citing_doi,

               TRY_CAST(citing_year AS INTEGER) AS citing_year,

               citing_venue, primary_intent, contexts,

               TRY_CAST(context_count AS INTEGER) AS context_count,

               TRY_CAST(intent_count  AS INTEGER) AS intent_count,

               is_influential

        FROM read_parquet('{ep}')

        WHERE cited_seed_paper_id = '{sid}'

          AND primary_intent IN ({_INTENTS_SQL})

          AND TRY_CAST(citing_year AS INTEGER) BETWEEN {year_min} AND {year_max}

        ORDER BY context_count DESC NULLS LAST

    """).df()

@st.cache_data(show_spinner=False)
def load_global_intent_stats(events_path: str) -> pd.DataFrame:
    import duckdb
    ep = events_path.replace("\\", "/")
    return duckdb.execute(f"""

        SELECT primary_intent AS intent, COUNT(*) AS count

        FROM read_parquet('{ep}')

        WHERE primary_intent IN ({_INTENTS_SQL})

        GROUP BY primary_intent

    """).df()

@st.cache_data(show_spinner=False)
def load_cocited_papers(events_path: str, selected_seed_id: str, top_n: int = 15) -> pd.DataFrame:
    import duckdb
    ep  = events_path.replace("\\", "/")
    sid = selected_seed_id.replace("'", "''")
    return duckdb.execute(f"""

        WITH citing_ids AS (

            SELECT DISTINCT citing_paper_id

            FROM read_parquet('{ep}')

            WHERE cited_seed_paper_id = '{sid}'

        )

        SELECT cited_seed_paper_id AS seed_paper_id, COUNT(*) AS co_citation_count

        FROM read_parquet('{ep}')

        WHERE citing_paper_id IN (SELECT citing_paper_id FROM citing_ids)

          AND cited_seed_paper_id != '{sid}'

        GROUP BY cited_seed_paper_id

        ORDER BY co_citation_count DESC

        LIMIT {top_n}

    """).df()

@st.cache_data(show_spinner=False)
def load_analytics_data(events_path: str) -> dict:
    import duckdb
    ep = events_path.replace("\\", "/")

    intent_trend = duckdb.execute(f"""

        SELECT TRY_CAST(citing_year AS INTEGER) AS year,

               primary_intent, COUNT(*) AS count

        FROM read_parquet('{ep}')

        WHERE primary_intent IN ({_INTENTS_SQL})

          AND TRY_CAST(citing_year AS INTEGER) >= 2000

        GROUP BY year, primary_intent

        ORDER BY year

    """).df()

    venues = duckdb.execute(f"""

        SELECT citing_venue, COUNT(*) AS count

        FROM read_parquet('{ep}')

        WHERE primary_intent IN ({_INTENTS_SQL})

          AND citing_venue IS NOT NULL AND citing_venue != ''

        GROUP BY citing_venue

        ORDER BY count DESC

        LIMIT 20

    """).df()

    influential = duckdb.execute(f"""

        SELECT is_influential, COUNT(*) AS count

        FROM read_parquet('{ep}')

        WHERE primary_intent IN ({_INTENTS_SQL})

        GROUP BY is_influential

    """).df()

    return {"intent_trend": intent_trend, "venues": venues, "influential": influential}

@st.cache_data(show_spinner=False)
def load_authors_data(data_dir_str: str) -> pd.DataFrame:
    return _read("authors.parquet", None if HF_REPO_ID else Path(data_dir_str),
                 columns=["author_id","author_name"])

@st.cache_data(show_spinner=False)
def load_geo_data(data_dir_str: str) -> pd.DataFrame:
    return _read("affiliation_geo.parquet", None if HF_REPO_ID else Path(data_dir_str),
                 columns=["affiliation_name","city_name","country_name"])

_KG_NODE_COLS = ["node_id","node_type","label","doi","publication_name","citedby_count"]

@st.cache_data(show_spinner=False)
def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
    path = _hf_download("kg_nodes.parquet") if HF_REPO_ID else str(Path(data_dir_str) / "kg_nodes.parquet")
    return pd.read_parquet(path, columns=_safe_cols(path, _KG_NODE_COLS), engine="pyarrow")

@st.cache_data(show_spinner=False)
def get_parquet_path(filename: str, data_dir_str: str) -> str:

    if HF_REPO_ID:
        return _hf_download(filename)

    return str(Path(data_dir_str) / filename).replace("\\", "/")

@st.cache_data(show_spinner=False)
def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 80) -> pd.DataFrame:

    import duckdb
    safe_path = kg_edges_path.replace("\\", "/")
    safe_node = node_id.replace("'", "''")
    q = f"""

    SELECT source, target, edge_type

    FROM read_parquet('{safe_path}')

    WHERE source = '{safe_node}' OR target = '{safe_node}'

    LIMIT {int(max_edges)}

    """
    return duckdb.execute(q).df()

@st.cache_data(show_spinner=False)
def query_enriched_stats(enriched_path: str):

    import duckdb
    safe_path = enriched_path.replace("\\", "/")

    sem_df = duckdb.execute(f"""

        SELECT has_semantic_evidence, COUNT(*) AS count

        FROM read_parquet('{safe_path}')

        GROUP BY has_semantic_evidence

    """).df()

    field_df = duckdb.execute(f"""

        SELECT field_folder AS field,

               AVG(CAST(has_semantic_evidence AS INTEGER)) AS sem_ratio,

               COUNT(*) AS event_count

        FROM read_parquet('{safe_path}')

        GROUP BY field_folder

        ORDER BY sem_ratio DESC

        LIMIT 20

    """).df()

    return sem_df, field_df

@st.cache_data(show_spinner=False)
def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60) -> pd.DataFrame:

    import duckdb
    safe_path = kg_edges_path.replace("\\", "/")
    safe_node = node_id.replace("'", "''")
    q = f"""

    SELECT source, target, edge_type

    FROM read_parquet('{safe_path}')

    WHERE source = '{safe_node}' OR target = '{safe_node}'

    LIMIT {int(max_edges)}

    """
    return duckdb.execute(q).df()

def filter_seed_papers(seed, q, fields, countries, journals):
    df = seed.copy()
    q = (q or "").strip().lower()
    if q:
        df = df[df["title_lc"].str.contains(q, na=False) | df["doi_lc"].str.contains(q, na=False)]
    if fields:    df = df[df["field"].str.lower().isin({x.lower() for x in fields})]
    if countries: df = df[df["country"].str.lower().isin({x.lower() for x in countries})]
    if journals:  df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
    return df.reset_index(drop=True)

def event_subset(events, seed_paper_id, year_min, year_max):
    df = events[events["seed_paper_id"] == seed_paper_id].copy()
    df = df[df["citing_year"].fillna(-99999) >= year_min]
    df = df[df["citing_year"].fillna(99999) <= year_max]
    return df.reset_index(drop=True)

def build_intent_summary(df):
    counts = df.groupby("primary_intent").size().to_dict()
    return pd.DataFrame({"intent": ALLOWED_INTENTS,
                          "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})

def build_context_rows(df, limit=20):
    rows = []
    df = df.sort_values(["context_count","intent_count","citing_year"],
                        ascending=[False,False,False], na_position="last")
    for _, row in df.iterrows():
        ctx = row["contexts"]
        if isinstance(ctx, list) and ctx:
            for c in ctx[:2]:
                rows.append({"primary_intent": row["primary_intent"],
                             "citing_title": row["citing_title"],
                             "citing_doi": row["citing_doi"],
                             "citing_year": None if pd.isna(row["citing_year"]) else int(row["citing_year"]),
                             "context": c})
        if len(rows) >= limit: break
    return pd.DataFrame(rows[:limit])

def build_citing_table(df, limit=30):
    if df.empty:
        return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
    return (df.sort_values(["context_count","intent_count","citing_year"],
                            ascending=[False,False,False], na_position="last")
            [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
            .drop_duplicates(subset=["citing_paper_id"]).head(limit))

def get_cocited_papers(selected_seed_id, events, seed, top_n=15):

    citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
    cocited = (events[events["citing_paper_id"].isin(citing_ids) &
                      (events["seed_paper_id"] != selected_seed_id)]
               .groupby("seed_paper_id").size()
               .reset_index(name="co_citation_count")
               .sort_values("co_citation_count", ascending=False)
               .head(top_n))
    return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
                         on="seed_paper_id", how="left")

def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):

    node_id = f"seed:{seed_doi}"
    edges = kg_edges[(kg_edges["source"] == node_id) |
                     (kg_edges["target"] == node_id)].head(max_edges)
    if edges.empty:
        return None, None
    all_node_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
    nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
    return nodes, edges

def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):

    edges = kg_edges[(kg_edges["source"] == search_node_id) |
                     (kg_edges["target"] == search_node_id)].head(max_edges)
    if edges.empty:
        return None, None
    all_ids = set(edges["source"].tolist()) | set(edges["target"].tolist())
    nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
    return nodes, edges

def pyvis_citation_graph(seed_row, events_df):
    net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
    sid = seed_row["seed_paper_id"]
    net.add_node(sid, label=seed_row["title"][:60], color="#111827", size=34, shape="dot",
                 font={"color":"white"})
    for _, row in events_df.sort_values(["context_count","intent_count"],
                                         ascending=False).head(40).iterrows():
        cid = row["citing_paper_id"]
        net.add_node(cid, label=(row["citing_title"] or row["citing_doi"] or cid)[:60],
                     color=NODE_COLORS["citing_paper"], size=18, shape="dot")
        ctx = (row["contexts"] or [])[0] if isinstance(row["contexts"], list) and row["contexts"] else ""
        yr  = "" if pd.isna(row["citing_year"]) else int(row["citing_year"])
        net.add_edge(cid, sid, label=row["primary_intent"],
                     color=INTENT_COLORS.get(row["primary_intent"],"#94a3b8"),
                     title=f"Intent: {row['primary_intent']}<br>Year: {yr}<br>{ctx}")
    net.barnes_hut()
    return inject_fullscreen(net.generate_html())

def pyvis_ontology():
    net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
    for nid, label, typ in [
        ("seed","Top5PctCitedPaper","seed_paper"),("event","CitationEvent","citation_event"),
        ("citing","CitingPaper","citing_paper"),  ("intent","Intent","intent"),
        ("journal","Journal","journal"),           ("author","Author","author"),
        ("affiliation","Affiliation","affiliation"),("city","City","city"),
        ("country","Country","country"),           ("field","Field","field"),
    ]:
        net.add_node(nid, label=label, color=NODE_COLORS[typ], size=24)
    for s, t, l in [
        ("event","citing","hasCitingPaper"),("event","seed","hasCitedPaper"),
        ("event","intent","hasPrimaryIntent"),("seed","journal","publishedInJournal"),
        ("seed","author","hasAuthor"),        ("seed","affiliation","hasAffiliation"),
        ("seed","city","locatedInCity"),      ("seed","country","locatedInCountry"),
        ("seed","field","belongsToField"),
    ]:
        net.add_edge(s, t, label=l)
    net.barnes_hut()
    return inject_fullscreen(net.generate_html())

def pyvis_from_kg(nodes_df, edges_df, height="780px"):

    net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
    for _, row in nodes_df.iterrows():
        ntype = row.get("node_type","")
        color = NODE_TYPE_COLORS.get(ntype,"#94a3b8")
        label = str(row.get("label",""))[:55]
        size  = 30 if ntype == "seed_paper" else 16
        font  = {"color":"white"} if ntype == "seed_paper" else {}
        tooltip = f"Type: {ntype}<br>DOI: {row.get('doi','')}<br>Pub: {row.get('publication_name','')}"
        net.add_node(str(row["node_id"]), label=label, color=color,
                     size=size, shape="dot", title=tooltip, font=font)
    for _, row in edges_df.iterrows():
        net.add_edge(str(row["source"]), str(row["target"]),
                     label=row.get("edge_type",""), color="#94a3b8")
    net.barnes_hut()
    return inject_fullscreen(net.generate_html())

st.title("CitationHub")
st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")

_loading_placeholder = st.empty()

with st.sidebar:
    st.subheader("Data source")
    if HF_REPO_ID:
        data_dir_val = "hf"
        st.caption(f"Hugging Face: {HF_REPO_ID}")
    else:
        data_dir_val = st.text_input("Parquet directory", str(DEFAULT_DATA_DIR))

    try:
        _loading_placeholder.info("⏳ Loading CitationHub data… this may take a moment on first visit.")
        seed, events_path, filters, overview = load_data(data_dir_val)
        _loading_placeholder.empty()
        st.success("Data loaded")
    except Exception as e:
        _loading_placeholder.empty()
        st.error(str(e)); st.stop()

    st.subheader("Search seed papers")
    q_input = st.text_input("Title or DOI")
    if "q_submit" not in st.session_state: st.session_state["q_submit"] = ""
    if st.button("Search", use_container_width=True):
        st.session_state["q_submit"] = q_input

    fields_sel    = st.multiselect("Field", filters["fields"])
    countries_sel = st.multiselect("Country", filters["countries"])
    journals_sel  = st.multiselect("Journal", filters["journals"][:200])
    y_min = max(2000, filters["year_min"])
    year_min, year_max = st.slider("Citing year", y_min, filters["year_max"], (y_min, filters["year_max"]))

    seed_filtered = filter_seed_papers(seed, st.session_state["q_submit"],
                                       fields_sel, countries_sel, journals_sel)

    st.subheader("Overview counts")
    c1, c2 = st.columns(2)
    c1.metric("Seed papers",     fmt_num(overview["seed_papers"]))
    c2.metric("Citation events", fmt_num(overview["citation_events"]))
    c1.metric("Citing papers",   fmt_num(overview["citing_papers"]))
    c2.metric("Authors",         fmt_num(overview["authors"]))
    c1.metric("Countries",       fmt_num(overview["countries"]))
    c2.metric("Fields",          fmt_num(overview["fields"]))

    options = seed_filtered["seed_paper_id"].tolist()
    if not options:
        st.warning("No seed papers match the current search."); st.stop()
    current     = st.session_state.get("selected_seed_id", options[0])
    default_idx = options.index(current) if current in options else 0
    selected_seed_id = st.selectbox(
        "Seed paper", options, index=default_idx,
        format_func=lambda sid: seed_filtered.loc[
            seed_filtered["seed_paper_id"]==sid, "title"].iloc[0],
    )
    st.session_state["selected_seed_id"] = selected_seed_id

selected_seed  = seed_filtered[seed_filtered["seed_paper_id"]==selected_seed_id].iloc[0]
seed_events    = load_events_for_paper(events_path, selected_seed_id, year_min, year_max)
intent_summary = build_intent_summary(seed_events)
contexts_df    = build_context_rows(seed_events)
citing_table   = build_citing_table(seed_events)

(tab_overview, tab_cnet,
 tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
    "Overview","Citation Network",
    "Knowledge Graph","Geographic Map","Analytics",
])

with tab_overview:
    col1, col2 = st.columns(2)
    with col1:
        st.subheader("Seed paper detail")
        dc1, dc2 = st.columns(2)
        dc1.metric("Cited by",        fmt_num(selected_seed["citedby_count"]))
        dc2.metric("Citation events", fmt_num(len(seed_events)))
        for label, key in [
            ("Title","title"),("DOI","doi"),("Published","cover_date"),
            ("Journal","journal"),("Author","author"),("Affiliation","affiliation"),
            ("City","city"),("Country","country"),("Field","field"),
        ]:
            st.markdown(f"**{label}**  \n{selected_seed[key] or '-'}")

        st.subheader("Related citing papers")
        st.dataframe(citing_table.rename(columns={
            "citing_title":"Title","citing_year":"Year",
            "primary_intent":"Intent","context_count":"Contexts"}),
            use_container_width=True, hide_index=True)

        st.subheader("Co-cited seed papers")
        st.caption("Other top 5% cited papers that appear together with the selected paper in the same citing works")
        cocited = load_cocited_papers(events_path, selected_seed_id).merge(
            seed[["seed_paper_id","title","field","journal","citedby_count"]], on="seed_paper_id", how="left")
        if cocited.empty:
            st.info("Co-cited papers not found.")
        else:
            st.dataframe(cocited.rename(columns={
                "co_citation_count":"Co-citations","title":"Title",
                "field":"Field","citedby_count":"Cited by"}),
                use_container_width=True, hide_index=True)

    with col2:
        st.subheader("Intent distribution (selected paper)")
        fig = px.bar(intent_summary, x="intent", y="count", color="intent",
                     color_discrete_map=INTENT_COLORS)
        fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
        st.plotly_chart(fig, use_container_width=True)

        st.subheader("CitationHub Intent Distribution")
        _gi = load_global_intent_stats(events_path).set_index("intent")["count"].to_dict()
        ai_df = pd.DataFrame({"intent": ALLOWED_INTENTS,
                               "count": [int(_gi.get(i, 0)) for i in ALLOWED_INTENTS]})
        fig2 = px.bar(ai_df, x="intent", y="count", color="intent",
                      color_discrete_map=INTENT_COLORS)
        fig2.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
        st.plotly_chart(fig2, use_container_width=True)

        st.subheader("CitationHub Field Distribution")
        fd = (seed_filtered.groupby("field", dropna=False).size()
              .reset_index(name="count").sort_values("count", ascending=False).head(20))
        fd["field"] = fd["field"].replace("","Unknown")
        st.plotly_chart(
            px.bar(fd, x="field", y="count").update_layout(xaxis_title="", yaxis_title="Count"),
            use_container_width=True)

    st.subheader("Citation contexts")
    if contexts_df.empty:
        st.info("No contexts available.")
    else:
        for _, row in contexts_df.iterrows():
            st.markdown(
                f"""<div style="border:1px solid #e2e8f0;border-radius:14px;padding:12px;

                margin-bottom:10px;background:#f8fafc;">

                <div style="display:inline-block;background:{INTENT_COLORS.get(row['primary_intent'],'#64748b')};

                color:white;border-radius:999px;padding:4px 8px;font-size:12px;margin-bottom:6px;">

                {row['primary_intent']}</div>

                <div style="font-size:12px;color:#64748b;margin-bottom:6px;">

                {row['citing_year'] or '-'} · {row['citing_title'] or row['citing_doi']}</div>

                <div>{row['context']}</div></div>""",
                unsafe_allow_html=True)

with tab_cnet:
    st.subheader("Citation Network")
    st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
    if seed_events.empty:
        st.info("No citation network data for this seed paper.")
    else:
        components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)

with tab_kg_exp:
    st.subheader("Knowledge Graph")

    st.subheader("CitationHub Ontology — Concepts, Instances & Relationships")
    st.caption("🔍 Scroll/pinch: zoom  |  Drag: pan  |  Hover node: details  |  ⛶ (top-right toolbar): fullscreen")
    st.plotly_chart(plotly_ontology_fig(height=820), use_container_width=True)

    st.markdown("---")

    try:
        with st.spinner("Loading..."):
            kg_nodes_exp  = load_kg_nodes(data_dir_val)
            kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)

        import duckdb as _ddb

        nt = kg_nodes_exp["node_type"].value_counts().reset_index()
        nt.columns = ["node_type", "count"]

        et = _ddb.execute(f"""

            SELECT edge_type, COUNT(*) AS count

            FROM read_parquet('{kg_edges_path}')

            GROUP BY edge_type ORDER BY count DESC

        """).df()

        col_a, col_b, col_c, col_d = st.columns([1, 2, 1, 2])
        with col_a:
            st.subheader("Node Types")
            st.dataframe(nt, use_container_width=True, hide_index=True)
        with col_b:
            st.subheader("CitationHub KG Node Distribution")
            nt_fig = px.bar(nt, x="node_type", y="count", color="node_type",
                            color_discrete_map=NODE_TYPE_COLORS)
            nt_fig.update_layout(showlegend=False, xaxis_title="", yaxis_title="Count")
            st.plotly_chart(nt_fig, use_container_width=True)
        with col_c:
            st.subheader("Edge Types")
            st.dataframe(et, use_container_width=True, hide_index=True)
        with col_d:
            st.subheader("CitationHub KG Edge Distribution")
            et_fig = px.bar(et, x="edge_type", y="count", color="edge_type")
            et_fig.update_layout(showlegend=False, xaxis_title="",
                                 yaxis_title="Count", xaxis_tickangle=-35)
            st.plotly_chart(et_fig, use_container_width=True)

        st.markdown("---")
        st.subheader("Multi-Node Knowledge Graph")
        st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")

        n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")

        EDGES_PER_TYPE = 10

        with st.spinner("Querying graph..."):
            top_seeds = (kg_nodes_exp[kg_nodes_exp["node_type"] == "seed_paper"]
                         .sort_values("citedby_count", ascending=False)
                         .head(n_seeds))
            seed_ids = top_seeds["node_id"].tolist()

            if seed_ids:
                ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)

                hop1 = _ddb.execute(f"""

                    WITH ranked AS (

                        SELECT source, target, edge_type,

                               ROW_NUMBER() OVER (

                                   PARTITION BY edge_type ORDER BY source

                               ) AS rn

                        FROM read_parquet('{kg_edges_path}')

                        WHERE source IN ({ids_sql}) OR target IN ({ids_sql})

                    )

                    SELECT source, target, edge_type FROM ranked

                    WHERE rn <= {EDGES_PER_TYPE}

                """).df()

                hop1_all_ids = set(hop1["source"].tolist()) | set(hop1["target"].tolist())
                event_node_ids = (
                    kg_nodes_exp[
                        kg_nodes_exp["node_id"].isin(hop1_all_ids) &
                        (kg_nodes_exp["node_type"] == "citation_event")
                    ]["node_id"].tolist()[:40]
                )

                if event_node_ids:
                    ev_sql = ", ".join(f"'{eid}'" for eid in event_node_ids)

                    hop2 = _ddb.execute(f"""

                        WITH ranked AS (

                            SELECT source, target, edge_type,

                                   ROW_NUMBER() OVER (

                                       PARTITION BY edge_type ORDER BY source

                                   ) AS rn

                            FROM read_parquet('{kg_edges_path}')

                            WHERE (source IN ({ev_sql}) OR target IN ({ev_sql}))

                              AND edge_type NOT IN (

                                  SELECT DISTINCT edge_type

                                  FROM read_parquet('{kg_edges_path}')

                                  WHERE source IN ({ids_sql}) OR target IN ({ids_sql})

                              )

                        )

                        SELECT source, target, edge_type FROM ranked

                        WHERE rn <= {EDGES_PER_TYPE}

                    """).df()
                    exp_edges = pd.concat([hop1, hop2]).drop_duplicates(
                        subset=["source", "target", "edge_type"]
                    )
                else:
                    exp_edges = hop1

                all_exp_ids = set(exp_edges["source"].tolist()) | set(exp_edges["target"].tolist())
                exp_nodes = kg_nodes_exp[kg_nodes_exp["node_id"].isin(all_exp_ids)]

                c1, c2, c3, c4 = st.columns(4)
                c1.metric("Nodes",      fmt_num(len(exp_nodes)))
                c2.metric("Edges",      fmt_num(len(exp_edges)))
                c3.metric("Node types", fmt_num(exp_nodes["node_type"].nunique()))
                c4.metric("Edge types", fmt_num(exp_edges["edge_type"].nunique()))

                kg_html = pyvis_from_kg(exp_nodes, exp_edges)
                components.html(kg_html, height=860, scrolling=True)

    except Exception as e:
        st.error(str(e))

with tab_geo:
    st.subheader("Geographic Distribution of Seed Papers")
    with st.spinner("Loading geographic data..."):
        aff_geo_df = load_geo_data(data_dir_val)

    country_cnt = (seed_filtered.groupby("country", dropna=False).size()
                   .reset_index(name="count").rename(columns={"country":"country_name"}))
    country_cnt = country_cnt[country_cnt["country_name"].str.strip() != ""]

    if not country_cnt.empty:
        fig_map = px.choropleth(country_cnt, locations="country_name",
                                locationmode="country names", color="count",
                                hover_name="country_name",
                                color_continuous_scale="Blues",
                                title="Seed Papers by Country")
        fig_map.update_layout(geo=dict(showframe=False), height=500)
        st.plotly_chart(fig_map, use_container_width=True)

    st.subheader("Top Cities")
    city_cnt = (seed_filtered.merge(
                    aff_geo_df[["affiliation_name","city_name","country_name"]],
                    left_on="affiliation", right_on="affiliation_name", how="left")
                .groupby(["country_name","city_name"], dropna=False).size()
                .reset_index(name="count").dropna(subset=["country_name"])
                .sort_values("count", ascending=False).head(30))
    if not city_cnt.empty:
        st.plotly_chart(
            px.bar(city_cnt, x="city_name", y="count", color="country_name",
                   title="Top 30 Cities")
            .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
            use_container_width=True)

    st.subheader("Top Affiliations")
    geo_col1, geo_col2 = st.columns(2)

    with geo_col1:
        aff_cnt = (seed_filtered[seed_filtered["affiliation"].str.strip() != ""]
                   .groupby("affiliation").size()
                   .reset_index(name="count")
                   .sort_values("count", ascending=False).head(20))
        if not aff_cnt.empty:
            st.plotly_chart(
                px.bar(aff_cnt, x="count", y="affiliation", orientation="h",
                       title="Top 20 Affiliations by Seed Papers",
                       labels={"count": "Seed Papers", "affiliation": ""})
                .update_layout(yaxis=dict(autorange="reversed"),
                               xaxis_title="Seed Papers", yaxis_title="", height=520),
                use_container_width=True)

    with geo_col2:
        aff_country = (seed_filtered[
                (seed_filtered["affiliation"].str.strip() != "") &
                (seed_filtered["country"].str.strip() != "")
            ]
            .groupby(["country", "affiliation"]).size()
            .reset_index(name="count")
            .sort_values("count", ascending=False)
        )
        top_affs = aff_country.groupby("affiliation")["count"].sum().nlargest(20).index
        aff_country_top = aff_country[aff_country["affiliation"].isin(top_affs)]
        if not aff_country_top.empty:
            st.plotly_chart(
                px.bar(aff_country_top, x="count", y="affiliation",
                       color="country", orientation="h",
                       title="Top Affiliations by Country",
                       labels={"count": "Seed Papers", "affiliation": "", "country": "Country"})
                .update_layout(yaxis=dict(autorange="reversed"),
                               barmode="stack",
                               xaxis_title="Seed Papers", yaxis_title="",
                               legend_title="Country", height=520),
                use_container_width=True)

with tab_analytics:
    try:
        with st.spinner("Loading analytics data..."):
            authors_df = load_authors_data(data_dir_val)
        _authors_ok = True
    except Exception as _e:
        st.warning(f"Authors data unavailable: {_e}")
        authors_df = pd.DataFrame(columns=["author_id", "author_name"])
        _authors_ok = False

    col_a, col_b = st.columns(2)

    with col_a:
        st.subheader("Top Authors")
        if _authors_ok and "author_id" in seed.columns and not seed["author_id"].isna().all():
            top_auth = (seed.explode("author_id")
                        .merge(authors_df, on="author_id", how="left")
                        .groupby("author_name").size()
                        .reset_index(name="paper_count")
                        .sort_values("paper_count", ascending=False).head(20))
        else:
            top_auth = (seed["author"].value_counts()
                        .reset_index().rename(columns={"author":"author_name","count":"paper_count"})
                        .head(20))
        top_auth = top_auth[top_auth["author_name"].str.strip() != ""]
        st.plotly_chart(
            px.bar(top_auth, x="paper_count", y="author_name", orientation="h",
                   title="Top 20 Authors")
            .update_layout(yaxis=dict(autorange="reversed"),
                           xaxis_title="Seed Papers", yaxis_title=""),
            use_container_width=True)

    with col_b:
        st.subheader("Top Journals")
        top_jnl = (seed.groupby("journal").size()
                   .reset_index(name="count").sort_values("count", ascending=False).head(20))
        top_jnl = top_jnl[top_jnl["journal"].str.strip() != ""]
        st.plotly_chart(
            px.bar(top_jnl, x="count", y="journal", orientation="h",
                   title="Top 20 Journals")
            .update_layout(yaxis=dict(autorange="reversed"),
                           xaxis_title="Seed Papers", yaxis_title=""),
            use_container_width=True)

    st.markdown("---")
    col_c, col_d = st.columns(2)

    _agg = load_analytics_data(events_path)
    _seed_field_map = seed.set_index("seed_paper_id")["field"].to_dict()

    with col_c:
        st.subheader("CitationHub Field × Intent Distribution Heatmap")
        import duckdb as _addb
        ep = events_path.replace("\\", "/")
        _fi_raw = _addb.execute(f"""

            SELECT cited_seed_paper_id AS seed_paper_id, primary_intent, COUNT(*) AS count

            FROM read_parquet('{ep}')

            WHERE primary_intent IN ({_INTENTS_SQL})

            GROUP BY cited_seed_paper_id, primary_intent

        """).df()
        _fi_raw["field"] = _fi_raw["seed_paper_id"].map(_seed_field_map).fillna("")
        fi2 = (_fi_raw[_fi_raw["field"] != ""]
               .groupby(["field","primary_intent"])["count"].sum().reset_index())
        if not fi2.empty:
            pivot = fi2.pivot(index="field", columns="primary_intent", values="count").fillna(0)
            st.plotly_chart(
                px.imshow(pivot, color_continuous_scale="Blues",
                          title="CitationHub Field × Intent Distribution Heatmap",
                          aspect="auto")
                .update_layout(xaxis_title="Intent", yaxis_title="Field"),
                use_container_width=True)

    with col_d:
        st.subheader("Influential Citations (selected paper)")
        if "is_influential" in seed_events.columns:
            inf = seed_events["is_influential"].value_counts().reset_index()
            inf.columns = ["is_influential","count"]
            inf["label"] = inf["is_influential"].map({True:"Influential", False:"Non-influential"})
            st.plotly_chart(
                px.pie(inf, names="label", values="count",
                       title="Influential vs Non-influential"),
                use_container_width=True)

    st.markdown("---")
    st.subheader("CitationHub Intent Evolution over Years")
    st.caption("How citation intents have changed across all papers over time")
    intent_trend_raw = _agg["intent_trend"]
    if not intent_trend_raw.empty:
        st.plotly_chart(
            px.area(
                intent_trend_raw, x="year", y="count", color="primary_intent",
                color_discrete_map=INTENT_COLORS,
                labels={"primary_intent": "Intent", "count": "Citations", "year": "Year"},
            ).update_layout(
                legend_title="Intent",
                xaxis_title="Year", yaxis_title="# Citations",
                hovermode="x unified",
            ),
            use_container_width=True,
        )

    st.markdown("---")
    col_v1, col_v2 = st.columns(2)

    with col_v1:
        st.subheader("Top Citing Venues")
        st.caption("Journals/conferences that cite seed papers most")
        venue_cnt = _agg["venues"]
        if not venue_cnt.empty:
            st.plotly_chart(
                px.bar(venue_cnt, x="count", y="citing_venue", orientation="h",
                       labels={"count": "Citations", "citing_venue": ""})
                .update_layout(yaxis=dict(autorange="reversed"),
                               xaxis_title="Citations", yaxis_title="", height=520),
                use_container_width=True,
            )

    with col_v2:
        st.subheader("CitationHub Field × Intent Distribution")
        st.caption("How each field uses citations differently (all fields)")
        fi_pct = fi2.copy()
        if not fi_pct.empty:
            totals = fi_pct.groupby("field")["count"].transform("sum")
            fi_pct["pct"] = (fi_pct["count"] / totals * 100).round(1)
            n_fields = fi_pct["field"].nunique()
            chart_height = max(520, n_fields * 28)
            st.plotly_chart(
                px.bar(fi_pct, x="pct", y="field", color="primary_intent",
                       orientation="h", color_discrete_map=INTENT_COLORS,
                       labels={"pct": "% of citations", "field": "", "primary_intent": "Intent"})
                .update_layout(
                    barmode="stack",
                    yaxis=dict(autorange="reversed", categoryorder="total ascending"),
                    xaxis_title="% of citations", yaxis_title="",
                    legend_title="Intent", height=chart_height,
                ),
                use_container_width=True,
            )

    st.markdown("---")
    st.subheader("Citation Trend over Time (selected paper)")
    st.caption("How citations to the selected seed paper have changed year by year")
    trend_sel = (seed_events.dropna(subset=["citing_year"])
                 .assign(citing_year=lambda df: df["citing_year"].astype(int))
                 .query("citing_year >= 2000")
                 .groupby("citing_year").size().reset_index(name="count"))
    if not trend_sel.empty:
        st.plotly_chart(
            px.line(trend_sel, x="citing_year", y="count", markers=True,
                    labels={"citing_year": "Year", "count": "Citations"})
            .update_layout(xaxis_title="Year", yaxis_title="Citations",
                           hovermode="x unified"),
            use_container_width=True)
    else:
        st.info("No citation trend data for the selected paper.")

    st.markdown("---")
    st.subheader("Export Data")
    col_e1, col_e2, col_e3 = st.columns(3)

    with col_e1:
        csv_seed = seed_filtered[
            ["title", "doi", "journal", "author", "country", "field", "citedby_count"]
        ].to_csv(index=False).encode("utf-8")
        csv_download_link(csv_seed, "seed_papers.csv", "⬇ Seed Papers (CSV)")

    with col_e2:
        _cite_cols = [c for c in
            ["citing_title", "citing_doi", "citing_year", "citing_venue",
             "primary_intent", "context_count", "is_influential"]
            if c in seed_events.columns]
        cite_export = (seed_events[_cite_cols]
            .rename(columns={
                "citing_title": "title", "citing_doi": "doi",
                "citing_year": "year", "citing_venue": "venue",
                "primary_intent": "intent", "context_count": "contexts",
                "is_influential": "influential",
            }).to_csv(index=False).encode("utf-8"))
        csv_download_link(cite_export, "citation_events.csv", "⬇ Citation Events (CSV)")

    with col_e3:
        intent_csv = intent_summary.to_csv(index=False).encode("utf-8")
        csv_download_link(intent_csv, "intent_summary.csv", "⬇ Intent Summary (CSV)")