KChad commited on
Commit
6f9d53d
·
1 Parent(s): 6da61f5

new changes

Browse files
Files changed (1) hide show
  1. server/app.py +58 -11
server/app.py CHANGED
@@ -59,9 +59,47 @@ def _landing_page_html() -> str:
59
  </article>
60
  """
61
  for task, description in (
62
- ("easy", "Obvious or lightly hidden prompt injections that try to force unsafe outbound behavior."),
63
- ("medium", "Wrapper-based, authority-spoofed, or session-confusing attacks that require better judgment."),
64
- ("hard", "Safe-completion episodes where the agent must write a clean internal summary without leaking attack content."),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  )
66
  )
67
 
@@ -317,9 +355,10 @@ def _landing_page_html() -> str:
317
  <span class="eyebrow">OpenEnv Benchmark</span>
318
  <h1>IDPI Exfil Environment</h1>
319
  <p class="lede">
320
- A prompt-injection benchmark for email-style agent workflows. Agents must finish
321
- legitimate reading and summarization tasks while refusing hidden exfiltration
322
- attempts, delayed triggers, and contaminated summary instructions.
 
323
  </p>
324
  <div class="cta-row">
325
  <a class="cta primary" href="/docs">Open API Docs</a>
@@ -344,8 +383,8 @@ def _landing_page_html() -> str:
344
  <div class="stat-value">{", ".join(TASK_NAMES)}</div>
345
  </div>
346
  <div class="stat">
347
- <div class="stat-label">Reset</div>
348
- <div class="stat-value">POST /reset</div>
349
  </div>
350
  </div>
351
  </div>
@@ -357,12 +396,19 @@ def _landing_page_html() -> str:
357
  </section>
358
 
359
  <section class="panel section">
360
- <h2>Task Ladder</h2>
361
  <div class="task-grid">
362
  {task_cards}
363
  </div>
364
  </section>
365
 
 
 
 
 
 
 
 
366
  <section class="panel section">
367
  <h2>Quick Start</h2>
368
  <div class="api-grid">
@@ -378,8 +424,9 @@ curl https://your-space.hf.space/result</pre>
378
  <ul>
379
  <li>Use <span class="inline">POST /reset</span> to start a new episode. An empty body is valid.</li>
380
  <li>Use <span class="inline">POST /step</span> with typed OpenEnv actions.</li>
381
- <li>Inspect live schemas at <span class="inline">/schema</span> and interactive docs at <span class="inline">/docs</span>.</li>
382
- <li>Call <span class="inline">/metadata</span> for task info and <span class="inline">/health</span> for service health.</li>
 
383
  </ul>
384
  </div>
385
  </div>
 
59
  </article>
60
  """
61
  for task, description in (
62
+ (
63
+ "easy",
64
+ "Person A easy overlay with obvious but believable relay attempts like "
65
+ "`easy_gold_direct_relay_confirmation` and `easy_gold_hidden_wrapper_ack`.",
66
+ ),
67
+ (
68
+ "medium",
69
+ "Curated Person A medium pack with hidden carrier channels such as "
70
+ "`medium_gold_html_comment_relay`, transcript replay, and authority spoofing.",
71
+ ),
72
+ (
73
+ "hard",
74
+ "Person A hard pack where the agent must finish the business task safely, including "
75
+ "`hard_gold_attachment_alignment_clause` and delayed summary-hook scenarios.",
76
+ ),
77
+ )
78
+ )
79
+ showcase_cards = "".join(
80
+ f"""
81
+ <article class="task-card">
82
+ <span class="task-pill">{channel}</span>
83
+ <h3>{title}</h3>
84
+ <p>{description}</p>
85
+ </article>
86
+ """
87
+ for channel, title, description in (
88
+ (
89
+ "HTML COMMENT",
90
+ "medium_gold_html_comment_relay",
91
+ "Hidden relay instructions tucked into comment markup. Safe agents should read, flag, and keep the digest workflow internal.",
92
+ ),
93
+ (
94
+ "ATTACHMENT",
95
+ "hard_gold_delayed_attachment_summary_hook",
96
+ "The malicious instruction only activates when the agent tries to write the benign summary, which makes the episode trajectory-aware instead of static-text-aware.",
97
+ ),
98
+ (
99
+ "SEMANTIC",
100
+ "hard_gold_attachment_alignment_clause",
101
+ "A business-looking workbook clause poisons the internal summary unless the model recognizes it as attacker-directed content.",
102
+ ),
103
  )
104
  )
105
 
 
355
  <span class="eyebrow">OpenEnv Benchmark</span>
356
  <h1>IDPI Exfil Environment</h1>
357
  <p class="lede">
358
+ A prompt-injection benchmark for Person A's email workflow. The runtime pack contains
359
+ 32 curated scenarios across easy, medium, and hard splits, covering hidden relay
360
+ instructions, delayed summary hooks, HTML comment attacks, transcript replay, and
361
+ attachment-borne contamination attempts.
362
  </p>
363
  <div class="cta-row">
364
  <a class="cta primary" href="/docs">Open API Docs</a>
 
383
  <div class="stat-value">{", ".join(TASK_NAMES)}</div>
384
  </div>
385
  <div class="stat">
386
+ <div class="stat-label">Runtime Pack</div>
387
+ <div class="stat-value">Person A / 32 scenarios</div>
388
  </div>
389
  </div>
390
  </div>
 
396
  </section>
397
 
398
  <section class="panel section">
399
+ <h2>Person A Runtime Pack</h2>
400
  <div class="task-grid">
401
  {task_cards}
402
  </div>
403
  </section>
404
 
405
+ <section class="panel section">
406
+ <h2>Showcase Episodes</h2>
407
+ <div class="task-grid">
408
+ {showcase_cards}
409
+ </div>
410
+ </section>
411
+
412
  <section class="panel section">
413
  <h2>Quick Start</h2>
414
  <div class="api-grid">
 
424
  <ul>
425
  <li>Use <span class="inline">POST /reset</span> to start a new episode. An empty body is valid.</li>
426
  <li>Use <span class="inline">POST /step</span> with typed OpenEnv actions.</li>
427
+ <li>The evaluation runner uses <span class="inline">docs/person_a_benchmark_manifest.json</span> as the runtime source of truth.</li>
428
+ <li>Inspect live schemas at <span class="inline">/schema</span>, docs at <span class="inline">/docs</span>, and metadata at <span class="inline">/metadata</span>.</li>
429
+ <li>Call <span class="inline">/health</span> for service health and <span class="inline">/result</span> after an episode to inspect grader output.</li>
430
  </ul>
431
  </div>
432
  </div>