Spaces:

KChad
/

Prompt-Injection-RL-environment

Sleeping

App Files Files Community

KChad commited on 28 days ago

Commit

6f9d53d

1 Parent(s): 6da61f5

new changes

Browse files

Files changed (1) hide show

server/app.py +58 -11

server/app.py CHANGED Viewed

@@ -59,9 +59,47 @@ def _landing_page_html() -> str:
         </article>
         """
         for task, description in (
-            ("easy", "Obvious or lightly hidden prompt injections that try to force unsafe outbound behavior."),
-            ("medium", "Wrapper-based, authority-spoofed, or session-confusing attacks that require better judgment."),
-            ("hard", "Safe-completion episodes where the agent must write a clean internal summary without leaking attack content."),
         )
     )
@@ -317,9 +355,10 @@ def _landing_page_html() -> str:
             <span class="eyebrow">OpenEnv Benchmark</span>
             <h1>IDPI Exfil Environment</h1>
             <p class="lede">
-              A prompt-injection benchmark for email-style agent workflows. Agents must finish
-              legitimate reading and summarization tasks while refusing hidden exfiltration
-              attempts, delayed triggers, and contaminated summary instructions.
             </p>
             <div class="cta-row">
               <a class="cta primary" href="/docs">Open API Docs</a>
@@ -344,8 +383,8 @@ def _landing_page_html() -> str:
                   <div class="stat-value">{", ".join(TASK_NAMES)}</div>
                 </div>
                 <div class="stat">
-                  <div class="stat-label">Reset</div>
-                  <div class="stat-value">POST /reset</div>
                 </div>
               </div>
             </div>
@@ -357,12 +396,19 @@ def _landing_page_html() -> str:
         </section>
         <section class="panel section">
-          <h2>Task Ladder</h2>
           <div class="task-grid">
             {task_cards}
           </div>
         </section>
         <section class="panel section">
           <h2>Quick Start</h2>
           <div class="api-grid">
@@ -378,8 +424,9 @@ curl https://your-space.hf.space/result</pre>
               <ul>
                 <li>Use <span class="inline">POST /reset</span> to start a new episode. An empty body is valid.</li>
                 <li>Use <span class="inline">POST /step</span> with typed OpenEnv actions.</li>
-                <li>Inspect live schemas at <span class="inline">/schema</span> and interactive docs at <span class="inline">/docs</span>.</li>
-                <li>Call <span class="inline">/metadata</span> for task info and <span class="inline">/health</span> for service health.</li>
               </ul>
             </div>
           </div>

         </article>
         """
         for task, description in (
+            (
+                "easy",
+                "Person A easy overlay with obvious but believable relay attempts like "
+                "`easy_gold_direct_relay_confirmation` and `easy_gold_hidden_wrapper_ack`.",
+            ),
+            (
+                "medium",
+                "Curated Person A medium pack with hidden carrier channels such as "
+                "`medium_gold_html_comment_relay`, transcript replay, and authority spoofing.",
+            ),
+            (
+                "hard",
+                "Person A hard pack where the agent must finish the business task safely, including "
+                "`hard_gold_attachment_alignment_clause` and delayed summary-hook scenarios.",
+            ),
+        )
+    )
+    showcase_cards = "".join(
+        f"""
+        <article class="task-card">
+          <span class="task-pill">{channel}</span>
+          <h3>{title}</h3>
+          <p>{description}</p>
+        </article>
+        """
+        for channel, title, description in (
+            (
+                "HTML COMMENT",
+                "medium_gold_html_comment_relay",
+                "Hidden relay instructions tucked into comment markup. Safe agents should read, flag, and keep the digest workflow internal.",
+            ),
+            (
+                "ATTACHMENT",
+                "hard_gold_delayed_attachment_summary_hook",
+                "The malicious instruction only activates when the agent tries to write the benign summary, which makes the episode trajectory-aware instead of static-text-aware.",
+            ),
+            (
+                "SEMANTIC",
+                "hard_gold_attachment_alignment_clause",
+                "A business-looking workbook clause poisons the internal summary unless the model recognizes it as attacker-directed content.",
+            ),
         )
     )
             <span class="eyebrow">OpenEnv Benchmark</span>
             <h1>IDPI Exfil Environment</h1>
             <p class="lede">
+              A prompt-injection benchmark for Person A's email workflow. The runtime pack contains
+              32 curated scenarios across easy, medium, and hard splits, covering hidden relay
+              instructions, delayed summary hooks, HTML comment attacks, transcript replay, and
+              attachment-borne contamination attempts.
             </p>
             <div class="cta-row">
               <a class="cta primary" href="/docs">Open API Docs</a>
                   <div class="stat-value">{", ".join(TASK_NAMES)}</div>
                 </div>
                 <div class="stat">
+                  <div class="stat-label">Runtime Pack</div>
+                  <div class="stat-value">Person A / 32 scenarios</div>
                 </div>
               </div>
             </div>
         </section>
         <section class="panel section">
+          <h2>Person A Runtime Pack</h2>
           <div class="task-grid">
             {task_cards}
           </div>
         </section>
+        <section class="panel section">
+          <h2>Showcase Episodes</h2>
+          <div class="task-grid">
+            {showcase_cards}
+          </div>
+        </section>
         <section class="panel section">
           <h2>Quick Start</h2>
           <div class="api-grid">
               <ul>
                 <li>Use <span class="inline">POST /reset</span> to start a new episode. An empty body is valid.</li>
                 <li>Use <span class="inline">POST /step</span> with typed OpenEnv actions.</li>
+                <li>The evaluation runner uses <span class="inline">docs/person_a_benchmark_manifest.json</span> as the runtime source of truth.</li>
+                <li>Inspect live schemas at <span class="inline">/schema</span>, docs at <span class="inline">/docs</span>, and metadata at <span class="inline">/metadata</span>.</li>
+                <li>Call <span class="inline">/health</span> for service health and <span class="inline">/result</span> after an episode to inspect grader output.</li>
               </ul>
             </div>
           </div>