ByteDance-Seed
/

Stable-DiffCoder-8B-Instruct

@@ -160,12 +160,15 @@ class StableDiffcoderForCausalLM(LlamaForCausalLM):
             cur_attn_mask = block_diffusion_attention_mask[
                 ..., :prefill_length, :prefill_length
             ]
             self(
                 x[:, :prefill_length],
                 past_key_values=past_key_values,
                 attention_mask=cur_attn_mask,
                 use_cache=True,
-            ).past_key_values
         for block_id, block_size in enumerate(gen_block_list):
             block_start = (
@@ -181,8 +184,13 @@ class StableDiffcoderForCausalLM(LlamaForCausalLM):
             replace_position = torch.zeros_like(x, dtype=torch.bool)
             replace_position[:, block_start:block_end] = True
-            for token_count in num_transfer_tokens:
-                if token_count:
                     nfe += 1
                     mask_map = x[:, block_start:block_end] == mask_id
                     attention_mask = block_diffusion_attention_mask[
@@ -207,20 +215,26 @@ class StableDiffcoderForCausalLM(LlamaForCausalLM):
                         x[:, block_start:block_end],
                         token_count if threshold is None else None,
                         threshold,
-                        shift=False,
                     )
                     x[:, block_start:block_end][transfer_map] = x0[transfer_map]
                 if (x[:, block_start:block_end] == mask_id).sum() == 0:
                     if (
                         eos_id is not None
-                        and (x[:, block_start:block_end] == eos_id).sum() > 0
                     ):
                         final_flag = True
                         x = x[:, :block_end]
-                        eos_pos = (x == eos_id).nonzero(as_tuple=True)[1][0].item()
                         x[0, eos_pos:] = eos_id
                         break
                     nfe += 1
                     self(
                         x[:, block_start:block_end],
@@ -231,13 +245,13 @@ class StableDiffcoderForCausalLM(LlamaForCausalLM):
                         use_cache=True,
                         cache_position=replace_position.nonzero(as_tuple=True)[1],
                     )
-                    break
             if final_flag:
                 break
         return x, nfe
     @torch.no_grad()
     def generate(
         self,

             cur_attn_mask = block_diffusion_attention_mask[
                 ..., :prefill_length, :prefill_length
             ]
+            # Fix 1: Explicitly pass cache_position for newer transformers prefill
+            cache_pos = torch.arange(prefill_length, device=x.device)
             self(
                 x[:, :prefill_length],
                 past_key_values=past_key_values,
                 attention_mask=cur_attn_mask,
                 use_cache=True,
+                cache_position=cache_pos,
+            )
         for block_id, block_size in enumerate(gen_block_list):
             block_start = (
             replace_position = torch.zeros_like(x, dtype=torch.bool)
             replace_position[:, block_start:block_end] = True
+            step_idx = 0
+            while True:
+                idx = min(step_idx, len(num_transfer_tokens) - 1)
+                token_count = num_transfer_tokens[idx].item()
+                step_idx += 1
+                if token_count > 0:
                     nfe += 1
                     mask_map = x[:, block_start:block_end] == mask_id
                     attention_mask = block_diffusion_attention_mask[
                         x[:, block_start:block_end],
                         token_count if threshold is None else None,
                         threshold,
+                        shift=shift,
                     )
                     x[:, block_start:block_end][transfer_map] = x0[transfer_map]
                 if (x[:, block_start:block_end] == mask_id).sum() == 0:
+                    # Fix 2: Calculate where the generated tokens ACTUALLY start in this block
+                    gen_start = max(block_start, prompt_length)
+                    # Only check for eos_id in the freshly generated region, ignoring the prompt overlap
                     if (
                         eos_id is not None
+                        and gen_start < block_end
+                        and (x[:, gen_start:block_end] == eos_id).sum() > 0
                     ):
                         final_flag = True
                         x = x[:, :block_end]
+                        eos_pos = (x[:, gen_start:block_end] == eos_id).nonzero(as_tuple=True)[1][0].item() + gen_start
                         x[0, eos_pos:] = eos_id
                         break
                     nfe += 1
                     self(
                         x[:, block_start:block_end],
                         use_cache=True,
                         cache_position=replace_position.nonzero(as_tuple=True)[1],
                     )
+                    break
             if final_flag:
                 break
         return x, nfe
     @torch.no_grad()
     def generate(
         self,