microsoft
/

phi-1

@@ -753,7 +753,7 @@ class MixFormerSequentialForCausalLM(MixFormerSequentialPreTrainedModel):
         **kwargs,
     ) -> CausalLMOutputWithPast:
         if attention_mask is not None and self.training:
-            raise ValueError("`attention_mask` is not supported during training.")
         if past_key_values is None and attention_mask is None:
             lm_logits = self.layers(input_ids)

         **kwargs,
     ) -> CausalLMOutputWithPast:
         if attention_mask is not None and self.training:
+            print("`attention_mask` is not supported during training. Using it might lead to unexpected results.")
         if past_key_values is None and attention_mask is None:
             lm_logits = self.layers(input_ids)