| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """Testing suite for the PyTorch Cohere2 model.""" |
| |
|
| | import unittest |
| |
|
| | import pytest |
| | from packaging import version |
| | from parameterized import parameterized |
| | from pytest import mark |
| |
|
| | from transformers import AutoModelForCausalLM, AutoTokenizer, Cohere2Config, is_torch_available, pipeline |
| | from transformers.generation.configuration_utils import GenerationConfig |
| | from transformers.testing_utils import ( |
| | require_flash_attn, |
| | require_read_token, |
| | require_torch, |
| | require_torch_large_gpu, |
| | slow, |
| | torch_device, |
| | ) |
| |
|
| | from ...models.cohere.test_modeling_cohere import CohereModelTest, CohereModelTester |
| | from ...test_configuration_common import ConfigTester |
| |
|
| |
|
| | if is_torch_available(): |
| | import torch |
| |
|
| | from transformers import ( |
| | Cohere2ForCausalLM, |
| | Cohere2Model, |
| | ) |
| |
|
| |
|
| | class Cohere2ModelTester(CohereModelTester): |
| | config_class = Cohere2Config |
| | if is_torch_available(): |
| | model_class = Cohere2Model |
| | for_causal_lm_class = Cohere2ForCausalLM |
| |
|
| |
|
| | @require_torch |
| | class Cohere2ModelTest(CohereModelTest, unittest.TestCase): |
| | all_model_classes = (Cohere2Model, Cohere2ForCausalLM) if is_torch_available() else () |
| | pipeline_model_mapping = ( |
| | { |
| | "feature-extraction": Cohere2Model, |
| | "text-generation": Cohere2ForCausalLM, |
| | } |
| | if is_torch_available() |
| | else {} |
| | ) |
| | _is_stateful = True |
| |
|
| | def setUp(self): |
| | self.model_tester = Cohere2ModelTester(self) |
| | self.config_tester = ConfigTester(self, config_class=Cohere2Config, hidden_size=37) |
| |
|
| | @unittest.skip("Failing because of unique cache (HybridCache)") |
| | def test_model_outputs_equivalence(self, **kwargs): |
| | pass |
| |
|
| | @unittest.skip("Cohere2's forcefully disables sdpa due to softcapping") |
| | def test_sdpa_can_dispatch_non_composite_models(self): |
| | pass |
| |
|
| | @unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different") |
| | def test_eager_matches_sdpa_generate(self): |
| | pass |
| |
|
| | @parameterized.expand([("random",), ("same",)]) |
| | @pytest.mark.generate |
| | @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding") |
| | def test_assisted_decoding_matches_greedy_search(self, assistant_type): |
| | pass |
| |
|
| | @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding") |
| | def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type): |
| | pass |
| |
|
| | @pytest.mark.generate |
| | @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding") |
| | def test_assisted_decoding_sample(self): |
| | pass |
| |
|
| | @unittest.skip("Cohere2 has HybridCache which is not compatible with dola decoding") |
| | def test_dola_decoding_sample(self): |
| | pass |
| |
|
| | @unittest.skip("Cohere2 has HybridCache and doesn't support continue from past kv") |
| | def test_generate_continue_from_past_key_values(self): |
| | pass |
| |
|
| | @unittest.skip("Cohere2 has HybridCache and doesn't support contrastive generation") |
| | def test_contrastive_generate(self): |
| | pass |
| |
|
| | @unittest.skip("Cohere2 has HybridCache and doesn't support contrastive generation") |
| | def test_contrastive_generate_dict_outputs_use_cache(self): |
| | pass |
| |
|
| | @unittest.skip("Cohere2 has HybridCache and doesn't support contrastive generation") |
| | def test_contrastive_generate_low_memory(self): |
| | pass |
| |
|
| | @unittest.skip("Cohere2 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") |
| | def test_generate_with_static_cache(self): |
| | pass |
| |
|
| | @unittest.skip("Cohere2 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") |
| | def test_generate_from_inputs_embeds_with_static_cache(self): |
| | pass |
| |
|
| | @unittest.skip("Cohere2 has HybridCache and doesn't support progressive generation using input embeds.") |
| | def test_generate_continue_from_inputs_embeds(self): |
| | pass |
| |
|
| | @unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different") |
| | def test_sdpa_equivalence(self): |
| | pass |
| |
|
| |
|
| | @slow |
| | @require_read_token |
| | @require_torch_large_gpu |
| | class Cohere2IntegrationTest(unittest.TestCase): |
| | input_text = ["Hello I am doing", "Hi today"] |
| | |
| | |
| | cuda_compute_capability_major_version = None |
| |
|
| | @classmethod |
| | def setUpClass(cls): |
| | if is_torch_available() and torch.cuda.is_available(): |
| | |
| | cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] |
| |
|
| | def test_model_bf16(self): |
| | model_id = "CohereForAI/c4ai-command-r7b-12-2024" |
| | EXPECTED_TEXTS = [ |
| | "<BOS_TOKEN>Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", |
| | "<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n", |
| | ] |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager" |
| | ).to(torch_device) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| | inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) |
| |
|
| | output = model.generate(**inputs, max_new_tokens=20, do_sample=False) |
| | output_text = tokenizer.batch_decode(output, skip_special_tokens=False) |
| |
|
| | self.assertEqual(output_text, EXPECTED_TEXTS) |
| |
|
| | def test_model_fp16(self): |
| | model_id = "CohereForAI/c4ai-command-r7b-12-2024" |
| | EXPECTED_TEXTS = [ |
| | "<BOS_TOKEN>Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", |
| | "<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n", |
| | ] |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager" |
| | ).to(torch_device) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| | inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) |
| |
|
| | output = model.generate(**inputs, max_new_tokens=20, do_sample=False) |
| | output_text = tokenizer.batch_decode(output, skip_special_tokens=False) |
| |
|
| | self.assertEqual(output_text, EXPECTED_TEXTS) |
| |
|
| | def test_model_pipeline_bf16(self): |
| | |
| | model_id = "CohereForAI/c4ai-command-r7b-12-2024" |
| | |
| | EXPECTED_TEXTS = [ |
| | "Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", |
| | "Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n", |
| | ] |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" |
| | ).to(torch_device) |
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| | pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) |
| |
|
| | output = pipe(self.input_text, max_new_tokens=20, do_sample=False, padding=True) |
| |
|
| | self.assertEqual(output[0][0]["generated_text"], EXPECTED_TEXTS[0]) |
| | self.assertEqual(output[1][0]["generated_text"], EXPECTED_TEXTS[1]) |
| |
|
| | @require_flash_attn |
| | @mark.flash_attn_test |
| | def test_model_flash_attn(self): |
| | |
| | model_id = "CohereForAI/c4ai-command-r7b-12-2024" |
| | EXPECTED_TEXTS = [ |
| | '<BOS_TOKEN>Hello I am doing a project for my school and I need to create a website for a fictional company. I have the logo and the name of the company. I need a website that is simple and easy to navigate. I need a home page, about us, services, contact us, and a gallery. I need the website to be responsive and I need it to be able to be hosted on a server. I need the website to be done in a week. I need the website to be done in HTML,', |
| | "<PAD><PAD><BOS_TOKEN>Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n\nThis recipe is very simple and easy to make.\n\nYou will need:\n\n* 2 cups of flour\n* 1 cup of sugar\n* 1/2 cup of cocoa powder\n* 1 teaspoon of baking powder\n* 1 teaspoon of baking soda\n* 1/2 teaspoon of salt\n* 2 eggs\n* 1 cup of milk\n", |
| | ] |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_id, attn_implementation="flash_attention_2", torch_dtype="float16" |
| | ).to(torch_device) |
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| | inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) |
| |
|
| | output = model.generate(**inputs, max_new_tokens=100, do_sample=False) |
| | output_text = tokenizer.batch_decode(output, skip_special_tokens=False) |
| |
|
| | self.assertEqual(output_text, EXPECTED_TEXTS) |
| |
|
| | def test_export_static_cache(self): |
| | if version.parse(torch.__version__) < version.parse("2.5.0"): |
| | self.skipTest(reason="This test requires torch >= 2.5 to run.") |
| |
|
| | from transformers.integrations.executorch import ( |
| | TorchExportableModuleWithStaticCache, |
| | convert_and_export_with_cache, |
| | ) |
| |
|
| | model_id = "CohereForAI/c4ai-command-r7b-12-2024" |
| | EXPECTED_TEXT_COMPLETION = [ |
| | "Hello I am doing a project on the effects of social media on mental health. I have a few questions. 1. What is the relationship", |
| | ] |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="<PAD>", padding_side="right") |
| | |
| | device = "cpu" |
| | dtype = torch.bfloat16 |
| | cache_implementation = "static" |
| | attn_implementation = "sdpa" |
| | batch_size = 1 |
| | model = AutoModelForCausalLM.from_pretrained( |
| | "CohereForAI/c4ai-command-r7b-12-2024", |
| | device_map=device, |
| | torch_dtype=dtype, |
| | attn_implementation=attn_implementation, |
| | generation_config=GenerationConfig( |
| | use_cache=True, |
| | cache_implementation=cache_implementation, |
| | max_length=30, |
| | cache_config={ |
| | "batch_size": batch_size, |
| | "max_cache_len": 30, |
| | }, |
| | ), |
| | ) |
| |
|
| | prompts = ["Hello I am doing"] |
| | prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) |
| | prompt_token_ids = prompt_tokens["input_ids"] |
| | max_new_tokens = 30 - prompt_token_ids.shape[-1] |
| |
|
| | |
| | exported_program = convert_and_export_with_cache(model) |
| | ep_generated_ids = TorchExportableModuleWithStaticCache.generate( |
| | exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens |
| | ) |
| | ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) |
| | self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) |
| |
|
| | @parameterized.expand([("flash_attention_2",), ("sdpa",), ("flex_attention",), ("eager",)]) |
| | @require_read_token |
| | def test_generation_beyond_sliding_window(self, attn_implementation: str): |
| | """Test that we can correctly generate beyond the sliding window. This is non trivial as |
| | we need to correctly slice the attention mask in all cases (because we use a HybridCache). |
| | Outputs for every attention functions should be coherent and identical. |
| | """ |
| | model_id = "CohereForAI/c4ai-command-r7b-12-2024" |
| | EXPECTED_COMPLETIONS = [ |
| | " the mountains, the lakes, the rivers, the waterfalls, the waterfalls, the waterfalls, the waterfalls", |
| | ", green, yellow, orange, purple, pink, brown, black, white, grey, silver", |
| | ] |
| |
|
| | input_text = [ |
| | "This is a nice place. " * 800 + "I really enjoy the scenery,", |
| | "A list of colors: red, blue", |
| | ] |
| | tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left") |
| | inputs = tokenizer(input_text, padding=True, return_tensors="pt").to(torch_device) |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_id, attn_implementation=attn_implementation, torch_dtype=torch.float16 |
| | ).to(torch_device) |
| |
|
| | |
| | input_size = inputs.input_ids.shape[-1] |
| | self.assertTrue(input_size > model.config.sliding_window) |
| |
|
| | out = model.generate(**inputs, max_new_tokens=20)[:, input_size:] |
| | output_text = tokenizer.batch_decode(out) |
| |
|
| | self.assertEqual(output_text, EXPECTED_COMPLETIONS) |
| |
|