gemma-3n / README.md

Update README.md

6e0bf15 verified 8 months ago

25.3 kB

	---
	library_name: transformers
	pipeline_tag: image-text-to-text
	inference: true
	widget:
	- text: Hello!
	example_title: Hello world
	group: Python
	base_model:
	- google/gemma-3n-E4B-it
	---

	This tiny model is for debugging. It is randomly initialized with the config adapted from [google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it).

	\| Model ID \| Notes \|
	\| ------------------------------------------------------------------------------------- \| ------------------------------- \|
	\| [tiny-random/gemma-3n](https://huggingface.co/tiny-random/gemma-3n) \| hidden size is 32 \|
	\| [tiny-random/gemma-3n-dim4](https://huggingface.co/tiny-random/gemma-3n-dim4) \| hidden size is 4; potentially not supported in paged attention kernels\|

	### Example usage:

	```python
	import torch

	from transformers import pipeline

	model_id = "tiny-random/gemma-3n"
	pipe = pipeline(
	task="image-text-to-text",
	model=model_id,
	device=0,
	torch_dtype=torch.bfloat16
	)

	# temporary patch for audio tower
	from accelerate.hooks import ModelHook, add_hook_to_module

	class EnsureDtype(ModelHook):
	def pre_forward(self, module, args, *kwargs):
	args = list(args)
	args[0] = args[0].to(module.dtype)
	return super().pre_forward(module, args, *kwargs)
	add_hook_to_module(pipe.model.audio_tower, EnsureDtype())

	messages = [
	{
	"role": "system",
	"content": [
	{"type": "text", "text": "You are a helpful assistant."}
	]
	},
	{
	"role": "user",
	"content": [
	{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
	# audio is buggy for now: bf16 x fp32
	{"type": "audio", "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"},
	{"type": "text", "text": "Which image is cuter?"},
	]
	},
	]
	result = pipe(messages, min_new_tokens=512, max_new_tokens=512, do_sample=True)
	print(result)
	```

	### Codes to create this repo:

	```python
	import json
	from pathlib import Path

	import torch

	import accelerate
	from huggingface_hub import file_exists, hf_hub_download
	from timm.models.mobilenetv5 import decode_arch_def
	from transformers import (
	AutoConfig,
	AutoModelForCausalLM,
	AutoProcessor,
	AutoTokenizer,
	Gemma3nForConditionalGeneration,
	GenerationConfig,
	set_seed,
	)

	source_model_id = "google/gemma-3n-E4B-it"
	save_folder = "/tmp/tiny-random/gemma-3n"

	processor = AutoProcessor.from_pretrained(source_model_id)
	processor.save_pretrained(save_folder)

	with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
	config_json = json.load(f)

	config_json['audio_config'].update({
	"conf_num_attention_heads": 2,
	"conf_num_hidden_layers": 2,
	"hidden_size": 64,
	})
	config_json['text_config'].update({
	"activation_sparsity_pattern": [0.95, 0.95, 0.0, 0.0],
	"head_dim": 32, # required by vllm
	"hidden_size": 32,
	"hidden_size_per_layer_input": 2,
	"intermediate_size": 64,
	"laurel_rank": 8,
	"layer_types": ['sliding_attention', 'full_attention', 'sliding_attention', 'full_attention'],
	"num_attention_heads": 1,
	"num_hidden_layers": 4,
	"num_key_value_heads": 1,
	"num_kv_shared_layers": 2,
	"sliding_window": 512,
	})
	block_args = decode_arch_def(
	[
	# Stage 0: 128x128 in
	[
	'er_r1_k3_s2_e4_c32',
	'er_r1_k3_s1_e4_c32',
	],
	# Stage 1: 256x256 in
	[
	'uir_r1_a3_k5_s2_e6_c32',
	'uir_r1_a5_k0_s1_e4_c32',
	'uir_r1_a3_k0_s1_e4_c32',
	],
	# Stage 2: 640x640 in
	[
	"uir_r1_a5_k5_s2_e6_c32",
	"uir_r1_a0_k0_s1_e1_c32",
	"mqa_r1_k3_h2_v2_s1_d64_c32",
	"uir_r1_a0_k0_s1_e2_c32",
	],
	# Stage 3: 1280x1280 in
	[
	"uir_r1_a5_k5_s2_e6_c32",
	"mqa_r1_k3_h2_s1_d64_c32",
	"uir_r1_a0_k0_s1_e2_c32",
	],
	]
	)
	config_json['vision_config'].update({
	"hidden_size": 2048, # hard-coded in timm
	"model_args": {
	"block_args": block_args,
	}
	})
	config_json['tie_word_embeddings'] = True

	with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
	json.dump(config_json, f, indent=2)

	config = AutoConfig.from_pretrained(
	save_folder,
	trust_remote_code=True,
	)
	print(config)

	torch.set_default_dtype(torch.bfloat16)
	model = Gemma3nForConditionalGeneration(config)
	torch.set_default_dtype(torch.float32)
	if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
	model.generation_config = GenerationConfig.from_pretrained(
	source_model_id, trust_remote_code=True,
	)
	set_seed(42)
	model = model.cpu()
	all_numels = 0
	for name, p in sorted(model.named_parameters()):
	all_numels += p.numel()
	with torch.no_grad():
	for name, p in sorted(model.named_parameters()):
	torch.nn.init.normal_(p, 0, 0.2)
	print(name, p.shape, f'{p.numel() / all_numels * 100: .4f}%')
	model.save_pretrained(save_folder)
	```

	### Printing the model:

	```text
	Gemma3nForConditionalGeneration(
	(model): Gemma3nModel(
	(vision_tower): TimmWrapperModel(
	(timm_model): MobileNetV5Encoder(
	(conv_stem): ConvNormAct(
	(conv): Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(blocks): Sequential(
	(0): Sequential(
	(0): EdgeResidual(
	(conv_exp): Conv2dSame(64, 256, kernel_size=(3, 3), stride=(2, 2), bias=False)
	(bn1): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	(aa): Identity()
	(se): Identity()
	(conv_pwl): Conv2d(256, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn2): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	(drop_path): Identity()
	)
	(1): EdgeResidual(
	(conv_exp): Conv2d(32, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn1): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	(aa): Identity()
	(se): Identity()
	(conv_pwl): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn2): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	(drop_path): Identity()
	)
	)
	(1): Sequential(
	(0): UniversalInvertedResidual(
	(dw_start): ConvNormAct(
	(conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(pw_exp): ConvNormAct(
	(conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(dw_mid): ConvNormAct(
	(conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(se): Identity()
	(pw_proj): ConvNormAct(
	(conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(dw_end): Identity()
	(layer_scale): LayerScale2d()
	(drop_path): Identity()
	)
	(1): UniversalInvertedResidual(
	(dw_start): ConvNormAct(
	(conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(pw_exp): ConvNormAct(
	(conv): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(dw_mid): Identity()
	(se): Identity()
	(pw_proj): ConvNormAct(
	(conv): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(dw_end): Identity()
	(layer_scale): LayerScale2d()
	(drop_path): Identity()
	)
	(2): UniversalInvertedResidual(
	(dw_start): ConvNormAct(
	(conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(pw_exp): ConvNormAct(
	(conv): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(dw_mid): Identity()
	(se): Identity()
	(pw_proj): ConvNormAct(
	(conv): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(dw_end): Identity()
	(layer_scale): LayerScale2d()
	(drop_path): Identity()
	)
	)
	(2): Sequential(
	(0): UniversalInvertedResidual(
	(dw_start): ConvNormAct(
	(conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(pw_exp): ConvNormAct(
	(conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(dw_mid): ConvNormAct(
	(conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(se): Identity()
	(pw_proj): ConvNormAct(
	(conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(dw_end): Identity()
	(layer_scale): LayerScale2d()
	(drop_path): Identity()
	)
	(1): UniversalInvertedResidual(
	(dw_start): Identity()
	(pw_exp): ConvNormAct(
	(conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(dw_mid): Identity()
	(se): Identity()
	(pw_proj): ConvNormAct(
	(conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(dw_end): Identity()
	(layer_scale): LayerScale2d()
	(drop_path): Identity()
	)
	(2): MobileAttention(
	(norm): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	(attn): MultiQueryAttention2d(
	(query): Sequential(
	(proj): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
	)
	(key): Sequential(
	(down_conv): Conv2dSame(32, 32, kernel_size=(3, 3), stride=(2, 2), groups=32, bias=False)
	(norm): RmsNorm2d()
	(proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
	)
	(value): Sequential(
	(down_conv): Conv2dSame(32, 32, kernel_size=(3, 3), stride=(2, 2), groups=32, bias=False)
	(norm): RmsNorm2d()
	(proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
	)
	(attn_drop): Dropout(p=0.0, inplace=False)
	(output): Sequential(
	(proj): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(drop): Dropout(p=0.0, inplace=False)
	)
	)
	(layer_scale): LayerScale2d()
	(drop_path): Identity()
	)
	(3): UniversalInvertedResidual(
	(dw_start): Identity()
	(pw_exp): ConvNormAct(
	(conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(dw_mid): Identity()
	(se): Identity()
	(pw_proj): ConvNormAct(
	(conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(dw_end): Identity()
	(layer_scale): LayerScale2d()
	(drop_path): Identity()
	)
	)
	(3): Sequential(
	(0): UniversalInvertedResidual(
	(dw_start): ConvNormAct(
	(conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(pw_exp): ConvNormAct(
	(conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(dw_mid): ConvNormAct(
	(conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(se): Identity()
	(pw_proj): ConvNormAct(
	(conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(dw_end): Identity()
	(layer_scale): LayerScale2d()
	(drop_path): Identity()
	)
	(1): MobileAttention(
	(norm): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	(attn): MultiQueryAttention2d(
	(query): Sequential(
	(proj): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
	)
	(key): Sequential(
	(proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
	)
	(value): Sequential(
	(proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
	)
	(attn_drop): Dropout(p=0.0, inplace=False)
	(output): Sequential(
	(proj): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(drop): Dropout(p=0.0, inplace=False)
	)
	)
	(layer_scale): LayerScale2d()
	(drop_path): Identity()
	)
	(2): UniversalInvertedResidual(
	(dw_start): Identity()
	(pw_exp): ConvNormAct(
	(conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(dw_mid): Identity()
	(se): Identity()
	(pw_proj): ConvNormAct(
	(conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(dw_end): Identity()
	(layer_scale): LayerScale2d()
	(drop_path): Identity()
	)
	)
	)
	(msfa): MobileNetV5MultiScaleFusionAdapter(
	(ffn): UniversalInvertedResidual(
	(dw_start): Identity()
	(pw_exp): ConvNormAct(
	(conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): GELU(approximate='none')
	)
	)
	(dw_mid): Identity()
	(se): Identity()
	(pw_proj): ConvNormAct(
	(conv): Conv2d(128, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn): RmsNormAct2d(
	(drop): Identity()
	(act): Identity()
	)
	)
	(dw_end): Identity()
	(layer_scale): Identity()
	(drop_path): Identity()
	)
	(norm): RmsNorm2d()
	)
	)
	)
	(language_model): Gemma3nTextModel(
	(embed_tokens): Gemma3nTextScaledWordEmbedding(262400, 32, padding_idx=0)
	(layers): ModuleList(
	(0-3): 4 x Gemma3nTextDecoderLayer(
	(self_attn): Gemma3nTextAttention(
	(q_proj): Linear(in_features=32, out_features=32, bias=False)
	(k_proj): Linear(in_features=32, out_features=32, bias=False)
	(v_proj): Linear(in_features=32, out_features=32, bias=False)
	(o_proj): Linear(in_features=32, out_features=32, bias=False)
	(q_norm): Gemma3nRMSNorm((32,), eps=1e-06)
	(k_norm): Gemma3nRMSNorm((32,), eps=1e-06)
	(v_norm): Gemma3nRMSNorm((), eps=1e-06)
	)
	(mlp): Gemma3nTextMLP(
	(gate_proj): Linear(in_features=32, out_features=64, bias=False)
	(up_proj): Linear(in_features=32, out_features=64, bias=False)
	(down_proj): Linear(in_features=64, out_features=32, bias=False)
	(act_fn): PytorchGELUTanh()
	)
	(input_layernorm): Gemma3nRMSNorm((32,), eps=1e-06)
	(post_attention_layernorm): Gemma3nRMSNorm((32,), eps=1e-06)
	(pre_feedforward_layernorm): Gemma3nRMSNorm((32,), eps=1e-06)
	(post_feedforward_layernorm): Gemma3nRMSNorm((32,), eps=1e-06)
	(act_fn): PytorchGELUTanh()
	(altup): Gemma3nTextAltUp(
	(correction_coefs): Linear(in_features=4, out_features=4, bias=False)
	(prediction_coefs): Linear(in_features=4, out_features=16, bias=False)
	(modality_router): Linear(in_features=32, out_features=4, bias=False)
	(router_norm): Gemma3nRMSNorm((32,), eps=1e-06)
	)
	(laurel): Gemma3nTextLaurelBlock(
	(linear_left): Linear(in_features=32, out_features=8, bias=False)
	(linear_right): Linear(in_features=8, out_features=32, bias=False)
	(post_laurel_norm): Gemma3nRMSNorm((32,), eps=1e-06)
	)
	(per_layer_input_gate): Linear(in_features=32, out_features=2, bias=False)
	(per_layer_projection): Linear(in_features=2, out_features=32, bias=False)
	(post_per_layer_input_norm): Gemma3nRMSNorm((32,), eps=1e-06)
	)
	)
	(norm): Gemma3nRMSNorm((32,), eps=1e-06)
	(rotary_emb): Gemma3nTextRotaryEmbedding()
	(rotary_emb_local): Gemma3nTextRotaryEmbedding()
	(embed_tokens_per_layer): Gemma3nTextScaledWordEmbedding(262144, 8, padding_idx=0)
	(per_layer_model_projection): Linear(in_features=32, out_features=8, bias=False)
	(per_layer_projection_norm): Gemma3nRMSNorm((2,), eps=1e-06)
	(altup_projections): ModuleList(
	(0-2): 3 x Linear(in_features=32, out_features=32, bias=False)
	)
	(altup_unembed_projections): ModuleList(
	(0-2): 3 x Linear(in_features=32, out_features=32, bias=False)
	)
	)
	(audio_tower): Gemma3nAudioEncoder(
	(subsample_conv_projection): Gemma3nAudioSubSampleConvProjection(
	(conv_0): Gemma3nAudioSSCPConvBlock(
	(conv): Conv2d(1, 128, kernel_size=(3, 3), stride=(2, 2), bias=False)
	(norm): Gemma3nAudioCumulativeGroupNorm()
	(activation): ReLU()
	)
	(conv_1): Gemma3nAudioSSCPConvBlock(
	(conv): Conv2d(128, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
	(norm): Gemma3nAudioCumulativeGroupNorm()
	(activation): ReLU()
	)
	(input_proj_linear): Linear(in_features=1024, out_features=64, bias=False)
	)
	(conformer): ModuleList(
	(0-1): 2 x Gemma3nAudioConformerBlock(
	(ffw_layer_start): Gemma3nAudioConformerFeedForward(
	(pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06)
	(ffw_layer_1): Linear(in_features=64, out_features=256, bias=False)
	(ffw_layer_2): Linear(in_features=256, out_features=64, bias=False)
	(post_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06)
	)
	(attention): Gemma3nAudioConformerAttention(
	(pre_attn_norm): Gemma3nRMSNorm((64,), eps=1e-06)
	(attn): Gemma3nAudioAttention(
	(relative_position_embedding): Gemma3nAudioRelativePositionEmbedding(
	(pos_proj): Linear(in_features=64, out_features=64, bias=False)
	)
	(q_proj): Linear(in_features=64, out_features=64, bias=False)
	(k_proj): Linear(in_features=64, out_features=64, bias=False)
	(v_proj): Linear(in_features=64, out_features=64, bias=False)
	)
	(post): Linear(in_features=64, out_features=64, bias=False)
	(post_norm): Gemma3nRMSNorm((64,), eps=1e-06)
	)
	(lconv1d): Gemma3nAudioConformerLightConv1d(
	(pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06)
	(linear_start): Linear(in_features=64, out_features=128, bias=False)
	(depthwise_conv1d): Conv1d(64, 64, kernel_size=(5,), stride=(1,), groups=64, bias=False)
	(conv_norm): Gemma3nRMSNorm((64,), eps=1e-06)
	(linear_end): Linear(in_features=64, out_features=64, bias=False)
	)
	(ffw_layer_end): Gemma3nAudioConformerFeedForward(
	(pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06)
	(ffw_layer_1): Linear(in_features=64, out_features=256, bias=False)
	(ffw_layer_2): Linear(in_features=256, out_features=64, bias=False)
	(post_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06)
	)
	(norm): Gemma3nRMSNorm((64,), eps=1e-06)
	)
	)
	)
	(embed_vision): Gemma3nMultimodalEmbedder(
	(embedding): Embedding(128, 2048)
	(hard_embedding_norm): Gemma3nRMSNorm((2048,), eps=1e-06)
	(soft_embedding_norm): Gemma3nRMSNorm((2048,), eps=1e-06)
	(embedding_projection): Linear(in_features=2048, out_features=32, bias=False)
	(embedding_post_projection_norm): Gemma3nRMSNorm((), eps=1e-06)
	)
	(embed_audio): Gemma3nMultimodalEmbedder(
	(embedding): Embedding(128, 64)
	(hard_embedding_norm): Gemma3nRMSNorm((64,), eps=1e-06)
	(soft_embedding_norm): Gemma3nRMSNorm((64,), eps=1e-06)
	(embedding_projection): Linear(in_features=64, out_features=32, bias=False)
	(embedding_post_projection_norm): Gemma3nRMSNorm((), eps=1e-06)
	)
	)
	(lm_head): Linear(in_features=32, out_features=262400, bias=False)
	)
	```