| | --- |
| | library_name: transformers |
| | pipeline_tag: image-text-to-text |
| | inference: true |
| | widget: |
| | - text: Hello! |
| | example_title: Hello world |
| | group: Python |
| | base_model: |
| | - google/gemma-3n-E4B-it |
| | --- |
| | |
| | This tiny model is for debugging. It is randomly initialized with the config adapted from [google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it). |
| |
|
| | | Model ID | Notes | |
| | | ------------------------------------------------------------------------------------- | ------------------------------- | |
| | | [tiny-random/gemma-3n](https://huggingface.co/tiny-random/gemma-3n) | hidden size is 32 | |
| | | [tiny-random/gemma-3n-dim4](https://huggingface.co/tiny-random/gemma-3n-dim4) | hidden size is 4; potentially not supported in paged attention kernels| |
| |
|
| | ### Example usage: |
| |
|
| | ```python |
| | import torch |
| | |
| | from transformers import pipeline |
| | |
| | model_id = "tiny-random/gemma-3n" |
| | pipe = pipeline( |
| | task="image-text-to-text", |
| | model=model_id, |
| | device=0, |
| | torch_dtype=torch.bfloat16 |
| | ) |
| | |
| | # temporary patch for audio tower |
| | from accelerate.hooks import ModelHook, add_hook_to_module |
| | |
| | class EnsureDtype(ModelHook): |
| | def pre_forward(self, module, *args, **kwargs): |
| | args = list(args) |
| | args[0] = args[0].to(module.dtype) |
| | return super().pre_forward(module, *args, **kwargs) |
| | add_hook_to_module(pipe.model.audio_tower, EnsureDtype()) |
| | |
| | messages = [ |
| | { |
| | "role": "system", |
| | "content": [ |
| | {"type": "text", "text": "You are a helpful assistant."} |
| | ] |
| | }, |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"}, |
| | # audio is buggy for now: bf16 x fp32 |
| | {"type": "audio", "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"}, |
| | {"type": "text", "text": "Which image is cuter?"}, |
| | ] |
| | }, |
| | ] |
| | result = pipe(messages, min_new_tokens=512, max_new_tokens=512, do_sample=True) |
| | print(result) |
| | ``` |
| |
|
| | ### Codes to create this repo: |
| |
|
| | ```python |
| | import json |
| | from pathlib import Path |
| | |
| | import torch |
| | |
| | import accelerate |
| | from huggingface_hub import file_exists, hf_hub_download |
| | from timm.models.mobilenetv5 import decode_arch_def |
| | from transformers import ( |
| | AutoConfig, |
| | AutoModelForCausalLM, |
| | AutoProcessor, |
| | AutoTokenizer, |
| | Gemma3nForConditionalGeneration, |
| | GenerationConfig, |
| | set_seed, |
| | ) |
| | |
| | source_model_id = "google/gemma-3n-E4B-it" |
| | save_folder = "/tmp/tiny-random/gemma-3n" |
| | |
| | processor = AutoProcessor.from_pretrained(source_model_id) |
| | processor.save_pretrained(save_folder) |
| | |
| | with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f: |
| | config_json = json.load(f) |
| | |
| | config_json['audio_config'].update({ |
| | "conf_num_attention_heads": 2, |
| | "conf_num_hidden_layers": 2, |
| | "hidden_size": 64, |
| | }) |
| | config_json['text_config'].update({ |
| | "activation_sparsity_pattern": [0.95, 0.95, 0.0, 0.0], |
| | "head_dim": 32, # required by vllm |
| | "hidden_size": 32, |
| | "hidden_size_per_layer_input": 2, |
| | "intermediate_size": 64, |
| | "laurel_rank": 8, |
| | "layer_types": ['sliding_attention', 'full_attention', 'sliding_attention', 'full_attention'], |
| | "num_attention_heads": 1, |
| | "num_hidden_layers": 4, |
| | "num_key_value_heads": 1, |
| | "num_kv_shared_layers": 2, |
| | "sliding_window": 512, |
| | }) |
| | block_args = decode_arch_def( |
| | [ |
| | # Stage 0: 128x128 in |
| | [ |
| | 'er_r1_k3_s2_e4_c32', |
| | 'er_r1_k3_s1_e4_c32', |
| | ], |
| | # Stage 1: 256x256 in |
| | [ |
| | 'uir_r1_a3_k5_s2_e6_c32', |
| | 'uir_r1_a5_k0_s1_e4_c32', |
| | 'uir_r1_a3_k0_s1_e4_c32', |
| | ], |
| | # Stage 2: 640x640 in |
| | [ |
| | "uir_r1_a5_k5_s2_e6_c32", |
| | "uir_r1_a0_k0_s1_e1_c32", |
| | "mqa_r1_k3_h2_v2_s1_d64_c32", |
| | "uir_r1_a0_k0_s1_e2_c32", |
| | ], |
| | # Stage 3: 1280x1280 in |
| | [ |
| | "uir_r1_a5_k5_s2_e6_c32", |
| | "mqa_r1_k3_h2_s1_d64_c32", |
| | "uir_r1_a0_k0_s1_e2_c32", |
| | ], |
| | ] |
| | ) |
| | config_json['vision_config'].update({ |
| | "hidden_size": 2048, # hard-coded in timm |
| | "model_args": { |
| | "block_args": block_args, |
| | } |
| | }) |
| | config_json['tie_word_embeddings'] = True |
| | |
| | with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f: |
| | json.dump(config_json, f, indent=2) |
| | |
| | config = AutoConfig.from_pretrained( |
| | save_folder, |
| | trust_remote_code=True, |
| | ) |
| | print(config) |
| | |
| | torch.set_default_dtype(torch.bfloat16) |
| | model = Gemma3nForConditionalGeneration(config) |
| | torch.set_default_dtype(torch.float32) |
| | if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'): |
| | model.generation_config = GenerationConfig.from_pretrained( |
| | source_model_id, trust_remote_code=True, |
| | ) |
| | set_seed(42) |
| | model = model.cpu() |
| | all_numels = 0 |
| | for name, p in sorted(model.named_parameters()): |
| | all_numels += p.numel() |
| | with torch.no_grad(): |
| | for name, p in sorted(model.named_parameters()): |
| | torch.nn.init.normal_(p, 0, 0.2) |
| | print(name, p.shape, f'{p.numel() / all_numels * 100: .4f}%') |
| | model.save_pretrained(save_folder) |
| | ``` |
| |
|
| | ### Printing the model: |
| |
|
| | ```text |
| | Gemma3nForConditionalGeneration( |
| | (model): Gemma3nModel( |
| | (vision_tower): TimmWrapperModel( |
| | (timm_model): MobileNetV5Encoder( |
| | (conv_stem): ConvNormAct( |
| | (conv): Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (blocks): Sequential( |
| | (0): Sequential( |
| | (0): EdgeResidual( |
| | (conv_exp): Conv2dSame(64, 256, kernel_size=(3, 3), stride=(2, 2), bias=False) |
| | (bn1): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | (aa): Identity() |
| | (se): Identity() |
| | (conv_pwl): Conv2d(256, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn2): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | (drop_path): Identity() |
| | ) |
| | (1): EdgeResidual( |
| | (conv_exp): Conv2d(32, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) |
| | (bn1): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | (aa): Identity() |
| | (se): Identity() |
| | (conv_pwl): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn2): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | (drop_path): Identity() |
| | ) |
| | ) |
| | (1): Sequential( |
| | (0): UniversalInvertedResidual( |
| | (dw_start): ConvNormAct( |
| | (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (pw_exp): ConvNormAct( |
| | (conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (dw_mid): ConvNormAct( |
| | (conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (se): Identity() |
| | (pw_proj): ConvNormAct( |
| | (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (dw_end): Identity() |
| | (layer_scale): LayerScale2d() |
| | (drop_path): Identity() |
| | ) |
| | (1): UniversalInvertedResidual( |
| | (dw_start): ConvNormAct( |
| | (conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (pw_exp): ConvNormAct( |
| | (conv): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (dw_mid): Identity() |
| | (se): Identity() |
| | (pw_proj): ConvNormAct( |
| | (conv): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (dw_end): Identity() |
| | (layer_scale): LayerScale2d() |
| | (drop_path): Identity() |
| | ) |
| | (2): UniversalInvertedResidual( |
| | (dw_start): ConvNormAct( |
| | (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (pw_exp): ConvNormAct( |
| | (conv): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (dw_mid): Identity() |
| | (se): Identity() |
| | (pw_proj): ConvNormAct( |
| | (conv): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (dw_end): Identity() |
| | (layer_scale): LayerScale2d() |
| | (drop_path): Identity() |
| | ) |
| | ) |
| | (2): Sequential( |
| | (0): UniversalInvertedResidual( |
| | (dw_start): ConvNormAct( |
| | (conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (pw_exp): ConvNormAct( |
| | (conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (dw_mid): ConvNormAct( |
| | (conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (se): Identity() |
| | (pw_proj): ConvNormAct( |
| | (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (dw_end): Identity() |
| | (layer_scale): LayerScale2d() |
| | (drop_path): Identity() |
| | ) |
| | (1): UniversalInvertedResidual( |
| | (dw_start): Identity() |
| | (pw_exp): ConvNormAct( |
| | (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (dw_mid): Identity() |
| | (se): Identity() |
| | (pw_proj): ConvNormAct( |
| | (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (dw_end): Identity() |
| | (layer_scale): LayerScale2d() |
| | (drop_path): Identity() |
| | ) |
| | (2): MobileAttention( |
| | (norm): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | (attn): MultiQueryAttention2d( |
| | (query): Sequential( |
| | (proj): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | ) |
| | (key): Sequential( |
| | (down_conv): Conv2dSame(32, 32, kernel_size=(3, 3), stride=(2, 2), groups=32, bias=False) |
| | (norm): RmsNorm2d() |
| | (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | ) |
| | (value): Sequential( |
| | (down_conv): Conv2dSame(32, 32, kernel_size=(3, 3), stride=(2, 2), groups=32, bias=False) |
| | (norm): RmsNorm2d() |
| | (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | ) |
| | (attn_drop): Dropout(p=0.0, inplace=False) |
| | (output): Sequential( |
| | (proj): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (drop): Dropout(p=0.0, inplace=False) |
| | ) |
| | ) |
| | (layer_scale): LayerScale2d() |
| | (drop_path): Identity() |
| | ) |
| | (3): UniversalInvertedResidual( |
| | (dw_start): Identity() |
| | (pw_exp): ConvNormAct( |
| | (conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (dw_mid): Identity() |
| | (se): Identity() |
| | (pw_proj): ConvNormAct( |
| | (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (dw_end): Identity() |
| | (layer_scale): LayerScale2d() |
| | (drop_path): Identity() |
| | ) |
| | ) |
| | (3): Sequential( |
| | (0): UniversalInvertedResidual( |
| | (dw_start): ConvNormAct( |
| | (conv): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), groups=32, bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (pw_exp): ConvNormAct( |
| | (conv): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (dw_mid): ConvNormAct( |
| | (conv): Conv2dSame(192, 192, kernel_size=(5, 5), stride=(2, 2), groups=192, bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (se): Identity() |
| | (pw_proj): ConvNormAct( |
| | (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (dw_end): Identity() |
| | (layer_scale): LayerScale2d() |
| | (drop_path): Identity() |
| | ) |
| | (1): MobileAttention( |
| | (norm): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | (attn): MultiQueryAttention2d( |
| | (query): Sequential( |
| | (proj): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | ) |
| | (key): Sequential( |
| | (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | ) |
| | (value): Sequential( |
| | (proj): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | ) |
| | (attn_drop): Dropout(p=0.0, inplace=False) |
| | (output): Sequential( |
| | (proj): Conv2d(128, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (drop): Dropout(p=0.0, inplace=False) |
| | ) |
| | ) |
| | (layer_scale): LayerScale2d() |
| | (drop_path): Identity() |
| | ) |
| | (2): UniversalInvertedResidual( |
| | (dw_start): Identity() |
| | (pw_exp): ConvNormAct( |
| | (conv): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (dw_mid): Identity() |
| | (se): Identity() |
| | (pw_proj): ConvNormAct( |
| | (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (dw_end): Identity() |
| | (layer_scale): LayerScale2d() |
| | (drop_path): Identity() |
| | ) |
| | ) |
| | ) |
| | (msfa): MobileNetV5MultiScaleFusionAdapter( |
| | (ffn): UniversalInvertedResidual( |
| | (dw_start): Identity() |
| | (pw_exp): ConvNormAct( |
| | (conv): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): GELU(approximate='none') |
| | ) |
| | ) |
| | (dw_mid): Identity() |
| | (se): Identity() |
| | (pw_proj): ConvNormAct( |
| | (conv): Conv2d(128, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) |
| | (bn): RmsNormAct2d( |
| | (drop): Identity() |
| | (act): Identity() |
| | ) |
| | ) |
| | (dw_end): Identity() |
| | (layer_scale): Identity() |
| | (drop_path): Identity() |
| | ) |
| | (norm): RmsNorm2d() |
| | ) |
| | ) |
| | ) |
| | (language_model): Gemma3nTextModel( |
| | (embed_tokens): Gemma3nTextScaledWordEmbedding(262400, 32, padding_idx=0) |
| | (layers): ModuleList( |
| | (0-3): 4 x Gemma3nTextDecoderLayer( |
| | (self_attn): Gemma3nTextAttention( |
| | (q_proj): Linear(in_features=32, out_features=32, bias=False) |
| | (k_proj): Linear(in_features=32, out_features=32, bias=False) |
| | (v_proj): Linear(in_features=32, out_features=32, bias=False) |
| | (o_proj): Linear(in_features=32, out_features=32, bias=False) |
| | (q_norm): Gemma3nRMSNorm((32,), eps=1e-06) |
| | (k_norm): Gemma3nRMSNorm((32,), eps=1e-06) |
| | (v_norm): Gemma3nRMSNorm((), eps=1e-06) |
| | ) |
| | (mlp): Gemma3nTextMLP( |
| | (gate_proj): Linear(in_features=32, out_features=64, bias=False) |
| | (up_proj): Linear(in_features=32, out_features=64, bias=False) |
| | (down_proj): Linear(in_features=64, out_features=32, bias=False) |
| | (act_fn): PytorchGELUTanh() |
| | ) |
| | (input_layernorm): Gemma3nRMSNorm((32,), eps=1e-06) |
| | (post_attention_layernorm): Gemma3nRMSNorm((32,), eps=1e-06) |
| | (pre_feedforward_layernorm): Gemma3nRMSNorm((32,), eps=1e-06) |
| | (post_feedforward_layernorm): Gemma3nRMSNorm((32,), eps=1e-06) |
| | (act_fn): PytorchGELUTanh() |
| | (altup): Gemma3nTextAltUp( |
| | (correction_coefs): Linear(in_features=4, out_features=4, bias=False) |
| | (prediction_coefs): Linear(in_features=4, out_features=16, bias=False) |
| | (modality_router): Linear(in_features=32, out_features=4, bias=False) |
| | (router_norm): Gemma3nRMSNorm((32,), eps=1e-06) |
| | ) |
| | (laurel): Gemma3nTextLaurelBlock( |
| | (linear_left): Linear(in_features=32, out_features=8, bias=False) |
| | (linear_right): Linear(in_features=8, out_features=32, bias=False) |
| | (post_laurel_norm): Gemma3nRMSNorm((32,), eps=1e-06) |
| | ) |
| | (per_layer_input_gate): Linear(in_features=32, out_features=2, bias=False) |
| | (per_layer_projection): Linear(in_features=2, out_features=32, bias=False) |
| | (post_per_layer_input_norm): Gemma3nRMSNorm((32,), eps=1e-06) |
| | ) |
| | ) |
| | (norm): Gemma3nRMSNorm((32,), eps=1e-06) |
| | (rotary_emb): Gemma3nTextRotaryEmbedding() |
| | (rotary_emb_local): Gemma3nTextRotaryEmbedding() |
| | (embed_tokens_per_layer): Gemma3nTextScaledWordEmbedding(262144, 8, padding_idx=0) |
| | (per_layer_model_projection): Linear(in_features=32, out_features=8, bias=False) |
| | (per_layer_projection_norm): Gemma3nRMSNorm((2,), eps=1e-06) |
| | (altup_projections): ModuleList( |
| | (0-2): 3 x Linear(in_features=32, out_features=32, bias=False) |
| | ) |
| | (altup_unembed_projections): ModuleList( |
| | (0-2): 3 x Linear(in_features=32, out_features=32, bias=False) |
| | ) |
| | ) |
| | (audio_tower): Gemma3nAudioEncoder( |
| | (subsample_conv_projection): Gemma3nAudioSubSampleConvProjection( |
| | (conv_0): Gemma3nAudioSSCPConvBlock( |
| | (conv): Conv2d(1, 128, kernel_size=(3, 3), stride=(2, 2), bias=False) |
| | (norm): Gemma3nAudioCumulativeGroupNorm() |
| | (activation): ReLU() |
| | ) |
| | (conv_1): Gemma3nAudioSSCPConvBlock( |
| | (conv): Conv2d(128, 32, kernel_size=(3, 3), stride=(2, 2), bias=False) |
| | (norm): Gemma3nAudioCumulativeGroupNorm() |
| | (activation): ReLU() |
| | ) |
| | (input_proj_linear): Linear(in_features=1024, out_features=64, bias=False) |
| | ) |
| | (conformer): ModuleList( |
| | (0-1): 2 x Gemma3nAudioConformerBlock( |
| | (ffw_layer_start): Gemma3nAudioConformerFeedForward( |
| | (pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | (ffw_layer_1): Linear(in_features=64, out_features=256, bias=False) |
| | (ffw_layer_2): Linear(in_features=256, out_features=64, bias=False) |
| | (post_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | ) |
| | (attention): Gemma3nAudioConformerAttention( |
| | (pre_attn_norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | (attn): Gemma3nAudioAttention( |
| | (relative_position_embedding): Gemma3nAudioRelativePositionEmbedding( |
| | (pos_proj): Linear(in_features=64, out_features=64, bias=False) |
| | ) |
| | (q_proj): Linear(in_features=64, out_features=64, bias=False) |
| | (k_proj): Linear(in_features=64, out_features=64, bias=False) |
| | (v_proj): Linear(in_features=64, out_features=64, bias=False) |
| | ) |
| | (post): Linear(in_features=64, out_features=64, bias=False) |
| | (post_norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | ) |
| | (lconv1d): Gemma3nAudioConformerLightConv1d( |
| | (pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | (linear_start): Linear(in_features=64, out_features=128, bias=False) |
| | (depthwise_conv1d): Conv1d(64, 64, kernel_size=(5,), stride=(1,), groups=64, bias=False) |
| | (conv_norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | (linear_end): Linear(in_features=64, out_features=64, bias=False) |
| | ) |
| | (ffw_layer_end): Gemma3nAudioConformerFeedForward( |
| | (pre_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | (ffw_layer_1): Linear(in_features=64, out_features=256, bias=False) |
| | (ffw_layer_2): Linear(in_features=256, out_features=64, bias=False) |
| | (post_layer_norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | ) |
| | (norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | ) |
| | ) |
| | ) |
| | (embed_vision): Gemma3nMultimodalEmbedder( |
| | (embedding): Embedding(128, 2048) |
| | (hard_embedding_norm): Gemma3nRMSNorm((2048,), eps=1e-06) |
| | (soft_embedding_norm): Gemma3nRMSNorm((2048,), eps=1e-06) |
| | (embedding_projection): Linear(in_features=2048, out_features=32, bias=False) |
| | (embedding_post_projection_norm): Gemma3nRMSNorm((), eps=1e-06) |
| | ) |
| | (embed_audio): Gemma3nMultimodalEmbedder( |
| | (embedding): Embedding(128, 64) |
| | (hard_embedding_norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | (soft_embedding_norm): Gemma3nRMSNorm((64,), eps=1e-06) |
| | (embedding_projection): Linear(in_features=64, out_features=32, bias=False) |
| | (embedding_post_projection_norm): Gemma3nRMSNorm((), eps=1e-06) |
| | ) |
| | ) |
| | (lm_head): Linear(in_features=32, out_features=262400, bias=False) |
| | ) |
| | ``` |