Add model.py
Browse files
model.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
|
| 4 |
+
class OMNILITEUnifiedSparseMultimodalTransformer(nn.Module):
|
| 5 |
+
def __init__(self):
|
| 6 |
+
super().__init__()
|
| 7 |
+
self.layers = nn.Sequential(
|
| 8 |
+
nn.Conv2d(**{"in_channels":3,"out_channels":1024,"kernel_size":14,"stride":14,"note":"Vision Patch Embedding for ViT encoder"}),
|
| 9 |
+
nn.TransformerBlock(**{"embed_dim":1024,"num_heads":16,"ff_dim":4096,"depth":12,"note":"Lightweight Vision Transformer (ViT) Backbone"}),
|
| 10 |
+
nn.TransformerBlock(**{"type":"PerceiverResampler","num_latents":64,"embed_dim":2048,"note":"Maps visual features to text latent space"}),
|
| 11 |
+
nn.Linear(**{"in_features":32000,"out_features":2048,"note":"Text Token Embedding layer"}),
|
| 12 |
+
nn.TransformerBlock(**{"type":"GQA_MoE_Layer","repeat":24,"num_experts":16,"top_k":2,"hidden_dim":2048,"num_heads":32,"num_kv_heads":8,"rope_dim":64,"note":"Shared Backbone: 480M active parameters per token"}),
|
| 13 |
+
nn.Linear(**{"in_features":2048,"out_features":32000,"note":"Causal Language Modeling (CLM) Head"}),
|
| 14 |
+
nn.Linear(**{"in_features":2048,"out_features":64,"note":"Rectified Flow-Matching (RFM) Head for DiT Latents"}),
|
| 15 |
+
nn.Conv2d(**{"in_channels":4,"out_channels":3,"kernel_size":3,"stride":1,"note":"VQ-VAE Decoder for 8x8 Latent Reconstruction"})
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
def forward(self, x):
|
| 19 |
+
return self.layers(x)
|