02:59 am
graph TD subgraph Input Processing I1[Input Text] --> T_Emb[Token Embeddings] I2[Input Image] --> V_Patch["Patch Embedding (UnfoldConv)"] V_Patch --> V_EmbAdd["Add Class & Positional Embeddings"] end subgraph Vision Tower [Llama4VisionModel] V_EmbAdd --> V_LN1["LayerNorm (Pre)"] V_LN1 --> V_RoPE_Calc["Calculate Vision 2D RoPE"] V_LN1 --> V_EncLoop subgraph V_EncLoop [N x Vision Encoder Layers] direction TB V_Enc_In["Hidden State"] --> V_LN_PreAttn[LayerNorm] V_LN_PreAttn --> V_Attn["Vision Attention (MHA + 2D RoPE)"] V_RoPE_Calc --> V_Attn V_Attn --> V_Res1_Add["Add Residual"] V_Enc_In --> V_Res1_Add V_Res1_Add --> V_LN_PreMLP[LayerNorm] V_LN_PreMLP --> V_MLP["Vision MLP (GELU)"] V_MLP --> V_Res2_Add["Add Residual"] V_Res1_Add --> V_Res2_Add --> V_Enc_Out["Output State"] end V_EncLoop --> V_LN2["LayerNorm (Post)"] V_LN2 --> V_RemoveCLS["Remove CLS Token"] V_RemoveCLS --> V_Adapter["Vision Adapter (PixelShuffle + MLP)"] V_Adapter --> V_Proj["MultiModal Projector (Linear)"] V_Proj --> ImgFeats["Projected Image Features"] end subgraph Text Tower [Llama4ForCausalLM / Llama4TextModel] T_Emb --> T_Combine["Combine Text & Image Embeds"] ImgFeats -- Replace Image Tokens --> T_Combine T_Combine --> T_RoPE_Calc["Calculate Text RoPE (1D)"] T_Combine --> T_DecLoop subgraph T_DecLoop ["N x Text Decoder Layers (Llama4TextDecoderLayer)"] direction TB T_Dec_In["Hidden State"] --> T_LN_PreAttn[RMSNorm] T_LN_PreAttn --> T_Attn["Text Attention (MHA/GQA + RoPE/Temp)"] T_RoPE_Calc --> T_Attn T_Attn --> T_Res1_Add["Add Residual"] T_Dec_In --> T_Res1_Add T_Res1_Add --> T_LN_PreFFN[RMSNorm] T_LN_PreFFN --> T_FFN{Feed Forward} subgraph T_FFN direction LR IsMoE{"Is MoE Layer?"} -- Yes --> T_MoE["Mixture of Experts (Router + Experts + Shared)"] IsMoE -- No --> T_MLP["Standard MLP (SiLU)"] end T_FFN --> T_Res2_Add["Add Residual"] T_Res1_Add --> T_Res2_Add --> T_Dec_Out["Output State"] end T_DecLoop --> T_LN_Final["RMSNorm (Final)"] T_LN_Final --> LM_Head["LM Head (Linear)"] LM_Head --> Logits["Output Logits"] end style V_EncLoop fill:#eee1,stroke:#333,stroke-width:2px style T_DecLoop fill:#eee1,stroke:#333,stroke-width:2px style Input Processing fill:#eee,stroke:#333,stroke-width:1px style Vision Tower fill:#efe,stroke:#333,stroke-width:1px style Text Tower fill:#eef,stroke:#333,stroke-width:1px
This will not be seen in blog right?
Links : TODO
Tags :
Date : 6th April, Sunday, 2025, (Wikilinks: 6th April, April 25, April, 2025. Sunday)
Category : Others