02:59 am

graph TD

subgraph Input Processing

I1[Input Text] --> T_Emb[Token Embeddings]

I2[Input Image] --> V_Patch["Patch Embedding (UnfoldConv)"]

V_Patch --> V_EmbAdd["Add Class & Positional Embeddings"]

end

  

subgraph Vision Tower [Llama4VisionModel]

V_EmbAdd --> V_LN1["LayerNorm (Pre)"]

V_LN1 --> V_RoPE_Calc["Calculate Vision 2D RoPE"]

V_LN1 --> V_EncLoop

subgraph V_EncLoop [N x Vision Encoder Layers]

direction TB

V_Enc_In["Hidden State"] --> V_LN_PreAttn[LayerNorm]

V_LN_PreAttn --> V_Attn["Vision Attention (MHA + 2D RoPE)"]

V_RoPE_Calc --> V_Attn

V_Attn --> V_Res1_Add["Add Residual"]

V_Enc_In --> V_Res1_Add

V_Res1_Add --> V_LN_PreMLP[LayerNorm]

V_LN_PreMLP --> V_MLP["Vision MLP (GELU)"]

V_MLP --> V_Res2_Add["Add Residual"]

V_Res1_Add --> V_Res2_Add --> V_Enc_Out["Output State"]

end

V_EncLoop --> V_LN2["LayerNorm (Post)"]

V_LN2 --> V_RemoveCLS["Remove CLS Token"]

V_RemoveCLS --> V_Adapter["Vision Adapter (PixelShuffle + MLP)"]

V_Adapter --> V_Proj["MultiModal Projector (Linear)"]

V_Proj --> ImgFeats["Projected Image Features"]

end

  

subgraph Text Tower [Llama4ForCausalLM / Llama4TextModel]

T_Emb --> T_Combine["Combine Text & Image Embeds"]

ImgFeats -- Replace Image Tokens --> T_Combine

T_Combine --> T_RoPE_Calc["Calculate Text RoPE (1D)"]

T_Combine --> T_DecLoop

  

subgraph T_DecLoop ["N x Text Decoder Layers (Llama4TextDecoderLayer)"]

direction TB

T_Dec_In["Hidden State"] --> T_LN_PreAttn[RMSNorm]

T_LN_PreAttn --> T_Attn["Text Attention (MHA/GQA + RoPE/Temp)"]

T_RoPE_Calc --> T_Attn

T_Attn --> T_Res1_Add["Add Residual"]

T_Dec_In --> T_Res1_Add

T_Res1_Add --> T_LN_PreFFN[RMSNorm]

T_LN_PreFFN --> T_FFN{Feed Forward}

subgraph T_FFN

direction LR

IsMoE{"Is MoE Layer?"} -- Yes --> T_MoE["Mixture of Experts (Router + Experts + Shared)"]

IsMoE -- No --> T_MLP["Standard MLP (SiLU)"]

end

T_FFN --> T_Res2_Add["Add Residual"]

T_Res1_Add --> T_Res2_Add --> T_Dec_Out["Output State"]

end

T_DecLoop --> T_LN_Final["RMSNorm (Final)"]

T_LN_Final --> LM_Head["LM Head (Linear)"]

LM_Head --> Logits["Output Logits"]

end

  

style V_EncLoop fill:#eee1,stroke:#333,stroke-width:2px

style T_DecLoop fill:#eee1,stroke:#333,stroke-width:2px

style Input Processing fill:#eee,stroke:#333,stroke-width:1px

style Vision Tower fill:#efe,stroke:#333,stroke-width:1px

style Text Tower fill:#eef,stroke:#333,stroke-width:1px

This will not be seen in blog right?


Links : TODO

Tags :

Date : 6th April, Sunday, 2025, (Wikilinks: 6th April, April 25, April, 2025. Sunday)

Category : Others