`ezpz.models.llama`¶

See ezpz/models/llama.py

`Attention` ¶

Bases: Module

Multi-head attention module.

Parameters:

Name	Type	Description	Default
`model_args`	`ModelArgs`	Model configuration arguments.	required

Attributes:

Name	Type	Description
`n_kv_heads`	`int`	Number of key and value heads.
`n_heads`	`int`	Number of query heads.
`n_local_kv_heads`	`int`	Number of local key and value heads.
`n_rep`	`int`	Number of repetitions for local heads.
`head_dim`	`int`	Dimension size of each attention head.
`wq`	`Linear`	Linear transformation for queries.
`wk`	`Linear`	Linear transformation for keys.
`wv`	`Linear`	Linear transformation for values.
`wo`	`Linear`	Linear transformation for output.

Source code in src/ezpz/models/llama.py

class Attention(nn.Module):
    """
    Multi-head attention module.

    Args:
        model_args (ModelArgs): Model configuration arguments.

    Attributes:
        n_kv_heads (int): Number of key and value heads.
        n_heads (int): Number of query heads.
        n_local_kv_heads (int): Number of local key and value heads.
        n_rep (int): Number of repetitions for local heads.
        head_dim (int): Dimension size of each attention head.
        wq (Linear): Linear transformation for queries.
        wk (Linear): Linear transformation for keys.
        wv (Linear): Linear transformation for values.
        wo (Linear): Linear transformation for output.

    """

    def __init__(self, model_args: ModelArgs):
        super().__init__()
        self.n_heads = model_args.n_heads
        self.n_kv_heads = (
            model_args.n_heads
            if model_args.n_kv_heads is None
            else model_args.n_kv_heads
        )
        self.n_rep = self.n_heads // self.n_kv_heads
        self.head_dim = model_args.dim // model_args.n_heads

        self.wq = nn.Linear(
            model_args.dim, model_args.n_heads * self.head_dim, bias=False
        )
        self.wk = nn.Linear(
            model_args.dim, self.n_kv_heads * self.head_dim, bias=False
        )
        self.wv = nn.Linear(
            model_args.dim, self.n_kv_heads * self.head_dim, bias=False
        )
        self.wo = nn.Linear(
            model_args.n_heads * self.head_dim, model_args.dim, bias=False
        )

    def init_weights(self, init_std: float):
        for linear in (self.wq, self.wk, self.wv):
            nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02)
        nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std)

    def forward(
        self,
        x: torch.Tensor,
        freqs_cis: torch.Tensor,
    ) -> torch.Tensor:
        """
        Forward pass of the attention module.

        Args:
            x (torch.Tensor): Input tensor.
            freqs_cis (torch.Tensor): Precomputed frequency tensor.

        Returns:
            torch.Tensor: Output tensor after attention.

        """
        bsz, seqlen, _ = x.shape
        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)

        xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim)
        xk = xk.view(bsz, seqlen, self.n_kv_heads, self.head_dim)
        xv = xv.view(bsz, seqlen, self.n_kv_heads, self.head_dim)

        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)

        keys = repeat_kv(
            xk, self.n_rep
        )  # (bs, seqlen, n_local_heads, head_dim)
        values = repeat_kv(
            xv, self.n_rep
        )  # (bs, seqlen, n_local_heads, head_dim)

        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
        xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
        xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)

        # we use casual mask for training
        if os.environ.get("EZPZ_ATTENTION_FP32") == "1":
            output = F.scaled_dot_product_attention(
                xq.float(), xk.float(), xv.float(), is_causal=True
            ).to(xq.dtype)
        else:
            output = F.scaled_dot_product_attention(
                xq, xk, xv, is_causal=True
            )
        output = output.transpose(
            1, 2
        ).contiguous()  # (bs, seqlen, n_local_heads, head_dim)
        output = output.view(bsz, seqlen, -1)
        if _DEBUG_NAN:
            global _DEBUG_NAN_ONCE
            if not _DEBUG_NAN_ONCE:
                with torch.no_grad():
                    q_nf, q_max = _tensor_stats("xq", xq)
                    k_nf, k_max = _tensor_stats("xk", xk)
                    v_nf, v_max = _tensor_stats("xv", xv)
                    o_nf, o_max = _tensor_stats("out", output)
                logger.info(
                    "attn_stats xq(nonfinite=%s max_abs=%.6f) "
                    "xk(nonfinite=%s max_abs=%.6f) "
                    "xv(nonfinite=%s max_abs=%.6f) "
                    "out(nonfinite=%s max_abs=%.6f)",
                    q_nf,
                    q_max,
                    k_nf,
                    k_max,
                    v_nf,
                    v_max,
                    o_nf,
                    o_max,
                )
                _DEBUG_NAN_ONCE = True
        return self.wo(output)

`forward(x, freqs_cis)` ¶

Forward pass of the attention module.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Input tensor.	required
`freqs_cis`	`Tensor`	Precomputed frequency tensor.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: Output tensor after attention.

Source code in src/ezpz/models/llama.py

def forward(
    self,
    x: torch.Tensor,
    freqs_cis: torch.Tensor,
) -> torch.Tensor:
    """
    Forward pass of the attention module.

    Args:
        x (torch.Tensor): Input tensor.
        freqs_cis (torch.Tensor): Precomputed frequency tensor.

    Returns:
        torch.Tensor: Output tensor after attention.

    """
    bsz, seqlen, _ = x.shape
    xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)

    xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim)
    xk = xk.view(bsz, seqlen, self.n_kv_heads, self.head_dim)
    xv = xv.view(bsz, seqlen, self.n_kv_heads, self.head_dim)

    xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)

    keys = repeat_kv(
        xk, self.n_rep
    )  # (bs, seqlen, n_local_heads, head_dim)
    values = repeat_kv(
        xv, self.n_rep
    )  # (bs, seqlen, n_local_heads, head_dim)

    xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
    xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
    xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)

    # we use casual mask for training
    if os.environ.get("EZPZ_ATTENTION_FP32") == "1":
        output = F.scaled_dot_product_attention(
            xq.float(), xk.float(), xv.float(), is_causal=True
        ).to(xq.dtype)
    else:
        output = F.scaled_dot_product_attention(
            xq, xk, xv, is_causal=True
        )
    output = output.transpose(
        1, 2
    ).contiguous()  # (bs, seqlen, n_local_heads, head_dim)
    output = output.view(bsz, seqlen, -1)
    if _DEBUG_NAN:
        global _DEBUG_NAN_ONCE
        if not _DEBUG_NAN_ONCE:
            with torch.no_grad():
                q_nf, q_max = _tensor_stats("xq", xq)
                k_nf, k_max = _tensor_stats("xk", xk)
                v_nf, v_max = _tensor_stats("xv", xv)
                o_nf, o_max = _tensor_stats("out", output)
            logger.info(
                "attn_stats xq(nonfinite=%s max_abs=%.6f) "
                "xk(nonfinite=%s max_abs=%.6f) "
                "xv(nonfinite=%s max_abs=%.6f) "
                "out(nonfinite=%s max_abs=%.6f)",
                q_nf,
                q_max,
                k_nf,
                k_max,
                v_nf,
                v_max,
                o_nf,
                o_max,
            )
            _DEBUG_NAN_ONCE = True
    return self.wo(output)

`FeedForward` ¶

Bases: Module

FeedForward module

Parameters:

Name	Type	Description	Default
`dim`	`int`	Input dimension.	required
`hidden_dim`	`int`	Hidden dimension of the feedforward layer.	required
`multiple_of`	`int`	Value to ensure hidden dimension is a multiple of this value.	required
`ffn_dim_multiplier`	`Optional[float]`	Custom multiplier for hidden dimension. Defaults to None.	required

Attributes:

Name	Type	Description
`w1`	`Linear`	Linear transformation for the first layer.
`w2`	`Linear`	Linear transformation for the second layer.
`w3`	`Linear`	Linear transformation for the third layer.

Source code in src/ezpz/models/llama.py

class FeedForward(nn.Module):
    """
    FeedForward module

    Args:
        dim (int): Input dimension.
        hidden_dim (int): Hidden dimension of the feedforward layer.
        multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
        ffn_dim_multiplier (Optional[float]): Custom multiplier for hidden dimension. Defaults to None.

    Attributes:
        w1 (Linear): Linear transformation for the first layer.
        w2 (Linear): Linear transformation for the second layer.
        w3 (Linear): Linear transformation for the third layer.

    """

    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        multiple_of: int,
        ffn_dim_multiplier: Optional[float],
    ):
        super().__init__()
        hidden_dim = int(2 * hidden_dim / 3)
        # custom dim factor multiplier
        if ffn_dim_multiplier is not None:
            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
        hidden_dim = multiple_of * (
            (hidden_dim + multiple_of - 1) // multiple_of
        )

        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)

    def forward(self, x):
        return self.w2(F.silu(self.w1(x)) * self.w3(x))

    def init_weights(self, init_std: float):
        nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
        for linear in (self.w2, self.w3):
            nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std)

`RMSNorm` ¶

Bases: Module

Initialize the RMSNorm normalization layer.

Parameters:

Name	Type	Description	Default
`dim`	`int`	The dimension of the input tensor.	required
`eps`	`float`	A small value added to the denominator for numerical stability. Default is 1e-6.	`1e-06`

Attributes:

Name	Type	Description
`eps`	`float`	A small value added to the denominator for numerical stability.
`weight`	`Parameter`	Learnable scaling parameter.

Source code in src/ezpz/models/llama.py

class RMSNorm(nn.Module):
    """
    Initialize the RMSNorm normalization layer.

    Args:
        dim (int): The dimension of the input tensor.
        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.

    Attributes:
        eps (float): A small value added to the denominator for numerical stability.
        weight (nn.Parameter): Learnable scaling parameter.

    """

    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x: torch.Tensor):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x: torch.Tensor):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight

    def reset_parameters(self):
        torch.nn.init.ones_(self.weight)  # type: ignore

`Transformer` ¶

Bases: Module

Transformer Module

Parameters:

Name	Type	Description	Default
`model_args`	`ModelArgs`	Model configuration arguments.	required

Attributes:

Name	Type	Description
`model_args`	`ModelArgs`	Model configuration arguments.
`vocab_size`	`int`	Vocabulary size.
`n_layers`	`int`	Number of layers in the model.
`tok_embeddings`	`ParallelEmbedding`	Token embeddings.
`layers`	`ModuleList`	List of Transformer blocks.
`norm`	`RMSNorm`	Layer normalization for the model output.
`output`	`ColumnParallelLinear`	Linear layer for final output.
`freqs_cis`	`Tensor`	Precomputed cosine and sine frequencies.

Source code in src/ezpz/models/llama.py

class Transformer(nn.Module):
    """
    Transformer Module

    Args:
        model_args (ModelArgs): Model configuration arguments.

    Attributes:
        model_args (ModelArgs): Model configuration arguments.
        vocab_size (int): Vocabulary size.
        n_layers (int): Number of layers in the model.
        tok_embeddings (ParallelEmbedding): Token embeddings.
        layers (torch.nn.ModuleList): List of Transformer blocks.
        norm (RMSNorm): Layer normalization for the model output.
        output (ColumnParallelLinear): Linear layer for final output.
        freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.

    """

    def __init__(self, model_args: ModelArgs):
        super().__init__()
        self.model_args = model_args
        self.vocab_size = model_args.vocab_size
        self.n_layers = model_args.n_layers
        self.model_dim = model_args.dim

        self.tok_embeddings = nn.Embedding(
            model_args.vocab_size, model_args.dim
        )
        self.register_buffer(
            "freqs_cis",
            precompute_freqs_cis(
                model_args.dim // model_args.n_heads,
                # Need to compute until at least the max token limit for generation
                # (use 2x max sequence length to be safe)
                model_args.max_seq_len * 2,
            ),
        )
        self.layers = torch.nn.ModuleList()
        for layer_id in range(model_args.n_layers):
            self.layers.append(TransformerBlock(layer_id, model_args))

        self.norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps)

        self.output = nn.Linear(
            model_args.dim, model_args.vocab_size, bias=False
        )
        self.init_weights()

    def init_weights(self):
        """
        [Note: On ``init_weights`` vs. ``reset_parameters``]
        Modules may define ``reset_parameters`` to initialize parameter values.
        ``reset_parameters`` is meant to only initialize directly owned
        parameters/buffers, not those of their child modules, and it can be
        used to give the initial values for these tensors.
        Separately, users may want custom initialization for their modules,
        different from that in ``reset_parameters``. For this, we define
        ``init_weights``. We only call it in the constructor of this
        ``Transformer`` root module to avoid reinitializing tensors.
        """
        with torch.device(self.freqs_cis.device):
            self.freqs_cis = precompute_freqs_cis(
                self.model_args.dim // self.model_args.n_heads,
                # Need to compute until at least the max token limit for generation
                # (use 2x max sequence length to be safe)
                self.model_args.max_seq_len * 2,
            )
        nn.init.normal_(self.tok_embeddings.weight)
        for layer in self.layers:
            layer.init_weights()
        self.norm.reset_parameters()
        final_out_std = self.model_args.dim**-0.5
        cutoff_factor = 3
        nn.init.trunc_normal_(
            self.output.weight,
            mean=0.0,
            std=final_out_std,
            a=-cutoff_factor * final_out_std,
            b=cutoff_factor * final_out_std,
        )

    def forward(self, tokens: torch.Tensor) -> torch.Tensor:
        """
        Perform a forward pass through the Transformer model.

        Args:
            tokens (torch.Tensor): Input token indices.

        Returns:
            torch.Tensor: Output logits after applying the Transformer model.

        """
        _bsz, seqlen = tokens.shape
        h = self.tok_embeddings(tokens)
        self.freqs_cis = self.freqs_cis.to(h.device)
        freqs_cis = self.freqs_cis[0:seqlen]

        for layer in self.layers:
            h = layer(h, freqs_cis)
        h = self.norm(h)
        output = self.output(h).float()
        return output

    @classmethod
    def from_model_args(cls, model_args: ModelArgs) -> "Transformer":
        """
        Initialize a Transformer model from a ModelArgs object.

        Args:
            model_args (ModelArgs): Model configuration arguments.

        Returns:
            Transformer: Transformer model.

        """
        return cls(model_args)

`forward(tokens)` ¶

Perform a forward pass through the Transformer model.

Parameters:

Name	Type	Description	Default
`tokens`	`Tensor`	Input token indices.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: Output logits after applying the Transformer model.

Source code in src/ezpz/models/llama.py

def forward(self, tokens: torch.Tensor) -> torch.Tensor:
    """
    Perform a forward pass through the Transformer model.

    Args:
        tokens (torch.Tensor): Input token indices.

    Returns:
        torch.Tensor: Output logits after applying the Transformer model.

    """
    _bsz, seqlen = tokens.shape
    h = self.tok_embeddings(tokens)
    self.freqs_cis = self.freqs_cis.to(h.device)
    freqs_cis = self.freqs_cis[0:seqlen]

    for layer in self.layers:
        h = layer(h, freqs_cis)
    h = self.norm(h)
    output = self.output(h).float()
    return output

`from_model_args(model_args)` `classmethod` ¶

Initialize a Transformer model from a ModelArgs object.

Parameters:

Name	Type	Description	Default
`model_args`	`ModelArgs`	Model configuration arguments.	required

Returns:

Name	Type	Description
`Transformer`	`Transformer`	Transformer model.

Source code in src/ezpz/models/llama.py

@classmethod
def from_model_args(cls, model_args: ModelArgs) -> "Transformer":
    """
    Initialize a Transformer model from a ModelArgs object.

    Args:
        model_args (ModelArgs): Model configuration arguments.

    Returns:
        Transformer: Transformer model.

    """
    return cls(model_args)

`init_weights()` ¶

[Note: On init_weights vs. reset_parameters] Modules may define reset_parameters to initialize parameter values. reset_parameters is meant to only initialize directly owned parameters/buffers, not those of their child modules, and it can be used to give the initial values for these tensors. Separately, users may want custom initialization for their modules, different from that in reset_parameters. For this, we define init_weights. We only call it in the constructor of this Transformer root module to avoid reinitializing tensors.

Source code in src/ezpz/models/llama.py

def init_weights(self):
    """
    [Note: On ``init_weights`` vs. ``reset_parameters``]
    Modules may define ``reset_parameters`` to initialize parameter values.
    ``reset_parameters`` is meant to only initialize directly owned
    parameters/buffers, not those of their child modules, and it can be
    used to give the initial values for these tensors.
    Separately, users may want custom initialization for their modules,
    different from that in ``reset_parameters``. For this, we define
    ``init_weights``. We only call it in the constructor of this
    ``Transformer`` root module to avoid reinitializing tensors.
    """
    with torch.device(self.freqs_cis.device):
        self.freqs_cis = precompute_freqs_cis(
            self.model_args.dim // self.model_args.n_heads,
            # Need to compute until at least the max token limit for generation
            # (use 2x max sequence length to be safe)
            self.model_args.max_seq_len * 2,
        )
    nn.init.normal_(self.tok_embeddings.weight)
    for layer in self.layers:
        layer.init_weights()
    self.norm.reset_parameters()
    final_out_std = self.model_args.dim**-0.5
    cutoff_factor = 3
    nn.init.trunc_normal_(
        self.output.weight,
        mean=0.0,
        std=final_out_std,
        a=-cutoff_factor * final_out_std,
        b=cutoff_factor * final_out_std,
    )

`TransformerBlock` ¶

Bases: Module

TransformerBlock Module

Parameters:

Name	Type	Description	Default
`layer_id`	`int`	Identifier for the layer.	required
`model_args`	`ModelArgs`	Model configuration arguments.	required

Attributes:

Name	Type	Description
`n_heads`	`int`	Number of attention heads.
`dim`	`int`	Dimension size of the model.
`head_dim`	`int`	Dimension size of each attention head.
`attention`	`Attention`	Attention module.
`feed_forward`	`FeedForward`	FeedForward module.
`layer_id`	`int`	Identifier for the layer.
`attention_norm`	`RMSNorm`	Layer normalization for attention output.
`ffn_norm`	`RMSNorm`	Layer normalization for feedforward output.

Source code in src/ezpz/models/llama.py

class TransformerBlock(nn.Module):
    """
    TransformerBlock Module

    Args:
        layer_id (int): Identifier for the layer.
        model_args (ModelArgs): Model configuration arguments.

    Attributes:
        n_heads (int): Number of attention heads.
        dim (int): Dimension size of the model.
        head_dim (int): Dimension size of each attention head.
        attention (Attention): Attention module.
        feed_forward (FeedForward): FeedForward module.
        layer_id (int): Identifier for the layer.
        attention_norm (RMSNorm): Layer normalization for attention output.
        ffn_norm (RMSNorm): Layer normalization for feedforward output.

    """

    def __init__(self, layer_id: int, model_args: ModelArgs):
        super().__init__()
        self.n_heads = model_args.n_heads
        self.dim = model_args.dim
        self.attention = Attention(model_args)
        self.feed_forward = FeedForward(
            dim=model_args.dim,
            hidden_dim=4 * model_args.dim,
            multiple_of=model_args.multiple_of,
            ffn_dim_multiplier=model_args.ffn_dim_multiplier,
        )
        self.layer_id = layer_id
        self.num_layers = model_args.n_layers

        self.attention_norm = RMSNorm(
            dim=model_args.dim, eps=model_args.norm_eps
        )
        self.ffn_norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps)

        if model_args.depth_init:
            self.weight_init_std = 0.02 / (2 * (self.layer_id + 1)) ** 0.5
        else:
            self.weight_init_std = 0.02 / (2 * self.num_layers) ** 0.5

    def forward(
        self,
        x: torch.Tensor,
        freqs_cis: torch.Tensor,
    ) -> torch.Tensor:
        """
        Perform a forward pass through the TransformerBlock.

        Args:
            x (torch.Tensor): Input tensor.
            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.

        Returns:
            torch.Tensor: Output tensor after applying attention and feedforward layers.

        """
        h = x + self.attention(self.attention_norm(x), freqs_cis)
        out = h + self.feed_forward(self.ffn_norm(h))
        return out

    def init_weights(self):
        for norm in (self.attention_norm, self.ffn_norm):
            norm.reset_parameters()
        self.attention.init_weights(self.weight_init_std)
        self.feed_forward.init_weights(self.weight_init_std)

`forward(x, freqs_cis)` ¶

Perform a forward pass through the TransformerBlock.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Input tensor.	required
`freqs_cis`	`Tensor`	Precomputed cosine and sine frequencies.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: Output tensor after applying attention and feedforward layers.

Source code in src/ezpz/models/llama.py

def forward(
    self,
    x: torch.Tensor,
    freqs_cis: torch.Tensor,
) -> torch.Tensor:
    """
    Perform a forward pass through the TransformerBlock.

    Args:
        x (torch.Tensor): Input tensor.
        freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.

    Returns:
        torch.Tensor: Output tensor after applying attention and feedforward layers.

    """
    h = x + self.attention(self.attention_norm(x), freqs_cis)
    out = h + self.feed_forward(self.ffn_norm(h))
    return out

`apply_rotary_emb(xq, xk, freqs_cis)` ¶

Apply rotary embeddings to input tensors using the given frequency tensor.

This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are returned as real tensors.

Parameters:

Name	Type	Description	Default
`xq`	`Tensor`	Query tensor to apply rotary embeddings.	required
`xk`	`Tensor`	Key tensor to apply rotary embeddings.	required
`freqs_cis`	`Tensor`	Precomputed frequency tensor for complex exponentials.	required

Returns:

Type	Description
`Tuple[Tensor, Tensor]`	Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.

Source code in src/ezpz/models/llama.py

def apply_rotary_emb(
    xq: torch.Tensor,
    xk: torch.Tensor,
    freqs_cis: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Apply rotary embeddings to input tensors using the given frequency tensor.

    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
    returned as real tensors.

    Args:
        xq (torch.Tensor): Query tensor to apply rotary embeddings.
        xk (torch.Tensor): Key tensor to apply rotary embeddings.
        freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
    """
    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
    return xq_out.type_as(xq), xk_out.type_as(xk)

`precompute_freqs_cis(dim, end, theta=10000.0)` ¶

Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64 data type.

Parameters:

Name	Type	Description	Default
`dim`	`int`	Dimension of the frequency tensor.	required
`end`	`int`	End index for precomputing frequencies.	required
`theta`	`float`	Scaling factor for frequency computation. Defaults to 10000.0.	`10000.0`

Returns:

Type	Description
`Tensor`	torch.Tensor: Precomputed frequency tensor with complex exponentials.

Source code in src/ezpz/models/llama.py

def precompute_freqs_cis(
    dim: int, end: int, theta: float = 10000.0
) -> torch.Tensor:
    """
    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
    and the end index 'end'. The 'theta' parameter scales the frequencies.
    The returned tensor contains complex values in complex64 data type.

    Args:
        dim (int): Dimension of the frequency tensor.
        end (int): End index for precomputing frequencies.
        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.

    Returns:
        torch.Tensor: Precomputed frequency tensor with complex exponentials.
    """
    freqs = 1.0 / (
        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
    )
    t = torch.arange(end, device=freqs.device)  # type: ignore
    freqs = torch.outer(t, freqs).float()  # type: ignore
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
    return freqs_cis

`repeat_kv(x, n_rep)` ¶

torch.repeat_interleave(x, dim=2, repeats=n_rep)

Source code in src/ezpz/models/llama.py

def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
    bs, slen, n_kv_heads, head_dim = x.shape
    if n_rep == 1:
        return x
    return (
        x[:, :, :, None, :]
        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
    )

`reshape_for_broadcast(freqs_cis, x)` ¶

Reshape frequency tensor for broadcasting it with another tensor.

This function reshapes the frequency tensor to have the same shape as the target tensor 'x' for the purpose of broadcasting the frequency tensor during element-wise operations.

Parameters:

Name	Type	Description	Default
`freqs_cis`	`Tensor`	Frequency tensor to be reshaped.	required
`x`	`Tensor`	Target tensor for broadcasting compatibility.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: Reshaped frequency tensor.

Source code in src/ezpz/models/llama.py

def reshape_for_broadcast(
    freqs_cis: torch.Tensor, x: torch.Tensor
) -> torch.Tensor:
    """
    Reshape frequency tensor for broadcasting it with another tensor.

    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
    for the purpose of broadcasting the frequency tensor during element-wise operations.

    Args:
        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
        x (torch.Tensor): Target tensor for broadcasting compatibility.

    Returns:
        torch.Tensor: Reshaped frequency tensor.
    """
    ndim = x.ndim
    if ndim <= 1:
        raise ValueError(
            "Expected tensor with at least two dimensions for rotary embedding"
        )

    seqlen = int(x.shape[1])
    rotary_dim = int(x.shape[-1])

    if freqs_cis.ndim > 2:
        if freqs_cis.shape[-1] != rotary_dim:
            raise ValueError(
                "Rotary dimension mismatch: got "
                f"{freqs_cis.shape[-1]} for freqs_cis and {rotary_dim} for tensor"
            )
        freqs_cis = freqs_cis.reshape(-1, rotary_dim)

    if freqs_cis.shape[-1] != rotary_dim:
        raise ValueError(
            "Rotary dimension mismatch: got "
            f"{freqs_cis.shape[-1]} for freqs_cis and {rotary_dim} for tensor"
        )

    freq_seqlen = int(freqs_cis.shape[0])
    if freq_seqlen < seqlen:
        freqs_cis = precompute_freqs_cis(rotary_dim * 2, seqlen).to(
            device=freqs_cis.device, dtype=freqs_cis.dtype
        )
        freq_seqlen = seqlen

    if freq_seqlen != seqlen:
        start_idx = _infer_seq_start_idx(freq_seqlen, seqlen)
        if freq_seqlen > seqlen:
            max_start = max(freq_seqlen - seqlen, 0)
            start_idx = int(max(0, min(start_idx, max_start)))
            freqs_cis = freqs_cis.narrow(0, start_idx, seqlen)

        if freqs_cis.shape[0] != seqlen:
            freqs_cis = freqs_cis[-seqlen:]

        freqs_cis = freqs_cis.contiguous()

    shape = [
        d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)
    ]
    return freqs_cis.view(*shape)

ezpz.models.llama¶

Attention ¶

forward(x, freqs_cis) ¶

FeedForward ¶

RMSNorm ¶

Transformer ¶

forward(tokens) ¶

from_model_args(model_args) classmethod ¶

init_weights() ¶

TransformerBlock ¶

forward(x, freqs_cis) ¶

apply_rotary_emb(xq, xk, freqs_cis) ¶

precompute_freqs_cis(dim, end, theta=10000.0) ¶

repeat_kv(x, n_rep) ¶

reshape_for_broadcast(freqs_cis, x) ¶

`ezpz.models.llama`¶

`Attention` ¶

`forward(x, freqs_cis)` ¶

`FeedForward` ¶

`RMSNorm` ¶

`Transformer` ¶

`forward(tokens)` ¶

`from_model_args(model_args)` `classmethod` ¶

`init_weights()` ¶

`TransformerBlock` ¶

`forward(x, freqs_cis)` ¶

`apply_rotary_emb(xq, xk, freqs_cis)` ¶

`precompute_freqs_cis(dim, end, theta=10000.0)` ¶

`repeat_kv(x, n_rep)` ¶

`reshape_for_broadcast(freqs_cis, x)` ¶