Skip to content

ezpz.models.llamaΒΆ

Attention ΒΆ

Bases: Module

Multi-head attention module.

Parameters:

Name Type Description Default
model_args ModelArgs

Model configuration arguments.

required

Attributes:

Name Type Description
n_kv_heads int

Number of key and value heads.

n_heads int

Number of query heads.

n_local_kv_heads int

Number of local key and value heads.

n_rep int

Number of repetitions for local heads.

head_dim int

Dimension size of each attention head.

wq Linear

Linear transformation for queries.

wk Linear

Linear transformation for keys.

wv Linear

Linear transformation for values.

wo Linear

Linear transformation for output.

Source code in src/ezpz/models/llama.py
class Attention(nn.Module):
    """
    Multi-head attention module.

    Args:
        model_args (ModelArgs): Model configuration arguments.

    Attributes:
        n_kv_heads (int): Number of key and value heads.
        n_heads (int): Number of query heads.
        n_local_kv_heads (int): Number of local key and value heads.
        n_rep (int): Number of repetitions for local heads.
        head_dim (int): Dimension size of each attention head.
        wq (Linear): Linear transformation for queries.
        wk (Linear): Linear transformation for keys.
        wv (Linear): Linear transformation for values.
        wo (Linear): Linear transformation for output.

    """

    def __init__(self, model_args: ModelArgs):
        super().__init__()
        self.n_heads = model_args.n_heads
        self.n_kv_heads = (
            model_args.n_heads
            if model_args.n_kv_heads is None
            else model_args.n_kv_heads
        )
        self.n_rep = self.n_heads // self.n_kv_heads
        self.head_dim = model_args.dim // model_args.n_heads

        self.wq = nn.Linear(
            model_args.dim, model_args.n_heads * self.head_dim, bias=False
        )
        self.wk = nn.Linear(
            model_args.dim, self.n_kv_heads * self.head_dim, bias=False
        )
        self.wv = nn.Linear(
            model_args.dim, self.n_kv_heads * self.head_dim, bias=False
        )
        self.wo = nn.Linear(
            model_args.n_heads * self.head_dim, model_args.dim, bias=False
        )

    def init_weights(self, init_std: float):
        for linear in (self.wq, self.wk, self.wv):
            nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02)
        nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std)

    def forward(
        self,
        x: torch.Tensor,
        freqs_cis: torch.Tensor,
    ) -> torch.Tensor:
        """
        Forward pass of the attention module.

        Args:
            x (torch.Tensor): Input tensor.
            freqs_cis (torch.Tensor): Precomputed frequency tensor.

        Returns:
            torch.Tensor: Output tensor after attention.

        """
        bsz, seqlen, _ = x.shape
        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)

        xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim)
        xk = xk.view(bsz, seqlen, self.n_kv_heads, self.head_dim)
        xv = xv.view(bsz, seqlen, self.n_kv_heads, self.head_dim)

        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)

        keys = repeat_kv(
            xk, self.n_rep
        )  # (bs, seqlen, n_local_heads, head_dim)
        values = repeat_kv(
            xv, self.n_rep
        )  # (bs, seqlen, n_local_heads, head_dim)

        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
        xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
        xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)

        # we use casual mask for training
        if os.environ.get("EZPZ_ATTENTION_FP32") == "1":
            output = F.scaled_dot_product_attention(
                xq.float(), xk.float(), xv.float(), is_causal=True
            ).to(xq.dtype)
        else:
            output = F.scaled_dot_product_attention(
                xq, xk, xv, is_causal=True
            )
        output = output.transpose(
            1, 2
        ).contiguous()  # (bs, seqlen, n_local_heads, head_dim)
        output = output.view(bsz, seqlen, -1)
        if _DEBUG_NAN:
            global _DEBUG_NAN_ONCE
            if not _DEBUG_NAN_ONCE:
                with torch.no_grad():
                    q_nf, q_max = _tensor_stats("xq", xq)
                    k_nf, k_max = _tensor_stats("xk", xk)
                    v_nf, v_max = _tensor_stats("xv", xv)
                    o_nf, o_max = _tensor_stats("out", output)
                logger.info(
                    "attn_stats xq(nonfinite=%s max_abs=%.6f) "
                    "xk(nonfinite=%s max_abs=%.6f) "
                    "xv(nonfinite=%s max_abs=%.6f) "
                    "out(nonfinite=%s max_abs=%.6f)",
                    q_nf,
                    q_max,
                    k_nf,
                    k_max,
                    v_nf,
                    v_max,
                    o_nf,
                    o_max,
                )
                _DEBUG_NAN_ONCE = True
        return self.wo(output)

forward(x, freqs_cis) ΒΆ

Forward pass of the attention module.

Parameters:

Name Type Description Default
x Tensor

Input tensor.

required
freqs_cis Tensor

Precomputed frequency tensor.

required

Returns:

Type Description
Tensor

torch.Tensor: Output tensor after attention.

Source code in src/ezpz/models/llama.py
def forward(
    self,
    x: torch.Tensor,
    freqs_cis: torch.Tensor,
) -> torch.Tensor:
    """
    Forward pass of the attention module.

    Args:
        x (torch.Tensor): Input tensor.
        freqs_cis (torch.Tensor): Precomputed frequency tensor.

    Returns:
        torch.Tensor: Output tensor after attention.

    """
    bsz, seqlen, _ = x.shape
    xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)

    xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim)
    xk = xk.view(bsz, seqlen, self.n_kv_heads, self.head_dim)
    xv = xv.view(bsz, seqlen, self.n_kv_heads, self.head_dim)

    xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)

    keys = repeat_kv(
        xk, self.n_rep
    )  # (bs, seqlen, n_local_heads, head_dim)
    values = repeat_kv(
        xv, self.n_rep
    )  # (bs, seqlen, n_local_heads, head_dim)

    xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
    xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
    xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)

    # we use casual mask for training
    if os.environ.get("EZPZ_ATTENTION_FP32") == "1":
        output = F.scaled_dot_product_attention(
            xq.float(), xk.float(), xv.float(), is_causal=True
        ).to(xq.dtype)
    else:
        output = F.scaled_dot_product_attention(
            xq, xk, xv, is_causal=True
        )
    output = output.transpose(
        1, 2
    ).contiguous()  # (bs, seqlen, n_local_heads, head_dim)
    output = output.view(bsz, seqlen, -1)
    if _DEBUG_NAN:
        global _DEBUG_NAN_ONCE
        if not _DEBUG_NAN_ONCE:
            with torch.no_grad():
                q_nf, q_max = _tensor_stats("xq", xq)
                k_nf, k_max = _tensor_stats("xk", xk)
                v_nf, v_max = _tensor_stats("xv", xv)
                o_nf, o_max = _tensor_stats("out", output)
            logger.info(
                "attn_stats xq(nonfinite=%s max_abs=%.6f) "
                "xk(nonfinite=%s max_abs=%.6f) "
                "xv(nonfinite=%s max_abs=%.6f) "
                "out(nonfinite=%s max_abs=%.6f)",
                q_nf,
                q_max,
                k_nf,
                k_max,
                v_nf,
                v_max,
                o_nf,
                o_max,
            )
            _DEBUG_NAN_ONCE = True
    return self.wo(output)

FeedForward ΒΆ

Bases: Module

FeedForward module

Parameters:

Name Type Description Default
dim int

Input dimension.

required
hidden_dim int

Hidden dimension of the feedforward layer.

required
multiple_of int

Value to ensure hidden dimension is a multiple of this value.

required
ffn_dim_multiplier Optional[float]

Custom multiplier for hidden dimension. Defaults to None.

required

Attributes:

Name Type Description
w1 Linear

Linear transformation for the first layer.

w2 Linear

Linear transformation for the second layer.

w3 Linear

Linear transformation for the third layer.

Source code in src/ezpz/models/llama.py
class FeedForward(nn.Module):
    """
    FeedForward module

    Args:
        dim (int): Input dimension.
        hidden_dim (int): Hidden dimension of the feedforward layer.
        multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
        ffn_dim_multiplier (Optional[float]): Custom multiplier for hidden dimension. Defaults to None.

    Attributes:
        w1 (Linear): Linear transformation for the first layer.
        w2 (Linear): Linear transformation for the second layer.
        w3 (Linear): Linear transformation for the third layer.

    """

    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        multiple_of: int,
        ffn_dim_multiplier: Optional[float],
    ):
        super().__init__()
        hidden_dim = int(2 * hidden_dim / 3)
        # custom dim factor multiplier
        if ffn_dim_multiplier is not None:
            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
        hidden_dim = multiple_of * (
            (hidden_dim + multiple_of - 1) // multiple_of
        )

        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)

    def forward(self, x):
        return self.w2(F.silu(self.w1(x)) * self.w3(x))

    def init_weights(self, init_std: float):
        nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
        for linear in (self.w2, self.w3):
            nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std)

RMSNorm ΒΆ

Bases: Module

Initialize the RMSNorm normalization layer.

Parameters:

Name Type Description Default
dim int

The dimension of the input tensor.

required
eps float

A small value added to the denominator for numerical stability. Default is 1e-6.

1e-06

Attributes:

Name Type Description
eps float

A small value added to the denominator for numerical stability.

weight Parameter

Learnable scaling parameter.

Source code in src/ezpz/models/llama.py
class RMSNorm(nn.Module):
    """
    Initialize the RMSNorm normalization layer.

    Args:
        dim (int): The dimension of the input tensor.
        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.

    Attributes:
        eps (float): A small value added to the denominator for numerical stability.
        weight (nn.Parameter): Learnable scaling parameter.

    """

    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x: torch.Tensor):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x: torch.Tensor):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight

    def reset_parameters(self):
        torch.nn.init.ones_(self.weight)  # type: ignore

Transformer ΒΆ

Bases: Module

Transformer Module

Parameters:

Name Type Description Default
model_args ModelArgs

Model configuration arguments.

required

Attributes:

Name Type Description
model_args ModelArgs

Model configuration arguments.

vocab_size int

Vocabulary size.

n_layers int

Number of layers in the model.

tok_embeddings ParallelEmbedding

Token embeddings.

layers ModuleList

List of Transformer blocks.

norm RMSNorm

Layer normalization for the model output.

output ColumnParallelLinear

Linear layer for final output.

freqs_cis Tensor

Precomputed cosine and sine frequencies.

Source code in src/ezpz/models/llama.py
class Transformer(nn.Module):
    """
    Transformer Module

    Args:
        model_args (ModelArgs): Model configuration arguments.

    Attributes:
        model_args (ModelArgs): Model configuration arguments.
        vocab_size (int): Vocabulary size.
        n_layers (int): Number of layers in the model.
        tok_embeddings (ParallelEmbedding): Token embeddings.
        layers (torch.nn.ModuleList): List of Transformer blocks.
        norm (RMSNorm): Layer normalization for the model output.
        output (ColumnParallelLinear): Linear layer for final output.
        freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.

    """

    def __init__(self, model_args: ModelArgs):
        super().__init__()
        self.model_args = model_args
        self.vocab_size = model_args.vocab_size
        self.n_layers = model_args.n_layers
        self.model_dim = model_args.dim

        self.tok_embeddings = nn.Embedding(
            model_args.vocab_size, model_args.dim
        )
        self.register_buffer(
            "freqs_cis",
            precompute_freqs_cis(
                model_args.dim // model_args.n_heads,
                # Need to compute until at least the max token limit for generation
                # (use 2x max sequence length to be safe)
                model_args.max_seq_len * 2,
            ),
        )
        self.layers = torch.nn.ModuleList()
        for layer_id in range(model_args.n_layers):
            self.layers.append(TransformerBlock(layer_id, model_args))

        self.norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps)

        self.output = nn.Linear(
            model_args.dim, model_args.vocab_size, bias=False
        )
        self.init_weights()

    def init_weights(self):
        """
        [Note: On ``init_weights`` vs. ``reset_parameters``]
        Modules may define ``reset_parameters`` to initialize parameter values.
        ``reset_parameters`` is meant to only initialize directly owned
        parameters/buffers, not those of their child modules, and it can be
        used to give the initial values for these tensors.
        Separately, users may want custom initialization for their modules,
        different from that in ``reset_parameters``. For this, we define
        ``init_weights``. We only call it in the constructor of this
        ``Transformer`` root module to avoid reinitializing tensors.
        """
        with torch.device(self.freqs_cis.device):
            self.freqs_cis = precompute_freqs_cis(
                self.model_args.dim // self.model_args.n_heads,
                # Need to compute until at least the max token limit for generation
                # (use 2x max sequence length to be safe)
                self.model_args.max_seq_len * 2,
            )
        nn.init.normal_(self.tok_embeddings.weight)
        for layer in self.layers:
            layer.init_weights()
        self.norm.reset_parameters()
        final_out_std = self.model_args.dim**-0.5
        cutoff_factor = 3
        nn.init.trunc_normal_(
            self.output.weight,
            mean=0.0,
            std=final_out_std,
            a=-cutoff_factor * final_out_std,
            b=cutoff_factor * final_out_std,
        )

    def forward(self, tokens: torch.Tensor) -> torch.Tensor:
        """
        Perform a forward pass through the Transformer model.

        Args:
            tokens (torch.Tensor): Input token indices.

        Returns:
            torch.Tensor: Output logits after applying the Transformer model.

        """
        _bsz, seqlen = tokens.shape
        h = self.tok_embeddings(tokens)
        self.freqs_cis = self.freqs_cis.to(h.device)
        freqs_cis = self.freqs_cis[0:seqlen]

        for layer in self.layers:
            h = layer(h, freqs_cis)
        h = self.norm(h)
        output = self.output(h).float()
        return output

    @classmethod
    def from_model_args(cls, model_args: ModelArgs) -> "Transformer":
        """
        Initialize a Transformer model from a ModelArgs object.

        Args:
            model_args (ModelArgs): Model configuration arguments.

        Returns:
            Transformer: Transformer model.

        """
        return cls(model_args)

forward(tokens) ΒΆ

Perform a forward pass through the Transformer model.

Parameters:

Name Type Description Default
tokens Tensor

Input token indices.

required

Returns:

Type Description
Tensor

torch.Tensor: Output logits after applying the Transformer model.

Source code in src/ezpz/models/llama.py
def forward(self, tokens: torch.Tensor) -> torch.Tensor:
    """
    Perform a forward pass through the Transformer model.

    Args:
        tokens (torch.Tensor): Input token indices.

    Returns:
        torch.Tensor: Output logits after applying the Transformer model.

    """
    _bsz, seqlen = tokens.shape
    h = self.tok_embeddings(tokens)
    self.freqs_cis = self.freqs_cis.to(h.device)
    freqs_cis = self.freqs_cis[0:seqlen]

    for layer in self.layers:
        h = layer(h, freqs_cis)
    h = self.norm(h)
    output = self.output(h).float()
    return output

from_model_args(model_args) classmethod ΒΆ

Initialize a Transformer model from a ModelArgs object.

Parameters:

Name Type Description Default
model_args ModelArgs

Model configuration arguments.

required

Returns:

Name Type Description
Transformer Transformer

Transformer model.

Source code in src/ezpz/models/llama.py
@classmethod
def from_model_args(cls, model_args: ModelArgs) -> "Transformer":
    """
    Initialize a Transformer model from a ModelArgs object.

    Args:
        model_args (ModelArgs): Model configuration arguments.

    Returns:
        Transformer: Transformer model.

    """
    return cls(model_args)

init_weights() ΒΆ

[Note: On init_weights vs. reset_parameters] Modules may define reset_parameters to initialize parameter values. reset_parameters is meant to only initialize directly owned parameters/buffers, not those of their child modules, and it can be used to give the initial values for these tensors. Separately, users may want custom initialization for their modules, different from that in reset_parameters. For this, we define init_weights. We only call it in the constructor of this Transformer root module to avoid reinitializing tensors.

Source code in src/ezpz/models/llama.py
def init_weights(self):
    """
    [Note: On ``init_weights`` vs. ``reset_parameters``]
    Modules may define ``reset_parameters`` to initialize parameter values.
    ``reset_parameters`` is meant to only initialize directly owned
    parameters/buffers, not those of their child modules, and it can be
    used to give the initial values for these tensors.
    Separately, users may want custom initialization for their modules,
    different from that in ``reset_parameters``. For this, we define
    ``init_weights``. We only call it in the constructor of this
    ``Transformer`` root module to avoid reinitializing tensors.
    """
    with torch.device(self.freqs_cis.device):
        self.freqs_cis = precompute_freqs_cis(
            self.model_args.dim // self.model_args.n_heads,
            # Need to compute until at least the max token limit for generation
            # (use 2x max sequence length to be safe)
            self.model_args.max_seq_len * 2,
        )
    nn.init.normal_(self.tok_embeddings.weight)
    for layer in self.layers:
        layer.init_weights()
    self.norm.reset_parameters()
    final_out_std = self.model_args.dim**-0.5
    cutoff_factor = 3
    nn.init.trunc_normal_(
        self.output.weight,
        mean=0.0,
        std=final_out_std,
        a=-cutoff_factor * final_out_std,
        b=cutoff_factor * final_out_std,
    )

TransformerBlock ΒΆ

Bases: Module

TransformerBlock Module

Parameters:

Name Type Description Default
layer_id int

Identifier for the layer.

required
model_args ModelArgs

Model configuration arguments.

required

Attributes:

Name Type Description
n_heads int

Number of attention heads.

dim int

Dimension size of the model.

head_dim int

Dimension size of each attention head.

attention Attention

Attention module.

feed_forward FeedForward

FeedForward module.

layer_id int

Identifier for the layer.

attention_norm RMSNorm

Layer normalization for attention output.

ffn_norm RMSNorm

Layer normalization for feedforward output.

Source code in src/ezpz/models/llama.py
class TransformerBlock(nn.Module):
    """
    TransformerBlock Module

    Args:
        layer_id (int): Identifier for the layer.
        model_args (ModelArgs): Model configuration arguments.

    Attributes:
        n_heads (int): Number of attention heads.
        dim (int): Dimension size of the model.
        head_dim (int): Dimension size of each attention head.
        attention (Attention): Attention module.
        feed_forward (FeedForward): FeedForward module.
        layer_id (int): Identifier for the layer.
        attention_norm (RMSNorm): Layer normalization for attention output.
        ffn_norm (RMSNorm): Layer normalization for feedforward output.

    """

    def __init__(self, layer_id: int, model_args: ModelArgs):
        super().__init__()
        self.n_heads = model_args.n_heads
        self.dim = model_args.dim
        self.attention = Attention(model_args)
        self.feed_forward = FeedForward(
            dim=model_args.dim,
            hidden_dim=4 * model_args.dim,
            multiple_of=model_args.multiple_of,
            ffn_dim_multiplier=model_args.ffn_dim_multiplier,
        )
        self.layer_id = layer_id
        self.num_layers = model_args.n_layers

        self.attention_norm = RMSNorm(
            dim=model_args.dim, eps=model_args.norm_eps
        )
        self.ffn_norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps)

        if model_args.depth_init:
            self.weight_init_std = 0.02 / (2 * (self.layer_id + 1)) ** 0.5
        else:
            self.weight_init_std = 0.02 / (2 * self.num_layers) ** 0.5

    def forward(
        self,
        x: torch.Tensor,
        freqs_cis: torch.Tensor,
    ) -> torch.Tensor:
        """
        Perform a forward pass through the TransformerBlock.

        Args:
            x (torch.Tensor): Input tensor.
            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.

        Returns:
            torch.Tensor: Output tensor after applying attention and feedforward layers.

        """
        h = x + self.attention(self.attention_norm(x), freqs_cis)
        out = h + self.feed_forward(self.ffn_norm(h))
        return out

    def init_weights(self):
        for norm in (self.attention_norm, self.ffn_norm):
            norm.reset_parameters()
        self.attention.init_weights(self.weight_init_std)
        self.feed_forward.init_weights(self.weight_init_std)

forward(x, freqs_cis) ΒΆ

Perform a forward pass through the TransformerBlock.

Parameters:

Name Type Description Default
x Tensor

Input tensor.

required
freqs_cis Tensor

Precomputed cosine and sine frequencies.

required

Returns:

Type Description
Tensor

torch.Tensor: Output tensor after applying attention and feedforward layers.

Source code in src/ezpz/models/llama.py
def forward(
    self,
    x: torch.Tensor,
    freqs_cis: torch.Tensor,
) -> torch.Tensor:
    """
    Perform a forward pass through the TransformerBlock.

    Args:
        x (torch.Tensor): Input tensor.
        freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.

    Returns:
        torch.Tensor: Output tensor after applying attention and feedforward layers.

    """
    h = x + self.attention(self.attention_norm(x), freqs_cis)
    out = h + self.feed_forward(self.ffn_norm(h))
    return out

apply_rotary_emb(xq, xk, freqs_cis) ΒΆ

Apply rotary embeddings to input tensors using the given frequency tensor.

This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are returned as real tensors.

Parameters:

Name Type Description Default
xq Tensor

Query tensor to apply rotary embeddings.

required
xk Tensor

Key tensor to apply rotary embeddings.

required
freqs_cis Tensor

Precomputed frequency tensor for complex exponentials.

required

Returns:

Type Description
Tuple[Tensor, Tensor]

Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.

Source code in src/ezpz/models/llama.py
def apply_rotary_emb(
    xq: torch.Tensor,
    xk: torch.Tensor,
    freqs_cis: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Apply rotary embeddings to input tensors using the given frequency tensor.

    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
    returned as real tensors.

    Args:
        xq (torch.Tensor): Query tensor to apply rotary embeddings.
        xk (torch.Tensor): Key tensor to apply rotary embeddings.
        freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
    """
    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
    return xq_out.type_as(xq), xk_out.type_as(xk)

precompute_freqs_cis(dim, end, theta=10000.0) ΒΆ

Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64 data type.

Parameters:

Name Type Description Default
dim int

Dimension of the frequency tensor.

required
end int

End index for precomputing frequencies.

required
theta float

Scaling factor for frequency computation. Defaults to 10000.0.

10000.0

Returns:

Type Description
Tensor

torch.Tensor: Precomputed frequency tensor with complex exponentials.

Source code in src/ezpz/models/llama.py
def precompute_freqs_cis(
    dim: int, end: int, theta: float = 10000.0
) -> torch.Tensor:
    """
    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
    and the end index 'end'. The 'theta' parameter scales the frequencies.
    The returned tensor contains complex values in complex64 data type.

    Args:
        dim (int): Dimension of the frequency tensor.
        end (int): End index for precomputing frequencies.
        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.

    Returns:
        torch.Tensor: Precomputed frequency tensor with complex exponentials.
    """
    freqs = 1.0 / (
        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
    )
    t = torch.arange(end, device=freqs.device)  # type: ignore
    freqs = torch.outer(t, freqs).float()  # type: ignore
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
    return freqs_cis

repeat_kv(x, n_rep) ΒΆ

torch.repeat_interleave(x, dim=2, repeats=n_rep)

Source code in src/ezpz/models/llama.py
def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
    bs, slen, n_kv_heads, head_dim = x.shape
    if n_rep == 1:
        return x
    return (
        x[:, :, :, None, :]
        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
    )

reshape_for_broadcast(freqs_cis, x) ΒΆ

Reshape frequency tensor for broadcasting it with another tensor.

This function reshapes the frequency tensor to have the same shape as the target tensor 'x' for the purpose of broadcasting the frequency tensor during element-wise operations.

Parameters:

Name Type Description Default
freqs_cis Tensor

Frequency tensor to be reshaped.

required
x Tensor

Target tensor for broadcasting compatibility.

required

Returns:

Type Description
Tensor

torch.Tensor: Reshaped frequency tensor.

Source code in src/ezpz/models/llama.py
def reshape_for_broadcast(
    freqs_cis: torch.Tensor, x: torch.Tensor
) -> torch.Tensor:
    """
    Reshape frequency tensor for broadcasting it with another tensor.

    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
    for the purpose of broadcasting the frequency tensor during element-wise operations.

    Args:
        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
        x (torch.Tensor): Target tensor for broadcasting compatibility.

    Returns:
        torch.Tensor: Reshaped frequency tensor.
    """
    ndim = x.ndim
    if ndim <= 1:
        raise ValueError(
            "Expected tensor with at least two dimensions for rotary embedding"
        )

    seqlen = int(x.shape[1])
    rotary_dim = int(x.shape[-1])

    if freqs_cis.ndim > 2:
        if freqs_cis.shape[-1] != rotary_dim:
            raise ValueError(
                "Rotary dimension mismatch: got "
                f"{freqs_cis.shape[-1]} for freqs_cis and {rotary_dim} for tensor"
            )
        freqs_cis = freqs_cis.reshape(-1, rotary_dim)

    if freqs_cis.shape[-1] != rotary_dim:
        raise ValueError(
            "Rotary dimension mismatch: got "
            f"{freqs_cis.shape[-1]} for freqs_cis and {rotary_dim} for tensor"
        )

    freq_seqlen = int(freqs_cis.shape[0])
    if freq_seqlen < seqlen:
        freqs_cis = precompute_freqs_cis(rotary_dim * 2, seqlen).to(
            device=freqs_cis.device, dtype=freqs_cis.dtype
        )
        freq_seqlen = seqlen

    if freq_seqlen != seqlen:
        start_idx = _infer_seq_start_idx(freq_seqlen, seqlen)
        if freq_seqlen > seqlen:
            max_start = max(freq_seqlen - seqlen, 0)
            start_idx = int(max(0, min(start_idx, max_start)))
            freqs_cis = freqs_cis.narrow(0, start_idx, seqlen)

        if freqs_cis.shape[0] != seqlen:
            freqs_cis = freqs_cis[-seqlen:]

        freqs_cis = freqs_cis.contiguous()

    shape = [
        d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)
    ]
    return freqs_cis.view(*shape)