0% found this document useful (0 votes)

32 views20 pages

Modeling Chatglm

Uploaded by

alijamshed271

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

32 views20 pages

Modeling Chatglm

Uploaded by

alijamshed271

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 20

""" PyTorch ChatGLM model.

"""

import math
import sys
import torch
import torch.utils.checkpoint
import torch.nn.functional as F
from torch import nn
from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
from torch.nn.utils import skip_init
from typing import Optional, Tuple, Union, List, Dict, Any

from transformers.modeling_outputs import (

BaseModelOutputWithPast,
CausalLMOutputWithPast,
SequenceClassifierOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import logging, is_torch_npu_available
from transformers.generation.logits_process import LogitsProcessor
from transformers.generation.utils import ModelOutput

from .configuration_chatglm import ChatGLMConfig

try:
from transformers.utils import is_flash_attn_greater_or_equal_2_10,
is_flash_attn_2_available

if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input,
unpad_input # noqa
except:
pass

# flags required to enable jit fusion kernels

if sys.platform != 'darwin' and not is_torch_npu_available():

torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
torch._C._jit_override_can_fuse_on_cpu(True)
torch._C._jit_override_can_fuse_on_gpu(True)

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
_CONFIG_FOR_DOC = "ChatGLMConfig"

def default_init(cls, *args, **kwargs):

return cls(*args, **kwargs)

class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) ->
torch.FloatTensor:
if torch.isnan(scores).any() or torch.isinf(scores).any():
scores.zero_()
scores[..., 198] = 5e4
return scores

def split_tensor_along_last_dim(
tensor: torch.Tensor,
num_partitions: int,
contiguous_split_chunks: bool = False,
) -> List[torch.Tensor]:
"""Split a tensor along its last dimension.
Arguments:
tensor: input tensor.
num_partitions: number of partitions to split the tensor
contiguous_split_chunks: If True, make each chunk contiguous
in memory.
Returns:
A list of Tensors
"""
# Get the size and dimension.
last_dim = tensor.dim() - 1
last_dim_size = tensor.size()[last_dim] // num_partitions
# Split.
tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
# Note: torch.split does not create contiguous tensors by default.
if contiguous_split_chunks:
return tuple(chunk.contiguous() for chunk in tensor_list)

return tensor_list

class RotaryEmbedding(nn.Module):
def __init__(self, dim, rope_ratio=1, original_impl=False, device=None,
dtype=None):
super().__init__()
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2,
device=device).to(dtype=dtype) / dim))
self.register_buffer("inv_freq", inv_freq)
self.dim = dim
self.original_impl = original_impl
self.rope_ratio = rope_ratio

def forward_impl(
self, seq_len: int, n_elem: int, dtype: torch.dtype, device:
torch.device, base: int = 10000
):
"""Enhanced Transformer with Rotary Position Embedding.
Derived from:
https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/
master/labml_nn/
transformers/rope/__init__.py. MIT License:
https://github.com/labmlai/annotated_deep_learning_paper_implementations/
blob/master/license.
"""
# $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \
frac{d}{2}]}$
base = base * self.rope_ratio
theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float,
device=device) / n_elem))

# Create position indexes `[0, 1, ..., seq_len - 1]`

seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)

# Calculate the product of position index and $\theta_i$

idx_theta = torch.outer(seq_idx, theta).float()

cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)

# this is to mimic the behaviour of complex32, else we will get different

results
if dtype in (torch.float16, torch.bfloat16, torch.int8):
cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
return cache

def forward(self, max_seq_len, offset=0):

return self.forward_impl(
max_seq_len, self.dim, dtype=self.inv_freq.dtype,
device=self.inv_freq.device
)

@torch.jit.script
def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) ->
torch.Tensor:
# x: [b, np, sq, hn]
b, np, sq, hn = x.size(0), x.size(1), x.size(2), x.size(3)
rot_dim = rope_cache.shape[-2] * 2
x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
# truncate to support variable sizes
rope_cache = rope_cache[:, :sq]
xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
x_out2 = torch.stack(
[
xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] *
rope_cache[..., 1],
xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] *
rope_cache[..., 1],
],
-1,
)
x_out2 = x_out2.flatten(3)
return torch.cat((x_out2, x_pass), dim=-1)

class RMSNorm(torch.nn.Module):
def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None,
**kwargs):
super().__init__()
self.weight = torch.nn.Parameter(torch.empty(normalized_shape,
device=device, dtype=dtype))
self.eps = eps

def forward(self, hidden_states: torch.Tensor):

input_dtype = hidden_states.dtype
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.eps)

return (self.weight * hidden_states).to(input_dtype)

class CoreAttention(torch.nn.Module):
def __init__(self, config: ChatGLMConfig, layer_number):
super(CoreAttention, self).__init__()
self.config = config
self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
if self.apply_query_key_layer_scaling:
self.attention_softmax_in_fp32 = True
self.layer_number = max(1, layer_number)
self.is_causal = True

projection_size = config.kv_channels * config.num_attention_heads

# Per attention head and per partition values.

self.hidden_size_per_partition = projection_size
self.hidden_size_per_attention_head = projection_size //
config.num_attention_heads
self.num_attention_heads_per_partition = config.num_attention_heads

coeff = None
self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
if self.apply_query_key_layer_scaling:
coeff = self.layer_number
self.norm_factor *= coeff
self.coeff = coeff

self.attention_dropout = torch.nn.Dropout(config.attention_dropout)

def forward(self, query_layer, key_layer, value_layer, attention_mask):

# [b, np, sq, sk]
output_size = (query_layer.size(0), query_layer.size(1),
query_layer.size(2), key_layer.size(2))

# [b, np, sq, hn] -> [b * np, sq, hn]

query_layer = query_layer.view(output_size[0] * output_size[1],
output_size[2], -1)
# [b, np, sk, hn] -> [b * np, sk, hn]
key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3],
-1)

# preallocting input tensor: [b * np, sq, sk]

matmul_input_buffer = torch.empty(
output_size[0] * output_size[1], output_size[2], output_size[3],
dtype=query_layer.dtype,
device=query_layer.device
)

# Raw attention scores. [b * np, sq, sk]

matmul_result = torch.baddbmm(
matmul_input_buffer,
query_layer, # [b * np, sq, hn]
key_layer.transpose(1, 2), # [b * np, hn, sk]
beta=0.0,
alpha=(1.0 / self.norm_factor),
)

# change view to [b, np, sq, sk]

attention_scores = matmul_result.view(*output_size)
# ===========================
# Attention probs and dropout
# ===========================

# attention scores and attention mask [b, np, sq, sk]

if self.attention_softmax_in_fp32:
attention_scores = attention_scores.float()
if self.coeff is not None:
attention_scores = attention_scores * self.coeff
if attention_mask is None and attention_scores.shape[2] ==
attention_scores.shape[3]:
attention_mask = torch.ones(output_size[0], 1, output_size[2],
output_size[3],
device=attention_scores.device,
dtype=torch.bool)
attention_mask.tril_()
attention_mask = ~attention_mask
if attention_mask is not None:
attention_scores = attention_scores.masked_fill(attention_mask,
float("-inf"))
attention_probs = F.softmax(attention_scores, dim=-1)
attention_probs = attention_probs.type_as(value_layer)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.attention_dropout(attention_probs)

# query layer shape: [b * np, sq, hn]

# value layer shape: [b, np, sk, hn]
# attention shape: [b, np, sq, sk]
# context layer shape: [b, np, sq, hn]
output_size = (value_layer.size(0), value_layer.size(1),
query_layer.size(1), value_layer.size(3))
# change view [b * np, sk, hn]
value_layer = value_layer.view(output_size[0] * output_size[1],
value_layer.size(2), -1)
# change view [b * np, sq, sk]
attention_probs = attention_probs.view(output_size[0] * output_size[1],
output_size[2], -1)
# matmul: [b * np, sq, hn]
context_layer = torch.bmm(attention_probs, value_layer)
# change view [b, np, sq, hn]
context_layer = context_layer.view(*output_size)
# [b, np, sq, hn] --> [b, sq, np, hn]
context_layer = context_layer.transpose(1, 2).contiguous()
# [b, sq, np, hn] --> [b, sq, hp]
new_context_layer_shape = context_layer.size()[:-2] +
(self.hidden_size_per_partition,)
context_layer = context_layer.reshape(*new_context_layer_shape)

return context_layer

class SdpaAttention(CoreAttention):
def forward(self, query_layer, key_layer, value_layer, attention_mask):
if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
context_layer =
torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer,
value_layer,

is_causal=True,

dropout_p=self.config.attention_dropout if self.training else 0.0)

else:
if attention_mask is not None:
attention_mask = ~attention_mask
context_layer =
torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer,
value_layer,

attention_mask,

dropout_p=self.config.attention_dropout if self.training else 0.0)

context_layer = context_layer.transpose(1, 2).contiguous()
new_context_layer_shape = context_layer.size()[:-2] +
(self.hidden_size_per_partition,)
context_layer = context_layer.reshape(*new_context_layer_shape)
return context_layer

def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32),
(1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)

# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2

class FlashAttention2(CoreAttention):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._flash_attn_uses_top_left_mask = not
is_flash_attn_greater_or_equal_2_10()

def forward(self, query_states, key_states, value_states, attention_mask):

query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
batch_size, query_length = query_states.shape[:2]
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
# TODO: Remove the `query_length != 1` check once Flash Attention for
RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2
__init__.
causal = self.is_causal and query_length != 1
dropout = self.config.attention_dropout if self.training else 0.0
# Contains at least one padding token in the sequence
if attention_mask is not None:
query_states, key_states, value_states, indices_q, cu_seq_lens,
max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask,
query_length
)

cu_seqlens_q, cu_seqlens_k = cu_seq_lens

max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=None,
causal=causal,
)

attn_output = pad_input(attn_output_unpad, indices_q, batch_size,

query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout,
softmax_scale=None, causal=causal
)
attn_output = attn_output.reshape(batch_size, query_length,
self.hidden_size_per_partition).contiguous()
return attn_output

def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,

query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k =
_get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len,
self.num_attention_heads_per_partition, head_dim),
indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
) # There is a memcpy here, that is very bad.
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
# The -q_len: slice assumes left padding.
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q =
unpad_input(query_layer, attention_mask)

return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)

CORE_ATTENTION_CLASSES = {
"eager": CoreAttention,
"sdpa": SdpaAttention,
"flash_attention_2": FlashAttention2
}

class SelfAttention(torch.nn.Module):
"""Parallel self-attention layer abstract class.
Self-attention layer takes input with size [s, b, h]
and returns output of the same size.
"""

def init(self, config: ChatGLMConfig, layer_number, device=None):

super(SelfAttention, self).__init__()
self.layer_number = max(1, layer_number)

self.projection_size = config.kv_channels * config.num_attention_heads

# Per attention head and per partition values.

self.hidden_size_per_attention_head = self.projection_size //
config.num_attention_heads
self.num_attention_heads_per_partition = config.num_attention_heads

self.multi_query_attention = config.multi_query_attention
self.qkv_hidden_size = 3 * self.projection_size
if self.multi_query_attention:
self.num_multi_query_groups_per_partition =
config.multi_query_group_num
self.qkv_hidden_size = (
self.projection_size + 2 * self.hidden_size_per_attention_head
* config.multi_query_group_num
)
self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
bias=config.add_bias_linear or
config.add_qkv_bias,
device=device, **_config_to_kwargs(config)
)

self.core_attention = CORE_ATTENTION_CLASSES[config._attn_implementation]
(config, self.layer_number)
# Output.
self.dense = nn.Linear(self.projection_size, config.hidden_size,
bias=config.add_bias_linear,
device=device, **_config_to_kwargs(config)
)

def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None,

dtype=None):
if self.multi_query_attention:
num_attention_heads = self.num_multi_query_groups_per_partition
else:
num_attention_heads = self.num_attention_heads_per_partition
return torch.empty(
inference_max_sequence_len,
batch_size,
num_attention_heads,
self.hidden_size_per_attention_head,
dtype=dtype,
device=device,
)

def forward(
self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None,
use_cache=True
):
# hidden_states: [b, sq, h]

# =================================================
# Pre-allocate memory for key-values for inference.
# =================================================
# =====================
# Query, Key, and Value
# =====================

# Attention heads [b, sq, h] --> [b, sq, (np * 3 * hn)]

mixed_x_layer = self.query_key_value(hidden_states)

if self.multi_query_attention:
(query_layer, key_layer, value_layer) = mixed_x_layer.split(
[
self.num_attention_heads_per_partition *
self.hidden_size_per_attention_head,
self.num_multi_query_groups_per_partition *
self.hidden_size_per_attention_head,
self.num_multi_query_groups_per_partition *
self.hidden_size_per_attention_head,
],
dim=-1,
)
query_layer = query_layer.view(
query_layer.size()[:-1] + (self.num_attention_heads_per_partition,
self.hidden_size_per_attention_head)
)
key_layer = key_layer.view(
key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition,
self.hidden_size_per_attention_head)
)
value_layer = value_layer.view(
value_layer.size()[:-1]
+ (self.num_multi_query_groups_per_partition,
self.hidden_size_per_attention_head)
)
else:
new_tensor_shape = mixed_x_layer.size()[:-1] + \
(self.num_attention_heads_per_partition,
3 * self.hidden_size_per_attention_head)
mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)

# [b, sq, np, 3 * hn] --> 3 [b, sq, np, hn]

(query_layer, key_layer, value_layer) =
split_tensor_along_last_dim(mixed_x_layer, 3)

# [b, sq, np, hn] -> [b, np, sq, hn]

query_layer, key_layer, value_layer = [k.transpose(1, 2) for k in
[query_layer, key_layer, value_layer]]

# apply relative positional encoding (rotary embedding)

if rotary_pos_emb is not None:
query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)

# adjust key and value for inference

if kv_cache is not None:
cache_k, cache_v = kv_cache
key_layer = torch.cat((cache_k, key_layer), dim=2)
value_layer = torch.cat((cache_v, value_layer), dim=2)
if use_cache:
if kv_cache is None:
kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0),
value_layer.unsqueeze(0).unsqueeze(0)),
dim=1)
else:
kv_cache = (key_layer, value_layer)
else:
kv_cache = None

if self.multi_query_attention:
key_layer = key_layer.unsqueeze(2)
key_layer = key_layer.expand(
-1, -1, self.num_attention_heads_per_partition //
self.num_multi_query_groups_per_partition, -1, -1
)
key_layer = key_layer.contiguous().view(
key_layer.size()[:1] + (self.num_attention_heads_per_partition,) +
key_layer.size()[3:]
)
value_layer = value_layer.unsqueeze(2)
value_layer = value_layer.expand(
-1, -1, self.num_attention_heads_per_partition //
self.num_multi_query_groups_per_partition, -1, -1
)
value_layer = value_layer.contiguous().view(
value_layer.size()[:1] + (self.num_attention_heads_per_partition,)
+ value_layer.size()[3:]
)

# ==================================
# core attention computation
# ==================================

context_layer = self.core_attention(query_layer, key_layer, value_layer,

attention_mask)

# =================
# Output. [sq, b, h]
# =================

output = self.dense(context_layer)

return output, kv_cache

def _config_to_kwargs(args):
common_kwargs = {
"dtype": args.torch_dtype,
}
return common_kwargs

class MLP(torch.nn.Module):
"""MLP.
MLP will take the input with h hidden state, project it to 4*h
hidden dimension, perform nonlinear transformation, and project the
state back into h hidden dimension.
"""

def init(self, config: ChatGLMConfig, device=None):

super(MLP, self).__init__()

self.add_bias = config.add_bias_linear

# Project to 4h. If using swiglu double the output width, see

https://arxiv.org/pdf/2002.05202.pdf
self.dense_h_to_4h = nn.Linear(
config.hidden_size,
config.ffn_hidden_size * 2,
bias=self.add_bias,
device=device,
**_config_to_kwargs(config)
)

def swiglu(x):
x = torch.chunk(x, 2, dim=-1)
return F.silu(x[0]) * x[1]

self.activation_func = swiglu

# Project back to h.
self.dense_4h_to_h = nn.Linear(
config.ffn_hidden_size,
config.hidden_size,
bias=self.add_bias,
device=device,
**_config_to_kwargs(config)
)
def forward(self, hidden_states):
# [s, b, 4hp]
intermediate_parallel = self.dense_h_to_4h(hidden_states)
intermediate_parallel = self.activation_func(intermediate_parallel)
# [s, b, h]
output = self.dense_4h_to_h(intermediate_parallel)
return output

class GLMBlock(torch.nn.Module):
"""A single transformer layer.
Transformer layer takes input with size [s, b, h] and returns an
output of the same size.
"""

def init(self, config: ChatGLMConfig, layer_number, device=None):

super(GLMBlock, self).__init__()
self.layer_number = layer_number

self.apply_residual_connection_post_layernorm =
config.apply_residual_connection_post_layernorm

self.fp32_residual_connection = config.fp32_residual_connection

LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm

# Layernorm on the input data.
self.input_layernorm = LayerNormFunc(config.hidden_size,
eps=config.layernorm_epsilon, device=device,
dtype=config.torch_dtype)

# Self attention.
self.self_attention = SelfAttention(config, layer_number, device=device)
self.hidden_dropout = config.hidden_dropout

# Layernorm on the attention output

self.post_attention_layernorm = LayerNormFunc(config.hidden_size,
eps=config.layernorm_epsilon, device=device,
dtype=config.torch_dtype)

# MLP
self.mlp = MLP(config, device=device)

def forward(
self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None,
use_cache=True,
):
# hidden_states: [s, b, h]

# Layer norm at the beginning of the transformer layer.

layernorm_output = self.input_layernorm(hidden_states)
# Self attention.
attention_output, kv_cache = self.self_attention(
layernorm_output,
attention_mask,
rotary_pos_emb,
kv_cache=kv_cache,
use_cache=use_cache
)
# Residual connection.
if self.apply_residual_connection_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states

layernorm_input = torch.nn.functional.dropout(attention_output,
p=self.hidden_dropout, training=self.training)
layernorm_input = residual + layernorm_input

# Layer norm post the self attention.

layernorm_output = self.post_attention_layernorm(layernorm_input)

# MLP.
mlp_output = self.mlp(layernorm_output)

# Second residual connection.

if self.apply_residual_connection_post_layernorm:
residual = layernorm_output
else:
residual = layernorm_input

output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout,

training=self.training)
output = residual + output

return output, kv_cache

class GLMTransformer(torch.nn.Module):
"""Transformer class."""

def init(self, config: ChatGLMConfig, device=None):

super(GLMTransformer, self).__init__()

self.fp32_residual_connection = config.fp32_residual_connection
self.post_layer_norm = config.post_layer_norm

# Number of layers.
self.num_layers = config.num_layers

# Transformer layers.
def build_layer(layer_number):
return GLMBlock(config, layer_number, device=device)

self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in

range(self.num_layers)])

if self.post_layer_norm:
LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
# Final layer norm before output.
self.final_layernorm = LayerNormFunc(config.hidden_size,
eps=config.layernorm_epsilon, device=device,
dtype=config.torch_dtype)

self.gradient_checkpointing = False

def _get_layer(self, layer_number):

return self.layers[layer_number]
def forward(
self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
use_cache: Optional[bool] = True,
output_hidden_states: Optional[bool] = False,
):
if not kv_caches:
kv_caches = [None for _ in range(self.num_layers)]
presents = () if use_cache else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing.
Setting `use_cache=False`..."
)
use_cache = False

all_self_attentions = None
all_hidden_states = () if output_hidden_states else None
for index in range(self.num_layers):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)

layer = self._get_layer(index)
if self.gradient_checkpointing and self.training:
layer_ret = torch.utils.checkpoint.checkpoint(
layer,
hidden_states,
attention_mask,
rotary_pos_emb,
kv_caches[index],
use_cache,
use_reentrant=False
)
else:
layer_ret = layer(
hidden_states,
attention_mask,
rotary_pos_emb,
kv_cache=kv_caches[index],
use_cache=use_cache
)
hidden_states, kv_cache = layer_ret
if use_cache:
# token by token decoding, use tuple format
if kv_caches[0] is not None:
presents = presents + (kv_cache,)
# prefilling in decoding, use tensor format to save cuda memory
else:
if len(presents) == 0:
presents = kv_cache
else:
presents = torch.cat((presents,
kv_cache.to(presents.device)), dim=0)

if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)

# Final layer norm.

if self.post_layer_norm:
hidden_states = self.final_layernorm(hidden_states)

return hidden_states, presents, all_hidden_states, all_self_attentions

class ChatGLMPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""

is_parallelizable = False
supports_gradient_checkpointing = True
config_class = ChatGLMConfig
base_model_prefix = "transformer"
_no_split_modules = ["GLMBlock"]
_supports_flash_attn_2 = True
_supports_sdpa = True

def _init_weights(self, module: nn.Module):

"""Initialize the weights."""
return

def get_masks(self, input_ids, past_key_values, padding_mask=None):

if self.config._attn_implementation == "flash_attention_2":
if padding_mask is not None and not padding_mask.all():
return padding_mask
return None
batch_size, seq_length = input_ids.shape
full_attention_mask = torch.ones(batch_size, seq_length, seq_length,
device=input_ids.device)
full_attention_mask.tril_()
past_length = 0
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if past_length:
full_attention_mask = torch.cat((torch.ones(batch_size, seq_length,
past_length,
device=input_ids.device),
full_attention_mask), dim=-1)
if padding_mask is not None:
full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
if not past_length and padding_mask is not None:
full_attention_mask -= padding_mask.unsqueeze(-1) - 1
full_attention_mask = (full_attention_mask < 0.5).bool()
full_attention_mask.unsqueeze_(1)
return full_attention_mask

def get_position_ids(self, input_ids, device):

batch_size, seq_length = input_ids.shape
position_ids = torch.arange(seq_length, dtype=torch.long,
device=device).unsqueeze(0).repeat(batch_size, 1)
return position_ids

class Embedding(torch.nn.Module):
"""Language model embeddings."""

def init(self, config: ChatGLMConfig, device=None):

super(Embedding, self).__init__()

self.hidden_size = config.hidden_size
# Word embeddings (parallel).
self.word_embeddings = nn.Embedding(
config.padded_vocab_size,
self.hidden_size,
dtype=config.torch_dtype,
device=device
)
self.fp32_residual_connection = config.fp32_residual_connection

def forward(self, input_ids):

# Embeddings.
words_embeddings = self.word_embeddings(input_ids)
embeddings = words_embeddings
# If the input flag for fp32 residual connection is set, convert for float.
if self.fp32_residual_connection:
embeddings = embeddings.float()
return embeddings

class ChatGLMModel(ChatGLMPreTrainedModel):
def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
super().__init__(config)
if empty_init:
init_method = skip_init
else:
init_method = default_init
init_kwargs = {}
if device is not None:
init_kwargs["device"] = device
self.embedding = init_method(Embedding, config, **init_kwargs)
self.num_layers = config.num_layers
self.multi_query_group_num = config.multi_query_group_num
self.kv_channels = config.kv_channels

# Rotary positional embeddings

self.seq_length = config.seq_length
rotary_dim = (
config.hidden_size // config.num_attention_heads if config.kv_channels
is None else config.kv_channels
)

self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2,
rope_ratio=config.rope_ratio,
original_impl=config.original_rope,
device=device,
dtype=config.torch_dtype)
self.encoder = init_method(GLMTransformer, config, **init_kwargs)
self.output_layer = init_method(nn.Linear, config.hidden_size,
config.padded_vocab_size, bias=False,
dtype=config.torch_dtype, **init_kwargs)

def get_input_embeddings(self):
return self.embedding.word_embeddings

def set_input_embeddings(self, value):

self.embedding.word_embeddings = value
def forward(
self,
input_ids,
position_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.BoolTensor] = None,
full_attention_mask: Optional[torch.BoolTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor,
torch.Tensor], ...]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else
self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else
self.config.use_return_dict

batch_size, seq_length = input_ids.shape

if inputs_embeds is None:
inputs_embeds = self.embedding(input_ids)

if full_attention_mask is None:
if (attention_mask is not None and not attention_mask.all()) or
(past_key_values and seq_length != 1):
full_attention_mask = self.get_masks(input_ids, past_key_values,
padding_mask=attention_mask)

# Rotary positional embeddings

rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
if position_ids is not None:
rotary_pos_emb = rotary_pos_emb[position_ids]
else:
rotary_pos_emb = rotary_pos_emb[None, :seq_length]

# Run encoder.
hidden_states, presents, all_hidden_states, all_self_attentions =
self.encoder(
inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
kv_caches=past_key_values, use_cache=use_cache,
output_hidden_states=output_hidden_states
)
if presents is not None and type(presents) is torch.Tensor:
presents = presents.split(1, dim=0)
presents = list(presents)
presents = [list(x.squeeze(0).split(1, dim=0)) for x in presents]
presents = [tuple([x.squeeze(0) for x in y]) for y in presents]
presents = tuple(presents)

if not return_dict:
return tuple(v for v in [hidden_states, presents, all_hidden_states,
all_self_attentions] if v is not None)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=presents,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)

class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
super().__init__(config)

self.max_sequence_length = config.max_length
self.transformer = ChatGLMModel(config, empty_init=empty_init,
device=device)
self.config = config

def _update_model_kwargs_for_generation(
self,
outputs: ModelOutput,
model_kwargs: Dict[str, Any],
is_encoder_decoder: bool = False,
) -> Dict[str, Any]:
# update past_key_values
cache_name, cache = self._extract_past_from_model_output(outputs)
model_kwargs[cache_name] = cache

# update attention mask

if "attention_mask" in model_kwargs:
attention_mask = model_kwargs["attention_mask"]
model_kwargs["attention_mask"] = torch.cat(
[attention_mask, attention_mask.new_ones((attention_mask.shape[0],
1))], dim=-1
)

# update position ids

if "position_ids" in model_kwargs:
position_ids = model_kwargs["position_ids"]
new_position_id = position_ids[..., -1:].clone()
new_position_id += 1
model_kwargs["position_ids"] = torch.cat(
[position_ids, new_position_id], dim=-1
)

model_kwargs["is_first_forward"] = False
return model_kwargs

def prepare_inputs_for_generation(
self,
input_ids: torch.LongTensor,
past_key_values: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
is_first_forward: bool = True,
**kwargs
) -> dict:
# only last token for input_ids if past is not None
if position_ids is None:
position_ids = self.get_position_ids(input_ids,
device=input_ids.device)
if not is_first_forward:
if past_key_values is not None:
position_ids = position_ids[..., -1:]
input_ids = input_ids[:, -1:]
return {
"input_ids": input_ids,
"past_key_values": past_key_values,
"position_ids": position_ids,
"attention_mask": attention_mask,
"return_last_logit": True,
"use_cache": use_cache
}

def forward(
self,
input_ids: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
return_last_logit: Optional[bool] = False,
):
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else
self.config.use_return_dict

transformer_outputs = self.transformer(
input_ids=input_ids,
position_ids=position_ids,
attention_mask=attention_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)

hidden_states = transformer_outputs[0]
if return_last_logit:
hidden_states = hidden_states[:, -1:]
lm_logits = self.transformer.output_layer(hidden_states)

loss = None
if labels is not None:
lm_logits = lm_logits.to(torch.float32)

# Shift so that tokens < n predict n

shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=-100)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))

lm_logits = lm_logits.to(hidden_states.dtype)
loss = loss.to(hidden_states.dtype)

if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output

return CausalLMOutputWithPast(
loss=loss,
logits=lm_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)

@staticmethod
def _reorder_cache(
past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx:
torch.LongTensor
) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
"""
This function is used to re-order the `past_key_values` cache if
[`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match
`past_key_values` with the correct
beam_idx at every generation step.
Output shares the same memory storage as `past`.
"""
return tuple(
(
layer_past[0].index_select(0, beam_idx.to(layer_past[0].device)),
layer_past[1].index_select(0, beam_idx.to(layer_past[1].device)),
)
for layer_past in past
)

@torch.inference_mode()
def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str =
"user",
max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8,
temperature=0.8,
**kwargs):
if history is None:
history = []
gen_kwargs = {"max_length": max_length, "num_beams": num_beams,
"do_sample": do_sample, "top_p": top_p,
"temperature": temperature, **kwargs}
inputs = tokenizer.build_chat_input(query, history=history, role=role)
inputs = inputs.to(self.device)
eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
tokenizer.get_command("<|observation|>")]
outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
response = tokenizer.decode(outputs)
history.append({"role": role, "content": query})
return response, history

Irc Compilation Past Year Questions (Student)
No ratings yet
Irc Compilation Past Year Questions (Student)
387 pages
CSCI-43646364 S25 - Lecture 7
No ratings yet
CSCI-43646364 S25 - Lecture 7
85 pages
Font Transfer 2 Autoencoders
No ratings yet
Font Transfer 2 Autoencoders
78 pages
Lmi Question Bank
No ratings yet
Lmi Question Bank
29 pages
Corner To Corner Vest
No ratings yet
Corner To Corner Vest
15 pages
Ilovepdf Merged
No ratings yet
Ilovepdf Merged
144 pages
AIHT Final Project
No ratings yet
AIHT Final Project
6 pages
Pytorch Tutorial 1 Rev 1
No ratings yet
Pytorch Tutorial 1 Rev 1
48 pages
Transformer Flux
No ratings yet
Transformer Flux
11 pages
Touch Math Autism
No ratings yet
Touch Math Autism
11 pages
Mscanet Model
No ratings yet
Mscanet Model
12 pages
PyTorch Cheat Sheet
No ratings yet
PyTorch Cheat Sheet
2 pages
Hrhrttgggegerge
No ratings yet
Hrhrttgggegerge
3 pages
PINN 1DBurgers
No ratings yet
PINN 1DBurgers
19 pages
Composite Wall Experiment
No ratings yet
Composite Wall Experiment
6 pages
Lab - 11 - Jupyter Notebook
No ratings yet
Lab - 11 - Jupyter Notebook
28 pages
Structure and Modular Programming
No ratings yet
Structure and Modular Programming
17 pages
Diodo 1N4148
No ratings yet
Diodo 1N4148
7 pages
TRIGONOMETRY Merged
100% (1)
TRIGONOMETRY Merged
18 pages
DL Lab (6-10) With Output
No ratings yet
DL Lab (6-10) With Output
5 pages
Fnnexpriment 1
No ratings yet
Fnnexpriment 1
7 pages
Correct The Error
No ratings yet
Correct The Error
11 pages
Electrical System p9000
No ratings yet
Electrical System p9000
50 pages
PIPESIM Presentation SAE - 20181105
No ratings yet
PIPESIM Presentation SAE - 20181105
35 pages
Fixed+Appliances
No ratings yet
Fixed+Appliances
36 pages
About Version Control
No ratings yet
About Version Control
6 pages
Bones of Upper Limb (Anatomy Practical) Mansoura
100% (1)
Bones of Upper Limb (Anatomy Practical) Mansoura
27 pages
Pytorch Exercise
No ratings yet
Pytorch Exercise
5 pages
(Deep Learning Using PyTorch) (Cheatsheet)
No ratings yet
(Deep Learning Using PyTorch) (Cheatsheet)
7 pages
IBest DeepLearning
No ratings yet
IBest DeepLearning
123 pages
Java Advanced OOP
100% (1)
Java Advanced OOP
0 pages
Lab 6
No ratings yet
Lab 6
29 pages
LLM Code Ref
No ratings yet
LLM Code Ref
10 pages
Lesson 39 - Transcript. Build Applications With Glide - Part 2
No ratings yet
Lesson 39 - Transcript. Build Applications With Glide - Part 2
112 pages
Karpathy MinGPT Model
No ratings yet
Karpathy MinGPT Model
7 pages
Pytorch Demo 1749471354
No ratings yet
Pytorch Demo 1749471354
10 pages
Lecture 10-Controllers (PLC) B.
No ratings yet
Lecture 10-Controllers (PLC) B.
28 pages
Debugging Tensorflow Guide
No ratings yet
Debugging Tensorflow Guide
28 pages
Acta Paediatrica
No ratings yet
Acta Paediatrica
13 pages
CS236 Introduction To PyTorch
100% (4)
CS236 Introduction To PyTorch
33 pages
Chapter 1
No ratings yet
Chapter 1
37 pages
Transformers Torch
No ratings yet
Transformers Torch
38 pages
Chap 02 MCQ
No ratings yet
Chap 02 MCQ
14 pages
Chapter 4 Part-1 Sawyer's Book
No ratings yet
Chapter 4 Part-1 Sawyer's Book
11 pages
Coding Attention Mechanisms
No ratings yet
Coding Attention Mechanisms
24 pages
Introduction To Teradata Data Mover Create Your First Job
No ratings yet
Introduction To Teradata Data Mover Create Your First Job
5 pages
A286 Tech Data
No ratings yet
A286 Tech Data
5 pages
C941 Quick-Start Printing With Clear or White Toner 01
No ratings yet
C941 Quick-Start Printing With Clear or White Toner 01
10 pages
Assignment No 4
No ratings yet
Assignment No 4
8 pages
Astro AI
No ratings yet
Astro AI
20 pages
Python Lectures
No ratings yet
Python Lectures
9 pages
Lab 2 NA
No ratings yet
Lab 2 NA
9 pages
AI Lung Imaging Analysis System (ALIAS) (CT) 2021
No ratings yet
AI Lung Imaging Analysis System (ALIAS) (CT) 2021
9 pages
Bahan Metilen Blue
No ratings yet
Bahan Metilen Blue
5 pages
Unit 5 (P2)
No ratings yet
Unit 5 (P2)
9 pages
CVDL Tae 63
No ratings yet
CVDL Tae 63
9 pages
NLP 4
No ratings yet
NLP 4
10 pages
Project Source
No ratings yet
Project Source
21 pages
3d Shapes
No ratings yet
3d Shapes
16 pages
Short MCMC Supplementary
No ratings yet
Short MCMC Supplementary
5 pages
Lec 1
No ratings yet
Lec 1
4 pages
Pytorch Tutorial 1
No ratings yet
Pytorch Tutorial 1
48 pages
Apurv Notes - Foundations of Pytorch
No ratings yet
Apurv Notes - Foundations of Pytorch
15 pages
CS541 HW4
No ratings yet
CS541 HW4
11 pages
Astro AI
No ratings yet
Astro AI
20 pages
Ammonia QP
No ratings yet
Ammonia QP
4 pages
Class Time Table For Std. Viii - A
No ratings yet
Class Time Table For Std. Viii - A
2 pages
PyTorch CrashCourse
No ratings yet
PyTorch CrashCourse
17 pages
Tutorials Sources Beginner Ptcheat
No ratings yet
Tutorials Sources Beginner Ptcheat
7 pages
TXT
No ratings yet
TXT
7 pages
Chapter1 Intro
No ratings yet
Chapter1 Intro
35 pages
How To Set The Device by LAN - Wired Adding - 1730258016871 - U26bc
No ratings yet
How To Set The Device by LAN - Wired Adding - 1730258016871 - U26bc
5 pages
PyTorch CrashCourse
No ratings yet
PyTorch CrashCourse
16 pages
Homework IntroToDL
No ratings yet
Homework IntroToDL
3 pages
Transformers Implementations 1731410319
No ratings yet
Transformers Implementations 1731410319
10 pages
Operating System Exercises - Chapter 5-Exr
No ratings yet
Operating System Exercises - Chapter 5-Exr
2 pages
Decoder-Only Transformer (LLM) For Question Asking: Notebook Structure
No ratings yet
Decoder-Only Transformer (LLM) For Question Asking: Notebook Structure
9 pages
Harvard CS197 Lecture 6 & 7 Notes
No ratings yet
Harvard CS197 Lecture 6 & 7 Notes
18 pages
Main
No ratings yet
Main
1 page
Deep Learning Unit 4
No ratings yet
Deep Learning Unit 4
11 pages
PyTorch Cheat Sheet & Quick Reference
No ratings yet
PyTorch Cheat Sheet & Quick Reference
6 pages
Intro To Pytorch
No ratings yet
Intro To Pytorch
12 pages
GPT2 From Scratch in PyTorch
No ratings yet
GPT2 From Scratch in PyTorch
13 pages
Assignment3 AL
No ratings yet
Assignment3 AL
23 pages
Building Deep Learning Models Using The PyTorch Library
No ratings yet
Building Deep Learning Models Using The PyTorch Library
4 pages
Pytorch Neural Networks Guide 1717173717
No ratings yet
Pytorch Neural Networks Guide 1717173717
17 pages
Ilovepdf Merged
No ratings yet
Ilovepdf Merged
10 pages
PyTorch Crash Course 1713016363
No ratings yet
PyTorch Crash Course 1713016363
15 pages
02 - Lecture Note - TensorFlow Ops
No ratings yet
02 - Lecture Note - TensorFlow Ops
21 pages
Pytorch Tutorial For Beginner: Department of Computer Science & Engineering University of Washington
No ratings yet
Pytorch Tutorial For Beginner: Department of Computer Science & Engineering University of Washington
11 pages
Python Advanced Programming: The Guide to Learn Python Programming. Reference with Exercises and Samples About Dynamical Programming, Multithreading, Multiprocessing, Debugging, Testing and More
From Everand
Python Advanced Programming: The Guide to Learn Python Programming. Reference with Exercises and Samples About Dynamical Programming, Multithreading, Multiprocessing, Debugging, Testing and More
Marcus Richards
No ratings yet
Advanced C Concepts and Programming: First Edition
From Everand
Advanced C Concepts and Programming: First Edition
Gayatri
3/5 (1)
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet

Modeling Chatglm

Uploaded by

Modeling Chatglm

Uploaded by

""" PyTorch ChatGLM model.

from transformers.modeling_outputs import (

from .configuration_chatglm import ChatGLMConfig

# flags required to enable jit fusion kernels

if sys.platform != 'darwin' and not is_torch_npu_available():

def default_init(cls, *args, **kwargs):

# Create position indexes `[0, 1, ..., seq_len - 1]`

# Calculate the product of position index and $\theta_i$

cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)

# this is to mimic the behaviour of complex32, else we will get different

def forward(self, max_seq_len, offset=0):

def forward(self, hidden_states: torch.Tensor):

return (self.weight * hidden_states).to(input_dtype)

projection_size = config.kv_channels * config.num_attention_heads

# Per attention head and per partition values.

def forward(self, query_layer, key_layer, value_layer, attention_mask):

# [b, np, sq, hn] -> [b * np, sq, hn]

# preallocting input tensor: [b * np, sq, sk]

# Raw attention scores. [b * np, sq, sk]

# change view to [b, np, sq, sk]

# attention scores and attention mask [b, np, sq, sk]

# query layer shape: [b * np, sq, hn]

dropout_p=self.config.attention_dropout if self.training else 0.0)

dropout_p=self.config.attention_dropout if self.training else 0.0)

# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2

def forward(self, query_states, key_states, value_states, attention_mask):

cu_seqlens_q, cu_seqlens_k = cu_seq_lens

attn_output = pad_input(attn_output_unpad, indices_q, batch_size,

def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,

def __init__(self, config: ChatGLMConfig, layer_number, device=None):

self.projection_size = config.kv_channels * config.num_attention_heads

# Per attention head and per partition values.

def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None,

# Attention heads [b, sq, h] --> [b, sq, (np * 3 * hn)]

# [b, sq, np, 3 * hn] --> 3 [b, sq, np, hn]

# [b, sq, np, hn] -> [b, np, sq, hn]

# apply relative positional encoding (rotary embedding)

# adjust key and value for inference

context_layer = self.core_attention(query_layer, key_layer, value_layer,

return output, kv_cache

def __init__(self, config: ChatGLMConfig, device=None):

# Project to 4h. If using swiglu double the output width, see

def __init__(self, config: ChatGLMConfig, layer_number, device=None):

LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm

# Layernorm on the attention output

# Layer norm at the beginning of the transformer layer.

# Layer norm post the self attention.

# Second residual connection.

output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout,

return output, kv_cache

def __init__(self, config: ChatGLMConfig, device=None):

self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in

def _get_layer(self, layer_number):

# Final layer norm.

return hidden_states, presents, all_hidden_states, all_self_attentions

def _init_weights(self, module: nn.Module):

def get_masks(self, input_ids, past_key_values, padding_mask=None):

def get_position_ids(self, input_ids, device):

def __init__(self, config: ChatGLMConfig, device=None):

def forward(self, input_ids):

# Rotary positional embeddings

def set_input_embeddings(self, value):

batch_size, seq_length = input_ids.shape

# Rotary positional embeddings

# update attention mask

# update position ids

# Shift so that tokens < n predict n

You might also like

def init(self, config: ChatGLMConfig, layer_number, device=None):

def init(self, config: ChatGLMConfig, device=None):

def init(self, config: ChatGLMConfig, layer_number, device=None):

def init(self, config: ChatGLMConfig, device=None):

def init(self, config: ChatGLMConfig, device=None):