DeiTransformer_随笔

DeiTransformer
import paddle
import paddle.nn as nn
import numpy as np
import copy
#格式化代码 ctrl+alt+l

class Identity(nn.Layer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x


class MLP(nn.Layer):
    def __init__(self, embed_dim, mlp_ratio, dropout=0.):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, int(embed_dim * mlp_ratio))
        self.fc2 = nn.Linear(int(embed_dim * mlp_ratio), embed_dim)
        self.act = nn.GELU()
        self.dorpout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.dorpout(x)
        x = self.fc2(x)
        x = self.dorpout(x)
        return x

# [4,3,224,224]->[4,196+2(token个数),768(embed_dim)]
class PatchEmbedding(nn.Layer):
    def __init__(self, img_size=224, in_channels=3, patch_size=16, embed_dim=768, dropout=0.):
        super().__init__()
        n_patches = (img_size // patch_size) ** 2 #14*14个
        self.patch_embedding = nn.Conv2D(in_channels=in_channels,
                                         out_channels=embed_dim,
                                         kernel_size=patch_size,
                                         stride=patch_size,
                                         bias_attr=False)
        self.dropout = nn.Dropout(dropout)
        self.cls_token = paddle.create_parameter(shape=[1, 1, embed_dim],
                                                 dtype="float32",
                                                 default_initializer=nn.initializer.Constant(0), )
        self.distill_token = paddle.create_parameter(shape=[1, 1, embed_dim],
                                                     dtype="float32",
                                                     default_initializer=nn.initializer.TruncatedNormal(std=.02))
        self.position_embeddings = paddle.create_parameter(shape=[1, n_patches + 2, embed_dim],
                                                           dtype="float32",
                                                           default_initializer=nn.initializer.TruncatedNormal(std=0.02))

    def forward(self, x):
        # [n,c,h,w]
        cls_tokens = self.cls_token.expand([x.shape[0], -1, -1])  # xiecheng [4,1,768]yechengba?
        distill_tokens = self.distill_token.expand((x.shape[0], -1, -1))

        x = self.patch_embedding(x)  # [n,c',h',w'] ??为什么输入进去的是思维的，n是干嘛的到底[4 768 14 14]
        x = x.flatten(2)  # 为什么这个没有提示[4,768,196]
        x = x.transpose([0, 2, 1])  # [n,h'*w',c'][4,196,768]
        x = paddle.concat([cls_tokens, distill_tokens, x], axis=1)#[4,198,768]

        out = x + self.position_embeddings
        out = self.dropout(out)
        return out


class Attention(nn.Layer):
    def __init__(self, embed_dim, num_heads, qkv_bias=True, dropout=0., attention_dropout=0.):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = int(embed_dim / num_heads)  # 为什么要转换一下类型嫩，因为算出来的是float
        self.all_head_dim = self.head_dim * self.num_heads
        self.scales = self.head_dim ** -0.5  # 不应该是all——head——dim吗？？
        self.qkv = nn.Linear(embed_dim, self.all_head_dim * 3)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.attention_dropout = nn.Dropout(attention_dropout)
        self.softmax = nn.Softmax(axis=-1)

    def transpose_multihead(self, x):
        # x: [N, num_patches, all_head_dim] -> [N, n_heads, num_patches, head_dim]
        new_shape = x.shape[:-1] + [self.num_heads, self.head_dim]
        # print("nshape到底长啥样：", new_shape)#[4, 198, 4, 192]
        x = x.reshape(new_shape)
        x = x.transpose([0, 2, 1, 3])
        return x

    def forward(self, x):
        # [N,num_patches,dim]
        qkv = self.qkv(x).chunk(3, axis=-1)
        q, k, v = map(self.transpose_multihead, qkv)

        attn = paddle.matmul(q, k, transpose_y=True)
        attn = attn * self.scales
        attn = self.softmax(attn)
        attn = self.dropout(attn)

        out = paddle.matmul(attn, v)
        ##这两句没看懂
        out = out.transpose([0, 2, 1, 3])
        out = out.reshape(out.shape[:-2] + [self.all_head_dim])

        out = self.proj(out)
        out = -self.dropout(out)

        return out


class EncoderLayer(nn.Layer):
    # 先归一化，再接个个子层
    def __init__(self, embed_dim=768, num_heads=4, qkv_bias=True, mlp_ratio=4.0, dropout=0., attention_dropout=0.):
        super().__init__()
        self.attn_norm = nn.LayerNorm(embed_dim)
        self.attn = Attention(embed_dim, num_heads)
        self.mlp_norm = nn.LayerNorm(embed_dim)
        self.mlp = MLP(embed_dim, mlp_ratio)

    def forward(self, x):
        h = x
        print("x:",x.shape)
        x = self.attn_norm(x)
        x = self.attn(x)
        x = h + x

        h = x
        x = self.mlp_norm(x)
        x = self.mlp(x)
        x = x + h
        return x


class Encoder(nn.Layer):
    def __init__(self, embed_dim, depth):
        super().__init__()
        layer_list = []
        for i in range(depth):
            encoder_layer = EncoderLayer()
            layer_list.append(encoder_layer)
        self.layers = nn.LayerList(layer_list)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)
        return x[:, 0], x[:, 1]  # [4,198,emnbed_dim]中第一个和第二个是那两个特殊的token


class DeiT(nn.Layer):
    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 in_channels=3,
                 num_classes=1000,
                 embed_dim=768,
                 depth=3,
                 num_heads=8,
                 mlp_ratio=4,
                 qkv_bias=True,
                 dropout=0.,
                 attention_dropout=0.,
                 droppath=0.):
        super().__init__()
        # [4, 3, 224, 224]->[4, 196 + 2(token个数), 768(embed_dim)]
        self.patch_embedding = PatchEmbedding(img_size=img_size, in_channels=in_channels, patch_size=patch_size,
                                              embed_dim=embed_dim, dropout=dropout)
        self.encoder=Encoder(embed_dim=embed_dim,depth=depth)
        self.head=nn.Linear(embed_dim,num_classes)
        self.head_distill=nn.Linear(embed_dim,num_classes)

    def forward(self,x):
        x=self.patch_embedding(x)
        x,x_distill=self.encoder(x)
        x=self.head(x)
        x_distill=self.head_distill(x_distill)
        if(self.training):
            return x,x_distill
        else:
            return (x+x_distill)/2


def main():
    model=DeiT()
    # print(model)
    paddle.summary(model,(4,3,224,224))
if __name__=="__main__":
    main()
欢迎分享，转载请注明来源：内存溢出
原文地址:https://www.54852.com/zaji/5659712.html
DeiTransformer

发表评论

评论列表（0条）