# Add a batch dimension to the positional encoding
pe = pe.unsqueeze(0) # (1, seq_len, d_model)
# Register the positional encoding as a buffer
self.register_buffer('pe', pe)
def forward(self, x):
x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
return self.dropout(x)
复制代码
它是不可学习的,在初始化时就固定好了。使用的是正弦和余弦的编码方式,具体函数如下图:
而在这里CLIP中位置编码是可学习的参数,初始化为0。为什么呢?我认为图像像素之间的空间关系比文本序列中的位置关系复杂得多,固定的正余弦编码大概无法充分表达这种复杂的空间依赖。
之后便是遍历全部self.layers = nn.ModuleList([CLIPLayer(12, 768) for i in range(12)]),CLIPLayer定义如下:
首先,其是自注意力(Self-attention allows the model to relate words to each other )。即Q(query)K(key)V(value)在这都是同一个矩阵,即input,即归一化后的x。所以这里直接把x输入到self.in_proj = nn.Linear(d_embed, 3 * d_embed, bias=in_proj_bias)中,将其映射成三个矩阵,再在之后切割回q, k, v = self.in_proj(x).chunk(3, dim=-1),qkv即可,这里的qkv便是经过WQ,WQ,WV映射后得到的Q'K'V'。
之后通过传参n_heads设置head数,以及每头的维度,self.d_head = d_embed // n_heads,从而得到interim_shape = (batch_size, sequence_length, self.n_heads, self.d_head)
随后使用view对qkv做切割为多头即可(因为其存储是一连的,所以可以直接切割):
注意还必要使用transpose改变维度的顺序,因为我们希望每个头都可以包含整个句子序列,包含每个单词的差别表达。
# (Batch_Size, Seq_Len, Dim) -> (Batch_Size, Seq_Len, H, Dim / H) -> (Batch_Size, H, Seq_Len, Dim / H)
q = q.view(interim_shape).transpose(1, 2)
k = k.view(interim_shape).transpose(1, 2)
v = v.view(interim_shape).transpose(1, 2)
复制代码
然后是计算注意力的基础公式
首先来算QKT:
# (Batch_Size, H, Seq_Len, Dim / H) @ (Batch_Size, H, Dim / H, Seq_Len) -> (Batch_Size, H, Seq_Len, Seq_Len)
所以至此,两个残差块一起连用,即构成了CLIPLayer。而12个CLIPLayer串行使用,构成了:self.layers = nn.ModuleList([CLIPLayer(12, 768) for i in range(12)]),且这里head数为12。
所以,回到CLIP类的定义中,我们已经实现到此了:串行应用所有CLIPLayer,即12个编码器一起使用,每个编码器又是由自注意力残差块以及向前反馈残差块构成的。最后对输出做一个层归一化。
# Apply encoder layers similar to the Transformer's encoder.
# Params "beta_start" and "beta_end" taken from: https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/configs/stable-diffusion/v1-inference.yaml#L5C8-L5C8
# For the naming conventions, refer to the DDPM paper (https://arxiv.org/pdf/2006.11239.pdf)
# Params "beta_start" and "beta_end" taken from: https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/configs/stable-diffusion/v1-inference.yaml#L5C8-L5C8
# For the naming conventions, refer to the DDPM paper (https://arxiv.org/pdf/2006.11239.pdf)
# GeGLU as implemented in the original code: https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/modules/attention.py#L37C10-L37C10
# (Batch_Size, Height * Width, Features) -> two tensors of shape (Batch_Size, Height * Width, Features * 4)
x, gate = self.linear_geglu_1(x).chunk(2, dim=-1)
# Element-wise product: (Batch_Size, Height * Width, Features * 4) * (Batch_Size, Height * Width, Features * 4) -> (Batch_Size, Height * Width, Features * 4)
models = model_loader.preload_models_from_standard_weights(model_file, DEVICE)## TEXT TO IMAGEprompt = "A cat stretching on the floor, highly detailed, ultra sharp, cinematic, 8k resolution"uncond_prompt = "" # Also known as negative promptdo_cfg = Truecfg_scale = 8 # min: 1, max: 14## IMAGE TO IMAGEinput_image = None# Comment to disable image to imageimage_path = "../images/yibo.jpg"# input_image = Image.open(image_path)# Higher values means more noise will be added to the input image, so the result will further from the input image.# Lower values means less noise is added to the input image, so output will be closer to the input image.strength = 0.9## SAMPLERsampler = "ddpm"num_inference_steps = 50seed = 42output_image = pipeline.generate( prompt=prompt, uncond_prompt=uncond_prompt, input_image=input_image, strength=strength, do_cfg=do_cfg, cfg_scale=cfg_scale, sampler_name=sampler, n_inference_steps=num_inference_steps, seed=seed, models=models, device=DEVICE, idle_device="cpu", tokenizer=tokenizer,)# Combine the input image and the output image into a single image.Image.fromarray(output_image)