Vision Transformer (ViT)：将Transformer带入计算机视觉的革命性实验（代 - IT评测·应用市场-qidao123.com

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
# 超参数设置
image_size = 28 # MNIST 图像大小为 28x28
patch_size = 7 # Patch 大小为 7x7
num_patches = (image_size // patch_size) ** 2 # 16 个 Patch
patch_dim = patch_size * patch_size * 1 # 输入通道为 1 (灰度图)
dim = 64 # 嵌入维度
depth = 6 # Transformer 层数
heads = 8 # 注意力头数
mlp_dim = 128 # MLP 隐藏层维度
num_classes = 10 # MNIST 类别数
dropout = 0.1 # Dropout 率
# 设备设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Patch Embedding 模块
class PatchEmbedding(nn.Module):
def __init__(self, image_size, patch_size, patch_dim, dim, dropout):
super().__init__()
self.num_patches = (image_size // patch_size) ** 2
# 线性投影：将 Patch 展平并映射到 dim 维度
self.proj = nn.Linear(patch_dim, dim)
# 位置编码
self.pos_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, dim))
# CLS Token
self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B = x.shape[0] # Batch Size
# 将图像分割为 Patch 并展平
x = x.unfold(2, patch_size, patch_size).unfold(3, patch_size, patch_size) # (B, C, H/p, W/p, p, p)
x = x.permute(0, 2, 3, 1, 4, 5).contiguous() # (B, H/p, W/p, C, p, p)
x = x.view(B, self.num_patches, -1) # (B, num_patches, patch_dim)
# 线性投影
x = self.proj(x) # (B, num_patches, dim)
# 添加 CLS Token
cls_tokens = self.cls_token.expand(B, -1, -1) # (B, 1, dim)
x = torch.cat((cls_tokens, x), dim=1) # (B, num_patches + 1, dim)
# 添加位置编码
x = x + self.pos_embedding
x = self.dropout(x)
return x
# 多头自注意力模块
class MultiHeadAttention(nn.Module):
def __init__(self, dim, heads, dropout):
super().__init__()
self.heads = heads
self.scale = (dim // heads) ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=False) # 查询、键、值投影
self.dropout = nn.Dropout(dropout)
self.proj = nn.Linear(dim, dim) # 输出投影
def forward(self, x):
B, N, C = x.shape # (Batch, num_patches + 1, dim)
# 生成 Q, K, V
qkv = self.qkv(x).reshape(B, N, 3, self.heads, C // self.heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # (B, heads, N, dim/heads)
# 注意力计算
attn = (q @ k.transpose(-2, -1)) * self.scale # (B, heads, N, N)
attn = F.softmax(attn, dim=-1)
attn = self.dropout(attn)
# 加权求和
x = (attn @ v).transpose(1, 2).reshape(B, N, C) # (B, N, dim)
x = self.proj(x)
x = self.dropout(x)
return x
# Transformer Encoder 层
class TransformerEncoderLayer(nn.Module):
def __init__(self, dim, heads, mlp_dim, dropout):
super().__init__()
self.norm1 = nn.LayerNorm(dim)
self.attn = MultiHeadAttention(dim, heads, dropout)
self.norm2 = nn.LayerNorm(dim)
self.mlp = nn.Sequential(
nn.Linear(dim, mlp_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(mlp_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
x = x + self.attn(self.norm1(x)) # 残差连接
x = x + self.mlp(self.norm2(x)) # 残差连接
return x
# Vision Transformer 模型
class ViT(nn.Module):
def __init__(self, image_size, patch_size, patch_dim, dim, depth, heads, mlp_dim, num_classes, dropout):
super().__init__()
self.patch_embed = PatchEmbedding(image_size, patch_size, patch_dim, dim, dropout)
self.layers = nn.ModuleList([
TransformerEncoderLayer(dim, heads, mlp_dim, dropout) for _ in range(depth)
])
self.norm = nn.LayerNorm(dim)
self.head = nn.Linear(dim, num_classes)
def forward(self, x):
x = self.patch_embed(x)
for layer in self.layers:
x = layer(x)
x = self.norm(x)
cls_token = x[:, 0] # 提取 CLS Token
x = self.head(cls_token)
return x
# 数据加载
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)) # MNIST 均值和标准差
])
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
# 初始化模型、损失函数和优化器
model = ViT(
image_size=image_size,
patch_size=patch_size,
patch_dim=patch_dim,
dim=dim,
depth=depth,
heads=heads,
mlp_dim=mlp_dim,
num_classes=num_classes,
dropout=dropout
).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练函数
def train(model, train_loader, criterion, optimizer, epoch):
model.train()
running_loss = 0.0
for i, (images, labels) in enumerate(train_loader):
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 100 == 99:
print(f'[Epoch {epoch+1}, Batch {i+1}] Loss: {running_loss / 100:.3f}')
running_loss = 0.0
# 测试函数
def test(model, test_loader):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for images, labels in test_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')
return accuracy
# 主训练循环
num_epochs = 10
for epoch in range(num_epochs):
train(model, train_loader, criterion, optimizer, epoch)
test(model, test_loader)

复制代码