性能优化流程
优化后的性能(来自PyTorch网站)
1.我们使用与教程中使用的CIFAR10数据集具有相同属性和行为的假数据集。可以在这里找到此更改的动机。
2.我们初始化torch.profile.schedule,将warmup(预热)标志设置为3,将repeat(重复)标志设置为1。我们发现,预热步骤数量的轻微增加提高了性能分析结果的稳定性。
import numpy as np
import torch
import torch.nn
import torch.optim
import torch.profiler
import torch.utils.data
import torchvision.datasets
import torchvision.models
import torchvision.transforms as T
from torchvision.datasets.vision import VisionDataset
from PIL import Image
class FakeCIFAR(VisionDataset):
def __init__(self, transform):
super().__init__(root=None, transform=transform)
self.data = np.random.randint(low=0,high=256,size=(10000,32,32,3),dtype=np.uint8)
self.targets = np.random.randint(low=0,high=10,size=(10000),dtype=np.uint8).tolist()
def __getitem__(self, index):
img, target = self.data[index], self.targets[index]
img = Image.fromarray(img)
if self.transform is not None:
img = self.transform(img)
return img, target
def __len__(self) -> int:
return len(self.data)
transform = T.Compose(
[T.Resize(224),
T.ToTensor(),
T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
train_set = FakeCIFAR(transform=transform)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=32,shuffle=True)
device = torch.device("cuda:0")
model = torchvision.models.resnet18(weights='IMAGENET1K_V1').cuda(device)
criterion = torch.nn.CrossEntropyLoss().cuda(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
model.train()
# train step
def train(data):
inputs, labels = data[0].to(device=device), data[1].to(device=device)
outputs = model(inputs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# training loop wrapped with profiler object
with torch.profiler.profile(
schedule=torch.profiler.schedule(wait=1, warmup=4, active=3, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'),
record_shapes=True,
profile_memory=True,
with_stack=True
) as prof:
for step, batch_data in enumerate(train_loader):
if step >= (1 + 4 + 3) * 1:
break
train(batch_data)
prof.step() # Need to call this at the end of each step
TensorBoard Profiler Overview 选项卡中显示的基线性能结果
TensorBoard Profiler 跟踪视图选项卡中显示的基线性能结果
train_loader = torch.utils.data.DataLoader(train_set, batch_size=32,
shuffle=True, num_workers=8)
TensorBoard Profiler Overview选项卡中加载多进程数据的结果
Trace View选项卡中的多进程数据加载结果
train_loader = torch.utils.data.DataLoader(train_set, batch_size=32,
shuffle=True, num_workers=8, pin_memory=True)
inputs, labels = data[0].to(device=device, non_blocking=True), \
data[1].to(device=device, non_blocking=True)
TensorBoard Profiler Overview 选项卡中 Memory Pinning 的结果
TensorBoard Profiler中的内存视图
在TensorBoard Profiler Overview选项卡中增加批处理大小的结果
优化4:减少主机到设备的拷贝
您可能注意到了前面结果的饼图中表示主机到设备数据副本的红色部分。解决这种瓶颈最直接的方法是看看我们是否可以减少每个批量中的数据量。注意,在图像输入的情况下,我们将数据类型从8位无符号整数转换为32位浮点数,并在执行数据复制之前应用规范化。在下面的代码块中,我们建议对输入数据流进行更改,其中我们延迟数据类型转换和规范化,直到数据在GPU上:
# maintain the image input as an 8-bit uint8 tensor
transform = T.Compose(
[T.Resize(224),
T.PILToTensor()
])
train_set = FakeCIFAR(transform=transform)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=1024, shuffle=True, num_workers=8, pin_memory=True)
device = torch.device("cuda:0")
model = torch.compile(torchvision.models.resnet18(weights='IMAGENET1K_V1').cuda(device), fullgraph=True)
criterion = torch.nn.CrossEntropyLoss().cuda(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
model.train()
# train step
def train(data):
inputs, labels = data[0].to(device=device, non_blocking=True), \
data[1].to(device=device, non_blocking=True)
# convert to float32 and normalize
inputs = (inputs.to(torch.float32) / 255. - 0.5) / 0.5
outputs = model(inputs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
在TensorBoard Profiler Overview选项卡中减少CPU到GPU拷贝的结果
optimizer.zero_grad(set_to_none=True)
TensorBoard Profiler中的内核视图
def train(data):
inputs, labels = data[0].to(device=device, non_blocking=True), \
data[1].to(device=device, non_blocking=True)
inputs = (inputs.to(torch.float32) / 255. - 0.5) / 0.5
with torch.autocast(device_type='cuda', dtype=torch.float16):
outputs = model(inputs)
loss = criterion(outputs, labels)
# 注意 - torch.cuda。 amp.GradScaler() 可能需要
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
从 TensorBoard Profiler 中的内核视图进行 AMP 优化的 Tensor 核心利用率
在TensorBoard Profiler Overview选项卡中AMP优化的结果
model = torchvision.models.resnet18(weights='IMAGENET1K_V1').cuda(device)
model = torch.compile(model)
TensorBoard Profiler Overview选项卡中编译图形的结果
TensorBoard Profiler Trace View选项卡中编译图形的结果
性能结果摘要