def main():
local_rank = int(os.environ['LOCAL_RANK'])
torch.cuda.set_device(local_rank)
dist.init_process_group(backend='nccl')
if not osp.exists(cfg.respth): os.makedirs(cfg.respth)
setup_logger(f'{cfg.model_type}-{cfg.dataset.lower()}-train', cfg.respth)
train()
if name == "main":
main()
这段代码是分布式训练的代码,请问改哪里改成单卡训练?
local_rank =0就是id为0的单卡了,不过你只改这一段好像没啥用,还有个DataParallel呢
def set_model_dist(net):
local_rank = int(os.environ['LOCAL_RANK'])
net = nn.parallel.DistributedDataParallel(
net,
device_ids=[local_rank, ],
# find_unused_parameters=True,
output_device=local_rank
)
return net
这段吗?请问怎么改?