问题描述:同一段代码,使用单显卡时没有问题,使用多张显卡时出现问题:
Traceback (most recent call last):
File "trainer.py", line 370, in <module>
trainer.train()
File "trainer.py", line 263, in train
self.x_tilde = self.G(self.z)
File "G:\anaconda\lib\site-packages\torch\nn\modules\module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "G:\anaconda\lib\site-packages\torch\nn\parallel\data_parallel.py", line 152, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "G:\anaconda\lib\site-packages\torch\nn\parallel\data_parallel.py", line 162, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "G:\anaconda\lib\site-packages\torch\nn\parallel\parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "G:\anaconda\lib\site-packages\torch\_utils.py", line 394, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
File "G:\anaconda\lib\site-packages\torch\nn\parallel\parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "G:\anaconda\lib\site-packages\torch\nn\modules\module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "E:\Work_place\pggan-pytorch-master的副本\network.py", line 181, in forward
x = self.model(x.view(x.size(0), -1, 1, 1))
File "G:\anaconda\lib\site-packages\torch\nn\modules\module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "G:\anaconda\lib\site-packages\torch\nn\modules\container.py", line 100, in forward
input = module(input)
File "G:\anaconda\lib\site-packages\torch\nn\modules\module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "G:\anaconda\lib\site-packages\torch\nn\modules\container.py", line 100, in forward
input = module(input)
File "G:\anaconda\lib\site-packages\torch\nn\modules\module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "E:\Work_place\pggan-pytorch-master的副本\custom_layers.py", line 113, in forward
x = self.conv(x.mul(self.scale))
File "G:\anaconda\lib\site-packages\torch\nn\modules\module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "G:\anaconda\lib\site-packages\torch\nn\modules\conv.py", line 345, in forward
return self.conv2d_forward(input, self.weight)
File "G:\anaconda\lib\site-packages\torch\nn\modules\conv.py", line 342, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_BAD_PARAM
双显卡的型号为:
0号显卡GTX1660,
1号显卡GTX1060
两张显卡都是6G版本。
不知道这是哪里出问题了,求各路大神指点。
同问
Traceback (most recent call last):
File "main.py", line 292, in
main()
File "main.py", line 91, in main
train_op(net, args)
File "main.py", line 157, in train_op
loss = net.deterministic_forward(data)
File "/mnt/lustre/dengandong/self-disentangle/model/network.py", line 63, in deterministic_forward
self.z_c, self.gap, self.reconstructed_gap = self.dCE(self.true) # ae
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in call_impl
result = self.forward(*input, **kwargs)
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/mnt/lustre/dengandong/self-disentangle/model/autoencoder/ae_3dcnn.py", line 64, in forward
content_code = self.encoder(reduce_frames)
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/modules/container.py", line 117, in forward
input = module(input)
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/mnt/lustre/dengandong/self-disentangle/model/_init__.py", line 93, in forward
x = conv(x)
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in call_impl
result = self.forward(*input, **kwargs)
File "/mnt/lustre/dengandong/self-disentangle/model/_init__.py", line 30, in forward
x = self.conv(x)
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 419, in forward
return self._conv_forward(input, self.weight)
File "/mnt/lustre/dengandong/anaconda3/envs/video_torch/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 416, in _conv_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
解决了么?求问