cuda编程困惑cudaMemcopy

该函数底层是cpu在拷贝还是gpu在拷贝，考入考出耗时这么大，gpu加速计算时是怎么加速的，原理是什么，程序应该怎么设计？

cudaMemcopy是CPU和GPU两者共同作用的结果。
GPU编程就是八股文，共分三个步骤：
1、在启动GPU计算前使用cudaMemcopy将计算机内存的数据拷贝入GPU内存中，
2、启动GPU计算
3、GPU计算结束后使用cudaMemcopy将GPU内存中的计算结果返回CPU内存中。
由于数据交换通过PCI-E接口进行，只要数据俩不是太大且接口数据带宽足够，耗时不会很大！
GPU加速主要适用于高并行度、高计算量、低内存占用的项目。
1、高并行度——充分利用GPU中众多处理器的特点，根据不同的GPU可用成千上万个线程并行处理；
2、高计算量——可以抵消cudaMemcopy带来的影响
3、低内存占用——尽量使用局部内存，减少对显存的使用，从而减少由于访问显存而带来的时延。
下面是一个完整的例子：
CudaSample.h
#ifndef HEADER_THREADTEST_H
#define HEADER_THREADTEST_H
#ifdef __cplusplus
extern "C" {
#endif

//CUDA device property data structure
struct DevicdProp
{
int deviceNo;
char name[256];
size_t totalGlobalMem;
size_t sharedMemPerBlock;
int regsPerBlock;
int warpSize;
size_t memPitch;
int maxThreadsPerBlock;
int maxThreadsDim[3];
int maxGridSize[3];
size_t totalConstMem;
int major;
int minor;
int clockRate;
size_t textureAlignment;
int deviceOverlap;
int multiProcessorCount;
};

//thread number is equal to 2 multiprocessor's thread number
#define ThreadNumPerBlock 64

#define BlockNum 4

#define TotalThread BlockNum * ThreadNumPerBlock

//Self defined structure
struct MYSTRUCT
{
unsigned int A[TotalThread], B[TotalThread], C[TotalThread];
};

//Init CUDA
bool InitCUDA(DevicdProp *lpDevicdProp);

//Cuda Sample
void CudaSample(MYSTRUCT* lpHostStruct);

#ifdef __cplusplus
}
#endif
#endif

CudaCuda.cu
#include
#include

#include

#include "CudaSample.h"

/************************************************************************/
/* Init CUDA /
/***********************************************************************/
cudaDeviceProp deviceProp;

#if DEVICE_EMULATION

bool InitCUDA(DevicdProp *lpDevicdProp){return true;}

#else
bool InitCUDA(DevicdProp *lpDevicdProp)
{
int count = 0;
int i = 0;

//Set no CUDA device is selected
lpDevicdProp->deviceNo = -1;

cudaGetDeviceCount(&count);

if(count == 0)
{
    fprintf(stderr, "There is no device.\n");
    return false;
}
else
    printf("\n\nThere are maybe %d devices supporting CUDA\n", count);

for(i = 0; i < count; i++)
{
    if(cudaGetDeviceProperties(&deviceProp, i) != cudaSuccess)
    {
        printf("\nDevice %d: Property cannot be get.\n", i);
        continue;
    }

    // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
    if(deviceProp.major == 9999 && deviceProp.minor == 9999)
    {
        printf("\nDevice %d: Do not supporting CUDA.\n", i);
        continue;
    }

    if(lpDevicdProp->deviceNo == -1)
    {
        lpDevicdProp->deviceNo = i;                     
        memcpy(lpDevicdProp->name, deviceProp.name, 256);
        lpDevicdProp->totalGlobalMem        = deviceProp.totalGlobalMem; 
        lpDevicdProp->sharedMemPerBlock     = deviceProp.sharedMemPerBlock; 
        lpDevicdProp->regsPerBlock          = deviceProp.regsPerBlock;
        lpDevicdProp->warpSize              = deviceProp.warpSize;
        lpDevicdProp->memPitch              = deviceProp.memPitch;
        lpDevicdProp->maxThreadsPerBlock    = deviceProp.maxThreadsPerBlock; 
        lpDevicdProp->maxThreadsDim[0]      = deviceProp.maxThreadsDim[0]; 
        lpDevicdProp->maxThreadsDim[1]      = deviceProp.maxThreadsDim[1]; 
        lpDevicdProp->maxThreadsDim[2]      = deviceProp.maxThreadsDim[2]; 
        lpDevicdProp->maxGridSize[0]        = deviceProp.maxGridSize[0]; 
        lpDevicdProp->maxGridSize[1]        = deviceProp.maxGridSize[1]; 
        lpDevicdProp->maxGridSize[2]        = deviceProp.maxGridSize[2]; 
        lpDevicdProp->totalConstMem         = deviceProp.totalConstMem; 
        lpDevicdProp->major                 = deviceProp.major;
        lpDevicdProp->minor                 = deviceProp.minor;
        lpDevicdProp->clockRate             = deviceProp.clockRate;
        lpDevicdProp->textureAlignment      = deviceProp.textureAlignment; 
        lpDevicdProp->deviceOverlap         = deviceProp.deviceOverlap; 
        lpDevicdProp->multiProcessorCount   = deviceProp.multiProcessorCount;
    }

    printf("\nDevice %d: \"%s\"\n", i, deviceProp.name);
    printf("  CUDA Capability Major revision number:         %d\n", deviceProp.major);
    printf("  CUDA Capability Minor revision number:         %d\n", deviceProp.minor);
    printf("  Total amount of global memory:                 %u bytes\n", (unsigned int)(deviceProp.totalGlobalMem));

#if CUDART_VERSION >= 2000
printf(" Number of multiprocessors: %d\n", deviceProp.multiProcessorCount);
printf(" Number of cores: %d\n", 8 * deviceProp.multiProcessorCount);
#endif
printf(" Total amount of constant memory: %u bytes\n", (unsigned int)(deviceProp.totalConstMem));
printf(" Total amount of shared memory per block: %u bytes\n", (unsigned int)(deviceProp.sharedMemPerBlock));
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
printf(" Warp size: %d\n", deviceProp.warpSize);
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n",
deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",
deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf(" Maximum memory pitch: %u bytes\n", (unsigned int)(deviceProp.memPitch));
printf(" Texture alignment: %u bytes\n", (unsigned int)(deviceProp.textureAlignment));
printf(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 2000
printf(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 2020
printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated: %s\n", deviceProp.integrated ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
printf(" Compute mode: %s\n", deviceProp.computeMode == cudaComputeModeDefault ?
"Default (multiple host threads can use this device simultaneously)" :
deviceProp.computeMode == cudaComputeModeExclusive ?
"Exclusive (only one host thread at a time can use this device)" :
deviceProp.computeMode == cudaComputeModeProhibited ?
"Prohibited (no host thread can use this device)" :
"Unknown");
#endif
}

i = lpDevicdProp->deviceNo;                     
if(i == -1)
{
    fprintf(stderr, "There is no device supporting CUDA.\n");
    return false;
}
cudaSetDevice(i);
printf("CUDA Device No. used = %d.\n", i);

printf("CUDA initialized.\n");
return true;

}

/************************************************************************/
/* Example /
/***********************************************************************/
global static void CudaCalc(MYSTRUCT* lpMyStruct)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;

if(gridDim.x!=BlockNum)
    return;
if(blockDim.x!=ThreadNumPerBlock)
    return;

if(idx<TotalThread)
    lpMyStruct->A[idx] = lpMyStruct->B[idx] + lpMyStruct->C[idx];

}

void CudaSample(MYSTRUCT* lpHostStruct)
{

MYSTRUCT    *lpDeviceStruct = 0;

cudaMalloc((void**) &lpDeviceStruct, sizeof(MYSTRUCT));
cudaMemcpy(lpDeviceStruct, lpHostStruct, sizeof(MYSTRUCT), cudaMemcpyHostToDevice);

CudaCalc<<<BlockNum, ThreadNumPerBlock, 0>>>(lpDeviceStruct);

// CUT_CHECK_ERROR("Kernel execution failed\n");

cudaThreadSynchronize();

cudaMemcpy(lpHostStruct, lpDeviceStruct, sizeof(MYSTRUCT), cudaMemcpyDeviceToHost);

cudaFree(lpDeviceStruct);

}

#endif

CudaSample.cpp

#include
#include

#include "CudaSample.h"

int main(int argc, char* argv[])
{
DevicdProp DevicdProps;
MYSTRUCT MyStruct;
unsigned int I, J, N;

if(!InitCUDA(&DevicdProps))
    return 1;

for(I=0; I<TotalThread; I++)
{
    MyStruct.B[I] = I;
    MyStruct.C[I] = I;
}

CudaSample(&MyStruct);

for(I=0; I<TotalThread; I++)
    J = MyStruct.A[I];

return 0;

}

这个例子基本涵盖了用CUDA进行GPU编程的必要步骤（尚缺根据不同GPU选择不同的线程数以及在CPU中启用多线程进行多GPU并行计算）
其计算为
for(I=0; I<TotalThread; I++)
{
MyStruct.B[I] = I;
MyStruct.C[I] = I;
MyStruct.A[I] = MyStruct.B[I] + MyStruct.C[I];
}
其中MyStruct.A[I] = MyStruct.B[I] + MyStruct.C[I]在GPU中计算，每个线程算一个，总线程数为256个。

stdio.h
stdlib.h

cuda_runtime.h