快速卷积网络如何实现

使用numpy实现了一个简单的卷积网络,但是训练速度比tensorflow慢10倍左右,请问还有哪里可以优化的吗

import numpy as np
import tensorflow as tf
import time

"""
将输入按卷积核拆分为N个卷积核大小的张量

Parameters
----------
A: 输入数据,形状为(样本数,宽,高,通道)
dim:当前卷积层的维度信息

Return
------
(输出宽 * 输出高, 样本数, 核宽 * 核高, 输入通道数)
"""
def split(A, dim):
    res = []
    num_datas = A.shape[0]
    width_A = A.shape[1]
    height_A = A.shape[2]
    channel_A = A.shape[3]
    width_kernel = dim['kernel'][0]
    height_kernel = dim['kernel'][1]
    stride_w = dim['strides'][0]
    stride_h = dim['strides'][1]
    for k in range(0, width_A - width_kernel + 1, stride_w):
        for j in range(0, height_A-height_kernel+1, stride_h):
            res.append(A[:, k:k+width_kernel, j:j+height_kernel, :].reshape(num_datas, width_kernel*height_kernel, channel_A))
    return np.array(res)

def split_pool(A, dim):
    res = []
    width_A = A.shape[1]
    height_A = A.shape[2]
    width_kernel = dim['kernel'][0]
    height_kernel = dim['kernel'][1]
    stride_w = dim['strides'][0]
    stride_h = dim['strides'][1]
    for k in range(0, width_A - width_kernel + 1, stride_w):
        for j in range(0, height_A-height_kernel+1, stride_h):
            slice = A[:, k:k + width_kernel, j:j + height_kernel, :]
            mask = (slice == np.max(slice, axis=(1, 2))[:, np.newaxis, np.newaxis, :])
            res.append(mask)
    return np.array(res)

def relu(Z):
    return np.maximum(0, Z)


def softmax(X):
    X_exp = np.exp(X)
    return X_exp / np.sum(X_exp, axis=0)


def loss_cross_entropy(Y_pre, Y_label):
    return -np.sum(Y_label * np.nan_to_num(np.log(Y_pre), nan=0))/Y_pre.shape[1]


class Conv2d:
    """
    Parameters
    ----------
    dims: type(0:输入,1:卷积, 2:池化, 3: 全连接) kernel(卷积核) strides(步长)
    """
    def __init__(self, dims):
        self.dims = dims
        self.rng = np.random.default_rng(seed=2)
        self.outs = []
        self.lrate = .1
        for i in range(len(dims) - 1):
            dim = dims[i + 1]
            dim_bef = dims[i]
            if dim['type'] == 1:
                width_bef = dim_bef['size_out'][0]
                height_bef = dim_bef['size_out'][1]
                channel_bef = dim_bef['size_out'][2]
                width_kernel = dim['kernel'][0]
                height_kernel = dim['kernel'][1]
                channel_out = dim['kernel'][2]
                width_out = width_bef - width_kernel + 1
                height_out = height_bef - height_kernel + 1
                dim['size_out'] = (width_out, height_out, channel_out)
                dim['weight'] = tf.keras.initializers.RandomNormal(0, .03, seed=2)(shape=(width_kernel, height_kernel, channel_bef, channel_out)).numpy()
                # dim['weight'] = self.rng.normal(0, .03, (width_kernel, height_kernel, channel_bef, channel_out))
                dim['bias'] = np.zeros(channel_out)
            if dim['type'] == 2:
                width_bef = dim_bef['size_out'][0]
                height_bef = dim_bef['size_out'][1]
                channel_bef = dim_bef['size_out'][2]
                width_kernel = dim['kernel'][0]
                height_kernel = dim['kernel'][1]
                width_out = int((width_bef - width_kernel)/width_kernel) + 1
                height_out = int((height_bef - height_kernel)/height_kernel) + 1
                dim['size_out'] = (width_out, height_out, channel_bef)
            if dim['type'] == 3:
                size_out = dim['size_out']
                if dim_bef['type'] != 3:
                    width_bef = dim_bef['size_out'][0]
                    height_bef = dim_bef['size_out'][1]
                    channel_bef = dim_bef['size_out'][2]
                    size_in = width_bef * height_bef * channel_bef
                else:
                    size_in = dim_bef['size_out']
                dim['weight'] = tf.keras.initializers.RandomNormal(0, .03, seed=2)(shape=(size_in, size_out)).numpy().T
                #dim['weight'] = self.rng.normal(0, .03, (size_out, size_in))
                dim['bias'] = np.zeros(shape=(size_out, 1))
    def forward(self, X, Y):
        start_time = time.time()*1000
        cost_split = 0
        self.outs = [(X, X)]
        A = X
        for i in range(len(self.dims) - 1):
            dim = self.dims[i + 1]
            dim_bef = self.dims[i]
            type = dim['type']
            num_datas = A.shape[0]
            if type == 1:
                weight = dim['weight']
                bias = dim['bias']
                width_out = dim['size_out'][0]
                height_out = dim['size_out'][1]
                channel_out = dim['size_out'][2]
                #(输出宽 * 输出高, 样本数, 核宽 * 核高, 输入通道数)
                start_split = time.time() * 1000
                A_split = split(A, dim)
                cost_split += time.time() * 1000 - start_split
                A_split = A_split.reshape(-1, A_split.shape[-2] * A_split.shape[-1])
                #(核宽*核高*输入通道,输出通道)
                W = weight.reshape(-1, weight.shape[-1])
                #(输出宽*输出高*样本数,输出通道)
                Z = A_split.dot(W).reshape(width_out, height_out, num_datas, channel_out).transpose(2, 0, 1, 3) + bias
                A = relu(Z)
            if type == 2:
                width_out = dim['size_out'][0]
                height_out = dim['size_out'][1]
                channel_out = dim['size_out'][2]
                start_split = time.time() * 1000
                #(输出宽 * 输出高, 样本数, 核宽 * 核高, 输入通道数)
                A_split = split(A, dim)
                cost_split += time.time() * 1000 - start_split
                A = np.max(A_split, axis=2).reshape(width_out, height_out, num_datas, channel_out).transpose(2, 0, 1, 3)
            if type == 3:
                weight = dim['weight']
                bias = dim['bias']
                if dim_bef['type'] != 3:
                    A = A.reshape(A.shape[0], A.shape[1] * A.shape[2] * A.shape[3]).T
                Z = weight.dot(A) + bias
                if i == len(self.dims) - 2:
                    A = softmax(Z)
                else:
                    A = relu(Z)
            self.outs.append((Z, A))
        loss = loss_cross_entropy(A, Y)
        end_time = time.time() * 1000
        cost_time = end_time - start_time
        return A, loss, int(cost_time), int(cost_split)
    def backward(self, Y):
        start_time = time.time() * 1000
        num_datas = Y.shape[1]
        Z, A = self.outs[-1]
        dZ = (A - Y)/Y.shape[1]
        for i in range(len(self.dims) - 1):
            dim = self.dims[-1-i]
            dim_bef = self.dims[-2-i]
            Z_bef, A_bef = self.outs[-2 - i]
            Z, A = self.outs[-1-i]
            if dim['type'] == 3:
                if i > 0:
                    dZ = dA * (Z > 0)
                if A_bef.ndim != 2:
                    A_bef = A_bef.reshape(A_bef.shape[0], -1).T
                dW = dZ.dot(A_bef.T)
                dB = np.sum(dZ, axis=1)[..., np.newaxis]
                dA = dim['weight'].T.dot(dZ)
                dim['weight'] -= self.lrate * dW
                dim['bias'] -= self.lrate * dB
            if dim['type'] == 1:
                width_kernel = dim['kernel'][0]
                height_kernel = dim['kernel'][1]
                width_out = dim['size_out'][0]
                height_out = dim['size_out'][1]
                channel_out = dim['size_out'][2]
                width_in = dim_bef['size_out'][0]
                height_in = dim_bef['size_out'][1]
                channel_in = dim_bef['size_out'][2]
                weight = dim['weight'];
                if dA.ndim == 2:
                    dA = dA.T.reshape(num_datas, width_out, height_out, channel_out)
                dZ = dA * (Z > 0)
                dB = np.sum(dZ, axis=(0, 1, 2))
                #(输出宽 * 输出高, 样本数, 核宽 * 核高, 输入通道数)
                A_split = split(A_bef, dim)
                #(核宽*核高*输入通道数,输出宽*输出高*样本数)
                A_split = A_split.reshape(-1, A_split.shape[-2] * A_split.shape[-1]).T
                #(输出宽*输出高*样本数,输出通道数)
                dZ_flatten = dZ.transpose(1, 2, 0, 3).reshape(-1, dZ.shape[-1])
                #(核宽*核高*输入通道数,输出通道数)
                dW = A_split.dot(dZ_flatten).reshape(width_kernel, height_kernel, channel_in, channel_out)
                width_pad = width_kernel - 1
                height_pad = height_kernel - 1
                dZ_pad = np.pad(dZ, ((0, 0), (width_pad,width_pad), (height_pad,height_pad), (0,0)), mode='constant')
                #(核宽 * 核高 * 输出通道数,输入通道数)
                weight_rot = np.rot90(weight, k=2).transpose(0, 1, 3, 2).reshape(-1, weight.shape[2])
                #(输入宽 * 输入高, 样本数, 核宽 * 核高, 输出通道数)
                dZ_split = split(dZ_pad, dim)
                dZ_split = dZ_split.reshape(-1, dZ_split.shape[-2] * dZ_split.shape[-1])
                # (输入宽 * 输入高 * 样本数, 输入通道数)
                dA = dZ_split.dot(weight_rot).reshape(width_in, height_in, num_datas, channel_in).transpose(2, 0, 1, 3)
                dim['weight'] -= self.lrate * dW
                dim['bias'] -= self.lrate * dB
            if dim['type'] == 2:
                width_out = dim['size_out'][0]
                height_out = dim['size_out'][1]
                channel_out = dim['size_out'][2]
                width_kernel = dim['kernel'][0]
                height_kernel = dim['kernel'][1]
                width_in = dim_bef['size_out'][0]
                height_in = dim_bef['size_out'][1]
                channel_in = dim_bef['size_out'][2]
                dZ = dA
                if dZ.ndim == 2:
                    dZ = dZ.T.reshape(num_datas, width_out, height_out, channel_out)
                #(输出宽 * 输出高, 样本数, 核宽, 核高, 输入通道数)
                A_split = split_pool(A_bef, dim)
                # (样本数, 输出宽, 输出高, 核宽, 核高, 输入通道数)
                A_split = A_split.transpose(1, 0, 2, 3, 4).reshape(num_datas, width_out, height_out, width_kernel, height_kernel, channel_in)
                dA = A_split * dZ[:, :, :, np.newaxis, np.newaxis, :]
                dA = dA.transpose(0, 1, 3, 2, 4, 5).reshape(num_datas, width_out*width_kernel, height_out*height_kernel, channel_in)
                width_pad = width_in - width_out*width_kernel
                height_pad = height_in - height_out*height_kernel
                dA = np.pad(dA, ((0, 0), (0, width_pad), (0, height_pad), (0, 0)), mode='constant')
        end_time = time.time() * 1000
        cost_time = end_time - start_time
        return int(cost_time)

 

tf会利用cpu的avx指令集,多核并行以及显卡加速,而 numpy 自己实现显然利用不了这些,慢那是自然的了。

非常感谢提供代码,以下是代码的优化建议:

  1. 使用numpy的向量化运算函数,可以避免循环,从而加速代码。例如,dot函数可以用于矩阵乘法,conv2d函数可以用于卷积操作。

  2. 将计算图形编译为静态图形,这将使计算更快。TensorFlow在编译后使用C++进行计算,这比Python更快。

  3. 避免过多的内存分配。预先分配数组,并重复使用它们,以减少内存分配次数。如果必须动态分配内存,则可以考虑使用numpy.resize而不是append方法。

  4. 编写高效的代码。应该尽量避免使用Python列表和for循环等缓慢的操作。相反,应该使用numpy的数组、切片、广播和向量化操作等功能。

以下是部分代码的优化示例:

  • 优化loss_cross_entropy函数

原函数中使用了np.sum()方法,因为该方法会返回一个标量,所以需要将其转换为一个浮点数。但是,在转换过程中会出现一些开销,导致训练速度较慢。我们可以简单地将np.sum()替换为np.dot()来加速计算:

def loss_cross_entropy(Y_pre, Y_label):
    m = Y_pre.shape[1] # 样本数量
    logprobs = np.dot(Y_label, np.log(Y_pre).T)
    loss = -1/m * np.sum(logprobs)
    return loss
  • 优化卷积操作

可以使用numpy的convolve函数代替两个for循环来实现卷积操作:

def conv_single_step(a_slice_prev, W, b):
    s = np.multiply(a_slice_prev, W) + b
    Z = np.sum(s)
    return Z

def conv_forward(A_prev, W, b, hparameters):
    stride = hparameters['stride']
    pad = hparameters['pad']

    # Pad the previous activation
    A_prev_pad = np.pad(A_prev, ((0,0), (pad,pad), (pad,pad), (0,0)), 'constant', constant_values=(0,))
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev_pad.shape
    (f, f, n_C_prev, n_C) = W.shape
    n_H = int((n_H_prev - f) / stride) + 1
    n_W = int((n_W_prev - f) / stride) + 1
    Z = np.zeros((m, n_H, n_W, n_C))
    for i in range(m):
        a_prev_pad = A_prev_pad[i,:,:,:]
        for h in range(n_H):
            vert_start = h * stride
            vert_end = vert_start + f
            for w in range(n_W):
                horiz_start = w * stride
                horiz_end = horiz_start + f
                for c in range(n_C):
                    a_slice_prev = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]
                    weights = W[:, :, :, c]
                    biases = b[:, :, :, c]
                    Z[i, h, w, c] = conv_single_step(a_slice_prev, weights, biases)
    return Z
  • 优化池化操作

同样可以使用numpy的向量化运算来代替for循环。例如,使用numpy的kron函数可以实现最大池化操作:

def create_mask_from_window(x):
    mask = x == np.max(x)
    return mask

def pool_forward(A_prev, hparameters, mode="max"):
    stride = hparameters['stride']
    f = hparameters['f']
    m, n_H_prev, n_W_prev, n_C_prev = A_prev.shape
    n_H = int((n_H_prev - f) / stride) + 1
    n_W = int((n_W_prev - f) / stride) + 1
    n_C = n_C_prev
    A = np.zeros((m, n_H, n_W, n_C))
    for i in range(m):
        for h in range(n_H):
            vert_start = h * stride
            vert_end = vert_start + f
            for w in range(n_W):
                horiz_start = w * stride
                horiz_end = horiz_start + f
                for c in range(n_C):
                    a_prev_slice = A_prev[i, vert_start:vert_end, horiz_start:horiz_end, c]
                    if mode == "max":
                        mask = create_mask_from_window(a_prev_slice)
                        A[i, h, w, c] = np.max(a_prev_slice)
                    elif mode == "average":
                        A[i, h, w, c] = np.mean(a_prev_slice)
    cache = (A_prev, hparameters)
    return A, cache