Fatal error in MPI_Scatter: Invalid buffer pointer

随机生成方阵与向量并让他们相乘的代码，当矩阵维数大于int的限制（所以用了long）就会出现这个报错，为什么？
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include "mpi.h"
#include <math.h>
#include <stdio.h>
#include <time.h>
#include <malloc.h>
#include <stdlib.h>

//produce random number in a certain range
double frand(double fmin, double fmax)
{
    double f = (double)rand() / ((double)RAND_MAX + 1.0);
    return fmin + f * (fmax - fmin);
}

int main(int argc, char* argv[])
{
    int numprocs, myid, namelen, a;
    long i, j, k, l, N1 = 0, N = 0, N2 = 0, part = 0;
    char procs_name[MPI_MAX_PROCESSOR_NAME];
    double* matrix_A, * part_A, * vector_b, * answer, * vector_c;
    double time1, time2, time3, time4;
    double sqr_sum = 0, A = 0;//normalization factor
    double* norm_vector, * norm_c;
    FILE* fp = fopen("time.txt", "a");

    //MPI initialization
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
    MPI_Get_processor_name(procs_name, &namelen);

    fprintf(stdout, "Process %d of %d on %s\n", myid, numprocs, procs_name);
    fflush(stdout);

    //define matrix_A and vector_b
    if (myid == 0)
    {
        printf("Enter a vector dimension:");
        fflush(stdout);
        scanf_s("%ld", &N1);
        time1 = MPI_Wtime();
        a = N1 % numprocs;
        printf("%d\n", a);
        if (a > 0)
        {
            N = N1 + a;
        }
        else
        {
            N = N1;
        }
        N2 = N * N1;
        part = N / numprocs;
    }
    MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&N2, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&N1, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&part, 1, MPI_INT, 0, MPI_COMM_WORLD);

    //prepare for matrix_A and vectors
    matrix_A = (double*)malloc(sizeof(double) * N2);
    vector_b = (double*)malloc(sizeof(double) * N1);
    part_A = (double*)malloc(sizeof(double) * N2);
    answer = (double*)malloc(sizeof(double) * N);
    vector_c = (double*)malloc(sizeof(double) * N);
    if (myid == 0)
    {
        for (j = 0; j < N2; j++)
        {
            if (j < N1 * N1)
                matrix_A[j] = frand(0.0, 5.0);//save matrix_A as a one-dimension array
            else
                matrix_A[j] = 0;
        }
        for (k = 0; k < N1; k++)
        {
            vector_b[k] = frand(0.0, 5.0);
        }
        for (l = 0; l < N; l++)
        {
            answer[l] = 0;
        }
    }
    
    //distribution
    MPI_Bcast(vector_b, N1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    MPI_Bcast(answer, N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    MPI_Scatter(matrix_A, part * N1, MPI_DOUBLE, part_A, part * N1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

    if (myid == 0)
        time2 = MPI_Wtime();

    //Multiplication
    for (i = myid * part * N1; i < (myid + 1) * part * N1; i++)
    {
        answer[i / N1] += part_A[i] * vector_b[i % N1];
    }
    MPI_Allreduce(answer, vector_c, N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    if (myid == 0)
        time3 = MPI_Wtime();

    //normalize vector_c
    for (i = myid; i < N; i += numprocs)
    {
        sqr_sum += vector_c[i] * vector_c[i];
    }
    MPI_Allreduce(&sqr_sum, &A, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    A = sqrt(A);//normalization factor
    norm_vector = (double*)malloc(sizeof(double) * N);
    norm_c = (double*)malloc(sizeof(double) * N);
    for (j = 0; j < N; j++)
    {
        norm_vector[j] = 0;
    }
    for (i = myid; i < N; i += numprocs)
    {
        norm_vector[i] = vector_c[i] / A;
    }
    MPI_Allreduce(norm_vector, norm_c, N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    if (myid == 0)
    {
        time4 = MPI_Wtime();
        printf("Time of computing and normalization is %f\t", time4 - time2);
        printf("\tTime of total process is %f\n", time4 - time1);
        printf("Number of processes is %d\n", numprocs);
        fprintf(fp, "%f\t%f\t%d\t%ld\n", time4 - time2, time4 - time1, numprocs, N1);
        fclose(fp);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    free(matrix_A);
    free(vector_b);
    free(vector_c);
    free(answer);
    free(norm_vector);
    free(norm_c);
    free(part_A);
    MPI_Finalize();
    return 0;
}