C语言:英文文章根据非停用词使用频度统计语句重要程度，不同排序方法在数据量过大时结果为什么存在差异？

当article.txt中的文章数据量过大时，第一段代码使用的堆排序与第二段代码的选择排序得到的结果出现不同，求助原因。（选择排序结果正确，主要想知道堆排序代码有没有什么遗漏，为什么会造成这种不同）感谢！

停用词为stopword.txt中的单词，如介词，连词等。

每一句按照问号，句号，或感叹号划分（）单个句号也算是一句。

（声明:两段代码只有最终对paslist[]排序部分存在差异）

第一段代码如下:

#include<stdio.h>

#include<stdlib.h>

#include<math.h>

#include<string.h>

struct stopword {

char word[50];

struct stopword *link;

};

struct stopword *stop_sy[128];//stop按字母索引

char temp[50];

struct vocab {

char word[50];

long long num; // 出现频度

struct vocab *link;

};

struct vocab *wordlist[3010]; //hash表，冲突处按照单词ascii码排序

struct passage {

char con[50000];

long long num; //频度和

};

struct passage *paslist[300010], *pastemp;

int tail;

int N;

void getstop()//制作停用词字典

{

FILE *in;

in = fopen("stopwords.txt", "r");

for(int i = 0; i < 128; i++)

{

stop_sy[i] = NULL;

}

while(fscanf(in, "%s", temp) != EOF)

{

struct stopword *p;

p = (struct stopword *)malloc(sizeof(struct stopword));

for(int i = 0; i <= strlen(temp); i++)

{

p->word[i] = temp[i];

}

p->link = NULL;

if(stop_sy[temp[0]] == NULL)

{

stop_sy[temp[0]] = p;

}

else if(strcmp(stop_sy[temp[0]]->word, p->word) > 0)

{

p->link = stop_sy[temp[0]];

stop_sy[temp[0]] = p;

}

else if(stop_sy[temp[0]]->link == NULL)

{

stop_sy[temp[0]]->link = p;

}

else

{

for(struct stopword *r = stop_sy[temp[0]], *q = stop_sy[temp[0]]->link; q != NULL; r = q, q = q->link)

{

if(strcmp(q->word, p->word) > 0)

{

r->link = p;

p->link = q;

break;

}

else if(q->link == NULL)

{

q->link = p;

break;

}

//for(struct stopword *p = stop_sy['s']; p != NULL; p = p->link)

//{

// printf("%s\n", p->word);

//}

fclose(in);

return;

}

int search()//找到返回1

{

for(struct stopword *p = stop_sy[temp[0]]; p != NULL; p = p->link)

{

if(strcmp(temp, p->word) == 0)return 1;

else if(strcmp(temp, p->word) < 0)break;

}

return 0;

}

void getwords()

{

FILE *in;

in = fopen("article.txt", "r");

int i, hash_temp;

for(i = 0; i < 3010; i++)

{

wordlist[i] = NULL;

}

i = 0;

while((temp[i] = fgetc(in)) != EOF)

{

if((temp[i] <= 'Z' && temp[i] >= 'A') || (temp[i] <= 'z' && temp[i] >= 'a'))

{

if(temp[i] <= 'Z' && temp[i] >= 'A')

{

temp[i] = temp[i] + 'a' - 'A';

}

i++;

}

else if(i > 0)

{

temp[i] = '\0';

if(search() == 1)

{

i = 0;

continue;

}

hash_temp = 0;

for(int j = 0; j < strlen(temp); j++)

{

hash_temp = hash_temp*37 + temp[j];//重要公式

}

hash_temp = abs(hash_temp % 3001);

if(wordlist[hash_temp] == NULL)

{

wordlist[hash_temp] = (struct vocab *)malloc(sizeof(struct vocab));

wordlist[hash_temp]->link = NULL;

wordlist[hash_temp]->num = 1;

for(int j = 0; j <= strlen(temp); j++)

{

wordlist[hash_temp]->word[j] = temp[j];

}

else if(strcmp(wordlist[hash_temp]->word, temp) == 0)

{

wordlist[hash_temp]->num++;

}

else if(strcmp(wordlist[hash_temp]->word, temp) > 0)

{

struct vocab *p;

p = (struct vocab *)malloc(sizeof(struct vocab));

p->link = NULL;

p->num = 1;

for(int j = 0; j <= strlen(temp); j++)

{

p->word[j] = temp[j];

}

p->link = wordlist[hash_temp];

wordlist[hash_temp] = p;

}

else if(wordlist[hash_temp]->link == NULL)

{

struct vocab *p;

p = (struct vocab *)malloc(sizeof(struct vocab));

p->link = NULL;

p->num = 1;

for(int j = 0; j <= strlen(temp); j++)

{

p->word[j] = temp[j];

}

wordlist[hash_temp]->link = p;

}

else

{

for(struct vocab *q = wordlist[hash_temp], *p = wordlist[hash_temp]->link; p != NULL; q = p, p = p->link)

{

if(strcmp(p->word, temp) == 0)

{

p->num++;

break;

}

else if(strcmp(p->word, temp) > 0)

{

struct vocab *r;

r = (struct vocab *)malloc(sizeof(struct vocab));

r->link = p;

r->num = 1;

for(int j = 0; j <= strlen(temp); j++)

{

r->word[j] = temp[j];

}

q->link = r;

break;

}

else if(p->link == NULL)

{

struct vocab *r;

r = (struct vocab *)malloc(sizeof(struct vocab));

r->link = NULL;

r->num = 1;

for(int j = 0; j <= strlen(temp); j++)

{

r->word[j] = temp[j];

}

p->link = r;

break;

}

i = 0;

}

fclose(in);

return;

}

void adjust(int i, int m)

{

int j;

pastemp = paslist[i];

j = 2*i + 1;

while(j < m)

{

if(j < m-1 && paslist[j]->num > paslist[j+1]->num)j++;

if(pastemp->num <= paslist[j]->num) break;

paslist[(j-1)/2] = paslist[j];

j = 2*j + 1;

}

paslist[(j-1)/2] = pastemp;

return;

}

void result()

{

int i;

for(i = tail/2; i >= 0; i--)

{

adjust(i, tail+1);

}

for(i = tail; i >= 1; i--)

{

pastemp = paslist[i];

paslist[i] = paslist[0];

paslist[0] = pastemp;

adjust(0, i);

}

for(i = 0; i < 5; i++)

{

printf("%lld %s\n", paslist[i]->num, paslist[i]->con);

}

FILE *out;

out = fopen("results.txt", "w") ;

for(i = 0; i < N; i++)

{

fprintf(out, "%lld %s\n", paslist[i]->num, paslist[i]->con);

}

fclose(out);

return;

}

void solve()

{

FILE *in;

in = fopen("article.txt", "r");

int i, flag = 0, hash_temp;

for(i = 0; i < 300010; i++)

{

paslist[i] = NULL;

}

i = 0;

while(1)

{

paslist[i] = (struct passage *)malloc(sizeof(struct passage));

paslist[i]->num = 0;

int j = 0;

while(1)

{

if((paslist[i]->con[j]=fgetc(in)) == EOF)

{

flag = 1;

break;

}

if(paslist[i]->con[j] == ' ' && j == 0)

{

continue;

}

else if(paslist[i]->con[j] == '.' || paslist[i]->con[j] == '!' || paslist[i]->con[j] == '?')

{

paslist[i]->con[j+1] = '\0';

int k = 0;

for(int r = 0; r <= j; r++)

{

temp[k] = paslist[i]->con[r];

if((temp[k] <= 'Z' && temp[k] >= 'A') || (temp[k] <= 'z' && temp[k] >= 'a'))

{

if(temp[k] <= 'Z' && temp[k] >= 'A')

{

temp[k] = temp[k] + 'a' - 'A';

}

k++;

}

else if(k > 0)

{

temp[k] = '\0';

if(search() == 1)

{

k = 0;

continue;

}

hash_temp = 0;

for(int l = 0; l < strlen(temp); l++)

{

hash_temp = hash_temp*37 + temp[l];//重要公式

}

hash_temp = abs(hash_temp % 3001);

k = 0;

for(struct vocab *p = wordlist[hash_temp]; p != NULL; p = p->link)

{

//printf("%lld %s %d\n", p->num, p->word, hash_temp);

if(strcmp(temp, p->word) == 0)

{

paslist[i]->num += p->num;

break;

}

break;

}

else j++;

}

if(flag == 1)break;

i++;

}

tail = i;

fclose(in);

//开始排序

result();

return;

}

void release() //释放空间

{

for(int i = 0; i < 128; i++)

{

if(stop_sy[i] != NULL)free(stop_sy[i]);

}

for(int i = 0; i < 3010; i++)

{

if(wordlist[i] != NULL)free(wordlist[i]);

}

for(int i = 0; i <= tail; i++)

{

if(paslist[i] != NULL)free(paslist[i]);

}

return;

}

int main()

{

scanf("%d", &N);

getstop();

getwords();

solve();

release();

return 0;

}

第二段代码如下:

#include<stdio.h>

#include<stdlib.h>

#include<math.h>

#include<string.h>

struct stopword {

char word[50];

struct stopword *link;

};

struct stopword *stop_sy[128];//stop按字母索引

char temp[50];

struct vocab {

char word[50];

long long num; // 出现频度

struct vocab *link;

};

struct vocab *wordlist[3010]; //hash表，冲突处按照单词ascii码排序

struct passage {

char con[50000];

long long num; //频度和

};

struct passage *paslist[300010], *pastemp;

int tail;

int N;

void getstop()//制作停用词字典

{

FILE *in;

in = fopen("stopwords.txt", "r");

for(int i = 0; i < 128; i++)

{

stop_sy[i] = NULL;

}

while(fscanf(in, "%s", temp) != EOF)

{

struct stopword *p;

p = (struct stopword *)malloc(sizeof(struct stopword));

for(int i = 0; i <= strlen(temp); i++)

{

p->word[i] = temp[i];

}

p->link = NULL;

if(stop_sy[temp[0]] == NULL)

{

stop_sy[temp[0]] = p;

}

else if(strcmp(stop_sy[temp[0]]->word, p->word) > 0)

{

p->link = stop_sy[temp[0]];

stop_sy[temp[0]] = p;

}

else if(stop_sy[temp[0]]->link == NULL)

{

stop_sy[temp[0]]->link = p;

}

else

{

for(struct stopword *r = stop_sy[temp[0]], *q = stop_sy[temp[0]]->link; q != NULL; r = q, q = q->link)

{

if(strcmp(q->word, p->word) > 0)

{

r->link = p;

p->link = q;

break;

}

else if(q->link == NULL)

{

q->link = p;

break;

}

//for(struct stopword *p = stop_sy['s']; p != NULL; p = p->link)

//{

// printf("%s\n", p->word);

//}

fclose(in);

return;

}

int search()//找到返回1

{

for(struct stopword *p = stop_sy[temp[0]]; p != NULL; p = p->link)

{

if(strcmp(temp, p->word) == 0)return 1;

else if(strcmp(temp, p->word) < 0)break;

}

return 0;

}

void getwords()

{

FILE *in;

in = fopen("article.txt", "r");

int i, hash_temp;

for(i = 0; i < 3010; i++)

{

wordlist[i] = NULL;

}

i = 0;

while((temp[i] = fgetc(in)) != EOF)

{

if((temp[i] <= 'Z' && temp[i] >= 'A') || (temp[i] <= 'z' && temp[i] >= 'a'))

{

if(temp[i] <= 'Z' && temp[i] >= 'A')

{

temp[i] = temp[i] + 'a' - 'A';

}

i++;

}

else if(i > 0)

{

temp[i] = '\0';

if(search() == 1)

{

i = 0;

continue;

}

hash_temp = 0;

for(int j = 0; j < strlen(temp); j++)

{

hash_temp = hash_temp*37 + temp[j];//重要公式

}

hash_temp = abs(hash_temp % 3001);

if(wordlist[hash_temp] == NULL)

{

wordlist[hash_temp] = (struct vocab *)malloc(sizeof(struct vocab));

wordlist[hash_temp]->link = NULL;

wordlist[hash_temp]->num = 1;

for(int j = 0; j <= strlen(temp); j++)

{

wordlist[hash_temp]->word[j] = temp[j];

}

else if(strcmp(wordlist[hash_temp]->word, temp) == 0)

{

wordlist[hash_temp]->num++;

}

else if(strcmp(wordlist[hash_temp]->word, temp) > 0)

{

struct vocab *p;

p = (struct vocab *)malloc(sizeof(struct vocab));

p->link = NULL;

p->num = 1;

for(int j = 0; j <= strlen(temp); j++)

{

p->word[j] = temp[j];

}

p->link = wordlist[hash_temp];

wordlist[hash_temp] = p;

}

else if(wordlist[hash_temp]->link == NULL)

{

struct vocab *p;

p = (struct vocab *)malloc(sizeof(struct vocab));

p->link = NULL;

p->num = 1;

for(int j = 0; j <= strlen(temp); j++)

{

p->word[j] = temp[j];

}

wordlist[hash_temp]->link = p;

}

else

{

for(struct vocab *q = wordlist[hash_temp], *p = wordlist[hash_temp]->link; p != NULL; q = p, p = p->link)

{

if(strcmp(p->word, temp) == 0)

{

p->num++;

break;

}

else if(strcmp(p->word, temp) > 0)

{

struct vocab *r;

r = (struct vocab *)malloc(sizeof(struct vocab));

r->link = p;

r->num = 1;

for(int j = 0; j <= strlen(temp); j++)

{

r->word[j] = temp[j];

}

q->link = r;

break;

}

else if(p->link == NULL)

{

struct vocab *r;

r = (struct vocab *)malloc(sizeof(struct vocab));

r->link = NULL;

r->num = 1;

for(int j = 0; j <= strlen(temp); j++)

{

r->word[j] = temp[j];

}

p->link = r;

break;

}

i = 0;

}

fclose(in);

return;

}

void solve()

{

FILE *in;

in = fopen("article.txt", "r");

int i, flag = 0, hash_temp;

for(i = 0; i < 300010; i++)

{

paslist[i] = NULL;

}

i = 0;

while(1)

{

paslist[i] = (struct passage *)malloc(sizeof(struct passage));

paslist[i]->num = 0;

int j = 0;

while(1)

{

if((paslist[i]->con[j]=fgetc(in)) == EOF)

{

flag = 1;

break;

}

if(paslist[i]->con[j] == ' ' && j == 0)

{

continue;

}

else if(paslist[i]->con[j] == '.' || paslist[i]->con[j] == '!' || paslist[i]->con[j] == '?')

{

paslist[i]->con[j+1] = '\0';

int k = 0;

for(int r = 0; r <= j; r++)

{

temp[k] = paslist[i]->con[r];

if((temp[k] <= 'Z' && temp[k] >= 'A') || (temp[k] <= 'z' && temp[k] >= 'a'))

{

if(temp[k] <= 'Z' && temp[k] >= 'A')

{

temp[k] = temp[k] + 'a' - 'A';

}

k++;

}

else if(k > 0)

{

temp[k] = '\0';

if(search() == 1)

{

k = 0;

continue;

}

hash_temp = 0;

for(int l = 0; l < strlen(temp); l++)

{

hash_temp = hash_temp*37 + temp[l];//重要公式

}

hash_temp = abs(hash_temp % 3001);

k = 0;

for(struct vocab *p = wordlist[hash_temp]; p != NULL; p = p->link)

{

//printf("%lld %s %d\n", p->num, p->word, hash_temp);

if(strcmp(temp, p->word) == 0)

{

paslist[i]->num += p->num;

break;

}

break;

}

else j++;

}

if(flag == 1)break;

i++;

}

tail = i;

fclose(in);

//开始排序

int j = 0, max;

for(i = 0; i < tail; i++)

{

max = paslist[i]->num;

flag = i;

for(j = i+1; j <= tail; j++)

{

if(paslist[j]->num > max)

{

max = paslist[j]->num;

flag = j;

}

pastemp = paslist[flag];

paslist[flag] = paslist[i];

paslist[i] = pastemp;

}

for(i = 0; i < 5; i++)

{

printf("%lld %s\n", paslist[i]->num, paslist[i]->con);

}

FILE *out;

out = fopen("results.txt", "w") ;

for(i = 0; i < N; i++)

{

fprintf(out, "%lld %s\n", paslist[i]->num, paslist[i]->con);

}

fclose(out);

return;

}

void release() //释放空间

{

for(int i = 0; i < 128; i++)

{

if(stop_sy[i] != NULL)free(stop_sy[i]);

}

for(int i = 0; i < 3010; i++)

{

if(wordlist[i] != NULL)free(wordlist[i]);

}

for(int i = 0; i <= tail; i++)

{

if(paslist[i] != NULL)free(paslist[i]);

}

return;

}

int main()

{

scanf("%d", &N);

getstop();

getwords();

solve();

release();

return 0;

}

不同排序方法在数据量很大的时候，性能相差很大是因为排序的时间复杂性决定的，这个你要了解一下常用的8种排序规则。

建议研究一下数据结构。

简单排序里面，快速排序是最快的，二叉树排序，如果数据量很大了那就要用到归并排序等。

找到问题了，是特征值相同时排序顺序的