统计txt文档中英文词频
#include<stdio.h>
#include<string.h>
#define max 5000
int k = 0;
//单词结构
struct f_word
{
char *word;
int flag;
}fword;
//词频结构
struct f_linelist
{
char *word;
int frequency;
};
f_linelist linelist[max];
//统计文章词频
int line_word(char *word)
{
for (int i = 0; i < max; i++)
{
if (linelist[i].word == word)
{
linelist[i].frequency += 1;
return i;
}
}
linelist[k].word = word;
linelist[k].frequency = 1;
k += 1;
return 0;
}
int main()
{
char ch, a[max]; int i = 0;
//打开文件
FILE* fpRead;
fopen_s(&fpRead, "C:/Users/Apple/Desktop/2.txt", "rb");
if (fpRead == NULL)
{
printf("未读到文件");
return 0;
}
while ((ch = fgetc(fpRead)) != EOF)
{
putchar(ch);
a[i++] = ch;
}
printf("\n--------------------\n");
char* word;//用于存放单词
int k=0, j, m;
for (k ; k<30; k++)
{
printf("%c %d\n", a[k],k);
if ((a[k] >= 'a' && a[k] <= 'z') || (a[k] >= 'A' && a[k] <= 'Z'))
{
fword.flag = 1;//是字母,标志位设置为1
//printf("%c是字母\n", a[k]);
}
else
{
fword.flag = 0;//非字母,标志位设置为0
//printf("%c不是字母\n", a[k]);
}
if (fword.flag == 0)//非字母
{
if (fword.word != "\0")//判断之前的word里面是否有单词
{
line_word(fword.word);//存放到数组里面
}
printf("这个非字母为:%c\n", fword.word);
fword.word = "\0";//清空word
}
else if (fword.flag == 1)//字母
{
//printf("到这里还在执行吗?\n");
if (fword.word == "\0")//word为空,放入第一个字母
{
fword.word[0] = a[k];
printf("到这里还在执行吗?\n");
}
else
{
fword.word += a[k];//word不为空,拼接字母组成单词
printf("到这里还在执行吗?\n");
}
printf("这个字母为:%c\n", linelist[k].word);
}
else printf("获取单词时发生未知错误");
}
printf("k的值为:%d\n", k);
//按照词典排序(冒泡排序)
f_linelist temp;
for (i = 0; i < k; i++)
{
m = i;
for (j = i + 1; j < k; j++)
{
if (linelist[j].word < linelist[m].word)//将单词转换成小写进行比较
m = j;
}
//交换原始单词
temp = linelist[i];
linelist[i] = linelist[m];
linelist[m] = temp;
}
//输出词频
printf("all word is\n");
for (i = 0; i < k; i++)
{
printf("%d %s", linelist[i].frequency, linelist[i].word);
}
return 0;
}