对英文文章的词频进行统计,输出统计结果
有下列英文段落:
Many dogs have forgotten or never learned how to properly greet other members of their own species. Dogs are mostly on leashes, behind fences, rushed into greetings by their well-meaning people and kept from situations that would otherwise help them learn canine greeting etiquette. Holding a dog on a tight leash can create frustration when your dog sees other dogs on the street, and that can cause body language in your dog that may look offensive to other dogs. Certain canine behavior, such as pulling on leash, a hard stare and running up to other dogs head-on, can turn greetings sour.
Before you set up a new-dog introduction, consider your own dog’s general attitude toward other dogs. Some dogs are uncomfortable with or reactive to new dogs and need very slow introductions. Be aware that dogs do not necessarily like every single dog they meet, and some do best as the only dog in the household. If your dog has a history of reacting negatively to the company of other dogs, it is probably best to just have one dog.
程序要求:
(1) 请编程统计该英文段落里面出现的单词种类和每个单词出现的次数
(2) 程序具有通用性,可以分析任何英文段落
(3) 提示:1)可以使用动态内存分配,请提前学习一下教材第九章;2)提取到的单词变成小写后再统计出现次数;3)需要去掉提取的单
代码:
#include <iostream>
#include <string>
#include <stdlib.h>
using namespace std;
struct WordNode
{
char word[40]; //单词长度为40
int nmb;
struct WordNode* next;
WordNode() { next = 0; }
};
class WordCount
{
public:
WordCount() { head = 0; }
~WordCount()
{
WordNode* node = 0;
while (head)
{
node = head->next;
delete head;
head = node;
}
}
WordNode* findNode(char* p); //查找单词
void insertNode(WordNode* node); //插入单词
void addCount(WordNode* node); //单词的数量加1
void display(); //显示
private:
struct WordNode* head;
};
//判断是否为空
int isEmpty(char* p)
{
while (*p)
{
if (*p > 0x20)
return 0;
p++;
}
return 1;
}
//查找单词
WordNode* WordCount::findNode(char* p)
{
struct WordNode* node = head;
while (node)
{
if (strcmp(p, node->word) == 0)
{
return node;
}
else
{
node = node->next;
}
}
return 0;
}
//插入单词
void WordCount::insertNode(struct WordNode* node)
{
//cout << "insert node : " << node->word << endl;
struct WordNode* P;
struct WordNode* pre;
if (head == 0)
{
head = node;
return;
}
P = head;
pre = head;
while (P)
{
if (strcmp(P->word, node->word) < 0) //降序排列
{
if (P == head)
{
node->next = head;
head = node; //插入头部
}
else
{
pre->next = node;
node->next = P;
}
break;
}
else
{
pre = P;
P = P->next;
//插入尾部
if (P == 0)
{
pre->next = node;
break;
}
}
}
}
//单词的数量加1
void WordCount::addCount(struct WordNode* node)
{
node->nmb += 1;
//printf(" >> %s :%d\n",node->word,node->nmb);
}
//显示
void WordCount::display()
{
struct WordNode* node = head;
while (node)
{
printf("%s\t%d\n", node->word, node->nmb);
//cout << node->word << "\t" << node->nmb << endl;
node = node->next;
}
}
bool isctrl(char ch)
{
if (ch < 0x1F || ch == 0x7F)
{
return true;
}
return false;
}
//删除空格,并将大写转变成小写
void toLower(char tt[])
{
int i = 0, j = 0;
while (tt[i] != '\0')
{
if ((tt[i] != ' ') && (!isctrl(tt[i])))
{
if (tt[i] >= 'A' && tt[i] <= 'Z')
tt[j++] = tt[i] + 32;
else
tt[j++] = tt[i];
}
i++;
}
}
//buf是存储文件的缓冲区,lSize是文件大小
char* textFileRead(const char* filename, long* lSize)
{
char* buf;
FILE* pf = fopen(filename, "r");
fseek(pf, 0, SEEK_END);
*lSize = ftell(pf);
// 用完后需要将内存free掉
rewind(pf);
buf = new char[*lSize + 1];
*lSize = fread(buf, sizeof(char), *lSize, pf);
buf[*lSize] = '\0';
return buf;
}
int main()
{
long size = 0;
int i = 0, j = 0;
char text[50] = { 0 };
WordCount cc;
struct WordNode* node = 0;
char* buf = textFileRead("textfile.txt", &size); //读文件所有内容
if (size <= 0)
{
printf("文件打开失败,或者为空\n");
return 0;
}
while (i <= size)
{
if (i == size)
{
text[j] = '\0';
toLower(text);
if (isEmpty(text))
{
j = 0;
i++;
continue;
}
node = 0;
node = cc.findNode(text);
if (node)
{
cc.addCount(node);
}
else
{
node = (struct WordNode*)malloc(sizeof(WordNode));
strcpy(node->word, text);
node->nmb = 1;
node->next = 0;
cc.insertNode(node);
}
break;
}
else
{
//此处不考虑中间含有.的单词,入地名,缩写等等
if (buf[i] == ' ' || buf[i] == '\r' || buf[i] == ',' || buf[i] == '.' || buf[i] == '\n' || buf[i] == '!' || isctrl(buf[i]) || buf[i] == ';')
{
text[j] = '\0';
toLower(text);
if (isEmpty(text))
{
j = 0;
i++;
continue;
}
node = 0;
node = cc.findNode(text);
if (node)
{
//printf("find ,add count\n");
cc.addCount(node);
}
else
{
//printf("not find,create\n");
node = (struct WordNode*)malloc(sizeof(WordNode));//new WordNode;
strcpy(node->word, text);
node->nmb = 1;
node->next = 0;
cc.insertNode(node);
}
j = 0;
}
else
{
text[j++] = buf[i];
}
i++;
}
}
cc.display();
return 0;
}
请问你这个段落是在一个文件里,还是直接输入啊?
dog’s、dog、dog.各算一个单词吧?
单词种类意思是相同单词是一种这个意思
然后知道有多少种就可以?
mapreduce经典案例
#使用nltk库的stem算法。1英文小写.2,词干提取。3,词形还原
from nltk.stem import WordNetLemmatizer #词形还原
from nltk.stem.porter import PorterStemmer #词干提取 复数变单数
# 统计词频
def word_count(file_name):
word_freq = collections.defaultdict(int)
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
with open(file_name, encoding='utf-8') as f:
for l in f:
# 词形还原 词干提取
l = wordnet_lemmatizer.lemmatize(porter_stemmer.stem(l.lower()))
for w in l:
word_freq[w] += 1
return word_freq
问下单词种类是啥意思