#!/usr/bin/env python
import re, sys, collections
stops = open(’../stop_words.txt’).read().split(’,’)
words = re.findall(’[a-z]{2,}’, open(sys.argv[1]).read().lower())
counts = collections.Counter(w for w in words if w not in stops)
for (w, c) in counts.most_common(25):
print w, ’-’, c
上述示例代码为python版本的词频统计。功能如下:
导入stopwords和源文件,之后进行统计并输出。
问题:使用尽可能少的java代码来实现相同的功能,尽可能的调用函数
文件输入的功能你自己实现下,别的完全改写了。
import java.util.HashMap;
import java.util.regex.*;
import java.util.Map;
import java.util.*;
class HelloWorld {
public static void main(String []args) {
String s = "someone says: hello world! hello, this website hello the people this is the java language.";
Pattern pattern = Pattern.compile("[a-z]{2,}");
Matcher matcher = pattern.matcher(s);
Map<String, Integer> map = new HashMap<>();
while (matcher.find()) {
String word = matcher.group(0).toLowerCase();
if(!map.containsKey(word))
{
map.put(word, 1);
}
else
{
int times = map.get(word) + 1;
map.remove(word);
map.put(word, times);
}
}
//System.out.println(map);
List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(map.entrySet());
list.sort(new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
for (Map.Entry<String, Integer> mapping : list){
System.out.println(mapping.getKey()+" - "+mapping.getValue());
}
}
}
hello - 3
the - 2
this - 2
website - 1
world - 1
java - 1
someone - 1
is - 1
language - 1
says - 1
people - 1