```python
# coding: utf-8
import math
import datetime
import jieba
import csv
import pandas as pd
import os
import unittest
class Struct(object):
def __init__(self, word, sentiment, pos, value, class_value):
self.word = word
self.sentiment = sentiment
self.pos = pos
self.value = value
self.class_value = class_value
class Result(object):
def __init__(self, score, score_words, not_word, degree_word):
self.score = score
self.score_words = score_words
self.not_word = not_word
self.degree_word = degree_word
class Score(object):
# 七个情感大类对应的小类简称: 尊敬
score_class = {'乐': ['PA', 'PE'],
'好': ['PD', 'PH', 'PG', 'PB', 'PK'],
'怒': ['NA'],
'哀': ['NB', 'NJ', 'NH', 'PF'],
'惧': ['NI', 'NC', 'NG'],
'恶': ['NE', 'ND', 'NN', 'NK', 'NL'],
'惊': ['PC']
}
# 大连理工大学 -> ICTPOS 3.0
POS_MAP = {
'noun': 'n',
'verb': 'v',
'adj': 'a',
'adv': 'd',
'nw': 'al', # 网络用语
'idiom': 'al',
'prep': 'p',
}
# 否定词 TODO 找否定语料库,从文件加载
NOT_DICT = set(['不是', '不大', '没', '非', '莫', '弗', '毋', '没有','勿', '未', '否', '别', '無', '休',
'缺乏', '缺少', '不', '甭', '勿', '别', '未', '反', '没', '否', '木有', '非', '无', '请勿', '无须', '并非', '毫无', '决不', '休想', '永不', '不要',
'未尝', '未曾', '毋', '莫', '从未', '从未有过', '尚未', '一无', '并未', '尚无', '从没', '绝非', '远非', '切莫', '绝不', '毫不', '禁止', '忌', '拒绝',
'杜绝', '弗'])
def __init__(self, sentiment_dict_path, degree_dict_path, stop_dict_path):
self.sentiment_struct, self.sentiment_dict = self.load_sentiment_dict(sentiment_dict_path)
self.degree_dict = self.load_degree_dict(degree_dict_path)
self.stop_words = self.load_stop_words(stop_dict_path)
def load_stop_words(self, stop_dict_path):
stop_words = [w.strip() for w in open(stop_dict_path,'r',encoding='utf-8').readlines()]
# print (stop_words[:100])
return stop_words
def remove_stopword(self, words):
words = [w.strip() for w in words if w not in self.stop_words]
return words
def load_degree_dict(self, dict_path):
"""读取程度副词词典
Args:
dict_path: 程度副词词典路径. 格式为 word\tdegree
所有的词可以分为6个级别,分别对应极其, 很, 较, 稍, 欠, 超
Returns:
返回 dict = {word: degree}
"""
degree_dict = {}
with open(degree_dict_path, 'r', encoding='UTF-8') as f:
for line in f:
line = line.strip()
word, degree = line.split('\t')
degree = float(degree)
degree_dict[word] = degree
return degree_dict
def load_sentiment_dict(self, dict_path):
"""读取情感词词典
Args:
dict_path: 情感词词典路径. 格式请看 README.md
Returns:
返回 dict = {(word, postag): 极性}
"""
sentiment_dict = {}
sentiment_struct = []
with open(sentiment_dict_path, 'r', encoding='UTF-8') as f:
# with gzip.open(dict_path) as f:
for index, line in enumerate(f):
if index == 0: # title,即第一行的标题
continue
items = line.split('\t')
word = items[0] #词语
pos = items[1] #词性种类
sentiment = items[4] #情感分类
intensity = items[5] # 1, 3, 5, 7, 9五档, 9表示强度最大, 1为强度最小.
polar = items[6] # 极性
# 将词性转为 ICTPOS 词性体系
pos = self.__class__.POS_MAP[pos]
intensity = int(intensity)
polar = int(polar)
# 转换情感倾向的表现形式, 负数为消极, 0 为中性, 正数为积极
# 数值绝对值大小表示极性的强度 // 分成3类,极性:褒(+1)、中(0)、贬(-1); 强度为权重值
value = None
if polar == 0: # neutral
value = 0
elif polar == 1: # positive
value = intensity
elif polar == 2: # negtive
value = -1 * intensity
else: # invalid
continue
# key = (word, pos, sentiment )
key = word
sentiment_dict[key] = value
# 找对应的大类
for item in self.score_class.items():
key = item[0]
values = item[1]
# print(key)
# print(value)
for x in values:
if (sentiment == x):
class_value = key # 如果values中包含,则获取key
sentiment_struct.append(Struct(word, sentiment, pos, value, class_value))
return sentiment_struct, sentiment_dict
def findword(self, text): # 查找文本中包含哪些情感词
word_list = []
for item in self.sentiment_struct:
if item.word in text:
word_list.append(item)
return word_list
def classify_words(self, words):
# 这3个键是词的序号(索引)
sen_word = {}
not_word = {}
degree_word = {}
# 找到对应的sent, not, degree; words 是分词后的列表
for index, word in enumerate(words):
if word in self.sentiment_dict and word not in self.__class__.NOT_DICT and word not in self.degree_dict:
sen_word[index] = self.sentiment_dict[word]
elif word in self.__class__.NOT_DICT and word not in self.degree_dict:
not_word[index] = -1
elif word in self.degree_dict:
degree_word[index] = self.degree_dict[word]
return sen_word, not_word, degree_word
def get2score_position(self, words):
sen_word, not_word, degree_word = self.classify_words(words) # 是字典
score = 0
start = 0
# 存所有情感词、否定词、程度副词的位置(索引、序号)的列表
sen_locs = sen_word.keys()
not_locs = not_word.keys()
degree_locs = degree_word.keys()
senloc = -1
# 遍历句子中所有的单词words,i为单词的绝对位置
for i in range(0, len(words)):
if i in sen_locs:
W = 1 # 情感词间权重重置
not_locs_index = 0
degree_locs_index = 0
# senloc为情感词位置列表的序号,之前的sen_locs是情感词再分词后列表中的位置序号
senloc += 1
# score += W * float(sen_word[i])
if (senloc == 0): # 第一个情感词,前面是否有否定词,程度词
start = 0
elif senloc < len(sen_locs): # 和前面一个情感词之间,是否有否定词,程度词
# j为绝对位置
start = previous_sen_locs
for j in range(start, i): # 词间的相对位置
# 如果有否定词
if j in not_locs:
W *= -1
not_locs_index = j
# 如果有程度副词
elif j in degree_locs:
W *= degree_word[j]
degree_locs_index = j
# 判断否定词和程度词的位置:1)否定词在前,程度词减半(加上正值);不是很 2)否定词在后,程度增强(不变),很不是
if ((not_locs_index > 0) and (degree_locs_index > 0)):
if (not_locs_index < degree_locs_index):
degree_reduce = (float(degree_word[degree_locs_index] / 2))
W += degree_reduce
# print (W)
score += W * float(sen_word[i]) # 直接添加该情感词分数
# print(score)
previous_sen_locs = i
return score
def getscore(self, text): # 所有情感的得分
word_list = self.findword(text) ##查找文本中包含哪些情感词
# 增加程度副词+否定词
not_w = 1
not_word = []
for notword in self.__class__.NOT_DICT: # 否定词
if notword in text:
not_w = not_w * -1
not_word.append(notword)
degree_word = []
degree = 0
for degreeword in self.degree_dict.keys():
if degreeword in text:
degree = self.degree_dict[degreeword]
# polar = polar + degree if polar > 0 else polar - degree
degree_word.append(degreeword)
# 7大类找对应感情大类的词语,分别统计分数= 词极性*词权重
result = []
for key in self.score_class.keys(): # 区分7大类
score = 0
score_words = []
for word in word_list:
if (key == word.class_value):
score = score + word.value
score_words.append(word.word)
if score > 0:
score = score + degree
elif score < 0:
score = score - degree # 看分数>0,程度更强; 分数<0,程度减弱?
score = score * not_w
x = '{}_score={}; word={}; nor_word={}; degree_word={};'.format(key, score, score_words, not_word,
degree_word)
# x='{}'.format(score)
# print(x)
result.append(score)
# key + '_score=%d; word=%s; nor_word=%s; degree_word=%s;'% (score, score_words,not_word, degree_word))
return result
#输入int数字,返回'2020-01-01'
def timeitr(smonth,sday,emonth,eday,year=2019): #遍历一定范围内的日期,返回日期字符串列表,闭区间
begin = datetime.date(year, smonth, sday)
end = datetime.date(year, emonth, eday)
outDaylst=[]
for i in range((end - begin).days + 1):
outday = begin + datetime.timedelta(days=i)
outDaylst.append(str(outday))
return outDaylst
#返回列表[最强情感的字符串 第二强情感的字符串]
def find_1st2nd_max(intlist_moodScore):
moods = ["乐", "好", "怒", "哀", "惧", "恶", "惊"]
max1st_index=intlist_moodScore.index(max(intlist_moodScore))
mood1st=moods[max1st_index]
min_index=intlist_moodScore.index(min(intlist_moodScore))#将最大值换为最小值去找第二强情感
intlist_moodScore[max1st_index]=intlist_moodScore[min_index]
max2nd_index=intlist_moodScore.index(max(intlist_moodScore))
mood2nd=moods[max2nd_index]
return [mood1st,mood2nd]
def jstvRead(csv_path):
csvFile = open(csv_path, "r",encoding='UTF-8')
reader = csv.reader(csvFile)
out=[]
for item in reader:
out.append(item[1])
#列表,每个元素是当日新闻正文
return out
def jstvSocre(smonth,sday,emonth,eday,year):
timeStage = timeitr(smonth, sday, emonth, eday, year) # 日期参数
for ymd in timeStage:
try:
jstv_csv_path = r"test_time.csv".format(ymd)#有些天会没有
out_path = r"jstvScore.csv".format(ymd)
comment_list = jstvRead(jstv_csv_path)
c = {
"newsContent": comment_list
}
jstvdf = pd.DataFrame(c)
# print(weibodf)
# 文件写入
outFile = open(out_path, 'a+', encoding='UTF-8')
#db = pymysql.connect(host="localhost", user="root", password="196811", database="db_student",charset="utf8")
# 写入表头
moodlist = ["polar", "乐", "好", "怒", "哀", "惧", "恶", "惊", "最强情感", "次强情感"]
for moodType_index in range(len(moodlist) - 1):
outFile.write(moodlist[moodType_index] + ',')
outFile.write(moodlist[-1] + '\n')
# 写入极性和每种情感的得分,一条评论有太多否定词会出现负分
for temp in jstvdf['newsContent']:
score = Score(sentiment_dict_path, degree_dict_path, stop_dict_path)
words = [x.strip() for x in jieba.cut(temp)] # 分词
words_noStopWords = score.remove_stopword(words)
commentLen = len(words_noStopWords)
# 分词->情感词间是否有否定词/程度词+前后顺序->分数累加
# polar分
result = score.get2score_position(words_noStopWords) # polar
polarScore = 0
if (commentLen): # 因爬虫原因,json的评论下可能没评论
polarScore = float(result) / math.log(commentLen)
outFile.write(str(polarScore) + ',')
# 乐,好,怒,哀,惧,恶,惊
emotionScore_list = score.getscore(words_noStopWords) # 6种情感
# 大连理工情感词典里表示好和恶的情感很多,消除情感字典情感词数量的影响
weight = [1967, 10640, 388, 2314, 1179, 10282, 288] # 每个情感词汇个数
for i in range(len(emotionScore_list)): # 除以对数评论字符串长度
emotionScore_list[i] = emotionScore_list[i] / math.log((commentLen + 2) / math.log(weight[i]))
#由于有负分,进行情感修正
emotionScore_list=changeScore(emotionScore_list)
for i in range(len(emotionScore_list)):
outFile.write(str(emotionScore_list[i]) + ',') # 写入情感分数
moods1st2nd = find_1st2nd_max(emotionScore_list)
outFile.write(moods1st2nd[0] + ',')
outFile.write(moods1st2nd[1] + '\n') # 写入最强,次强情感
except Exception as e:
print(e)
outFile.close()
def changeScore(scoreList):
'''
:param scoreList:
:return:
'''
key = [ "乐", "好", "怒", "哀", "惧", "恶", "惊"]
'''
乐 反义 0.4哀 0.1恶 0.5惧
好 反义 0.5惧 0.5恶
怒 反义 乐
哀 反义 0.4好 0.6乐
惧 反义 好
恶 反义 乐
惊 反义 0.6乐 0.4好
'''
anti_dict = {
# 乐 好 怒 哀 惧 恶 惊"
'乐': [0, 0, 0, 0.4, 0.5, 0.1, 0],
'好': [0, 0, 0, 0, 0.5, 0.5, 0],
'怒': [1, 0, 0, 0, 0, 0, 0],
'哀': [0.6, 0.4, 0, 0, 0, 0, 0],
'惧': [0, 1, 0, 0, 0, 0, 0],
'恶': [0, 1, 0, 0, 0, 0, 0],
'惊': [0.6, 0.4, 0, 0, 0, 0, 0]
}
initial_mood_score_dict = dict(zip(key, scoreList))
minusScore_dict = {} # {'惧': -1.7929313807730507}
for kv in initial_mood_score_dict.items():
if (kv[1] < -0.000000001 ):
minusScore_dict[kv[0]] = kv[1]
adj_scoreList_dict = dict(zip(key, scoreList))
for kv in minusScore_dict:
adj_scoreList_dict[kv[0]] = 0.0
# print(minusScore_dict,adj_scoreList_dict,end=' ')
for kv in minusScore_dict.items():
moodType = kv[0]
score = kv[1]
if (moodType == '乐'):
adj_scoreList_dict['哀'] += -0.5 * score
adj_scoreList_dict['恶'] += -0.1 * score
adj_scoreList_dict['惧'] += -0.4 * score
if (moodType == '好'):
adj_scoreList_dict['惧'] += -0.5 * score
adj_scoreList_dict['恶'] += -0.5 * score
if (moodType == '怒'):
adj_scoreList_dict['乐'] += -score
if (moodType == '哀'):
adj_scoreList_dict['好'] += -0.5 * score
adj_scoreList_dict['乐'] += -0.5 * score
if (moodType == '惧'):
adj_scoreList_dict['好'] += -score
if (moodType == '恶'):
adj_scoreList_dict['好'] += -0.7 * score
adj_scoreList_dict['乐'] += -0.3 * score
if (moodType == '惊'):
adj_scoreList_dict['乐'] += -0.4 * score
adj_scoreList_dict['好'] += -0.6 * score
# print(adj_scoreList_dict)
adjScore_list=list(adj_scoreList_dict.values())#添加修正后的分数
return adjScore_list
if __name__ == '__main__':
sentiment_dict_path = r"dalian_emotion.xlsx"
degree_dict_path = r"degree_dict.txt"
stop_dict_path = r"stopwords.txt"
# weibo_path=r"../source/2019-12-08info.json"
# out_path=r"../source/Score.csv"
#weiboScore(4,29,6,20,2020,5)
jstvSocre(11,11,11,17,2019)
```
这个最好发一下代码,这样好分析一点,这个要看你的数据原本是什么格式内容