def load_data(self):
reviews = pd.read_csv('tuniu.csv')
data = reviews['content']
#with open('r.json') as file:
#data = file.read()
#data = json.loads(data)
#print(type(data))
return data
def fenci_data(self):
text = []
data = self.load_data()
with open("stoplist.txt", 'r',encoding='UTF-8') as file:
stop_word_list = file.read()
for weibo_item in tqdm(data):
tmp = []
sentence=''.join(re.findall(r'[\u4e00-\u9fa5]+',weibo_item['content']))
for word in jieba.lcut(sentence):
if word not in stop_word_list:
tmp.append(word)
text.append(tmp)
return text
报错:
sentence=''.join(re.findall(r'[\u4e00-\u9fa5]+',weibo_item['content']))
TypeError: string indices must be integers
那就是只传 content 这一列而已。
sentence=''.join(re.findall(r'[\u4e00-\u9fa5]+',weibo_item['content']))
# 改为
sentence=''.join(re.findall(r'[\u4e00-\u9fa5]+',weibo_item))
看提示, 这个 weibo_item 的当前的值 是 字符串, 不是字典。
试试把
sentence=''.join(re.findall(r'[\u4e00-\u9fa5]+',weibo_item['content']))
改为
try:
sentence=''.join(re.findall(r'[\u4e00-\u9fa5]+',weibo_item['content']))
except Exception as e :
print("error",repr(e),'\n',weibo_item)
sentence = ""
你看提示
weibo_item 是字符串, 咋取 weibo_item['content'] ?
先把两个文件都截图贴一下,描述下你的程序逻辑。