在做文本分析时出现了 Buffer has wrong number of dimensions (expected 1, got 2)的问题,但是找不到错因在哪,求解答,非常非常感谢!
import os
import os.path
filePaths = []
for root, dirs, files in os.walk(r"C:\Users\金\Desktop\高中数学辅导资料\数学"):
for name in files:
filePaths.append(os.path.join(root, name))
import codecs
filePaths = [];
fileContents = [];
for root, dirs, files in os.walk(r"C:\Users\金\Desktop\高中数学辅导资料\数学"):
for name in files:
filePath = os.path.join(root, name);
filePaths.append(filePath);
f = codecs.open(filePath, 'r', 'utf-8')
fileContent = f.read()
f.close()
fileContents.append(fileContent)
import pandas;
corpos = pandas.DataFrame({'filePath': filePaths,'fileContent': fileContents})
import jieba
output = open("储存.txt","w",encoding = "utf8")
segments = []
filePaths = []
for index, row in corpos.iterrows():
filePath = row['filePath']
fileContent = row['fileContent']
segs = jieba.cut(fileContent)
for seg in segs:
segments.append(seg)
filePaths.append(filePath)
segmentDataFrame = pandas.DataFrame({
'segment': segments,
'filePath': filePaths
})
import numpy
import pandas
import csv
#进行词频统计
segStat = segmentDataFrame.groupby(by = ['segment'])['segment'].agg([("count","count")])
segStat = segStat.reset_index().sort_values(by = ["count"],ascending=False)
"""排在前面的为停用词"""
quoting = csv.QUOTE_NONE
stopwords = pandas.read_csv("stopwords.txt",sep = "\t",header = None,quoting = csv.QUOTE_NONE,encoding='utf-8',index_col=False)
#获得没有停用词的词频统计结果
fSegStat = segStat[~segStat.segment.isin(stopwords.values)]
运行结果及报错内容
File "pandas_libs\hashtable_func_helper.pxi", line 415, in pandas._libs.hashtable.ismember_object
ValueError: Buffer has wrong number of dimensions (expected 1, go2)
想要文本词频统计去除停用词的结果,麻烦了
索引问题
比如下面例子
以下的步骤,一般没问题
df = df1.join(df2.set_index('col1'),on='col1')
加一步 重命名列,然后就报错了
df2.columns =[['col1','col12']]