import os
imdb_dir = '/home/ubuntu/data/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []
for label_type in ['neg', 'pos']:
dir_name = os.path.join(train_dir, label_type)
for fname in os.listdir(dir_name):
if fname[-4:] == '.txt':
f = open(os.path.join(dir_name, fname))
texts.append(f.read())
f.close()
if label_type == 'neg':
labels.append(0)
else:
labels.append(1)
在f = open(os.path.join(dir_name, fname))这句里,打开文件语句中指定编码方式。
添加 encoding='utf-8'或encoding='gbk'试试。