大家好,我目前正在做删除停词的任务,这个代码可以运行,想请教如何改成循环语句,即循环提取文件夹内停词,而不是单个单个的文件。应该是改“file1....这个语句,但不知道如何改。谢谢大家!
import io
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
file1 = open(
r"D:\1.1 SEC EDGAR年报源文件 (10Q_10KA_10QA)\2001\QTR1\20010102_10-K-A_edgar_data_1024302_0001092388-00-500453.txt")
line = file1.read()
words = word_tokenize(line)
words_witout_stop_words = ["" if word in stop_words else word for word in words]
new_words = " ".join(words_witout_stop_words).strip()
appendFile = open(
r"D:\1.1 SEC EDGAR年报源文件 (10Q_10KA_10QA)\2001\QTR1\20010102_10-K-A_edgar_data_1024302_0001092388-00-500453.txt", 'w')
appendFile.write(new_words)
appendFile.close()
谢谢大家!
大概就是这样 , 把文件处理改成一个过程 , 加一个遍历文件夹的过程。
import io
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
# 把文件处理改成一个过程
def file_work(file_name):
stop_words = set(stopwords.words('english'))
file1 = open( file_name)
line = file1.read()
words = word_tokenize(line)
words_witout_stop_words = ["" if word in stop_words else word for word in words]
new_words = " ".join(words_witout_stop_words).strip()
appendFile = open(file_name, 'w')
appendFile.write(new_words)
appendFile.close()
# 这个是完成1个文件
f = r"D:\1.1 SEC EDGAR年报源文件 (10Q_10KA_10QA)\2001\QTR1\20010102_10-K-A_edgar_data_1024302_0001092388-00-500453.txt"
file_work(f)
def check_all_files(check_path):
list_files = []
# 列出文件夹下所有的目录与文件
cur_list = os.listdir(check_path)
for i in range(0 ,len(cur_list)):
file_path = os.path.join(check_path, cur_list[i])
if os.path.isdir(file_path):
list_files.extend(check_all_files(file_path))
if os.path.isfile(file_path):
# print(cur_list[i][-4:].upper())
if cur_list[i][-3:].upper()=='TXT':
list_files.append([cur_list[i], file_path])
return list_files
# 遍历文件夹 处理所有txt文件
txt_file = check_all_files( r"D:\1.1 SEC EDGAR年报源文件 (10Q_10KA_10QA)")
for f in txt_file:
print("处理",f[0])
file_work(f[1])