import re import requests import time headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' } info_lists=[] f = open('C:/Users/xyh/Desktop/baike.txt', 'a+')#创文件 def judgment_sex(class_name):#判别性别 if class_name =='womenIcon': return '女' else: return '男' def get_info(url):#详细信息 res=requests.get(url) ids=re.findall('<h2>(.*?)<h2>',res.text,re.S) levels=re.findall('<div class="articleGender \D+Icon">(.*?)</div>',res.text,re.S) sexs=re.findall('<div class="articleGender (.*?)">',res.text,re.S) contents=re.findall('<div classs="content">.*?<span>(.*?)</span>',res.text,re.S) laughs=re.findall('<span class="stats-vote"><i class=""number>(\d+)</i>',res.text,re.S) comments=re.findall('<span class="stats-comments"><i class=""number>(\d+)</i>',res.text,re.S) for id,level,sex,content,laugh,comment in zip(ids,levels,sexs,contents,laughs,comments,): info={ 'id':id, 'level':level, 'sex':judgment_sex(sex), 'content':content, 'laugh':laugh, 'comment':comment } info_lists.append(info) if __name__ == '__main__': urls=['https://www.qiushibaike.com/text/page/{}/'.format(number) for number in range(1,11)] for url in urls: get_info(url) time.sleep(1) for info_list in info_lists:#写入到文件 try: f.write(info_list['id']+'\n') f.write(info_list['level']+'\n') f.write(info_list['sex']+'\n') f.write(info_list['content']+'\n') f.write(info_list['laugh']+'\n') f.write(info_list['comment']+'\n') f.close() except UnicodeError: pass
ids=re.findall('<h2>(.*?)<h2>',res.text,re.S) levels=re.findall('<div class="articleGender \D+Icon">(.*?)</div>',res.text,re.S) sexs=re.findall('<div class="articleGender (.*?)">',res.text,re.S) contents=re.findall('<div classs="content">.*?<span>(.*?)</span>',res.text,re.S) laughs=re.findall('<span class="stats-vote"><i class=""number>(\d+)</i>',res.text,re.S) comments=re.findall('<span class="stats-comments"><i class=""number>(\d+)</i>',res.text,re.S)
打印了下这几个变量都有内容了,你是后面的处理逻辑有问题。
for id,level,sex,content,laugh,comment in zip(ids,levels,sexs,contents,laughs,comments,): info={ 'id':id, 'level':level, 'sex':judgment_sex(sex), 'content':content, 'laugh':laugh, 'comment':comment } 这里我尝试把字典打印出来,可是不行,请问为什么呢?