使用您写的爬取知乎问答的爬虫,但是返回了Error
import requests
import csv
import time
#新建 csv,存储数据
csvf = open('zhihu.csv', 'a+', encoding='utf-8', newline='')
fieldnames = ['author', 'id', 'text']
writer = csv.DictWriter(csvf, fieldnames=fieldnames)
writer.writeheader()
#伪装头,用于反爬
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}
#网址模板
template = '''https://www.zhihu.com/api/v4/questions/432119474/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics%3Bsettings.table_of_content.enabled%3B&offset={offset}&limit=5&sort_by=default&platform=desktop'''
#批量采集,
#目前可爬 1240+页数据
#这里仅抓取前 100 页
for page in range(1, 100):
#对第 page 页进行访问
url = template.format(offset=page)
resp = requests.get(url, headers=headers)
#解析定位第 page 页的数据
for info in resp.json()['data']:
author = info['author']
Id = info['id']
text = info['excerpt']
data = {'author': author,
'id': Id,
'text': text}
#存入 csv
writer.writerow(data)
#降低爬虫对知乎的访问速度
time.sleep(1)
#关闭 csvf
csvf.close()
Traceback (most recent call last):
File "E:\学习资料(主)\3.大三\舆情控制\venv\爬取知乎回答.py", line 30, in <module>
for info in resp.json()['data']:
KeyError: 'data'
Process finished with exit code 1
1、尝试对出错部分进行定位:
import requests
template = 'https://www.zhihu.com/api/v4/questions/432119474/answers?include=data%5
B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_ac
tion%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edi
t%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_
count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Cre
view_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_au
thor%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2C
paid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower
_count%2Cbadge%5B*%5D.topics%3Bsettings.table_of_content.enabled%3B&offset={offset}
&limit=5&sort_by=default&platform=desktop'
#for page in range(1, 1240):
url = template.format(offset=1)
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebK
it/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}
resp = requests.get(url, headers=headers)
resp
得到结果:
<Response 403>
2、尝试对浏览器升级,但是并没有很好的效果
3、尝试使用IP池
能够正常爬取知乎问答,并进行数据分析
应该是知乎网站改版过了, 这个要爬取数据的网站地址或参数已经变更了, 你这个代码已经过期
了
print(resp.json())输出的json字典是
{'error': {'message': '请求参数异常,请升级客户端后重试', 'code': 10003}}
字典中没有data键
你要爬取什么数据, 需要从现在的知乎网站重新在有这个数据的页面中重新找到改版过后的数据地址和参数,
如果数据json字典的格式也变了,还需要重新分析json字典写对应的代码从json字典中提取数据