from bs4 import BeautifulSoup
import requests
import json
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53'}
commentURL='https://comment.tie.163.com/H8DOPGM90519DTSV.html'
def creatURL(commentURL,offset,limit):
s1='https://gw.m.163.com/commons-user-main/api/v1/commons/user/pc/getUserByCookie?isbtn=false&trace_id=IuKWAHFuVvX1653835134118'
s2='/comments/newList?offset='
name=commentURL.split('/')[-1].split('.')[0]
u=s1+str(name)+s2+str(offset)+'&limit='+str(limit)
return u
res=requests.get(url=creatURL(commentURL,1,40),headers=headers).content
data=json.loads(res.decode())
for key in data['comments'].keys():
print(data['comments'][key]['comment'])
爬取网易新闻评论页的评论,遇到问题for key in data['comments'].keys():
KeyError: 'comments',是不是长度有问题?url=creatURL(commentURL,1,40)是不是说获取40个评论?这个一页上是没有40条评论的,但是总的评论数又超过了40个
网页地址是https://comment.tie.163.com/H8DOPGM90519DTSV.html
你这个问题我没找到解决办法,但是你如果仅仅是为了获取评论数据,按照我的方法可以绕过你的问题取获取,我觉得这样简单的多,我这里测试过没发现报错
import requests
import re
import time
# 看到最新评论一共有五页,翻页关键参数在offset
page=0
s=1
for page in range(1,5):
print(f'-------------开始采集第{s}页数据----------------')
# 开发者工具抓包找到对应数据所在链接
url=f'https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/H8DOPGM90519DTSV/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset={page}&callback=jsonp_1653882863959&_=1653882863960'
data={}
headers={
'Referer': 'https://comment.tie.163.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53'
}
resp=requests.get(url,headers=headers)
# print(resp.text)
commentId_list=re.findall('"commentId":(.*?),',resp.text) # 用户id列表
content_list=re.findall('"content":"(.*?)",',resp.text) # 用户评论列表
# print(commentId_list)
# print(content_list)
n=1
for commentId in commentId_list:
print(commentId,content_list[n-1])
time.sleep(0.5)
n+=1
s+=1
page+=30
“url=creatURL(commentURL,1,40)”
应该是第一页 每页40条记录 大概是算上了回帖的
from bs4 import BeautifulSoup
import requests
import json
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53'}
commentURL='https://comment.tie.163.com/H8DOPGM90519DTSV.html'
def creatURL(commentURL,offset,limit):
s1='https://gw.m.163.com/commons-user-main/api/v1/commons/user/pc/getUserByCookie?isbtn=false&trace_id=IuKWAHFuVvX1653835134118'
s2='/comments/newList?offset='
name=commentURL.split('/')[-1].split('.')[0]
u=s1+str(name)+s2+str(offset)+'&limit='+str(limit)
return u
res=requests.get(url=creatURL(commentURL,1,40),headers=headers).content
data=json.loads(res.decode())
if 'comments' in data:
for key in data['comments'].keys():
print(data['comments'][key]['comment'])
else:
print('没获取到comment,网站没有登录')
没登录,获取不到comment
res的值b'{"code":0,"message":"\xe6\x88\x90\xe5\x8a\x9f","data":{"userLogined":false}}'
登录一下,然后获取一个cookie,放入header中