想爬取携程网上酒店的评论数据
已经找到了有完整数据的接口:
代码如下:
```python
import json
import requests
url = "https://m.ctrip.com/restapi/soa2/21881/json/GetReviewList"
headers = {
"Referer":"https://hotels.ctrip.com/",
"Content-Type":"application/json;charset=UTF-8",
'Cookie':'MKT_CKID=1688817105762.0zj6i.qvxc; GUID=09031098310968664531; _RSG=3u4j5LzyOkEw9LPA_F2JvA; _RDG=2884d84600b17124263e2fe848d3829c5d; _RGUID=eb98f3d5-45b4-4e03-ae53-08ffca65527a; _bfaStatusPVSend=1; MKT_Pagesource=PC; manualclose=1; ibulanguage=CN; ibulocale=zh_cn; cookiePricesDisplayed=CNY; Session=smartlinkcode=U1535&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=; _abtest_userid=6333cec9-a5fa-4d2d-b4c3-11ce006d8fe6; nfes_isSupportWebP=1; cticket=2AA3B5192CA808111F35AF5693827FD9EA62028F6D6E738BDBA9F687A47CCFED; login_type=0; login_uid=09E573D6D4D0872D67CF29854C3F2A404D75FD85FDFFB4FF1F1EA173009C0C97; DUID=u=60451AF81B9E70A8B2758FF945CCDE1A&v=0; IsNonUser=F; AHeadUserInfo=VipGrade=0&VipGradeName=%C6%D5%CD%A8%BB%E1%D4%B1&UserName=&NoReadMessageCount=0; intl_ht1=h4=2_433981,2_7739286,2_29471774,2_72926277,2_81163618,2_82034023; _RF1=112.36.86.88; Union=OUID=index&AllianceID=4897&SID=155952&SourceID=&createtime=1689216465&Expires=1689821264832; MKT_OrderClick=ASID=4897155952&AID=4897&CSID=155952&OUID=index&CT=1689216464834&CURL=https%3A%2F%2Fwww.ctrip.com%2F%3Fsid%3D155952%26allianceid%3D4897%26ouid%3Dindex&VAL={"pc_vid":"1688817105576.2k5yyg"}; _bfi=p1%3D102001%26p2%3D102003%26v1%3D1%26v2%3D2; _bfaStatus=success; librauuid=; _bfa=1.1688817105576.2k5yyg.1.1689216464611.1689388518803.16.1.102003; _ubtstatus=%7B%22vid%22%3A%221688817105576.2k5yyg%22%2C%22sid%22%3A16%2C%22pvid%22%3A1%2C%22pid%22%3A102003%7D; _jzqco=%7C%7C%7C%7C1689211525139%7C1.1185818776.1688817105760.1689216464846.1689388518870.1689216464846.1689388518870.0.0.0.65.65; __zpspc=9.17.1689388518.1689388518.1%232%7Cwww.baidu.com%7C%7C%7C%25E6%2590%25BA%25E7%25A8%258B%7C%23; MKT_CKID_LMT=1689388518883',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
data = {
"MasterHotelId":"433981",
"NeedFilter":"true",
"PageNo":"1",
"PageSize":"10",
"ServerData":"",
"ThemeId":"1",
"ThemeType":"commonTheme",
"UnUsefulPageNo":"1",
"UnUsefulPageSize":"5",
"genKeyParam":{
"a": "433981",
"d": "zh-cn",
"e": "2"
},
"genk":"true",
"head":{
"Locale": "zh-CN",
"Currency": "CNY",
"Device": "PC",
"UserIP": "112.36.86.88",
"Group": "ctrip",
"ReferenceID": "",
"UserRegion": "CN",
"AID": "4897",
"SID": "155952",
"Ticket": "",
"UID": "",
"IsQuickBooking": "",
"ClientID": "09031098310968664531",
"OUID": "index",
"TimeZone": "8",
"P": "75494579714",
"PageID": "102003",
"Version": "",
"HotelExtension": {
"WebpSupport": "true",
"group": "CTRIP",
"Qid": "null",
"hasAidInUrl": "false",
"hotelUuidKey": "aHcvljmgEDZiNlE8qeA5EFmIB7wacw73x8YoqElDyA3wsLYpFEOTx6ZYfjbY1bY8DjHzyf8ez0EO4jUZW5jSnR9dePYG9w1PIONiAMIpswzlvq0jOJ1GvdSwZQvagjtJohjO6w3XRn3joJ7Tvp3vD5YBhwdGjTmeDziazYzZIpnRP5EbYSAESXic6K5HyabjONvNAE6Xvn7WDNj4UIbarZDrOYaBRHQjbDRHaYgTiDzwOZRDqEmcWpFYkLvfFxLYMlYFoeN1iNFiZ4Y9bWXYalvB6Y3pjgZrUtRBpi4YameNEdQKQUvDNYaFy1NjB9vdXeDkYfhjk5ygJoFvGDYh3y9ZjQgv59esdY0zjdFyBJqXYGUvP4W8pWQJsZEFEhYHOjtqIGj1Xv9aeHsYn9i4XYfXW7zyPQrUYkfEOdiA9KcZEZly86InY7txtUr6bRSZrnAr5LKFYOTvg5JktxdLYNAilaiTHis5jP3YQFYXTYDYm0WcteA6RZOiBdw8GYqAwHNJHOvQfYO4wPnyk3WckekYHGvbZKlQwLBRhswSojFMvDoRoOJ56vldwZNwGHEPGwTpifTWPvboJspvBYttxZ0IA1iBPEg1r0aIZYQOJ1kYXQRnORtMwB8joDjMfjqfRtTyaqRBAWk9JQLJlSy0qwgpEsaYbGvXYs3xdQWZAEZBjL3wXUvk4jLPJl1xHy0YHv70ef1v1zRsPYBpjq0WbMebnyqTY38WSbrkfWOBw0YMoinpiAQWXcRdSYmaj7mWLDeoQwSgjZ6W6QY9gYMdeBYdSW99KFpWk8Rz5wsajHNvNoRf0JDPvafwzlJpFEzpJMmiNOE3grZHEcQeFYQ7KHcxDkjtFE0Tj5TWL1W5tWMqYgPYonY6zRNsYzoWsMYhpYXUY8LjNOeX5E0MWnFeZMw0Ue18jBkY01y3BEnFjonEpXrN9jgDw8hyPGy3qWa8eGYtoRdaWX0WHGW1zWHfYkYNqyA0EXLKtTvbPEHcWoBynZjsJqNv35EAgWMkyoHjk1YaXEhNKnYk0vULxpHeobEtZE39EtTRlTEUZwFXwThrFYb8v5hj3MRf1EdUElNEOTYT3YXsYUqYgoe36WNf"
},
"Frontend": {
"vid": "1688817105576.2k5yyg",
"sessionID": "13",
"pvid": "1"
}
},
"isHasFold":"false",
"ssr":"false"
}
data=json.dumps(data)
response = requests.post(url, data=data, headers=headers)
print(response.text)
headers和data都是从header和payload直接复制,但是结果里面response是空的:
进程已结束,退出代码0
这个返回结果就是网页里面的Response的内容:
可是唯独这里的response没有对应的结果:
用同样的方法爬取其他网站(如京东)的评论是OK的,为何携程是这样子?
求帮助!
原因很简单,爬虫不是修改url就可以重复使用,每个网站的请求格式都不一样,你这个拿不到数据是因为你被后台识别出来你是爬虫或者你没有携带他需要的数据去请求,
我怎么觉得是你的这个post请求url还有参数没有带上的原因导致的呢?
原post请求url为:
https://m.ctrip.com/restapi/soa2/21881/json/GetReviewList?testab=53578b376b8e855fc23729857ceee8fd61dd3209892638724d15431c4c6fea11
上述只是例子
应该cookie问题,你可以看看每次请求是不是都一样。观察哪一个值经常变化。
每个网站检测的力度和加密的规则是不一样的
携程可能有反爬虫的机制,你看下接口请求里面,发送的 header 字段是不是都满足。
我试了下你的代码,request没有登录情况下访问是获取不到的,或者你用自动化selenium模块试试吧
应该被反爬了,可能有一些必要的参数是动态获取的,没有传进去。
问题点: 爬取数据为空
分析思路:
请求参数缺少testab,请尝试在data中添加
{"testab": "53578b376b8e855fc23729857ceee8fd61dd3209892638724d15431c4c6fea11"}
携程有反爬的,需要js解密获取动态参数
可以参考下
https://blog.csdn.net/qq_36907160/article/details/117855974
https://blog.csdn.net/wenxuhonghe/article/details/102789096