爬虫爬取香港迪士尼网上商品详情页隐藏的请求头参数

网址


抓包的链接
https://www.hongkongdisneyland.com/zh-cn/tickets-api/promotion/ws

请求头中的参数有一个不同:X-Set-Cart
代理抓包和浏览器工具都没找到,我全局搜索打断点也找到的是空值,但是没这个参数就请求不成功,别人就找到了,不知道怎么玩的 'X-Set-Entry': 'merchandise'

img

img

# coding:utf-8
import re
import os
import json
import time
import execjs
import hashlib
import urllib
import requests
import threading
# import pandas as pd
from lxml import etree
# import my_fake_useragent as ua
from fake_useragent import UserAgent
# 禁用安全请求警告
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)


class Spider():
    def __init__(self):
        # self.user_agent = ua.UserAgent(phone=False, )
        self.user_agent = UserAgent().random
        # self.headers = {'Accept': 'application/json, text/plain, */*', 'Accept-Encoding': 'gzip, deflate, br',
        #                 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive',
        #                 'Content-Length': '217', 'Content-Type': 'application/json',
        #                 'Host': 'www.hongkongdisneyland.com', 'Origin': 'https://www.hongkongdisneyland.com',
        #                 'Pragma': 'no-cache',
        #                 'Referer': 'https://www.hongkongdisneyland.com/zh-cn/book/promotion/merchandise/online-merchandise-gm',
        #                 'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
        #                 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'Sec-Fetch-Dest': 'empty',
        #                 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin',
        #                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
        #                 'X-Set-Entry': 'merchandise'}
        self.headers ={
            "Accept": "application/json, text/plain, */*","Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9","Cache-Control": "no-cache","Connection": "keep-alive",
            "Content-Length": "245","Content-Type": "application/json",
            # "Cookie": "localeCookie_jar_aka=%7B%22contentLocale%22%3A%22zh_CN%22%2C%22version%22%3A%223%22%2C%22precedence%22%3A0%2C%22akamai%22%3A%22true%22%7D; languageSelection_jar_aka=%7B%22preferredLanguage%22%3A%22zh-cn%22%2C%22version%22%3A%221%22%2C%22precedence%22%3A0%2C%22language%22%3A%22zh-cn%22%2C%22akamai%22%3A%22true%22%7D; sn.vi=f451c27d-8331-42e6-b36d-f2078aa7f3f0; _gcl_au=1.1.1284066220.1635778392; sn.tpc=1; _ga=GA1.2.1122472326.1635778415; _gid=GA1.2.333436093.1635778415; localeCookie_jar=%7B%22contentLocale%22%3A%22zh_CN%22%2C%22version%22%3A%223%22%2C%22precedence%22%3A0%7D; GEOLOCATION_jar=%7B%22zipCode%22%3A100000%2C%22region%22%3A%22beijing+shi%22%2C%22country%22%3A%22china%22%2C%22metro%22%3A%22beijing%22%2C%22metroCode%22%3A%22156001%22%2C%22countryisocode%22%3A%22CHN%22%7D; languageSelection_jar=%7B%22language%22%3A%22zh-cn%22%2C%22precedence%22%3A0%7D; LPVID=E4ZTYxNDAzNGRmNjljZDc1; geolocation_aka_hkdl_jar=%7B%22zipCode%22%3A%22%22%2C%22region%22%3A%22SC%22%2C%22country%22%3A%22CN%22%2C%22metro%22%3A%22CHENGDU%22%2C%22metroCode%22%3A%22%22%7D; ak_bmsc=BE917CEA0C12A28FE422459F13356F61~000000000000000000000000000000~YAAQEwEPF/4vL6Z8AQAAoaB54w3AmrYVRqjcVTe73kmT2RIcsLCxNGG+flHdNIseVp85gPTOGGO4egW4bwsx5dhoYk43n5F84RsJRrdPGn1kiUZ+IJpqkjdsY2xPzLXBvWMU8RzjLtpMMFpClc1030rKfYvOLpE1pjL4w65IfPlGjeCpzwG6F3Wy4xTEQfyG1hffVhBYivq57D1gyj8hcyFtYpZfu8MfWMjmCHa/JmYj5i+IG2sxcoYfw/94otzZVgeAiyktNgiKLO94ApMnWjFxQIB5N296K/B9U0a9zGVl8Wt1VdxuFNxPYKcWUdbg7uVmf0d72BEZVDquh55shbADRgm7+PdTvZMDSZJlMZDiTkvAJr7wo5ZBXJ3AMesouVdSNgUSz0nZPIayaczkdcqgxsfcnIt7DTiOuJPm+jdwOTq0anr9m8Ul9mEytHMFIVcVbcfSFmJDQbi7oh9/4AfC/W7buNQmGbvclV/MWyJtlZlOZ1YG9/7OTQGrhQ==; check=true; AMCVS_EDA101AC512D2B230A490D4C%40AdobeOrg=1; AMCV_EDA101AC512D2B230A490D4C%40AdobeOrg=-330454231%7CMCIDTS%7C18935%7CMCMID%7C21883592571191269821529873215392913503%7CMCAAMLH-1636508772%7C11%7CMCAAMB-1636508772%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1635911172s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C3.1.2; mbox=PC#f916a8bc1a1c425f90310b28b6650ffa.32_0#1699148773|session#d74e9534a8804f5d80337814a170e8d3#1635905832; mboxEdgeCluster=32; s_pers=%20s_gpv_pn%3Dwdpro%252Fhkdl%252Fintl%252Fzh-cn%252Fcommerce%252Ftickets%252Fconsumer%252Fselect%252Fdining%7C1635905802444%3B; s_sess=%20s_slt%3D%3B%20s_cc%3Dtrue%3B%20s_tp%3D4250%3B%20s_ppv%3Dwdpro%252Fhkdl%252Fintl%252Fzh-cn%252Fcommerce%252Ftickets%252Fconsumer%252Fselect%252Fdining%252C30%252C25%252C1257%3B; dl_promo=%7B%22entryUrl%22%3A%22merchandise%22%2C%22ipAddress%22%3A%22118.124.219.166%22%2C%22mappingCode%22%3A%22cbe94889-174b-4753-8276-5e3591053dc5%22%7D; ADRUM_BTa="R:89|g:dfa5decc-5494-44fa-af64-b0928c2ac348"; ADRUM_BT1="R:89|i:307433|e:206"; bm_sv=88FAA112237E2432CF20B64FF1582DA2~XpaEHDRvglyueH5CKZGCupB+JmN9Ikx4WZ/iYUW8qxdSF0SzvMmQUTlwHluCiHf7u4UuyNJJbUGv7oLRVsXon1p7Z0RuPD+thsvS5/Tn6azk8yF3QiTVwcrvylPU1navq7oHG53zD3k7KxomssxHsYRhHVGQ0LhFvbwbh8Sojl8=; akavpau_HKDL_waiting_room_zh-cn_book=1635904602~id=30824c8abb12c6f67ca5ce29ef465701",
            "Host": "www.hongkongdisneyland.com",
            "Origin": "https://www.hongkongdisneyland.com",
            "Pragma": "no-cache",
            "Referer": "https://www.hongkongdisneyland.com/zh-cn/book/promotion/merchandise/online-merchandise-gm",
            "sec-ch-ua": '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
            "sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "Windows","Sec-Fetch-Dest": "empty",
            "Sec-Fetch-Mode": "cors","Sec-Fetch-Site": "same-origin",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
            'X-Set-Entry': ''
            }


        self.datas = []
        self.urls = []

    def getData_by_xpath(self, html, xpath, is_first=True, need_bind=False):
        # str_t = etree.tostring(div)
        # html_t = etree.HTML(str_t)
        try:
            if is_first:
                return html.xpath(xpath)[0]
            else:
                if need_bind:
                    temp = ''
                    for t in html.xpath(xpath):
                        temp += str(t)
                    return temp

                return html.xpath(xpath)
        except:
            return None

    def getData_by_re(self, re_, txt, is_first=True):
        try:
            if is_first:
                return re.findall(re_, txt)[0]
        except:
            return None

    def getData_from_json(self, data, path, name):
        '''name=self.getData_from_json(info,"['data']['user']['screen_name']",'name')'''
        try:
            name = eval('data' + path)
        except:
            name = None
        return name

    def connect(self):
        self.session = requests.Session()
        self.headers['User-Agent']=self.user_agent
        self.session.headers.update(self.headers)

    def crawl(self):
        url = 'https://www.hongkongdisneyland.com/zh-cn/tickets-api/promotion/ws'
        params = {"parameters": [{"type": None, "parameter": {"pk": "c96af76f-5c7f-4006-bb6d-171ce7940eb7",
                                                              "uri": "online-merchandise-gm"}}],
                  "payloadId": "7e3eb26b-4f6b-8192-cc84-fb38b7bd9628", "nameId": None,
                  "serviceName": "storefront.ThemeParkTicket", "method": "getPage"}

        self.connect()
        res = self.session.post(url, json=params)
        res=res.json()
        proucts=res['response']['subCategories'][0]['products']
        print(proucts)

    def parse_page(self, html, url):
        # print(html)
        html = etree.HTML(html)
        # str_t = etree.tostring()
        # html_t = etree.HTML(str_t)


if __name__ == '__main__':
    spider = Spider()
    spider.crawl()
    print(spider.session.cookies)