关于爬虫过程中可能遇到的反爬虫机制


user_agent_list = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"
    ]
url = 'http://www.swguancha.com/api/client/v1/cstMallPublic/searchByPage'
headers = {
    'Content-Type': 'application/json',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept':'application/json, text/plain, */*',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'zh,en;q=0.9,zh-CN;q=0.8',
    'Authorization':'3b84f95e150b4392b49517a2cb87cc4d',
    'Connection':'keep-alive',
    'Content-Length':'219',
    'Cookie':'__51uvsct__JM3cZPZl99RugvzY=1; __51vcke__JM3cZPZl99RugvzY=c4520fa3-eec5-5baa-a977-99902e2abded; __51vuft__JM3cZPZl99RugvzY=1629424480737; Hm_lvt_35ebc17b75cf00cf4d6a10dcf972521b=1629424481; vue_admin_template_token=3b84f95e150b4392b49517a2cb87cc4d; __vtins__JM3cZPZl99RugvzY=%7B%22sid%22%3A%20%22f78af916-022a-5377-8949-0ce528022ae9%22%2C%20%22vd%22%3A%203%2C%20%22stt%22%3A%20147493%2C%20%22dr%22%3A%20125097%2C%20%22expires%22%3A%201629426428227%2C%20%22ct%22%3A%201629424628227%7D; sidebarStatus=1; Hm_lpvt_35ebc17b75cf00cf4d6a10dcf972521b=1629424628',
    'Host':'www.swguancha.com',
    'Origin':'http://www.swguancha.com',
    'Referer':'http://www.swguancha.com/home/mall-advanced-search?',
}
headers['User-Agent'] = random.choice(user_agent_list)
para = {
        'areaRange':'null',
        'center':"106.76242,29.6438",
        'cityId':'',
        'current':1,
        'isCountry':'',
        'keyword':'',
        'manageYearRange':'',
        'order':'',
        'orderType':'',
        'provinceId':'',
        'regionId':'',
        'size':'10',
        'turnoverRange':'null',
        'type':''
    }
response = requests.post(url = url , params = para , headers = headers).json()

对某网站进行爬虫练习时,如headers里面只有user-agent将返回

img
参照网上的方式将headers补全后则出现了

img

请教各位,这是触发了反爬机制吗,如何解决呢?

首先,headers里不应该有Content-Length,length回根据body长度自动计算,而不手动指定
另外,既然是post请求,参数是通过params 传递的吗,建议核对一下
还有,AuthorizationCookie一般用来鉴权的,检查一下失效了没,要及时更新
其他看起来倒是问题不大

直接说那个网站。分分钟解决,给个关注采纳就好

采纳关注走一个吧,所有数据自己登录获取。


import requests

url = "http://www.swguancha.com/api/client/v1/cstMallPublic/searchByPage"
headers = {
    'Content-Type': 'application/json;charset=UTF-8',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
payload = '''{"keyWord":"","center":"104.03108,30.66358",
"isCountry":"","provinceId":"","cityId":"","regionId":"",
"areaRange":null,"manageYearRange":null,"type":"",
"turnoverRange":null,"order":"","orderType":"",
"size":10,"current":1}'''
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
url = 'http://www.swguancha.com/api/client/v1/cstMallPublic/searchByPage'
headers = {
    # "Accept": "application/json, text/plain, */*",
    # "Accept-Encoding": "gzip, deflate",
    # "Accept-Language": "zh-CN,zh;q=0.9",
    # "Content-Length": "219",
    "Content-Type": "application/json;charset=UTF-8",
    # "Cookie": "__vtins__JM3cZPZl99RugvzY=%7B%22sid%22%3A%20%223ed1288c-fc3c-59c8-98ce-a031f4e40d2c%22%2C%20%22vd%22%3A%203%2C%20%22stt%22%3A%20489906%2C%20%22dr%22%3A%20456531%2C%20%22expires%22%3A%20{}%2C%20%22ct%22%3A%201629447134719%7D; Hm_lpvt_35ebc17b75cf00cf4d6a10dcf972521b={}".format(
    #   str(int(time.time())*1000), str(int(time.time()))
    # ),
    # "Host": "www.swguancha.com",
    # "Origin": "http://www.swguancha.com",
    # "Proxy-Connection": "keep-alive",
    # "Referer": "http://www.swguancha.com/home/mall-advanced-search",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
}

para = {"keyWord":"","center":"121.58131,31.1962","isCountry":"","provinceId":"","cityId":"","regionId":"","areaRange":None,"manageYearRange":None,"type":"","turnoverRange":None,"order":"","orderType":"","size":10,"current":1}



response = requests.post(url = url , json = para , headers = headers,allow_redirects=True)
print(response.headers)
print(response.status_code)
print(response.text)

null值需要使用python 中的none来代替,参数缺一不可

img

如果有帮助,请点个采纳