python爬虫
爬ip代理网站(ip代理网站89)100条ip地址并保存在txt文件中
形式如下:http://192.168.80.99:8081/
import requests
from lxml import etree
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
"sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\""
}
cookies = {
"https_waf_cookie": "330c21d5-643c-4214c7be0b070c874422814d0c8ad1ef9128",
"Hm_lvt_f9e56acddd5155c92b9b5499ff966848": "1668483002",
"Hm_lpvt_f9e56acddd5155c92b9b5499ff966848": "1668483002"
}
url = "https://www.89ip.cn/"
response = requests.get(url, headers=headers, cookies=cookies)
html = etree.HTML(response.content.decode())
h_list = html.xpath('//tbody/tr/td[1]/text()')
p_list = html.xpath('//tbody/tr/td[2]/text()')
with open('text.txt', 'w') as f:
for h, p in zip(h_list, p_list):
f.write('http://' + h.strip() + ':' + p.strip() + '\n')