如何在爬取网页数据时用多线程?(python)
可以使用concurrent.futures的ThreadPoolExecutor,用一个线程池执行异步调用。例:
import requests
from bs4 import BeautifulSoup as bs
import time
from concurrent.futures import ThreadPoolExecutor
cookies = {
'__cfduid': 'xxx',
'PHPSESSID': 'xxx',
}#从页面分析获取
headers = {
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Referer': 'http://www.enagames.com/escape-game/top10newgames-rescue-the-tiger-2',
'Accept-Language': 'en-US;q=0.6',
}
response = requests.get('http://www.enagames.com/escape-game/top10newgames-rescue-the-tiger-2',headers=headers, cookies=cookies, verify=False)
soup=bs(response.text,'html.parser')
game_link = set([x['href'] for x in soup.select(
'div.home_game_image_featured center a')])
gl=list(game_link)
def get_swf(url):
r=requests.get(url,headers=headers,cookies=cookies)
s=bs(r.text,'html.parser')
sl =s.select_one('div#flash_container1 object param')['value']
with open(f'./{url.split("/")[-1]}.swf', 'wb') as f:
try:
req = requests.get(sl, headers=headers)
f.write(req.content)
except:
pass
with ThreadPoolExecutor(max_workers=5) as ex:
ex.map(get_swf,gl[:5])
将你爬取的代码定义为一个函数,然后多线程执行这个函数,比如下面例子
def func(aa):
pass
all_task = []
for i in range(10):
t = threading.Thread(target=func, args=(i))
t.setDaemon(True)
t.start()
all_task.append(t)
# 等待线程执行结果
for t in all_task:
t.join() # 设置主线程等待子线程结束
print("in main: get page success")
给你写一个完整的例子
#模拟浏览器请求网页,获取网页数据
def get_html(html_url):
'''
获取网页源代码
:param html_url: 网页url
:return:
'''
response = requests.get(url=html_url, headers=headers)
return response
#解析网页数据
def get_par(html_data):
'''
把 response.text 转换成 selector 对象 解析提取数据
:param html_data: response.text
:return: selector 对象
'''
selector = parsel.Selector(html_data)
return selector
#保存数据
def download(img_url, title):
'''
保存数据
:param img_url: 图片地址
:param title: 图片标题
:return:
'''
content = get_html(img_url).content
path = '壁纸\\' + title + '.jpg'
with open(path, mode='wb') as f:
f.write(content)
print('正在保存', title)
#主函数
def main(url):
'''
主函数
:param url: 列表页面 url
:return:
'''
html_data = get_html(url).text
selector = get_par(html_data)
lis = selector.css('.wb_listbox div dl dd a::attr(href)').getall()
for li in lis:
img_data = get_html(li).text
img_selector = get_par(img_data)
img_url = img_selector.css('.wb_showpic_main img::attr(src)').get()
title = img_selector.css('.wb_pictitle::text').get().strip()
download(img_url, title)
end_time = time.time() - s_time
print(end_time)
#启动多线程
if __name__ == '__main__':
for page in range(1, 11):
url = 'http://www.deskbizhi.com/min/list-{}.html'.format(page)
main_thread = threading.Thread(target=main, args=(url,))
main_thread.start()
具体得看环境是window还是Unix的 Windows下可参考以下文章:
https://www.cnblogs.com/johnyang/p/10885886.html
您好,我是有问必答小助手,您的问题已经有小伙伴解答了,您看下是否解决,可以追评进行沟通哦~
如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~
ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>>https://vip.csdn.net/askvip?utm_source=1146287632
非常感谢您使用有问必答服务,为了后续更快速的帮您解决问题,现诚邀您参与有问必答体验反馈。您的建议将会运用到我们的产品优化中,希望能得到您的支持与协助!
速戳参与调研>>>https://t.csdnimg.cn/Kf0y