python爬虫怎么改成多线程
import parsel
import requests
import os
url = "****"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39 '
}
response = requests.get(url=url, headers=headers)
html_str = response.text
selector = parsel.Selector(html_str)
lis = selector.xpath('//div[@id="primary"]/main/article')[1:]
for li in lis:
pic_title = li.xpath('.//h2/a/text()').get()
pic_url = li.xpath('.//h2/a/@href').get()
print('正在下载相册:', pic_title)
if not os.path.exists('img\\' + pic_title):
os.mkdir('img\\' + pic_title)
response_pic = requests.get(url=pic_url, headers=headers).text
selector_2 = parsel.Selector(response_pic)
pic_url_list = selector_2.xpath('//div[@class="entry-content"]//img/@src').getall()[1:-1]
# print(pic_url_list)
for pic_url in pic_url_list:
img_data = requests.get(url=pic_url, headers=headers).content
file_name = pic_url.split('/')[-1]
with open(f'img\\{pic_title}\\{file_name}', mode='wb') as f:
f.write(img_data)
print('保存完成:', file_name)
楼上的这些线程改造方式放在这里行不通吧。楼主代码中IO操作在循环的地方,这里线程不是应该将循环改成多线程吗。。。
##改了一个循环,试一下
```python
import parsel
import requests
import os
from multiprocessing.dummy import Pool
url = "****"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39 '
}
response = requests.get(url=url, headers=headers)
html_str = response.text
selector = parsel.Selector(html_str)
lis = selector.xpath('//div[@id="primary"]/main/article')[1:]
for li in lis:
pic_title = li.xpath('.//h2/a/text()').get()
pic_url = li.xpath('.//h2/a/@href').get()
print('正在下载相册:', pic_title)
if not os.path.exists('img\\' + pic_title):
os.mkdir('img\\' + pic_title)
response_pic = requests.get(url=pic_url, headers=headers).text
selector_2 = parsel.Selector(response_pic)
pic_url_list = selector_2.xpath('//div[@class="entry-content"]//img/@src').getall()[1:-1]
# print(pic_url_list)
def saveimg(pic_url):
img_data = requests.get(url=pic_url, headers=headers).content
file_name = pic_url.split('/')[-1]
with open(f'img\\{pic_title}\\{file_name}', mode='wb') as f:
f.write(img_data)
print('保存完成:', file_name)
p = Pool(10)
result=p.map(saveimg,pic_url_list)
p.close()
p.join()
```
可以参考
import threading
thread_max = threading.BoundedSemaphore(20)
semap = threading.Semaphore(5)
def muilt_begin(func, lis: list):
thread_list = []
for each in lis:
thread_max.acquire()
m = MyThread(func, each)
m.start()
thread_list.append(m)
for m in thread_list:
m.join()
class MyThread(threading.Thread):
def __init__(self, func, args):
threading.Thread.__init__(self)
self.func = func
self.args = args
def run(self):
self.func(self.args)
thread_max.release()
核心部分写成函数,然后调用这个函数执行
muilt_begin
import _thread
import time
import parsel
import requests
import os
# 为线程定义一个函数
def spider (threadName):
url = "****"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39 '
}
response = requests.get(url=url, headers=headers)
html_str = response.text
selector = parsel.Selector(html_str)
lis = selector.xpath('//div[@id="primary"]/main/article')[1:]
for li in lis:
pic_title = li.xpath('.//h2/a/text()').get()
pic_url = li.xpath('.//h2/a/@href').get()
print('正在下载相册:', pic_title)
if not os.path.exists('img\\' + pic_title):
os.mkdir('img\\' + pic_title)
response_pic = requests.get(url=pic_url, headers=headers).text
selector_2 = parsel.Selector(response_pic)
pic_url_list = selector_2.xpath('//div[@class="entry-content"]//img/@src').getall()[1:-1]
# print(pic_url_list)
for pic_url in pic_url_list:
img_data = requests.get(url=pic_url, headers=headers).content
file_name = pic_url.split('/')[-1]
with open(f'img\\{pic_title}\\{file_name}', mode='wb') as f:
f.write(img_data)
print('保存完成:', file_name)
# 创建两个线程
try:
_thread.start_new_thread( spider, ("Thread-1") )
_thread.start_new_thread( spider, ("Thread-2") )
except:
print ("Error: 无法启动线程")
while 1:
pass
协程与进程的完美结合
import asyncio
import httpx
from aiomultiprocess import Pool
async def get(url):
async with httpx.AsyncClient() as client:
resp = await client.get(url)
return resp.text
async def main():
urls = [url1, url2, url3]
async with Pool() as pool:
async for result in pool.map(get, urls):
print(result) # 每一个URL返回的内容
if __name__ == '__main__':
asyncio.run(main())
import parsel
import requests
import os
from concurrent.futures import ThreadPoolExecutor
url = "****"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39 '
}
def get_content(the_url: str):
img_data = requests.get(url=pic_url, headers=headers).content
return img_data
response = requests.get(url=url, headers=headers)
html_str = response.text
selector = parsel.Selector(html_str)
lis = selector.xpath('//div[@id="primary"]/main/article')[1:]
for li in lis:
pic_title = li.xpath('.//h2/a/text()').get()
pic_url = li.xpath('.//h2/a/@href').get()
print('正在下载相册:', pic_title)
if not os.path.exists('img\\' + pic_title):
os.mkdir('img\\' + pic_title)
response_pic = requests.get(url=pic_url, headers=headers).text
selector_2 = parsel.Selector(response_pic)
pic_url_list = selector_2.xpath('//div[@class="entry-content"]//img/@src').getall()[1:-1]
# print(pic_url_list)
with ThreadPoolExecutor(50) as t:
for pic_url in pic_url_list:
img_data = t.submit(get_content, pic_url)
file_name = pic_url.split('/')[-1]
with open(f'img\\{pic_title}\\{file_name}', mode='wb') as f:
f.write(img_data)
print('保存完成:', file_name)
开了一下线程池,另外就是把你获取二进制文件的语句封装成函数,用线程池来调用,这里是50个线程
多线程基本思路是这样的,对照着把方法改成这样
import parsel
import requests
import os
import threading
def do_someting:
url = "****"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39 '
}
response = requests.get(url=url, headers=headers)
html_str = response.text
selector = parsel.Selector(html_str)
lis = selector.xpath('//div[@id="primary"]/main/article')[1:]
for li in lis:
pic_title = li.xpath('.//h2/a/text()').get()
pic_url = li.xpath('.//h2/a/@href').get()
print('正在下载相册:', pic_title)
if not os.path.exists('img\\' + pic_title):
os.mkdir('img\\' + pic_title)
response_pic = requests.get(url=pic_url, headers=headers).text
selector_2 = parsel.Selector(response_pic)
pic_url_list = selector_2.xpath('//div[@class="entry-content"]//img/@src').getall()[1:-1]
# print(pic_url_list)
for pic_url in pic_url_list:
img_data = requests.get(url=pic_url, headers=headers).content
file_name = pic_url.split('/')[-1]
with open(f'img\\{pic_title}\\{file_name}', mode='wb') as f:
f.write(img_data)
print('保存完成:', file_name)
thread1 = threading.Thread(target=do_something)
thread2 = threading.Thread(target=do_something)
thread1.start()
thread2.start()
thread1.join()
thread2.join()
finish = time.perf_counter()