python爬虫怎么改成多线程

python爬虫怎么改成多线程

import parsel
import requests
import os


url = "****"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39 '
}

response = requests.get(url=url, headers=headers)
html_str = response.text
selector = parsel.Selector(html_str)
lis = selector.xpath('//div[@id="primary"]/main/article')[1:]
for li in lis:
    pic_title = li.xpath('.//h2/a/text()').get()
    pic_url = li.xpath('.//h2/a/@href').get()
    print('正在下载相册:', pic_title)

    if not os.path.exists('img\\' + pic_title):
        os.mkdir('img\\' + pic_title)

    response_pic = requests.get(url=pic_url, headers=headers).text

    selector_2 = parsel.Selector(response_pic)
    pic_url_list = selector_2.xpath('//div[@class="entry-content"]//img/@src').getall()[1:-1]
    # print(pic_url_list)
    for pic_url in pic_url_list:
        img_data = requests.get(url=pic_url, headers=headers).content

        file_name = pic_url.split('/')[-1]
        with open(f'img\\{pic_title}\\{file_name}', mode='wb') as f:
            f.write(img_data)
            print('保存完成:', file_name)

楼上的这些线程改造方式放在这里行不通吧。楼主代码中IO操作在循环的地方,这里线程不是应该将循环改成多线程吗。。。
##改了一个循环,试一下


```python

import parsel
import requests
import os
from multiprocessing.dummy import Pool
 
 
url = "****"
 
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39 '
}
 
response = requests.get(url=url, headers=headers)
html_str = response.text
selector = parsel.Selector(html_str)
lis = selector.xpath('//div[@id="primary"]/main/article')[1:]
for li in lis:
    pic_title = li.xpath('.//h2/a/text()').get()
    pic_url = li.xpath('.//h2/a/@href').get()
    print('正在下载相册:', pic_title)
 
    if not os.path.exists('img\\' + pic_title):
        os.mkdir('img\\' + pic_title)
 
    response_pic = requests.get(url=pic_url, headers=headers).text
 
    selector_2 = parsel.Selector(response_pic)
    pic_url_list = selector_2.xpath('//div[@class="entry-content"]//img/@src').getall()[1:-1]
    # print(pic_url_list)
    def saveimg(pic_url):
        img_data = requests.get(url=pic_url, headers=headers).content
        file_name = pic_url.split('/')[-1]
        with open(f'img\\{pic_title}\\{file_name}', mode='wb') as f:
            f.write(img_data)
            print('保存完成:', file_name)
   
    p = Pool(10)
    result=p.map(saveimg,pic_url_list)
    p.close()
    p.join()

```

可以参考

import threading

thread_max = threading.BoundedSemaphore(20)
semap = threading.Semaphore(5)


def muilt_begin(func, lis: list):
    thread_list = []
    for each in lis:
        thread_max.acquire()
        m = MyThread(func, each)
        m.start()
        thread_list.append(m)
    for m in thread_list:
        m.join()


class MyThread(threading.Thread):
    def __init__(self, func, args):
        threading.Thread.__init__(self)
        self.func = func
        self.args = args

    def run(self):
        self.func(self.args)
        thread_max.release()

核心部分写成函数,然后调用这个函数执行
muilt_begin

import _thread
import time
import parsel
import requests
import os

# 为线程定义一个函数
def spider (threadName):
    url = "****"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                    'Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39 '
    }
    
    response = requests.get(url=url, headers=headers)
    html_str = response.text
    selector = parsel.Selector(html_str)
    lis = selector.xpath('//div[@id="primary"]/main/article')[1:]
    for li in lis:
        pic_title = li.xpath('.//h2/a/text()').get()
        pic_url = li.xpath('.//h2/a/@href').get()
        print('正在下载相册:', pic_title)
    
        if not os.path.exists('img\\' + pic_title):
            os.mkdir('img\\' + pic_title)
    
        response_pic = requests.get(url=pic_url, headers=headers).text
    
        selector_2 = parsel.Selector(response_pic)
        pic_url_list = selector_2.xpath('//div[@class="entry-content"]//img/@src').getall()[1:-1]
        # print(pic_url_list)
        for pic_url in pic_url_list:
            img_data = requests.get(url=pic_url, headers=headers).content
    
            file_name = pic_url.split('/')[-1]
            with open(f'img\\{pic_title}\\{file_name}', mode='wb') as f:
                f.write(img_data)
                print('保存完成:', file_name)

# 创建两个线程
try:
   _thread.start_new_thread( spider, ("Thread-1") )
   _thread.start_new_thread( spider, ("Thread-2") )
except:
   print ("Error: 无法启动线程")

while 1:
   pass

协程与进程的完美结合

import asyncio
import httpx
from aiomultiprocess import Pool

async def get(url):
    async with httpx.AsyncClient() as client:
        resp = await client.get(url)
        return resp.text


async def main():
    urls = [url1, url2, url3]
    async with Pool() as pool:
        async for result in pool.map(get, urls):
            print(result)  # 每一个URL返回的内容

if __name__ == '__main__':
    asyncio.run(main())


import parsel
import requests
import os
from concurrent.futures import ThreadPoolExecutor

url = "****"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39 '
}


def get_content(the_url: str):
    img_data = requests.get(url=pic_url, headers=headers).content
    return img_data


response = requests.get(url=url, headers=headers)
html_str = response.text
selector = parsel.Selector(html_str)
lis = selector.xpath('//div[@id="primary"]/main/article')[1:]
for li in lis:
    pic_title = li.xpath('.//h2/a/text()').get()
    pic_url = li.xpath('.//h2/a/@href').get()
    print('正在下载相册:', pic_title)

    if not os.path.exists('img\\' + pic_title):
        os.mkdir('img\\' + pic_title)

    response_pic = requests.get(url=pic_url, headers=headers).text

    selector_2 = parsel.Selector(response_pic)
    pic_url_list = selector_2.xpath('//div[@class="entry-content"]//img/@src').getall()[1:-1]
    # print(pic_url_list)
    with ThreadPoolExecutor(50) as t:
        for pic_url in pic_url_list:
            img_data = t.submit(get_content, pic_url)
            file_name = pic_url.split('/')[-1]
            with open(f'img\\{pic_title}\\{file_name}', mode='wb') as f:
                f.write(img_data)
                print('保存完成:', file_name)

开了一下线程池,另外就是把你获取二进制文件的语句封装成函数,用线程池来调用,这里是50个线程

img

img

img

多线程基本思路是这样的,对照着把方法改成这样

img


改造后代码,参照这个



import parsel
import requests
import os
import threading 


def do_someting:

    url = "****"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39 '
    }

    response = requests.get(url=url, headers=headers)
    html_str = response.text
    selector = parsel.Selector(html_str)
    lis = selector.xpath('//div[@id="primary"]/main/article')[1:]
    for li in lis:
        pic_title = li.xpath('.//h2/a/text()').get()
        pic_url = li.xpath('.//h2/a/@href').get()
        print('正在下载相册:', pic_title)

        if not os.path.exists('img\\' + pic_title):
            os.mkdir('img\\' + pic_title)

        response_pic = requests.get(url=pic_url, headers=headers).text

        selector_2 = parsel.Selector(response_pic)
        pic_url_list = selector_2.xpath('//div[@class="entry-content"]//img/@src').getall()[1:-1]
        # print(pic_url_list)
        for pic_url in pic_url_list:
            img_data = requests.get(url=pic_url, headers=headers).content

            file_name = pic_url.split('/')[-1]
            with open(f'img\\{pic_title}\\{file_name}', mode='wb') as f:
                f.write(img_data)
                print('保存完成:', file_name)
                
thread1 = threading.Thread(target=do_something)
thread2 = threading.Thread(target=do_something)
thread1.start()
thread2.start()
thread1.join()
thread2.join()
finish = time.perf_counter()