为什么我的Python协程爬虫运行速度比预想中慢很多?

问题遇到的现象和发生背景

我最近在学习python爬虫,目前尝试从电影天堂上爬取一些电影的信息。然后我想尝试多线程和协程爬虫,爬取三页电影的信息。多线程的运行速度符合预期,但是协程的速度比我预想中慢太多了。从网上查找的资料上来看,协程爬虫的速度甚至应该是在多线程之上的。而我实际测试下来,协程的速度跟同步的几乎一样。我想知道是否我的协程代码编写上有问题?或者是其他方面的原因导致

用代码块功能插入代码,请勿粘贴截图

#协程爬虫

import datetime
import time
import aiohttp
import httpx
import requests
import asyncio
import csv
from bs4 import BeautifulSoup
from datetime import datetime


def find_id(movie_href: str):  # 从链接中取出ID
    char1 = '.'
    char2 = '/'
    npos1 = movie_href.rfind(char1)
    npos2 = movie_href.rfind(char2)
    ID = movie_href[npos2 + 1:npos1]
    return ID


head_list = ["ID", "电影名称", "网页", "磁链", "更新时间"]
# domain = "https://m.dytt8.net/index2.htm"  # 电影天堂域名
# url = "https://m.dytt8.net/html/gndy/dyzz/index.html"  # 电影天堂最新电影
# url = "https://m.dytt8.net/html/gndy/dyzz/list_23_1.html"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 '
                  'Safari/537.36',
    'Connection': 'close',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'referer': ''}
httpx.DEFAULT_RETRIES = 5  # 设置重试次数为5
data_list = []
client = httpx.AsyncClient(timeout=None,)
count = 0


async def craw_one_page(i):

    url = "https://m.dytt8.net/html/gndy/dyzz/list_23_" + str(i) + ".html"
    print(f"开始爬取第{i}页,时间:{start}")
    flag = True
    # while flag:
    #     try:
    #         response = await client.get(url, headers=headers)
    #         # await asyncio.sleep(0.01)
    #         flag = False
    #     except Exception as e:
    #         print("频繁访问!"+str(e))
    #         flag = True
    response = await client.get(url, headers=headers)
    await fetch_content(response)


async def fetch_content(response):
    global count
    # response = await client.get(url, headers=headers)
    response.encoding = 'gb2312'  # 编码格式gb2312
    homepage = response.text  # 主页
    soup = BeautifulSoup(homepage, "lxml")
    movies_list = soup.find_all('a', class_='ulink')  # 取当前页面所有的电影

    for movie in movies_list:
        name = movie.text
        href = "https://m.dytt8.net" + movie['href']
        ID = find_id(href)
        resp = requests.get(href, headers=headers)
        resp.encoding = 'gb2312'
        soup2 = BeautifulSoup(resp.text, "lxml")
        Magnet_URI = soup2.select_one('#Zoom > td > a')['href']
        updated_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        movie_dict = {
            "ID": ID,
            "电影名称": name,
            "网页": href,
            "磁链": Magnet_URI,
            "更新时间": updated_time}
        # data_list.append(movie_dict)
        print(movie_dict)
        count += 1


async def main():
    arr = [1, 2, 3]
    task = [craw_one_page(i) for i in arr]
    await asyncio.gather(*task)

if __name__ == '__main__':
    start = time.time()
    asyncio.run(main())
    # print(data_list)
    print(f"耗时{(time.time()-start)}秒")
    print(f"爬取电影共计{count}条")


#多线程爬虫

import datetime
import time
import requests
import threading
import csv
from bs4 import BeautifulSoup
from datetime import datetime

count = [0, 0, 0]


def find_id(movie_href: str):  # 从链接中取出ID
    char1 = '.'
    char2 = '/'
    npos1 = movie_href.rfind(char1)
    npos2 = movie_href.rfind(char2)
    ID = movie_href[npos2 + 1:npos1]
    return ID


head_list = ["ID", "电影名称", "网页", "磁链", "更新时间"]
# domain = "https://m.dytt8.net/index2.htm"  # 电影天堂域名
# url = "https://m.dytt8.net/html/gndy/dyzz/index.html"  # 电影天堂最新电影
# url = "https://m.dytt8.net/html/gndy/dyzz/list_23_1.html"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 '
                  'Safari/537.36',
    'Connection': 'close',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'referer': ''}
requests.DEFAULT_RETRIES = 5  # 设置重试次数为5
data_list = []


def craw_one_page(i):
    global count
    # for i in range(1, 2):
    url = "https://m.dytt8.net/html/gndy/dyzz/list_23_" + str(i) + ".html"
    start = time.time()
    print(f"开始爬取第{i}页,时间:{start}")
    response = requests.get(url, headers=headers)
    time.sleep(0.3)
    response.encoding = 'gb2312'  # 编码格式gb2312
    homepage = response.text  # 主页
    soup = BeautifulSoup(homepage, "lxml")
    movies_list = soup.find_all('a', class_='ulink')  # 取当前页面所有的电影

    for movie in movies_list:
        name = movie.text
        href = "https://m.dytt8.net" + movie['href']
        ID = find_id(href)
        resp = requests.get(href, headers=headers)
        resp.encoding = 'gb2312'
        soup2 = BeautifulSoup(resp.text, "lxml")
        Magnet_URI = soup2.select_one('#Zoom > td > a')['href']
        updated_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        movie_dict = {
            "ID": ID,
            "电影名称": movie.text,
            "网页": href,
            "磁链": Magnet_URI,
            "更新时间": updated_time}
        print(movie_dict)
        count[i - 1] += 1
        data_list.append(movie_dict)
    f = open('test.csv', mode='w', encoding="gb2312", newline="")
    with f:
        w = csv.DictWriter(f, head_list)
        w.writeheader()
        w.writerows(data_list)
    over = time.time()
    print(f"第{i}页爬取完毕,{count[i-1]}条数据,耗时{(over-start)}")


if __name__ == '__main__':
    Sum = 0
    t1 = threading.Thread(target=craw_one_page, args=(1,))
    t2 = threading.Thread(target=craw_one_page, args=(2,))
    t3 = threading.Thread(target=craw_one_page, args=(3,))
    t1.start()
    t2.start()
    t3.start()
    t1.join()
    t2.join()
    t3.join()
    for i in range(len(count)):
        Sum += count[i]
    print(f"总共{Sum}条数据")

#         data_list.append(movie_dict)
# f = open('test.csv', mode='w', encoding="gb2312", newline="")
# with f:
#     w = csv.DictWriter(f, head_list)
#     w.writeheader()
#     w.writerows(data_list)

运行结果及报错内容

协程爬虫的运行时间在7090秒左右,基本就是同步爬虫的运行速度,完全没有体现出协程的作用,而多线程爬虫的运行时间在2030秒左右。除此之外,协程爬虫在运行一段时间后有概率有报错。

我的解答思路和尝试过的方法

我将协程和多线程的爬虫代码都放上来,希望可以帮我看看问题是否出现在代码上

我想要达到的结果

希望协程爬虫的运行速度接近或快于多线程爬虫,因为从网上的资料看这样的速度是比较符合预期的。