问题遇到的现象和发生背景
我是通过asyncio异步爬取m3u8文件,总共684条,运行过程到680条左右就不动了,过一会就报asyncio.exceptions.TimeoutError超时错误
问题相关代码,请勿粘贴截图
import requests
import re
import asyncio
import aiohttp
import aiofiles
import time
from bs4 import BeautifulSoup
import os
def get_first_m3u8_url(url):
resp = requests.get(url)
page = BeautifulSoup(resp.text, 'html.parser')
div = page.find('div', attrs={'class': "stui-player__video clearfix"})
div = str(div)
obj = re.compile(r'url=(?P.*?)" ' , re.S)
first_m3u8_url = obj.search(div).group("url")
return first_m3u8_url
def get_second_m3u8_url(url):
resp = requests.get(url)
m3u8_domain = url.replace('index.m3u8', '')
second_m3u8_url = resp.text.split('AUdT3fzc/')[1].strip()
second_m3u8_url = m3u8_domain + second_m3u8_url
return second_m3u8_url
def download_m3u8_file(url):
resp = requests.get(url)
with open('斗破苍穹.m3u8', mode='wb') as f:
f.write(resp.content)
print('下载完毕')
async def download_ts(url,session,n,sem):
n = str(n)
n = n.zfill(4)
async with sem:
async with session.get(url) as resp:
async with aiofiles.open(f"斗破苍穹动漫/{n}" + '.ts', mode='wb') as f:
await f.write(await resp.content.read())
print(f"{url}下载完毕")
async def dowmload_ts_file():
tasks = []
n = 1
# timeout = aiohttp.ClientTimeout(total=1000)
sem = asyncio.Semaphore(5)
# connector = aiohttp.TCPConnector(limit=10)
async with aiohttp.ClientSession() as session:
async with aiofiles.open('斗破苍穹.m3u8', mode='r') as f:
async for line in f:
line = line.strip()
if line.startswith('#'):
continue
task = asyncio.create_task(download_ts(line, session, n, sem))
tasks.append(task)
n += 1
await asyncio.wait(tasks)
def main(url):
first_m3u8_url =get_first_m3u8_url(url)
second_m3u8_url = get_second_m3u8_url(first_m3u8_url)
download_m3u8_file(second_m3u8_url)
# loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)
# loop.run_until_complete(dowmload_ts_file())
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(dowmload_ts_file())
if __name__ == '__main__':
url = 'https://www.9y9y.com/vodplay/19137-1-1.html'
main(url)
运行结果及报错内容
asyncio.exceptions.TimeoutError
我的解答思路和尝试过的方法
尝试过Semaphore控制并发数,还有TCPConnector限制连接数,还有超时时间设置都不行
我想要达到的结果
成功运行