用异步协程方式爬取云播TV某部电视剧(仅用于个人学习),一共1194个ts文件,总有十几二十个ts文件无法下载,报错显示主机中软件终止了一个已建立的连接。
import asyncio
import requests
import aiohttp
import aiofiles
import time
import re
def main():
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"Referer": "https://player.yunbtv.net/"
}
m3u8_url = get_m3u8_url()
# "https://vod8.wenshibaowenbei.com/20210917/l7VYklhA/index.m3u8"
get_m3u8_file(m3u8_url)
asyncio.run(get_video())
def get_m3u8_url():
com = re.compile(r'"link_pre":"","url":"(?P<m3u8_url>.*?)","url_next', re.S)
url = "https://www.yunbtv.net/vodplay/youyuyouxi-1-1.html"
requ_1 = requests.get(url)
m3u8_url = com.search(requ_1.text).group("m3u8_url").replace("\\", "")
return m3u8_url
def get_m3u8_file(m3u8_url):
requ = requests.get(m3u8_url)
with open("鱿鱼游戏/m3u8/1.m3u8", mode="wb") as f_1:
f_1.write(requ.content)
requ.close()
with open("鱿鱼游戏/m3u8/1.m3u8", mode="r", encoding="utf-8") as f_2:
for line in f_2:
line = line.strip()
if line.startswith("#"):
continue
index_url = m3u8_url.split("/20210917")[0]+line # /20210917/l7VYklhA//1000kb/hls/index.m3u8
# "https://vod8.wenshibaowenbei.com/20210917/l7VYklhA//1000kb/hls/index.m3u8"
with open("鱿鱼游戏/m3u8/1.m3u8", mode="wb") as f_3:
f_3.write(requests.get(index_url).content)
async def get_video():
tasks = []
i = 1
# 将默认并发量由100降低至50
connector = aiohttp.TCPConnector(limit=50)
# 将默认超时时间提高至600秒
timeout = aiohttp.ClientTimeout(total=600)
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
async with aiofiles.open("鱿鱼游戏/m3u8/1.m3u8", mode="r", encoding="utf-8") as f_1:
async for line in f_1:
line = line.strip()
if line.startswith("#"):
continue
else:
task = asyncio.create_task(get_download_ts(session, line, i))
tasks.append(task)
i = i+1
await asyncio.wait(tasks)
async def get_download_ts(session, line, i):
# 将默认并发量由100降低至50
connector = aiohttp.TCPConnector(limit=50)
# 将默认超时时间提高至600秒
timeout = aiohttp.ClientTimeout(total=600)
# async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
requ = await session.get(line)
async with aiofiles.open("鱿鱼游戏/ts/第1集/{}.ts".format(i), mode="wb") as f:
await f.write(await requ.content.read())
print("第1集/{}.ts下载完成---".format(i))
time.sleep(0.3)
if __name__ == "__main__":
main()
已有以前多个ts文件可以正常下载,总有十几二十个ts文件无法下载并报错如下:
Task exception was never retrieved
future: <Task finished coro=<get_download_ts() done, defined at C:/python/Practice/爬虫第一阶段/08爬取云播TV鱿鱼游戏/08爬取云播TV鱿鱼游戏学习版.py:78> exception=ClientOSError(10053, '你的主机中的软件中止了一个已建立的连接。', None, 10053, None)>
Traceback (most recent call last):
File "C:/python/Practice/爬虫第一阶段/08爬取云播TV鱿鱼游戏/08爬取云播TV鱿鱼游戏学习版.py", line 84, in get_download_ts
requ = await session.get(line)
File "C:\python\Practice\爬虫第一阶段\venv\lib\site-packages\aiohttp\client.py", line 559, in _request
await resp.start(conn)
File "C:\python\Practice\爬虫第一阶段\venv\lib\site-packages\aiohttp\client_reqrep.py", line 898, in start
message, payload = await protocol.read() # type: ignore[union-attr]
File "C:\python\Practice\爬虫第一阶段\venv\lib\site-packages\aiohttp\streams.py", line 616, in read
await self._waiter
aiohttp.client_exceptions.ClientOSError: [WinError 10053] 你的主机中的软件中止了一个已建立的连接。
Task exception was never retrieved
future: <Task finished coro=<get_download_ts() done, defined at C:/python/Practice/爬虫第一阶段/08爬取云播TV鱿鱼游戏/08爬取云播TV鱿鱼游戏学习版.py:78> exception=ServerDisconnectedError('Server disconnected')>
Traceback (most recent call last):
File "C:/python/Practice/爬虫第一阶段/08爬取云播TV鱿鱼游戏/08爬取云播TV鱿鱼游戏学习版.py", line 84, in get_download_ts
requ = await session.get(line)
File "C:\python\Practice\爬虫第一阶段\venv\lib\site-packages\aiohttp\client.py", line 559, in _request
await resp.start(conn)
File "C:\python\Practice\爬虫第一阶段\venv\lib\site-packages\aiohttp\client_reqrep.py", line 898, in start
message, payload = await protocol.read() # type: ignore[union-attr]
File "C:\python\Practice\爬虫第一阶段\venv\lib\site-packages\aiohttp\streams.py", line 616, in read
await self._waiter
aiohttp.client_exceptions.ServerDisconnectedError: Server disconnected
Task exception was never retrieved
future: <Task finished coro=<get_download_ts() done, defined at C:/python/Practice/爬虫第一阶段/08爬取云播TV鱿鱼游戏/08爬取云播TV鱿鱼游戏学习版.py:78> exception=TimeoutError()>
Traceback (most recent call last):
File "C:/python/Practice/爬虫第一阶段/08爬取云播TV鱿鱼游戏/08爬取云播TV鱿鱼游戏学习版.py", line 86, in get_download_ts
await f.write(await requ.content.read())
File "C:\python\Practice\爬虫第一阶段\venv\lib\site-packages\aiohttp\streams.py", line 375, in read
block = await self.readany()
File "C:\python\Practice\爬虫第一阶段\venv\lib\site-packages\aiohttp\streams.py", line 397, in readany
await self._wait("readany")
File "C:\python\Practice\爬虫第一阶段\venv\lib\site-packages\aiohttp\streams.py", line 304, in _wait
await waiter
File "C:\python\Practice\爬虫第一阶段\venv\lib\site-packages\aiohttp\helpers.py", line 721, in __exit__
raise asyncio.TimeoutError from None
concurrent.futures._base.TimeoutError
1、降低默认并发量
2、提高默认超时时间
3、time.sleep(1),休眠
4、采用线程池已圆满完成
1、用异步协程的方式解决这个问题
2、搞清楚协程方式下载出现问题的原因,解决的方法,类似的问题
3、指出我在使用协程时出现的问题
3、请不要建议使用多线程了!
在学习协程相关的内容,感谢指教!
将并发量降低到5试试,你是用同一个IP并发,并发太多会被反爬