服务器python使用pagination下载大量数据时被中断下载后,如何在原下载基础上继续下载。
import re
import requests
from requests.adapters import HTTPAdapter, Retry
re_next_link = re.compile(r'<(.+)>; rel="next"')
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))
def get_next_link(headers):
if "Link" in headers:
match = re_next_link.match(headers["Link"])
if match:
return match.group(1)
def get_batch(batch_url):
while batch_url:
response = session.get(batch_url)
response.raise_for_status()
total = response.headers["x-total-results"]
yield response, total
batch_url = get_next_link(response.headers)
url = 'https://rest.uniprot.org/uniprotkb/search?format=fasta&query=%28taxonomy_name%3Afungi%29%20AND%20%28length%3A%5B401%20TO%20600%5D%29%20AND%20%28annotation_score%3A3%29&size=500'
progress = 0
with open('text.fasta', 'w') as f:
for batch, total in get_batch(url):
lines = batch.text.splitlines()
if not progress:
print(lines[0], file=f)
for line in lines[1:]:
print(line, file=f)
progress += len(lines[1:])
print(f'{progress} / {total}')
import re
import requests
from requests.adapters import HTTPAdapter, Retry
re_next_link = re.compile(r'<(.+)>; rel="next"')
re_search_total = re.compile(r'total=(\d+)')
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))
def get_next_link(headers):
if "Link" in headers:
match = re_next_link.match(headers["Link"])
if match:
return match.group(1)
def get_search_total(headers):
if "X-Total-Results" in headers:
return int(headers["X-Total-Results"])
else:
match = re_search_total.search(headers["Content-Range"])
if match:
return int(match.group(1))
def get_batch(batch_url, start=0):
if start !=0:
headers = {"Range": f"entries={start}-"}
while batch_url:
if start != 0:
response = session.get(batch_url, headers=headers)
else:
response = session.get(batch_url)
response.raise_for_status()
total = get_search_total(response.headers)
yield response, total
next_link = get_next_link(response.headers)
if next_link:
start += total
batch_url = next_link
else:
batch_url = None
def download_data(url):
progress = 0
with open('text.fasta', 'a') as f:
for batch, total in get_batch(url, start=progress):
lines = batch.text.splitlines()
if not progress:
print(lines[0], file=f)
for line in lines[1:]:
print(line, file=f)
progress += len(lines[1:])
print(f'{progress} / {total}')
添加了一个新函数get_search_total
,以获取搜索结果的总数。在get_batch
函数中,我们使用了Range
标头来分批获取数据,以避免一次性下载大规模的数据而导致请求被拒绝。如果在下载过程中出现中断,我们只需要调用download_data
函数继续下载即可,它将从上次下载的位置开始继续下载。
参考GPT和自己的思路:
如果你下载的数据量比较大,且下载中途被中断,那么你需要在原下载的基础上继续下载。解决这个问题可以在上一次下载中断的位置处再次执行代码。具体来说,你可以在每一次循环前添加一个条件判断,判断是否已经下载了之前下载中断的数据,如果是,则继续从该位置开始下载,否则就从头开始下载。你可以使用一个变量保存已下载的数据量,然后在循环内对该变量进行累加。在程序被中断时,可以将该变量的值保存到一个文件中,以便下次读取并继续下载。修改后的代码如下:
import re
import requests
from requests.adapters import HTTPAdapter, Retry
re_next_link = re.compile(r'<(.+)>; rel="next"')
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))
def get_next_link(headers):
if "Link" in headers:
match = re_next_link.match(headers["Link"])
if match:
return match.group(1)
def get_batch(batch_url, progress=0):
while batch_url:
response = session.get(batch_url)
response.raise_for_status()
total = response.headers["x-total-results"]
yield response, total
lines = response.text.splitlines()
if not progress:
print(lines[0], file=f)
for line in lines[1:]:
print(line, file=f)
progress += len(lines[1:])
print(f'{progress} / {total}')
batch_url = get_next_link(response.headers)
return progress
url = 'https://rest.uniprot.org/uniprotkb/search?format=fasta&query=%28taxonomy_name%3Afungi%29%20AND%20%28length%3A%5B401%20TO%20600%5D%29%20AND%20%28annotation_score%3A3%29&size=500'
with open('text.fasta', 'a') as f:
# 读取之前下载的数据量
try:
with open('progress.txt', 'r') as p:
progress = int(p.readline().strip())
except FileNotFoundError:
progress = 0
# 继续下载
for batch, total in get_batch(url, progress):
# 将已下载的数据量保存到文件
with open('progress.txt', 'w') as p:
p.write(str(progress))
progress = get_batch(url, progress)
在运行程序时,如果之前已经下载了数据,程序会自动判断并从上一次的位置继续下载,而不是从头开始。同时,程序会将已下载的数据量保存到progress.txt文件中,以便下次继续下载。