linux python 使用pagination下载数据时中断下载后,如何在原下载基础上继续下载

服务器python使用pagination下载大量数据时被中断下载后,如何在原下载基础上继续下载。


import re
import requests
from requests.adapters import HTTPAdapter, Retry

re_next_link = re.compile(r'<(.+)>; rel="next"')
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

def get_next_link(headers):
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

def get_batch(batch_url):
    while batch_url:
        response = session.get(batch_url)
        response.raise_for_status()
        total = response.headers["x-total-results"]
        yield response, total
        batch_url = get_next_link(response.headers)


url = 'https://rest.uniprot.org/uniprotkb/search?format=fasta&query=%28taxonomy_name%3Afungi%29%20AND%20%28length%3A%5B401%20TO%20600%5D%29%20AND%20%28annotation_score%3A3%29&size=500'
progress = 0
with open('text.fasta', 'w') as f:
    for batch, total in get_batch(url):
        lines = batch.text.splitlines()
        if not progress:
            print(lines[0], file=f)
        for line in lines[1:]:
            print(line, file=f)
        progress += len(lines[1:])
        print(f'{progress} / {total}')

import re 
import requests 
from requests.adapters import HTTPAdapter, Retry 

re_next_link = re.compile(r'<(.+)>; rel="next"') 
re_search_total = re.compile(r'total=(\d+)')

retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) 
session = requests.Session() 
session.mount("https://", HTTPAdapter(max_retries=retries)) 

def get_next_link(headers):    
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

def get_search_total(headers):
    if "X-Total-Results" in headers:
        return int(headers["X-Total-Results"])
    else:
        match = re_search_total.search(headers["Content-Range"])
        if match:
            return int(match.group(1))

def get_batch(batch_url, start=0):
    if start !=0:
        headers = {"Range": f"entries={start}-"}
    while batch_url:
        if start != 0:
            response = session.get(batch_url, headers=headers)
        else:
            response = session.get(batch_url)
        response.raise_for_status()
        total = get_search_total(response.headers)
        yield response, total
        next_link = get_next_link(response.headers)
        if next_link:
            start += total
            batch_url = next_link
        else:
            batch_url = None

def download_data(url):
    progress = 0
    with open('text.fasta', 'a') as f:
        for batch, total in get_batch(url, start=progress):
            lines = batch.text.splitlines()
            if not progress:
                print(lines[0], file=f)
            for line in lines[1:]:
                print(line, file=f)
            progress += len(lines[1:])
            print(f'{progress} / {total}')

添加了一个新函数get_search_total,以获取搜索结果的总数。在get_batch函数中,我们使用了Range标头来分批获取数据,以避免一次性下载大规模的数据而导致请求被拒绝。如果在下载过程中出现中断,我们只需要调用download_data函数继续下载即可,它将从上次下载的位置开始继续下载。

参考GPT和自己的思路:

如果你下载的数据量比较大,且下载中途被中断,那么你需要在原下载的基础上继续下载。解决这个问题可以在上一次下载中断的位置处再次执行代码。具体来说,你可以在每一次循环前添加一个条件判断,判断是否已经下载了之前下载中断的数据,如果是,则继续从该位置开始下载,否则就从头开始下载。你可以使用一个变量保存已下载的数据量,然后在循环内对该变量进行累加。在程序被中断时,可以将该变量的值保存到一个文件中,以便下次读取并继续下载。修改后的代码如下:


import re
import requests
from requests.adapters import HTTPAdapter, Retry

re_next_link = re.compile(r'<(.+)>; rel="next"')
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

def get_next_link(headers):
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

def get_batch(batch_url, progress=0):
    while batch_url:
        response = session.get(batch_url)
        response.raise_for_status()
        total = response.headers["x-total-results"]
        yield response, total

        lines = response.text.splitlines()
        if not progress:
            print(lines[0], file=f)

        for line in lines[1:]:
            print(line, file=f)
        progress += len(lines[1:])
        print(f'{progress} / {total}')

        batch_url = get_next_link(response.headers)

    return progress


url = 'https://rest.uniprot.org/uniprotkb/search?format=fasta&query=%28taxonomy_name%3Afungi%29%20AND%20%28length%3A%5B401%20TO%20600%5D%29%20AND%20%28annotation_score%3A3%29&size=500'

with open('text.fasta', 'a') as f:
    # 读取之前下载的数据量
    try:
        with open('progress.txt', 'r') as p:
            progress = int(p.readline().strip())
    except FileNotFoundError:
        progress = 0

    # 继续下载
    for batch, total in get_batch(url, progress):
        # 将已下载的数据量保存到文件
        with open('progress.txt', 'w') as p:
            p.write(str(progress))

        progress = get_batch(url, progress)

在运行程序时,如果之前已经下载了数据,程序会自动判断并从上一次的位置继续下载,而不是从头开始。同时,程序会将已下载的数据量保存到progress.txt文件中,以便下次继续下载。