python爬虫无法保存全部数据,可在上午还是可以保存全部数据


import requests
from jsonpath import jsonpath
import time

import xlrd

import openpyxl
# from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor

wb = openpyxl.load_workbook('userdata2.xlsx')
wa = wb.active
b1 = wa.cell(row=1, column=1, value='用户名')
b2 = wa.cell(row=1, column=2, value='ip')
b3 = wa.cell(row=1, column=3, value='用户圈子数')
b4 = wa.cell(row=1, column=4, value='用户关注')
b5 = wa.cell(row=1, column=5, value='用户粉丝')
b6 = wa.cell(row=1, column=6, value='用户等级')
b7 = wa.cell(row=1, column=7, value='用户牌子')
for n in range(10):
    cn = wa.cell(row=1, column=3*n+8, value=f'用户发帖时间{n}')
    dn = wa.cell(row=1, column=3*n+9, value=f'用户发帖内容{n}')
    en = wa.cell(row=1, column=3*n+10, value=f'用户帖子评论{n}')
dic_user = {}
def user(url_list,url_page,x):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
    }
    # proxies = {'HTTPS':'96.113.165.182:3128',
    #            'HTTP':'96.113.165.182:3128'}


    time.sleep(0.5)
    resp_list = requests.get(url=url_list, headers=header,timeout=5).json()
    resp_page = requests.get(url=url_page, headers=header,timeout=5).json()

    user_sum = jsonpath(resp_list, '$.entity.records.*')
    # self.user_sum = user_sum
    user_name = ''.join(jsonpath(resp_list, '$..0.author.name'))
    user_ip = ''.join(jsonpath(resp_page, "$.entity.ipRegion"))
    user_board = jsonpath(resp_page, '$.entity.boardCnt')[0]
    user_followee = jsonpath(resp_page, '$.entity.followeeCnt')[0]
    user_follower = jsonpath(resp_page, '$.entity.followerCnt')[0]
    user_level = jsonpath(resp_page, '$.entity.userGrowLevelInfo.level')[0]
    user_title = jsonpath(resp_page, '$.entity.userGrowLevelInfo.title')[0]
    dic_user = {
        'user_name': user_name,
        'user_ip': user_ip,
        'user_board': user_board,
        'user_followee': user_followee,
        'user_follower': user_follower,
        'user_level': user_level,
        'user_title': user_title
    }
    # print(dic_user['user_name'])
    for n in range(len(user_sum)):
        # post_title = jsonpath(resp_list, f'$..{n}.title')
        post_summary = jsonpath(resp_list, f'$..{n}.summary')
        post_time_cc = jsonpath(resp_list, f'$..{n}.createTime')[0]
        post_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(post_time_cc / 1000))
        post_comm_id = ''.join(jsonpath(resp_list, f'$.entity.records.{n}.id'))
        # 每个帖子评论数量
        com_summ = jsonpath(resp_list, f'$.entity.records.{n}.commentCnt')[0]
        # post_comm_time_sum =com_summ
        dic_comm = {}
        if True:
            post_comm_comm = 0
            if com_summ == 0:
                pass
            else:
                # 单个帖子评论数量
                url_comm = f'https://api.vip.miui.com/mtop/planet/vip/content/comments?ref=&pathname=/mio/detail&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=WgRuWRyOPot0i7YibHrmsw==&limit={com_summ}&postId={post_comm_id}&after=&sortType=1'
                resp_comm = requests.get(url=url_comm, headers=header).json()
                for comm_nb in range(com_summ):
                    post_comm_reply_a = jsonpath(resp_comm, f'entity.records.{comm_nb}.count')
                    try:
                        if post_comm_reply_a != None:
                            post_comm_reply_a = post_comm_reply_a[0]
                            # post_comm_comm +=post_comm_reply_a
                            post_comm_comm += post_comm_reply_a

                    except:
                        pass
            com_summ -= post_comm_comm
            for comm_nb_a in range(com_summ):
                url_comm = f'https://api.vip.miui.com/mtop/planet/vip/content/comments?ref=&pathname=/mio/detail&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=WgRuWRyOPot0i7YibHrmsw==&limit={com_summ}&postId={post_comm_id}&after=&sortType=1'
                resp_comm = requests.get(url=url_comm, headers=header).json()
                dic_comm_b = {}
                post_comm_user = jsonpath(resp_comm, f'$.entity.records.{comm_nb_a}.author.name')
                post_comm_text = jsonpath(resp_comm, f'$.entity.records.{comm_nb_a}.text')
                # post_comm_time_sum = post_comm_time_sum -post_comm_comm
                # for time_nb in range(post_comm_time_sum):
                #     post_comm_time_cc = jsonpath(resp_comm,f'$.entity.records.{time_nb}.time')[0]
                #     post_comm_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(post_comm_time_cc / 1000))
                #     print(post_comm_time)

                dic_com = {
                    f'post_comm_user{comm_nb_a}': post_comm_user,
                    f'post_comm_text{comm_nb_a}': post_comm_text,
                    # f'post_comm_time{comm_nb}':post_comm_time
                }
                dic_comm_b.update(dic_com)
                # print(dic_comm)
                dic_comm.update(dic_comm_b)
        dic = {
            f'post_time{n}': post_time,
            f'post_summary{n}': str(post_summary),
            f'post_comm{n}': str(dic_comm)
        }
        dic_user.update(dic)

    # dic_user.update(dic_user)
    # self.dic_user= dic_user
    print(dic_user['user_name'])
# 给字段中加值,考虑循环使用

    q1 = wa.cell(row=x+1, column=1, value=dic_user['user_name'])
    q2 = wa.cell(row=x+1, column=2, value=dic_user['user_ip'])
    q3 = wa.cell(row=x+1, column=3, value=dic_user['user_board'])
    q4 = wa.cell(row=x+1, column=4, value=dic_user['user_followee'])
    q5 = wa.cell(row=x+1, column=5, value=dic_user['user_followee'])
    q6 = wa.cell(row=x+1, column=6, value=dic_user['user_level'])
    q7 = wa.cell(row=x+1, column=7, value=dic_user['user_title'])

    for n in range(len(user_sum)):
        wn = wa.cell(row=x+ 1, column=3 * n + 8, value=dic_user[f'post_time{n}'])
        rn = wa.cell(row=x+ 1, column=3 * n + 9, value=dic_user[f'post_summary{n}'])
        vn = wa.cell(row=x+ 1, column=3 * n + 10, value=dic_user[f'post_comm{n}'])
    print(dic_user['user_name'],'over')


fp = xlrd.open_workbook('user.xlsx')
book = fp.sheet_by_index(0)
intt = []
for j in range(100):
    intt.append(int(book.col_values(colx=11)[j]))
url_list = 'https://api.vip.miui.com/api/community/user/announce/list?ref=&pathname=/mio/homePage&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=g/umA5p+n6Z2g+HrA2Mfkg==&uid={}&after=&limit=10'
url_page = 'https://api.vip.miui.com/api/community/user/home/page?ref=&pathname=/mio/homePage&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=V03pII8jNAVl8OF1z7yhOA==&uid={}&encodeUserId='
x = 1
if __name__ == "__main__":
    with ThreadPoolExecutor(50)as t:
        for i in range(len(intt)):
            t.submit(user, url_list.format(intt[i]), url_page.format(intt[i]), x)
            x += 1
    wb.save('./userdata2.xlsx')

我已经使用了代理,为什么每次还是无法保存全部数据,明明在上午的时候还是可以读取并保存全部数据。
是因为请求太频繁,ip不断被封吗?
还是因为线程池出问题了?

不知道你这个问题是否已经解决, 如果还没有解决的话:

如果你已经解决了该问题, 非常希望你能够分享一下解决方案, 写成博客, 将相关链接放在评论区, 以帮助更多的人 ^-^