import requests
from jsonpath import jsonpath
import time
import xlrd
import openpyxl
# from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor
wb = openpyxl.load_workbook('userdata2.xlsx')
wa = wb.active
b1 = wa.cell(row=1, column=1, value='用户名')
b2 = wa.cell(row=1, column=2, value='ip')
b3 = wa.cell(row=1, column=3, value='用户圈子数')
b4 = wa.cell(row=1, column=4, value='用户关注')
b5 = wa.cell(row=1, column=5, value='用户粉丝')
b6 = wa.cell(row=1, column=6, value='用户等级')
b7 = wa.cell(row=1, column=7, value='用户牌子')
for n in range(10):
cn = wa.cell(row=1, column=3*n+8, value=f'用户发帖时间{n}')
dn = wa.cell(row=1, column=3*n+9, value=f'用户发帖内容{n}')
en = wa.cell(row=1, column=3*n+10, value=f'用户帖子评论{n}')
dic_user = {}
def user(url_list,url_page,x):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
# proxies = {'HTTPS':'96.113.165.182:3128',
# 'HTTP':'96.113.165.182:3128'}
time.sleep(0.5)
resp_list = requests.get(url=url_list, headers=header,timeout=5).json()
resp_page = requests.get(url=url_page, headers=header,timeout=5).json()
user_sum = jsonpath(resp_list, '$.entity.records.*')
# self.user_sum = user_sum
user_name = ''.join(jsonpath(resp_list, '$..0.author.name'))
user_ip = ''.join(jsonpath(resp_page, "$.entity.ipRegion"))
user_board = jsonpath(resp_page, '$.entity.boardCnt')[0]
user_followee = jsonpath(resp_page, '$.entity.followeeCnt')[0]
user_follower = jsonpath(resp_page, '$.entity.followerCnt')[0]
user_level = jsonpath(resp_page, '$.entity.userGrowLevelInfo.level')[0]
user_title = jsonpath(resp_page, '$.entity.userGrowLevelInfo.title')[0]
dic_user = {
'user_name': user_name,
'user_ip': user_ip,
'user_board': user_board,
'user_followee': user_followee,
'user_follower': user_follower,
'user_level': user_level,
'user_title': user_title
}
# print(dic_user['user_name'])
for n in range(len(user_sum)):
# post_title = jsonpath(resp_list, f'$..{n}.title')
post_summary = jsonpath(resp_list, f'$..{n}.summary')
post_time_cc = jsonpath(resp_list, f'$..{n}.createTime')[0]
post_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(post_time_cc / 1000))
post_comm_id = ''.join(jsonpath(resp_list, f'$.entity.records.{n}.id'))
# 每个帖子评论数量
com_summ = jsonpath(resp_list, f'$.entity.records.{n}.commentCnt')[0]
# post_comm_time_sum =com_summ
dic_comm = {}
if True:
post_comm_comm = 0
if com_summ == 0:
pass
else:
# 单个帖子评论数量
url_comm = f'https://api.vip.miui.com/mtop/planet/vip/content/comments?ref=&pathname=/mio/detail&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=WgRuWRyOPot0i7YibHrmsw==&limit={com_summ}&postId={post_comm_id}&after=&sortType=1'
resp_comm = requests.get(url=url_comm, headers=header).json()
for comm_nb in range(com_summ):
post_comm_reply_a = jsonpath(resp_comm, f'entity.records.{comm_nb}.count')
try:
if post_comm_reply_a != None:
post_comm_reply_a = post_comm_reply_a[0]
# post_comm_comm +=post_comm_reply_a
post_comm_comm += post_comm_reply_a
except:
pass
com_summ -= post_comm_comm
for comm_nb_a in range(com_summ):
url_comm = f'https://api.vip.miui.com/mtop/planet/vip/content/comments?ref=&pathname=/mio/detail&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=WgRuWRyOPot0i7YibHrmsw==&limit={com_summ}&postId={post_comm_id}&after=&sortType=1'
resp_comm = requests.get(url=url_comm, headers=header).json()
dic_comm_b = {}
post_comm_user = jsonpath(resp_comm, f'$.entity.records.{comm_nb_a}.author.name')
post_comm_text = jsonpath(resp_comm, f'$.entity.records.{comm_nb_a}.text')
# post_comm_time_sum = post_comm_time_sum -post_comm_comm
# for time_nb in range(post_comm_time_sum):
# post_comm_time_cc = jsonpath(resp_comm,f'$.entity.records.{time_nb}.time')[0]
# post_comm_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(post_comm_time_cc / 1000))
# print(post_comm_time)
dic_com = {
f'post_comm_user{comm_nb_a}': post_comm_user,
f'post_comm_text{comm_nb_a}': post_comm_text,
# f'post_comm_time{comm_nb}':post_comm_time
}
dic_comm_b.update(dic_com)
# print(dic_comm)
dic_comm.update(dic_comm_b)
dic = {
f'post_time{n}': post_time,
f'post_summary{n}': str(post_summary),
f'post_comm{n}': str(dic_comm)
}
dic_user.update(dic)
# dic_user.update(dic_user)
# self.dic_user= dic_user
print(dic_user['user_name'])
# 给字段中加值,考虑循环使用
q1 = wa.cell(row=x+1, column=1, value=dic_user['user_name'])
q2 = wa.cell(row=x+1, column=2, value=dic_user['user_ip'])
q3 = wa.cell(row=x+1, column=3, value=dic_user['user_board'])
q4 = wa.cell(row=x+1, column=4, value=dic_user['user_followee'])
q5 = wa.cell(row=x+1, column=5, value=dic_user['user_followee'])
q6 = wa.cell(row=x+1, column=6, value=dic_user['user_level'])
q7 = wa.cell(row=x+1, column=7, value=dic_user['user_title'])
for n in range(len(user_sum)):
wn = wa.cell(row=x+ 1, column=3 * n + 8, value=dic_user[f'post_time{n}'])
rn = wa.cell(row=x+ 1, column=3 * n + 9, value=dic_user[f'post_summary{n}'])
vn = wa.cell(row=x+ 1, column=3 * n + 10, value=dic_user[f'post_comm{n}'])
print(dic_user['user_name'],'over')
fp = xlrd.open_workbook('user.xlsx')
book = fp.sheet_by_index(0)
intt = []
for j in range(100):
intt.append(int(book.col_values(colx=11)[j]))
url_list = 'https://api.vip.miui.com/api/community/user/announce/list?ref=&pathname=/mio/homePage&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=g/umA5p+n6Z2g+HrA2Mfkg==&uid={}&after=&limit=10'
url_page = 'https://api.vip.miui.com/api/community/user/home/page?ref=&pathname=/mio/homePage&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=V03pII8jNAVl8OF1z7yhOA==&uid={}&encodeUserId='
x = 1
if __name__ == "__main__":
with ThreadPoolExecutor(50)as t:
for i in range(len(intt)):
t.submit(user, url_list.format(intt[i]), url_page.format(intt[i]), x)
x += 1
wb.save('./userdata2.xlsx')
我已经使用了代理,为什么每次还是无法保存全部数据,明明在上午的时候还是可以读取并保存全部数据。
是因为请求太频繁,ip不断被封吗?
还是因为线程池出问题了?
文件1(data.csv)是月份总表,只有开始时间和结束时间。多个文件(12021-11-15.csv、22021-11-15.csv、32021-11-15.csv...)是日期表,只有一天的开始时间、结束时间、对应时刻速度。
多条跑道表中的数据如下(日期格式有两种):
时间 | 1.data |
2021-11-15 14:28:28 | 2 |
2021-11-15 14:38:28 | 3 |
时间 | 2.date |
2021/11/15 14:28:28 | 2.5 |
2021/11/15 14:38:28 | 3.5 |
那么这十分钟内,跑道1的平均速度是2.5,跑道2的平均速度是3,总跑道的平均速度是2.75。