import requests
from jsonpath import jsonpath
import time
import xlrd
import openpyxl
from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor
m=0
wb =openpyxl.load_workbook('userdata2.xlsx')
wa = wb.active
b1=wa.cell(row=1,column=1,value='用户名')
b2=wa.cell(row=1,column=2,value='ip')
b3=wa.cell(row=1,column=3,value='用户圈子数')
b4=wa.cell(row=1,column=4,value='用户关注')
b5=wa.cell(row=1,column=5,value='用户粉丝')
b6=wa.cell(row=1,column=6,value='用户等级')
b7=wa.cell(row=1,column=7,value='用户牌子')
for n in range(10):
cn = wa.cell(row=1, column=3*n+8, value=f'用户发帖时间{n}')
dn = wa.cell(row=1, column=3*n+9, value=f'用户发帖内容{n}')
en = wa.cell(row=1, column=3*n+10, value=f'用户帖子评论{n}')
x=0
dic_user={}
def user(url_list,url_page):
us =UserAgent()
header = {
'User-Agent': us.browsers[2],
'Cookie':''
}
proxies = {'HTTPS': '157.90.121.60:8080'}
# time.sleep(0.5)
resp_list = requests.get(url=url_list, headers=header,proxies=proxies,timeout=5).json()
resp_page = requests.get(url=url_page, headers=header,proxies=proxies,timeout=5).json()
user_sum = jsonpath(resp_list, '$.entity.records.*')
# self.user_sum = user_sum
user_name = ''.join(jsonpath(resp_list, '$..0.author.name'))
user_ip = ''.join(jsonpath(resp_page, "$.entity.ipRegion"))
user_board = jsonpath(resp_page, '$.entity.boardCnt')[0]
user_followee = jsonpath(resp_page, '$.entity.followeeCnt')[0]
user_follower = jsonpath(resp_page, '$.entity.followerCnt')[0]
user_level = jsonpath(resp_page, '$.entity.userGrowLevelInfo.level')[0]
user_title = jsonpath(resp_page, '$.entity.userGrowLevelInfo.title')[0]
dic_user = {
'user_name': user_name,
'user_ip': user_ip,
'user_board': user_board,
'user_followee': user_followee,
'user_follower': user_follower,
'user_level': user_level,
'user_title': user_title
}
# print(dic_user['user_name'])
for n in range(len(user_sum)):
# post_title = jsonpath(resp_list, f'$..{n}.title')
post_summary = jsonpath(resp_list, f'$..{n}.summary')
post_time_cc = jsonpath(resp_list, f'$..{n}.createTime')[0]
post_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(post_time_cc / 1000))
post_comm_id = ''.join(jsonpath(resp_list, f'$.entity.records.{n}.id'))
# 每个帖子评论数量
com_summ = jsonpath(resp_list, f'$.entity.records.{n}.commentCnt')[0]
# post_comm_time_sum =com_summ
dic_comm = {}
if True:
post_comm_comm = 0
if com_summ == 0:
pass
else:
# 单个帖子评论数量
url_comm = f'https://api.vip.miui.com/mtop/planet/vip/content/comments?ref=&pathname=/mio/detail&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=WgRuWRyOPot0i7YibHrmsw==&limit={com_summ}&postId={post_comm_id}&after=&sortType=1'
resp_comm = requests.get(url=url_comm, headers=header).json()
for comm_nb in range(com_summ):
post_comm_reply_a = jsonpath(resp_comm, f'entity.records.{comm_nb}.count')
try:
if post_comm_reply_a != None:
post_comm_reply_a = post_comm_reply_a[0]
# post_comm_comm +=post_comm_reply_a
post_comm_comm += post_comm_reply_a
except:
pass
com_summ -= post_comm_comm
for comm_nb_a in range(com_summ):
url_comm = f'https://api.vip.miui.com/mtop/planet/vip/content/comments?ref=&pathname=/mio/detail&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=WgRuWRyOPot0i7YibHrmsw==&limit={com_summ}&postId={post_comm_id}&after=&sortType=1'
resp_comm = requests.get(url=url_comm, headers=header).json()
dic_comm_b = {}
post_comm_user = jsonpath(resp_comm, f'$.entity.records.{comm_nb_a}.author.name')
post_comm_text = jsonpath(resp_comm, f'$.entity.records.{comm_nb_a}.text')
# post_comm_time_sum = post_comm_time_sum -post_comm_comm
# for time_nb in range(post_comm_time_sum):
# post_comm_time_cc = jsonpath(resp_comm,f'$.entity.records.{time_nb}.time')[0]
# post_comm_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(post_comm_time_cc / 1000))
# print(post_comm_time)
dic_com = {
f'post_comm_user{comm_nb_a}': post_comm_user,
f'post_comm_text{comm_nb_a}': post_comm_text,
# f'post_comm_time{comm_nb}':post_comm_time
}
dic_comm_b.update(dic_com)
# print(dic_comm)
dic_comm.update(dic_comm_b)
dic = {
f'post_time{n}': post_time,
f'post_summary{n}': str(post_summary),
f'post_comm{n}': str(dic_comm)
}
dic_user.update(dic)
dic_user.update(dic_user)
# self.dic_user= dic_user
# print(dic_user['user_name'],'爬取结束')
# 给字段中加值,考虑循环使用
q1 = wa.cell(row=x+1, column=m+1, value=dic_user['user_name'])
q2 = wa.cell(row=x+1, column=m+2, value=dic_user['user_ip'])
q3 = wa.cell(row=x+1, column=3+m, value=dic_user['user_board'])
q4 = wa.cell(row=x+1, column=4+m, value=dic_user['user_followee'])
q5 = wa.cell(row=x+1, column=5+m, value=dic_user['user_followee'])
q6 = wa.cell(row=x+1, column=6+m, value=dic_user['user_level'])
q7 = wa.cell(row=x+1, column=7+m, value=dic_user['user_title'])
for n in range(len(user_sum)):
wn = wa.cell(row=x+ 1, column=m+3 * n + 8, value=dic_user[f'post_time{n}'])
rn = wa.cell(row=x+ 1, column=m+3 * n + 9, value=dic_user[f'post_summary{n}'])
vn = wa.cell(row=x+ 1, column=m+3 * n + 10, value=dic_user[f'post_comm{n}'])
print(dic_user)
fp = xlrd.open_workbook('user.xlsx')
book = fp.sheet_by_index(0)
intt = []
for j in range(1,3):
intt.append(int(book.col_values(colx=3)[j]))
url_list = 'https://api.vip.miui.com/api/community/user/announce/list?ref=&pathname=/mio/homePage&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=g/umA5p+n6Z2g+HrA2Mfkg==&uid={}&after=&limit=10'
url_page = 'https://api.vip.miui.com/api/community/user/home/page?ref=&pathname=/mio/homePage&version=dev.230112&oaid=false&device=&restrict_imei=&miui_big_version=&model=&miuiBigVersion=&miui_vip_ph=V03pII8jNAVl8OF1z7yhOA==&uid={}&encodeUserId='
if __name__ =="__main__":
with ThreadPoolExecutor(2)as t:
for i in range(len(intt)):
t.submit(user,url_list.format(intt[i]),url_page.format(intt[i]))
x += 1
m += 17
wb.save('./userdata2.xlsx')
刚开始学,代码写的有点烂请见谅
我想请问,为什么使用线程池下载保存的数据只能保存最后一条,是我写的哪里有错误吗?
spring boot 2
@Bean
public MappingJackson2HttpMessageConverter jackson2HttpMessageConverter() {
MappingJackson2HttpMessageConverter converter = new MappingJackson2HttpMessageConverter();
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
mapper.setDateFormat(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"));
converter.setObjectMapper(mapper);
return converter;
}
@Override
public void configureMessageConverters(List<HttpMessageConverter<?>> converters) {
//将我们定义的时间格式转换器添加到转换器列表中,
//这样jackson格式化时候但凡遇到Date类型就会转换成我们定义的格式
converters.add(jackson2HttpMessageConverter());
}