需求是,用requests抓取请求一个链接里边的一个数据,可能有一个也可能有成百上千个。这个数据再用来构造新的链接,新连接返回结果是个json,然后保存。
现在的情况是,100个左右的新链接请求时长达到7秒多,linux下3秒多;500个的话就奔着1分钟去了
下边是我的代码
import requests, json, time, re
import pandas as pd
import aiohttp
import asyncio
from aiomultiprocess import Pool
useNatureCode_list = {"82": "非营业货运", "83": "非营业企业", "84": "非营业机关", "85": "非营业个人", "9A": "营业出租租赁", "9B": "营业城市公交",
"9C": "营业公路客运", "9D": "营业货运"}
catType = {"A": "客车", "G": "挂车", "H": "货车"}
head = ['归属协议号', '保单号', '投保单号', '业务归属员', '代理人', '出单日期', '险种始期', '险种止期', '车牌号', '车架号',
'使用性质', '品牌型号', '初登日期', '发动机号', '车辆种类', '座位', '姓名', '身份证', '手机号', '住址', '总保费', '总净保费',
'总税额', '总折扣', '车船税', '险种']
headers = {
'Cookie': 'BIGipServercar_core_pool=100797100.17951.0000; BIGipServercar_qcar_poool=1191316140.16671.0000; chinainsuranceJSESSIONID=ZzYGfgmQmwDm9XYXJTq6jNhPk4GySWmTBtgp72BcbhlZQ8VpZpjp!1474618526; QCARJSESSIONID=1RmQfhwFpWz2L238Qkt4nnpC7ZzHKGTQT5PJnpcJv5dv2XzJ12mF!1323047638',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded'
}
url = 'https://xxx.do'
data = 'pageNumber=100¤tPage=1&preProposalNo=&operatorCode=&licenseNo=&frameNo=&insuredName=&operateStartDate=2020-06-26&operateEndDate=2020-07-26&startStartDate=&startEndDate=&queryType=query&underwriteflag=6&validStatus=1&appliName=&proposalNo=&policyNo=&engineNo=&preProposalNoSign=%3D&operatorCodeSign=%3D&appliNameSign=*&insuredNameSign=*&operateDateSign=%3A&startDateSign=%3A&licenseNoSign=*&frameNoSign=*&proposalNoSign=%3D&policyNoSign=%3D&engineNoSign=*&partnerNameSign=*&partnerName=&printflag=&comCode=++&batchNo=++&riskCode=++&riskCodeSign=%3D++'
session = requests.Session()
session.keep_alive = False
resp = session.post(url=url, headers=headers, data=data).text
div = list(set(re.findall('[Z0-9]{27}', resp)))
length = len(div)
print(length)
session.close()
headers2 = {
'Cookie': 'BIGipServercar_core_pool=100797100.17951.0000; BIGipServercar_qcar_poool=1191316140.16671.0000; chinainsuranceJSESSIONID=ZzYGfgmQmwDm9XYXJTq6jNhPk4GySWmTBtgp72BcbhlZQ8VpZpjp!1474618526; QCARJSESSIONID=1RmQfhwFpWz2L238Qkt4nnpC7ZzHKGTQT5PJnpcJv5dv2XzJ12mF!1323047638',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
datas = []
datas2 = []
Count = 1
async def get_div():
lists = []
try:
lists.append(div.pop())
return lists[0]
except Exception as e:
return e
async def get(request_ulr):
#测试是发现这个函数耗时最长而且是越来越长
starts = time.time()
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False)) as session:
async with session.get(url=request_ulr, headers=headers2) as response:
text = await response.text()
# session = aiohttp.ClientSession()
# response = await session.get(url=request_ulr, headers=headers2)
# text = await response.text()
# await Session.close()
await session.close()
ends = time.time()
return text
async def reques():
# async with Pool() as pool:
# urls = []
preProposalNo = await get_div()
request_ulr = f'https://xxx?data={preProposalNo}'
# urls.append(request_ulr)
# async with Pool as pool:
result = await get(request_ulr)
# result = await pool.map(get, urls)
# end = time.time()
# print('花费时间: ', end - start)
return result
async def json_loads():
result = json.loads(await reques())
return result
async def get_data(datas):
pd_data = pd.DataFrame(datas)
pd_data.to_excel(r'C:\Users\Administrator\Desktop\my.xls')
async def parse():
licenseNo = None
insuredAddress = None
tax = None
result = await json_loads()
comCode = result['base']['comCode']
handler = result['base']['handler']
agencyName = result['agentComm']['agencyName']
inputDate = result['base']['inputDate']
vinNo = result['car']['vinNo']
useNatureCode = useNatureCode_list[result['car']['useNatureCode']]
modelName = result['car']['modelName']
enrollDate = result['car']['enrollDate']
engineNo = result['car']['engineNo']
carType1 = catType[result['car']['carType1']]
seatCount = result['car']['seatCount']
appliName = result['persons'][0]['insuredName']
identifyNumber = result['persons'][0]['identifyNumber']
mobile = result['persons'][0]['mobile']
if 'licenseNo' in result['car'].keys():
licenseNo = result['car']['licenseNo']
if 'tax' in result['tax'].keys():
tax = result['tax']['tax']
if 'insuredAddress' in result['persons'][0].keys():
insuredAddress = result['persons'][0]['insuredAddress']
if 'compulsoryPolicyNo' in result['base'].keys() and 'commercialPolicyNo' in result['base'].keys():
compulsoryPolicyNo = result['base']['compulsoryPolicyNo']
commercialPolicyNo = result['base']['commercialPolicyNo']
compulsoryProposalNo = result['base']['compulsoryProposalNo']
commercialProposalNo = result['base']['commercialProposalNo']
compulsoryPremium = result['base']['compulsoryPremium']
commercialPremium = result['base']['commercialPremium']
startDateCI = result['base']['startDateCI']
endDateCI = result['base']['endDateCI']
startDate = result['base']['startDate']
endDate = result['base']['endDate']
compulsoryTniPremium = result['base']['compulsoryTniPremium']
commercialTniPremium = result['base']['commercialTniPremium']
compulsoryVatPremium = result['base']['compulsoryVatPremium']
commercialVatPremium = result['base']['commercialVatPremium']
discountCI = result['base']['discountCI']
discountBI = result['base']['discountBI']
datas.append({
head[0]: comCode,
head[1]: compulsoryPolicyNo,
head[2]: compulsoryProposalNo,
head[3]: handler,
head[4]: agencyName,
head[5]: inputDate,
head[6]: startDateCI,
head[7]: endDateCI,
head[8]: licenseNo,
head[9]: vinNo,
head[10]: useNatureCode,
head[11]: modelName,
head[12]: enrollDate,
head[13]: engineNo,
head[14]: carType1,
head[15]: seatCount,
head[16]: appliName,
head[17]: identifyNumber,
head[18]: mobile,
head[19]: insuredAddress,
head[20]: compulsoryPremium,
head[21]: compulsoryTniPremium,
head[22]: compulsoryVatPremium,
head[23]: discountCI,
head[24]: tax,
head[25]: 'xxx'
})
datas.append({
head[0]: comCode,
head[1]: commercialPolicyNo,
head[2]: commercialProposalNo,
head[3]: handler,
head[4]: agencyName,
head[5]: inputDate,
head[6]: startDate,
head[7]: endDate,
head[8]: licenseNo,
head[9]: vinNo,
head[10]: useNatureCode,
head[11]: modelName,
head[12]: enrollDate,
head[13]: engineNo,
head[14]: carType1,
head[15]: seatCount,
head[16]: appliName,
head[17]: identifyNumber,
head[18]: mobile,
head[19]: insuredAddress,
head[20]: commercialPremium,
head[21]: commercialTniPremium,
head[22]: commercialVatPremium,
head[23]: discountBI,
head[24]: tax,
head[25]: 'xxx'
})
await get_data(datas)
# pd_data = pd.DataFrame(datas)
# pd_data.to_excel(r'C:\Users\Administrator\Desktop\my.xls')
if 'compulsoryPolicyNo' in result['base'].keys() and 'commercialPolicyNo' not in result['base'].keys():
compulsoryPolicyNo = result['base']['compulsoryPolicyNo']
compulsoryProposalNo = result['base']['compulsoryProposalNo']
compulsoryPremium = result['base']['compulsoryPremium']
startDateCI = result['base']['startDateCI']
endDateCI = result['base']['endDateCI']
compulsoryTniPremium = result['base']['compulsoryTniPremium']
compulsoryVatPremium = result['base']['compulsoryVatPremium']
discountCI = result['base']['discountCI']
datas.append({
head[0]: comCode,
head[1]: compulsoryPolicyNo,
head[2]: compulsoryProposalNo,
head[3]: handler,
head[4]: agencyName,
head[5]: inputDate,
head[6]: startDateCI,
head[7]: endDateCI,
head[8]: licenseNo,
head[9]: vinNo,
head[10]: useNatureCode,
head[11]: modelName,
head[12]: enrollDate,
head[13]: engineNo,
head[14]: carType1,
head[15]: seatCount,
head[16]: appliName,
head[17]: identifyNumber,
head[18]: mobile,
head[19]: insuredAddress,
head[20]: compulsoryPremium,
head[21]: compulsoryTniPremium,
head[22]: compulsoryVatPremium,
head[23]: discountCI,
head[24]: tax,
head[25]: 'XXX'
})
await get_data(datas)
# pd_data = pd.DataFrame(datas)
# pd_data.to_excel(r'C:\Users\Administrator\Desktop\my.xls')
if 'compulsoryPolicyNo' not in result['base'].keys() and 'commercialPolicyNo' in result['base'].keys():
commercialPolicyNo = result['base']['commercialPolicyNo']
commercialProposalNo = result['base']['commercialProposalNo']
commercialPremium = result['base']['commercialPremium']
startDate = result['base']['startDate']
endDate = result['base']['endDate']
commercialTniPremium = result['base']['commercialTniPremium']
commercialVatPremium = result['base']['commercialVatPremium']
discountBI = result['base']['discountBI']
datas.append({
head[0]: comCode,
head[1]: commercialPolicyNo,
head[2]: commercialProposalNo,
head[3]: handler,
head[4]: agencyName,
head[5]: inputDate,
head[6]: startDate,
head[7]: endDate,
head[8]: licenseNo,
head[9]: vinNo,
head[10]: useNatureCode,
head[11]: modelName,
head[12]: enrollDate,
head[13]: engineNo,
head[14]: carType1,
head[15]: seatCount,
head[16]: appliName,
head[17]: identifyNumber,
head[18]: mobile,
head[19]: insuredAddress,
head[20]: commercialPremium,
head[21]: commercialTniPremium,
head[22]: commercialVatPremium,
head[23]: discountBI,
head[24]: tax,
head[25]: 'xxx'
})
await get_data(datas)
async def save():
await parse()
if __name__ == '__main__':
from timeit import timeit, repeat
start = time.time()
tasks = [asyncio.ensure_future(save()) for _ in div]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
end2 = time.time()
print('Cost time2:', end2 - start)
下边是get()函数100次请求的时间
谢谢大佬们了
https://blog.csdn.net/someby/article/details/105056151
是异步编程的方式不正确。 asyncio异步有1个要求,如果只是在主线程内运行协程,应该在1个方法中将所有协程任务加入计划,然后再启动event_loop, 就你的代码而言,应该在save()方法,或者main代码块中,创建异步任务,再用gather()方法收集所有任务future 对象后,并发执行。 也就是说,你的代码实际上并没有并发执行,速度当然快不起来了。