import requests, re from lxml import etree import time import random import pymysql from fake_useragent import UserAgent from multiprocessing import Pool from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) navicode_list = [ 'A001,A002,A003,A004,A005,A006,A007','A008,A009,A010,A011,A012,A013', 'F,G,H,J','B014_1,B014_2', 'B014_31,B014_32,B014_33,B014_34,B014_35,B014_36,B014_37', 'B014_38,B014_39,B014_3A','B014_4', 'B014_5,B014_6,B014_7,B014_8','B015_1,B015_3,B015_4,B015_5,B015_6,B015_7,B015_8', 'B015_2', 'B016_11', 'B016_12', 'B016_3,B016_5,B016_6,B016_7,B016_8', 'B016_4', 'B016_21,B016_22,B016_23,B016_24,B016_26,B016_27,B016_28,B016_29', 'B016_25', 'B017,B018,B019', 'B020_1', 'B020_2,B020_3,B020_4,B020_5,B020_6,B020_7,B020_8,B020_9,B020_A,B020_B,B020_C', 'B021,B023,B025','B022_1,B022_2,B022_3,B022_4,B022_5', 'B022_6,B022_7', 'B022_8,B022_B,B022_C','B022_9', 'B022_A','B024_3', 'B024_7', 'B024_1,B024_2,B024_4,B024_5,B024_6','B024_A', 'B024_B,B024_E','B024_C', 'B024_D','B024_8,B024_9','B026', 'B027_1,B027_2,B027_3,B027_4', 'B027_5,B027_6', 'C028_1,C028_2,C028_4,C028_5,C028_6,C028_7,C028_8','C028_9','C028_38', 'C028_31,C028_32,C028_33,C028_34,C028_35', 'C028_36,C028_37,C028_39,C028_3A,C028_3B,C028_3C', 'C029_1,C029_2,C029_3,C029_4,C029_6,C029_7', 'C029_8,C029_9,C029_B,C029_C,C029_D', 'C029_51,C029_52,C029_53,C029_54,C029_55','C029_56,C029_57,C029_58,C029_59', 'C029_A1,C029_A3','C029_A2', 'C030_1,C030_2,C030_3,C030_4,C030_5', 'C030_6,C030_7,C030_8,C030_9,C030_A', 'C030_B,C030_C,C030_D,C030_E,C030_F,C030_G,C030_H,C030_I', 'C031,C032,C033,C034', 'C035_1,C035_2,C035_3,C035_4,C035_5,C035_6,C035_7,C035_8', 'C035_9,C035_A,C035_B,C035_C,C035_D,C035_E', 'C036,C037,C040,C041', 'C038_1,C038_21,C038_22,C038_23,C038_24,C038_3', 'C038_25,C038_26,C038_27,C038_28,C038_29', 'C039', 'C042', 'D043,D044,D045,D046,D047', 'D044,D045,D046,D047', 'I135_1,I135_2,I135_3,I135_4,I135_7,I135_8', 'I135_6', 'I135_522,I135_523,I135_524', 'I135_521', 'I138_1,I138_2,I138_3,I138_4,I138_5,I138_6,I138_7,I138_8,I138_9,I138_A,I138_B', 'I138_C12,I138_C13,I138_C14,I138_C2', 'I138_C11', 'I136_87,I136_88','I136_84,I136_85,I136_86', 'I136_81,I136_82,I136_83', 'I137_3,I137_4,I137_5','I137_1,I137_2', 'I139,I140,I141,I142,I143,I144', 'I136_1,I136_2,I136_3,I136_4,I136_5,I136_6,I136_7', 'I136_9,I136_A,I136_B,I136_C,I136_D,I136_E,I136_F,I136_G' ] time_list = ['2020-04', # '2021-04','2021-03','2021-01','2021-02', # '2020-12','2020-11','2020-10','2020-09','2020-08','2020-07','2020-06','2020-05','2020-04','2020-03','2020-02','2020-01', # '2019-12','2019-11','2019-10','2019-09','2019-08','2019-07','2019-06','2019-05','2019-04','2019-03','2019-02','2019-01', # '2018-12','2018-11','2018-10','2018-09','2018-08','2018-07','2018-06','2018-05','2018-04','2018-03','2018-02','2018-01', # '2017-12','2017-11','2017-10','2017-09','2017-08','2017-07','2017-06','2017-05','2017-04','2017-03','2017-02','2017-01', # '2016-12','2016-11','2016-10','2016-09','2016-08','2016-07','2016-06','2016-05','2016-04','2016-03','2016-02','2016-01', # '2015-12','2015-11','2015-10','2015-09','2015-08','2015-07','2015-06','2015-05','2015-04','2015-03','2015-02','2015-01', # '2014-12','2014-11','2014-10','2014-09','2014-08','2014-07','2014-06','2014-05','2014-04','2014-03','2014-02','2014-01', ] # db = pymysql.connect(host='',user='root',password='',database='industrynav') # cursor = db.cursor() times = time.strftime('%a %b %d %Y %H:%M:%S') + ' GMT+0800 (中国标准时间)' t = random.randint(1,2) for time1 in time_list: for LB in navicode_list: print(time1,LB) params1 = ( ('action', ''), ('NaviCode', LB), # 筛选的类别 ('ua', '1.21'), ('PageName', 'ASP.brief_result_aspx'), ('DbPrefix', 'SCPD'), ('DbCatalog', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'), ('ConfigFile', 'SCPD.xml'), ('db_opt', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'), ('db_value', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'), ('date_gkr_from', time1), # 筛选日期 ('date_gkr_to', time1), # 筛选日期 ('his', '0'), ('__', times), ) def get_cookie(): # 获取访问的cookie session = requests.session() try: session.get('https://epub.cnki.net/kns/request/SearchHandler.ashx', headers=headers, params=params1) return session except: pass def max11(): #获取最大页 session = get_cookie() params = ( ('curpage', ''), # 当前页数 ('RecordsPerPage', '50'), ('QueryID', '20'), ('ID', ''), ('turnpage', '1'), ('tpagemode', 'L'), ('dbPrefix', 'SCPD'), ('Fields', ''), ('DisplayMode', 'listmode'), ('SortType', "(公开日, 'DATE')desc"), ('PageName', 'ASP.brief_result_aspx'), ) response = session.get('https://epub.cnki.net/kns/brief/brief.aspx', headers=headers, params=params) selector = etree.HTML(response.text) try: page_info = selector.xpath('//*[@id="J_ORDER"]/tr[2]/td/table/tr/td[2]/div/span[1]')[0].text max1 = int(re.compile('浏览.*/(.*)').findall(page_info)[0]) # print(max1) return max1 except: pass def get_list_info(): # 获取列表页 max1 = max11() for i in range(1, max1 + 1): session = get_cookie() params = ( ('curpage', i), # 当前页数 ('RecordsPerPage', '50'), ('QueryID', '20'), ('ID', ''), ('turnpage', '1'), ('tpagemode', 'L'), ('dbPrefix', 'SCPD'), ('Fields', ''), ('DisplayMode', 'listmode'), ('SortType', "(公开日, 'DATE')desc"), ('PageName', 'ASP.brief_result_aspx'), ) try: response = session.get('https://epub.cnki.net/kns/brief/brief.aspx', headers=headers, params=params) selector = etree.HTML(response.text) urls_info = re.compile("<a class='fz14' href='/kns/detail/detail.aspx(.*?)'").findall(response.text) page_info = selector.xpath('//*[@id="J_ORDER"]/tr[2]/td/table/tr/td[2]/div/span[1]')[0].text print(page_info) nums = len(urls_info) now_page = int(re.compile('浏览(.*?)/').findall(page_info)[0]) print("当前获取第{}页数据".format(now_page), "数目", nums) print(LB, time1) except: pass for url in urls_info: detail_url = 'https://kns.cnki.net/kcms/detail/detail.aspx?' + url # 详情页地址 print(detail_url) try: response = requests.get(url=detail_url, headers=headers) except: time.sleep(5) response = requests.get(url=detail_url, headers=headers) main_info = str(response.text) a = str(main_info.split(' ')).replace(' ', '').replace(',', '').replace('\r\n', '').replace('\'', '') b = str(a.replace('</span><pclass="funds">', '').replace('</h5>\\r\\n<divclass="abstract-text">', '')) # print(b) title = ''.join(re.compile('<title>(.*?)-中国知网').findall(b)[0]) # 专利名称 leixing = ''.join(re.compile('>专利类型:(.*?)<').findall(b)[0]) # 类型 sqgb = ''.join(re.compile('>申请公布号:(.*?)<').findall(b)) # 申请公布 if (sqgb == ''): gb_id = ''.join(re.compile('授权公布号:(.*?)<').findall(b)) else: gb_id = sqgb gkr = ''.join(re.compile('>公开公告日:(.*?)<').findall(b)) # 公开日 if (gkr == ''): gb_time = ''.join(re.compile('授权公告日:(.*?)<').findall(b)) else: gb_time = gkr sq_id = ''.join(re.compile('>申请\(专利\)号:(.*?)<').findall(b)) # 申请号 sq_time = ''.join(re.compile('>申请日:(.*?)<').findall(b)) # 申请日 sqr = ''.join(re.compile('申请人:.*?">(.*?)<').findall(b)) # 申请人 if (sqr == ''): sq_person = ''.join(re.compile('申请人:(.*?)</p>').findall(b)) else: sq_person = sqr addr = ''.join(re.compile('>地址:(.*?)<').findall(b)) # 地址 cl = ''.join(re.compile('>主分类号:(.*?)<').findall(b)) # 主分类号 flh = ''.join(re.compile('>分类号:(.*?)<').findall(b)) # 分类号 gsdm = ''.join(re.compile('>国省代码:(.*?)<').findall(b)) # 国省代码 abstracts = ''.join((re.compile('>摘要:(.*?)<').findall(b))) # 摘要 The_inventor = ''.join(re.compile('>发明人:(.*?)<', re.S).findall(b)) # 发明人 if (The_inventor == ''): fmr = ''.join(re.compile('au([\u4e00-\u9fa5a-zA-Z. ]+)\d+', re.S).findall(b)) else: fmr = The_inventor try: patent_agency = ''.join(re.compile('>代理机构:(.*?)<').findall(b)) # 专利代理机构 agent = ''.join(re.compile('">代理人:(.*?)<').findall(b)) # 代理人 except Exception: patent_agency = agent = '' print(title, leixing, gb_id, gb_time, sq_id, sq_time, sq_person, addr, cl, flh, gsdm, abstracts, patent_agency, agent, fmr) # sql = """insert into patent(title, leixing, gb_id, gb_time, sq_id, sq_time, sq_person, addr, cl, flh, gsdm, abstracts,patent_agency, agent, fmr) values ('{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}')""".format( # title, leixing, gb_id, gb_time, sq_id, sq_time, sq_person, addr, cl, flh, gsdm, abstracts,patent_agency, agent, fmr) # print(sql) # cursor.execute(sql) # db.commit() if __name__ == '__main__': ua = UserAgent().random headers = { 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-Agent': ua, 'Referer': 'https://epub.cnki.net/kns/brief/result.aspx?dbprefix=SCPD', } urls_info = get_list_info() # db.close()
多线程你要将同一个操作分给不同线程操作,比如一个任务循环10次,5条线程,每条线程循环2次,你要将任务细分就可以了
您好,我是有问必答小助手,您的问题已经有小伙伴解答了,您看下是否解决,可以追评进行沟通哦~
如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~
ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>>https://vip.csdn.net/askvip?utm_source=1146287632