import requests
from lxml import etree
import csv
from concurrent.futures import ThreadPoolExecutor
#import pandas as pd
#from xlwt import *
#import time
f=open("data.csv",mode="w",encoding="utf-8",newline='')
csvwriter=csv.writer(f)
csvwriter.writerow(["项目名称","日期","网址"])
#抓取工具
def download_wuhu_page(url):
dic={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0"
}
resp=requests.get(url,headers=dic)
#print(resp.text)
html=etree.HTML(resp.text)
divs=html.xpath("//table/tbody")
#print(divs)
for div in divs:
name=div.xpath(".//td[2]/a/text()")
date=div.xpath(".//td[3]/text()")
wz=div.xpath(".//td[2]/a/@href")
#print(name)
txt=[]
for i in range(len(name)):
for j in range(len(date)):
if i==j:
for k in range(len(wz)):
if j==k:
t=(name[i],date[j],"http://whsggzy.wuhu.gov.cn"+wz[k])
txt.append(t)
csvwriter.writerows(txt)
#csvwriter.writerow(f"{name},{date}")
#csvwriter.writerow(date)
print(url,"提取完毕")
#抓取工具
if __name__ == '__main__':
with ThreadPoolExecutor(100)as t:
for x in '01','02','07','08','09','11','19':
#print(x)
z01=['01','02','03','04','05']
z02=['01','03','04','05','08','16']
z07=['01','02']
z08=['01','02','03','04']
#z09=['01','02','03','04']
#z11=['01','02','03','04']
#z19=['01','02','03','04']
zz=z08
if x=='01':
zz=z01
if x=='02':
zz=z02
if x=='07':
zz=z07
for z1 in zz:
for i in range(1,3):
if i==1:
t.submit(download_wuhu_page,f"http://whsggzy.wuhu.gov.cn/jyxx/0050{x}/0050{x}0{z1}/moreinfo_listjy.html")
else:
t.submit(download_wuhu_page,f"http://whsggzy.wuhu.gov.cn/jyxx/0050{x}/0050{x}0{z1}/{i}.html")
print('全部下载完毕。')
from pandas.io.excel import ExcelWriter
import pandas
csv_file ='data.csv'
with ExcelWriter(r'C:\Users\Administrator\Desktop\测试.xlsx') as ew:
pandas.read_csv(csv_file).to_excel(ew, sheet_name=csv_file)
应该是文件处在打开状态,文件流有些没有写入,解决办法是在if name == 'main':中添加一名文件句柄关闭语句f.close()即可:
if __name__ == '__main__':
with ThreadPoolExecutor(100)as t:
for x in '01','02','07','08','09','11','19':
#print(x)
z01=['01','02','03','04','05']
z02=['01','03','04','05','08','16']
z07=['01','02']
z08=['01','02','03','04']
#z09=['01','02','03','04']
#z11=['01','02','03','04']
#z19=['01','02','03','04']
zz=z08
if x=='01':
zz=z01
if x=='02':
zz=z02
if x=='07':
zz=z07
for z1 in zz:
for i in range(1,3):
if i==1:
t.submit(download_wuhu_page,f"http://whsggzy.wuhu.gov.cn/jyxx/0050{x}/0050{x}0{z1}/moreinfo_listjy.html")
else:
t.submit(download_wuhu_page,f"http://whsggzy.wuhu.gov.cn/jyxx/0050{x}/0050{x}0{z1}/{i}.html")
f.close()
如对你有帮助,请点击采纳按钮。
您好,我是有问必答小助手,您的问题已经有小伙伴帮您解答,感谢您对有问必答的支持与关注!