from bs4 import BeautifulSoup # 网页解析,获取数据
import urllib.error, urllib.request # 制定URL,获取网页数据
import xlwt
from lxml import etree
import time
from tenacity import retry, wait_fixed, stop_after_attempt
def main():
baseurl = "http://12345.chengdu.gov.cn/moreTelByClass?TelType="
# 1.爬取网页
datalist = getData(baseurl)
# 2.数据解析
savepath = "D://Pycharm//MyProject//exp_data_2020083021.xls"
# 3.保存数据
saveData(datalist, savepath)
# 爬取网页
def getData(baseurl):
print("开始爬取...")
Typelist = ['1101', '1104', '1107', '1122', '1137']
datalist = []
x = ''
for h in range(1, 280):
for k in Typelist:
newurl = baseurl + str(k)
k = int(k)
if k == 1101:
x = '城市管理'
elif k == 1104:
x = '交通管理'
elif k == 1107:
x = '环境保护'
elif k == 1122:
x = '城乡住房'
elif k == 1137:
x = '农林牧副渔'
url = newurl + "&page=" + str(h)
html_1 = askURL(url) # 保存获取到的网页源码
# 2.逐一解析数据
soup = BeautifulSoup(html_1, "html.parser")
for item in soup.find_all('li', class_="f12px"): # 查找符合要求的字符串
# print(item)
data = [] # 保存一条记录的全部信息
data.append(x)
item = str(item)
itree = etree.HTML(item)
# 获取来电标题
title = itree.xpath('//div[@class="listTit7"]/text()')
data.append(title)
# print(title)
# 获取办理单位
handle = itree.xpath('//div[@class="listTit2"][1]/text()')
data.append(handle)
# print(handle)
# 获取状态
status = itree.xpath('//div[@class="listTit8"][1]/text()')
data.append(status)
# print(status)
# 获取类别
type = itree.xpath('//div[@class="listTit2"][2]/text()')
data.append(type)
# print(type)
# 获取来电的详情
link = itree.xpath('//li[@class="f12px"][1]/a/@href')
link = link[0]
link = "http://12345.chengdu.gov.cn/" + str(link)
data.append(link)
# print(link)
# 获取访问量
pageviews = itree.xpath('//div[@class="listTit8"][2]/text()')
data.append(pageviews)
# print(pageviews)
# 获取来电时间
date = itree.xpath('//div[@class="listTit8"][3]/text()')
data.append(date)
# print(date)
html_temp = askURL(link)
soup_temp = BeautifulSoup(html_temp, "html.parser")
for i in soup_temp.find_all('table', class_="tb"):
# print(i)
i = str(i)
# 获取来电内容
tree = etree.HTML(i)
cont = tree.xpath('//td[@id="FmContent"]/text()')
# cont = re.findall(findCont, i)
# print(cont)
if len(cont) != 0:
cont = cont[0]
data.append(cont)
# print(cont)
# 获取处理部门
proc = tree.xpath('//td[@class="td31 f12pxgrey"]/text()')
if len(proc) != 0:
data.append(proc)
# print(proc)
# 获取办理结果
result = tree.xpath('//td[@id="DOverDesc"]/text()')
if len(result) != 0:
result = result[0]
data.append(result)
else:
result = '正在处理中'
data.append(result)
# print(result)
datalist.append(data)
print(data)
# print(datalist)
print("第%d次爬取完成" % h)
return datalist
# 得到指定一个URL的网页内容
@retry(wait=wait_fixed(3), stop=stop_after_attempt(5))
def askURL(url):
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.26"} # 模拟浏览器头部信息
request = urllib.request.Request(url, headers=head)
try:
time.sleep(0.1)
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
return html
# print(html)
except urllib.error.URLError as e:
print("TIME OUT !")
def saveData(datalist, savepath):
book = xlwt.Workbook(encoding="utf-8") # 创建workbook对象
sheet = book.add_sheet('exp_data_2020083021') # 创建工作表
col = ("来电分类", "来电标题", "办理单位", "状态", "类别", "链接", "访问量", "来电时间", "来电内容", "处理部门",
"办理结果")
for i in range(0, 11):
sheet.write(0, i, col[i]) # 列名
for i in range(0, len(datalist)):
try:
print("第%d条" % (i + 1))
data = datalist[i]
for j in range(0, 11):
sheet.write(i + 1, j, data[j])
except:
print("!!!第%d条处理失败" % (i + 1))
print("爬取完毕")
book.save(savepath)
if __name__ == "__main__":
main()
打断点调试