首页
编程
java
php
前端
首页
编程
java
php
前端
使用Anaconda3中的jupyternotebook写爬虫爬取结果重复
import requests
from lxml import etree
import re
import csv
list=[]for i in range(936,940):
url = "
404 Not Found
http://www.trzy.edu.cn/html/%22+str(i)+%22/%22
list.append(url)
testUrl = "
404 Not Found
http://www.trzy.edu.cn/html/936/%22
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.8031 SLBChan/30"}
r = requests.get(testUrl,headers=headers,timeout=5)
r.status_code
html = etree.HTML(r.text)
li = html.xpath('.//div[
@class="gov-main"]/div[@class="content"]/ul/li')
baseUrl = "
http://www.trzy.edu.cn"
dirUrl = []
for i in li:
href = baseUrl+i.xpath("./a/
@href")[0]
dirUrl.append(href)
for mburl in dirUrl:
r1 = requests.get(mburl,headers=headers,timeout=5)
r1.encoding = "utf8"
html0 = etree.HTML(r1.text)
for html in html0:
title = html0.xpath('//div[
@class="layout_txtcontent_title"]/text()')[0]
author = html0.xpath('//div[
@class="layout_txtcontent_info"]/text()')[0]
time = html0.xpath('//div[
@class="layout_txtcontent_info"]/text()')[1]
print(title,author,time)
点击展开全文