import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r=request.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
return ""
def fillUnivList(ulist,html):
soup=BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds=th('td')
ulist.append([tds[0].string,tds[1].string,tds[2].string])
pass
def printUnivList(ulist,num):
print("{:^10}\t{:^6}\t{:^10}".format("排名","学校名称","省市"))
for i in range(num):
u=ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
def main():
uinfo=[]
url="https://www.dxsbb.com/news/44368.html"
html =getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,30)
main()
代码问题:1.request少写了s。
2. fillUnivList(ulist,html)函数没有return。
3.printUnivList(ulist,num)缺少异常处理。
4.main()没有调用,应该放到main()函数外。
改成这样就行了。
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
return ""
def fillUnivList(ulist,html):
soup=BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds=tr('td')
ulist.append([tds[0].string,tds[1].string,tds[2].string])
return ulist
def printUnivList(ulist,num):
print("{:^10}\t{:^6}\t{:^10}".format("排名","学校名称","省市"))
for i in range(num):
try:
u=ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
except:
pass
def main():
uinfo=[]
url="https://www.dxsbb.com/news/44368.html"
html =getHTMLText(url)
ulist=fillUnivList(uinfo,html)
printUnivList(ulist,30)
if __name__=='__main__':
main()
try:
r=request.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
没报错是因为你try 捕获异常了,
你的请求 requests 请求 少个 s
您好,我是有问必答小助手,你的问题已经有小伙伴为您解答了问题,您看下是否解决了您的问题,可以追评进行沟通哦~
如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~
ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>>https://vip.csdn.net/askvip?utm_source=1146287632
#!/usr/bin/env python
from lxml import etree
import requests
import time
headers = {
'Host': 'bang.dangdang.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
}
data = requests.get(url,headers=headers).text
s = etree.HTML(data)
with open('dangdang.csv','w') as outputfile:
items = s.xpath('//*[@id="sortRanking"]/div')
for item in items:
book_url=item.xpath('./a/@href')
item_name=item.xpath('./a/text()')
if len(book_url)>0:
href=book_url[0]
item_title=item_name[0]
a=href[41:46]
print(item_title)
for page in range(1,26):
per_url= 'http://bang.dangdang.com/books/fivestars/{}.00.00.00.00-all-0-0-1-{}'.format(a,page)
data2=requests.get(per_url).text
f=etree.HTML(data2)
try:
file=f.xpath('//ul[@class="bang_list clearfix bang_list_mode"]/li')
print('正在打印{}第{}页…………'.format(item_title,page))
time.sleep(2)
for book in file:
title=book.xpath('./div[@class="name"]/a/@title')[0]
author=book.xpath('string(./div[@class="publisher_info"][1])')
pinglun=book.xpath('./div[@class="star"]/a/text()')[0].strip('条评论')
wuxing=book.xpath('./div[@class="biaosheng"]/span/text()')[0].strip('次')
price_now=book.xpath('./div[@class="price"]/p/span[1]/text()')[0]
price_before=book.xpath('./div[@class="price"]/p/span[2]/text()')[0]
price_sale=book.xpath('./div[@class="price"]/p/span[3]/text()')[0]
try:
date=book.xpath('./div[@class="publisher_info"]/span/text()')[0]
except:
date='出版时间不详'
try:
company=book.xpath('./div[@class="publisher_info"][2]/a/text()')[0]
except:
company='出版社不详'
try:
price_e=book.xpath('./div[@class="price"]/p[@class="price_e"]/span/text()')[0]
except:
price_e="没有电子书"
outputfile.write('{},{},{},{},{},{},{},{},{},{}'.format(title,author,date,company,pinglun,wuxing,price_now,price_before,price_sale,price_e))
except:
pass