如标题,除最底层的评论外,都可以正常爬取。源代码也找了,没有发现问题。
import re
import urllib.request
import xlwt
from bs4 import BeautifulSoup
import time
# time.sleep(5)
def main():
url1 ='http://tieba.baidu.com/hottopic/browse/hottopic?topic_id=865930&topic_name=LOL%E6%89%8B%E6%B8%B8%E4%BD%93%E9%AA%8C%E6%84%9F%E5%8F%97&red_tag=p3027039361'
savelist = gat(url1)
message = re.compile(r'<p class="content">(.*?)</p>',re.S)
def gat(url1):
url =url1
html = breal(url)
soup = BeautifulSoup(html,'html.parser')#注意该步骤
save=[]
for item in soup('div', class_="center"):
itrm = str(item)
print(itrm)#测试是否成功获取到网页信息
mess = re.findall(message,itrm)
# print(mess)
save.append(mess)
# for i in soup('li', class_="thread-item"):
# print(i)
def breal(url):
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
request = urllib.request.Request(url,headers=head)
html = ''
try:
reopen = urllib.request.urlopen(request)
html = reopen.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
if __name__ == '__main__':
main()