from urllib.request import Request, urlopen
import ssl
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.abc.net.au/news/justin'
#################################################
#################################################
###
headers={'User-Agent': 'Mozilla/5.0 (Macinstosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
req = Request(url, headers=headers)
context = ssl._create_unverified_context()
uClient= urlopen(req, context=context)
html = uClient.read()
uClient.close()
#################################################
#################################################
soup = BeautifulSoup(html, 'html.parser')
maindiv = soup.find('div',class_="JustInPaginationList")
dataset = []
for item in maindiv('a'):
title = item.find('p').getText()
url = item['href']
print(title)
print(url)
print()
dataset.append({'title':title,'url':url})
maindiv应该写成maindiv = soup.find_all('div',class_="JustInPaginationList")
for循环写成:
for items in maindiv:#resultSet
item=items.find('a')#每个标签为a的节点元素。
title = item.find('p').text
url = item['href']
print(title)
print(url)