import requests,re
from openpyxl import Workbook
# wb=Workbook()
# ws =wb.active
# ws.append(["详情介绍"])
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
# https://www.hjutv.cn/detail_6119.html
page_url="https://www.hjutv.cn/show_2________{}___.html"
detail_url="https://www.hjutv.cn/detail_{}.html"
for i in range(1,28):
url = page_url.format(i)
response = requests.get(url=url, headers=headers)
html = response.text
li_pattern = re.compile(r'<li class="vodlist_item .*?">(.*?)</li>', re.S)
li_list = li_pattern.findall(html)
href_pattern = re.compile(r'detail_(\d+)', re.S)
detail_pattern = re.compile(r'<div class="content_desc context clearfix"><span>([^<]+)', re.S)
name_pattern = re.compile(r'<p class="vodlist_title"><a .*?>(.*?)</a></p>', re.S)
img_pattern =re.compile(r'<a class="vodlist_thumb lazyload" data-original=([^<]+)>')
for li in li_list:
href = href_pattern.findall(li)[0]
href = detail_url.format(href)
# print(href)
name = name_pattern.findall(li)[0]
print(name)
img =img_pattern.findall(li)
with open("./data/{}.jpg".format(name), "wb") as f:
f.write()
<a class="vodlist_thumb lazyload" href="/index.php/detail_6091.html"
data_original="https.jpg" ></a>
不需要正则,直接用xpath即可获取。img=li.xpath('.//a/@data-original')[0]
https://tva1.sinaimg.cn/large/c0e0f216gy1gwqtznc9oxj207i0aot92.jpg
https://tva1.sinaimg.cn/large/003wZMYmgy1gv3ve3dopkj609o0d7jru02.jpg
https://tva1.sinaimg.cn/large/c0e0f216gy1gwb2x8dy4cj207i07iaa6.jpg
https://tva1.sinaimg.cn/large/c0e0f216gy1gw34sisxu2j207i0b9dg8.jpg
img_pattern =re.compile(r'<a class="vodlist_thumb lazyload".*?data-original="([^"]+)"')
你题目的解答代码如下:
import requests,re
from openpyxl import Workbook
# wb=Workbook()
# ws =wb.active
# ws.append(["详情介绍"])
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
# https://www.hjutv.cn/detail_6119.html
page_url="https://www.hjutv.cn/show_2________{}___.html"
detail_url="https://www.hjutv.cn/detail_{}.html"
for i in range(1,28):
url = page_url.format(i)
response = requests.get(url=url, headers=headers)
html = response.text
li_pattern = re.compile(r'<li class="vodlist_item .*?">(.*?)</li>', re.S)
li_list = li_pattern.findall(html)
href_pattern = re.compile(r'detail_(\d+)', re.S)
detail_pattern = re.compile(r'<div class="content_desc context clearfix"><span>([^<]+)', re.S)
name_pattern = re.compile(r'<p class="vodlist_title"><a .*?>(.*?)</a></p>', re.S)
img_pattern =re.compile(r'<a class="vodlist_thumb lazyload".*?data-original="([^"]+)"')
for li in li_list:
href = href_pattern.findall(li)[0]
href = detail_url.format(href)
# print(href)
name = name_pattern.findall(li)[0]
print(name)
img =img_pattern.findall(li)[0]
print(img)
如有帮助,望采纳!谢谢!