import requests,re
from openpyxl import Workbook
# wb=Workbook()
# ws =wb.active
# ws.append(["详情介绍"])
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
# https://www.hjutv.cn/detail_6119.html
detail_url="https://www.hjutv.cn/detail_{}.html"
for i in range(1,11):
url =detail_url.format(i)
print(url)
response = requests.get(url=url, headers=headers)
html = response.text
detail_pattern = re.compile(r'<div class="content_desc context clearfix" style="display: none;"><span>(.*?)</span></div>')
detail =detail_pattern.findall(html)
print(detail)
这样?
import requests,re
from openpyxl import Workbook
wb=Workbook()
ws =wb.active
ws.append(["韩剧名","主演","详情介绍"])
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
page_url="https://www.hjutv.cn/show_2________{}___.html"
detail_url="https://www.hjutv.cn/detail_{}.html"
response =requests.get(url=page_url,headers=headers)
for i in range(1,2):#################采集多页修改这里
url = page_url.format(i)
response = requests.get(url=url, headers=headers)
html = response.text
tvplay_pattern = re.compile(r'<li class="vodlist_item .*?">(.*?)</li>',re.S)
tvplay_list = tvplay_pattern .findall(html)
# 匹配剧名
name_pattern =re.compile(r'<p class="vodlist_title"><a[^>]+>([^<]+)',re.S)
# 匹配演员名
actor_name_p_pattern =re.compile(r'<p class="vodlist_sub">(.*?)</p>',re.S)#获取所有演员a标签html代码
actor_name_pattern =re.compile(r'<a[^>]+>([^<]+)',re.S)#获取演员
#URL正则
href_pattern=re.compile(r'detail_(\d+)',re.S)
#介绍正则
intro_pattern=re.compile(r'<div class="content_desc context clearfix"><span>([^<]+)',re.S)
for tvplay in tvplay_list:
#try:
name = name_pattern.findall(tvplay)
if len(name)==0:#没找到名称
name="---"
else:
name = name[0]
actor_name=""
actor_name_p=actor_name_p_pattern.findall(tvplay)
if len(actor_name_p):
actor_name=actor_name_pattern.findall(actor_name_p[0])
actor_name=','.join(actor_name)
#############获取详情介绍
href=href_pattern.findall(tvplay)[0]
href=detail_url.format(href)
detail=requests.get(href,headers=headers).text
detail=intro_pattern.findall(detail)
if len(detail):
detail=detail[0]
else:
detail="---"
#############获取详情介绍
print(name,actor_name,detail)
ws.append([name,actor_name,detail])
#except:
#print(tvplay)
#break
wb.save("韩剧.xlsx")
首先你的网址获取有异常,例如:https://www.hjutv.cn/detail_1.html 这个网址,是没有数据的,也就是说网址是不正确的
我是在你的基础上简单的修改了一下,其实这个还有很大的优化空间
import requests
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/93.0.4577.63 Safari/537.36"
}
detail_url = "https://www.hjutv.cn/detail_{}.html"
for i in range(1, 11):
url = detail_url.format(i)
print(url)
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
detail = html.xpath('//div[@class="content_desc context clearfix"]//span/text()')
print(detail)