爬详情内容写的不对,怎么改

import requests,re
from openpyxl import Workbook
# wb=Workbook()
# ws =wb.active
# ws.append(["详情介绍"])
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
# https://www.hjutv.cn/detail_6119.html
detail_url="https://www.hjutv.cn/detail_{}.html"
for i in range(1,11):
    url =detail_url.format(i)
    print(url)
    response = requests.get(url=url, headers=headers)
    html = response.text
    detail_pattern = re.compile(r'<div class="content_desc context clearfix" style="display: none;"><span>(.*?)</span></div>')
    detail =detail_pattern.findall(html)
    print(detail)

这样?

img

import requests,re
from openpyxl import Workbook
wb=Workbook()
ws =wb.active
ws.append(["韩剧名","主演","详情介绍"])
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
page_url="https://www.hjutv.cn/show_2________{}___.html"
detail_url="https://www.hjutv.cn/detail_{}.html"
response =requests.get(url=page_url,headers=headers)
for i in range(1,2):#################采集多页修改这里
    url = page_url.format(i)
    response = requests.get(url=url, headers=headers)
    html = response.text  
    tvplay_pattern = re.compile(r'<li class="vodlist_item .*?">(.*?)</li>',re.S)
    tvplay_list = tvplay_pattern .findall(html)
    # 匹配剧名
    name_pattern =re.compile(r'<p class="vodlist_title"><a[^>]+>([^<]+)',re.S)
    # 匹配演员名
    actor_name_p_pattern =re.compile(r'<p class="vodlist_sub">(.*?)</p>',re.S)#获取所有演员a标签html代码
    actor_name_pattern =re.compile(r'<a[^>]+>([^<]+)',re.S)#获取演员
    #URL正则
    href_pattern=re.compile(r'detail_(\d+)',re.S)
    #介绍正则
    intro_pattern=re.compile(r'<div class="content_desc context clearfix"><span>([^<]+)',re.S)
    for tvplay in tvplay_list:
        #try:
            name = name_pattern.findall(tvplay)
            if len(name)==0:#没找到名称
                name="---"
            else:
                name = name[0]
            actor_name=""
            actor_name_p=actor_name_p_pattern.findall(tvplay)
            if len(actor_name_p):
                actor_name=actor_name_pattern.findall(actor_name_p[0])
                actor_name=','.join(actor_name)

            #############获取详情介绍
            href=href_pattern.findall(tvplay)[0]
            href=detail_url.format(href)
            detail=requests.get(href,headers=headers).text
            detail=intro_pattern.findall(detail)
            if len(detail):
                detail=detail[0]
            else:
                detail="---"
         
            #############获取详情介绍
            print(name,actor_name,detail)
            ws.append([name,actor_name,detail])
        #except:
            #print(tvplay)
            #break
wb.save("韩剧.xlsx")
 

首先你的网址获取有异常,例如:https://www.hjutv.cn/detail_1.html 这个网址,是没有数据的,也就是说网址是不正确的
我是在你的基础上简单的修改了一下,其实这个还有很大的优化空间

import requests
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                  " Chrome/93.0.4577.63 Safari/537.36"
}
detail_url = "https://www.hjutv.cn/detail_{}.html"
for i in range(1, 11):
    url = detail_url.format(i)
    print(url)
    response = requests.get(url=url, headers=headers)
    html = etree.HTML(response.text)
    detail = html.xpath('//div[@class="content_desc context clearfix"]//span/text()')
    print(detail)