抓取韩剧网站多个网页的韩剧详情页链接,图片,现在怎么改才能运行出来

import requests
from lxml import etree
from openpyxl import Workbook
wb = Workbook()#实例化工作表
ws = wb.active#激活工作表
ws.append(["剧名","演员","更新状态","详情页链接"]) # 添加表头
#准备url和headers
headers={
  "User-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Mobile Safari/537.36"
}
all_url ="https://www.hjutv.cn/show_2________{}___.html"
base_url="https://www.hjutv.cn/type_2_.html"   #base_url="https://www.hjutv.cn/"
for i in range(1,28):
    url = all_url.format(i)
    response = requests.get(url=url, headers=headers)
    html =etree.HTML (response.text)
    li_list = html.xpath('//ul[@class="vodlist vodlist_wi author*qq3626/95/000 clearfix"]/li')
    print(len(li_list))
    for li in li_list:
         # 1.获取剧名
         name = li.xpath('.//p[@class="vodlist_title"]/a/text()')[0]
         # 2.获取演员
         actor = li.xpath('.//p[@class="vodlist_sub"]//a/text()')

         if len(actor)==0:
            actor="--"
         else:
            actor=','.join(actor)
         # 3.获取更新状态
         updatestatus=li.xpath('.//span[@class="pic_text text_right"]/text()')[0]
         # 4.获取详情页链接
         detaillink=li.xpath('.//li[@class="vodlist_item num_1"]/a/@href')[0]
         href=base_url+detaillink

         print(name,actor,updatestatus,href)

#          ws.append([name,actor,updatestatus])
# wb.save("韩剧数据表.xlsx")

请求的headers和url都要写正确,添加referer,请求头改为 "user-agent",原代码中detaillink的xpath不正确,多页请求时添加延时。测试代码如下,可获取数据。

import requests
from lxml import etree
from openpyxl import Workbook
import time
wb = Workbook()#实例化工作表
ws = wb.active#激活工作表
ws.append(["剧名","演员","更新状态","详情页链接"]) # 添加表头
#准备url和headers
headers={
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.43",'referer': 'https://www.hjutv.cn/show_2___________.html', 
}
all_url = "https://www.hjutv.cn/show_2________{}___.html"
base_url="https://www.hjutv.cn/"
for i in range(1,3):
        url = all_url.format(i)
        response = requests.get(url=url, headers=headers)
        html =etree.HTML (response.text)        
        li_list = html.xpath(
            '//ul[@class="vodlist vodlist_wi author*qq3626/95/000 clearfix"]/li')
        #//*[@id="show_page"]/div[2]/div/div[2]/ul[1]/li[1]
        print(len(li_list))
        for li in li_list:
                # 1.获取剧名
                name = li.xpath('.//p[@class="vodlist_title"]/a/text()')[0]
                # 2.获取演员
                actor = li.xpath('.//p[@class="vodlist_sub"]//a/text()')

                if len(actor)==0:
                        actor="--"
                else:
                        actor=','.join(actor)
                # 3.获取更新状态
                updatestatus=li.xpath('.//span[@class="pic_text text_right"]/text()')[0]
                # 4.获取详情页链接
                detaillink=li.xpath('.//a/@href')[0]
                href=base_url+detaillink
                
                print(name,actor,updatestatus,href)
                
                ws.append([name,actor,updatestatus,href])
        time.sleep(1)
wb.save("韩剧数据表.xlsx")

如有帮助,请点采纳。

base_url = "https://www.hjutv.cn"
detaillink = li.xpath('.//p[@class="vodlist_title"]/a/@href')[0]