import requests
from lxml import etree
from openpyxl import Workbook
wb = Workbook()#实例化工作表
ws = wb.active#激活工作表
ws.append(["剧名","演员","更新状态","详情页链接"]) # 添加表头
#准备url和headers
headers={
"User-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Mobile Safari/537.36"
}
all_url ="https://www.hjutv.cn/show_2________{}___.html"
base_url="https://www.hjutv.cn/type_2_.html" #base_url="https://www.hjutv.cn/"
for i in range(1,28):
url = all_url.format(i)
response = requests.get(url=url, headers=headers)
html =etree.HTML (response.text)
li_list = html.xpath('//ul[@class="vodlist vodlist_wi author*qq3626/95/000 clearfix"]/li')
print(len(li_list))
for li in li_list:
# 1.获取剧名
name = li.xpath('.//p[@class="vodlist_title"]/a/text()')[0]
# 2.获取演员
actor = li.xpath('.//p[@class="vodlist_sub"]//a/text()')
if len(actor)==0:
actor="--"
else:
actor=','.join(actor)
# 3.获取更新状态
updatestatus=li.xpath('.//span[@class="pic_text text_right"]/text()')[0]
# 4.获取详情页链接
detaillink=li.xpath('.//li[@class="vodlist_item num_1"]/a/@href')[0]
href=base_url+detaillink
print(name,actor,updatestatus,href)
# ws.append([name,actor,updatestatus])
# wb.save("韩剧数据表.xlsx")
请求的headers和url都要写正确,添加referer,请求头改为 "user-agent",原代码中detaillink的xpath不正确,多页请求时添加延时。测试代码如下,可获取数据。
import requests
from lxml import etree
from openpyxl import Workbook
import time
wb = Workbook()#实例化工作表
ws = wb.active#激活工作表
ws.append(["剧名","演员","更新状态","详情页链接"]) # 添加表头
#准备url和headers
headers={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.43",'referer': 'https://www.hjutv.cn/show_2___________.html',
}
all_url = "https://www.hjutv.cn/show_2________{}___.html"
base_url="https://www.hjutv.cn/"
for i in range(1,3):
url = all_url.format(i)
response = requests.get(url=url, headers=headers)
html =etree.HTML (response.text)
li_list = html.xpath(
'//ul[@class="vodlist vodlist_wi author*qq3626/95/000 clearfix"]/li')
#//*[@id="show_page"]/div[2]/div/div[2]/ul[1]/li[1]
print(len(li_list))
for li in li_list:
# 1.获取剧名
name = li.xpath('.//p[@class="vodlist_title"]/a/text()')[0]
# 2.获取演员
actor = li.xpath('.//p[@class="vodlist_sub"]//a/text()')
if len(actor)==0:
actor="--"
else:
actor=','.join(actor)
# 3.获取更新状态
updatestatus=li.xpath('.//span[@class="pic_text text_right"]/text()')[0]
# 4.获取详情页链接
detaillink=li.xpath('.//a/@href')[0]
href=base_url+detaillink
print(name,actor,updatestatus,href)
ws.append([name,actor,updatestatus,href])
time.sleep(1)
wb.save("韩剧数据表.xlsx")
如有帮助,请点采纳。
base_url = "https://www.hjutv.cn"
detaillink = li.xpath('.//p[@class="vodlist_title"]/a/@href')[0]