有偿帮忙看一下Python爬取文档信息所出现的问题
import requests
from lxml import etree
import csv
BEST_detail='https://www.ygdy8.net'
url='https://www.ygdy8.net/html/gndy/dyzz/20230630/63864.html'
headers={
'User_Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
def get_detail_url(url):
response=requests.get(url,headers=headers)
text=response.content.decode('gbk')
html=etree.HTML(text)
## print(text)
detail_url=html.xpath("table[@class='tbspan']//a/@herf")
## print(detail_url)
return detail_url
## get_detail_url(url)
def parse_detail_page(url):
## print(url)
movie={}
response=requests.get(url,headers=headers)
text=response.content.decode('gbk')
## print(text)
html=etree.HTML(text)
title=html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
## print(title),加【0】获取列表里的元素
movie['title']=title
zoomE=html.xpath("//div[@id='Zoom']")[0]
## print(zoomE)
cover=zoomE.xpath(".//img/@src")[0]
## print(cover) 获取海报图片链接地址
movie['cover']=cover
movie['grade']=0
information=zoomE.xpath(".//text()")
## print(information),解析内容,是个列表,需要对内容解析
for index,info in enumerate(information):
## print(info) 更简洁
## print(index) 列表序号下标
## 先获得译名
if info.startswith("◎译 名"):
info=info.replace("◎译 名","").strip()
##print(info)
movie['translate']=info
if info.startswith("◎年 代"):
info=info.replace("◎年 代","").strip()
##print(info)
movie['year']=info
if info.startswith("◎产 地"):
info=info.replace("◎产 地","").strip()
##print(info)
movie['area']=info
if info.startswith("◎类 别"):
info=info.replace("◎类 别","").strip()
##print(info)
movie['class']=info
if info.startswith("◎语 言"):
info=info.replace("◎语 言","").strip()
##print(info)
movie['language']=info
if info.startswith("◎字 幕"):
info=info.replace("◎字 幕","").strip()
##print(info)
movie['zimu']=info
if info.startswith("◎上映日期"):
info=info.replace("◎上映日期","").strip()
##print(info)
movie['data']=info
if info.startswith("◎豆瓣评分"):
info=info.replace("◎豆瓣评分","").strip()
##print(info)
movie['grade']=info
if info.startswith("◎片 长"):
info=info.replace("◎片 长","").strip()
##print(info)
movie['time']=info
if info.startswith("◎导 演"):
info=info.replace("◎导 演","").strip()
##print(info)
movie['director']=info
if info.startswith("◎编 剧"):
info=info.replace("◎编 剧","").strip()
##print(info)
movie['writer']=info
if info.startswith("◎主 演"):
info=info.replace("◎主 演","").strip()
actors=[info]
for x in range(index+1,1000):
actor=information[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
##print(actors)
movie['actors']=actors
if info.startswith("◎演 员"):
info=info.replace("◎演 员","").strip()
actors=[info]
for x in range(index+1,1000):
actor=information[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
##print(actors)
movie['actors']=actors
if info.startswith("◎简 介"):
info=info.replace("◎简 介","").strip()
profile=information[index+1]
##print(profile)
movie['profile']=profile
return movie
## parse_detail_page(url)##上面这个函数没有return时,若要单独输出,必须在最后面加parse_detail_page(url)
def spider():
base_url='https://www.ygdy8.net/html/gndy/china/list_4_{}.html'
movies=[]
for x in range(3,4):
url=base_url.format(x)
detail_urls=get_detail_url(url)
for detail_url in detail_urls:
detail_url='https://www.ygdy8.net'+detail_url
print(detail_url)
movie=parse_detail_page(detail_url)
movies.append(movie)
return movies
if __name__=='__main__':
movie=spider()
keys=movie[0].keys()
print(keys)
with open('movie_info.csv','w',nemline='') as output_file:
dict_writer=csv.DictWriter(output_file,keys)
dict_writer.writeheader()
dict_writer.writerows(movie)
##运行结果
IndexError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_63540/316892137.py in <module>
138 if __name__=='__main__':
139 movie=spider()
--> 140 keys=movie[0].keys()
141 print(keys)
142 with open('movie_info.csv','w',nemline='') as output_file:
IndexError: list index out of range
XPath中链接应该是 href 而不是 herf
不知道你这个问题是否已经解决, 如果还没有解决的话:例如:要查找最后一个明星的名字‘Hannah Quinlivan’,Index输入为4。在Python中,第四个元素的索引为3,因为索引是从0开始的。如果想要找最后一个,但是不知道索引为几的时候改怎么办呢?
star_names = ['Jay Chou', 'JJ Lin', 'Jolin Tsai', 'Hannah Quinlivan']
print(star_names[4])
报错情况:
基于new bing部分指引作答:
在你的代码中,出现了"list index out of range"错误,这意味着你试图访问一个超出列表范围的索引位置。该错误发生在以下代码行:
keys=movie[0].keys()
该错误的原因是,你的movie列表是空的,没有任何元素。这可能是由于在执行parse_detail_page函数时出现问题导致的。请检查parse_detail_page函数的实现,确保它能够正确地解析页面并返回有效的电影信息。你可以添加一些调试语句来输出中间结果,以便更好地理解代码的执行过程。
此外,你的 get_detail_url 函数中存在一个拼写错误。在以下行中:
detail_url=html.xpath("table[@class='tbspan']//a/@herf")
应将 @herf 更正为 @href
,以正确提取链接地址。这可能会导致你在解析详细页面时得到空的 detail_url 列表。
请修复这些问题并重新运行你的代码。
大哥 你爬取这个真实网站有点意思啊
所有的报错都给你处理了
import requests
from lxml import etree
import csv
BEST_detail = 'https://www.ygdy8.net'
url = 'https://www.ygdy8.net/html/gndy/dyzz/20230630/63864.html'
headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}
def get_detail_url(url):
response = requests.get(url, headers=headers)
text = response.content.decode('gbk')
html = etree.HTML(text)
detail_url = html.xpath("table[@class='tbspan']//a/@href")
return detail_url
def parse_detail_page(url):
movie = {}
response = requests.get(url, headers=headers)
text = response.content.decode('gbk')
html = etree.HTML(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie['title'] = title
zoomE = html.xpath("//div[@id='Zoom']")[0]
cover = BEST_detail + zoomE.xpath(".//img/@src")[0]
movie['cover'] = cover
movie['grade'] = 0
information = zoomE.xpath(".//text()")
for index, info in enumerate(information):
if info.startswith("◎译 名"):
info = info.replace("◎译 名", "").strip()
movie['translate'] = info
if info.startswith("◎年 代"):
info = info.replace("◎年 代", "").strip()
movie['year'] = info
if info.startswith("◎产 地"):
info = info.replace("◎产 地", "").strip()
movie['area'] = info
if info.startswith("◎类 别"):
info = info.replace("◎类 别", "").strip()
movie['class'] = info
if info.startswith("◎语 言"):
info = info.replace("◎语 言", "").strip()
movie['language'] = info
if info.startswith("◎字 幕"):
info = info.replace("◎字 幕", "").strip()
movie['zimu'] = info
if info.startswith("◎上映日期"):
info = info.replace("◎上映日期", "").strip()
movie['data'] = info
if info.startswith("◎豆瓣评分"):
info = info.replace("◎豆瓣评分", "").strip()
movie['grade'] = info
if info.startswith("◎片 长"):
info = info.replace("◎片 长", "").strip()
movie['time'] = info
if info.startswith("◎导 演"):
info = info.replace("◎导 演", "").strip()
movie['director'] = info
if info.startswith("◎编 剧"):
info = info.replace("◎编 剧", "").strip()
movie['writer'] = info
if info.startswith("◎主 演"):
info = info.replace("◎主 演", "").strip()
actors = [info]
for x in range(index + 1, 1000):
actor = information[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie['actors'] = actors
if info.startswith("◎演 员"):
info = info.replace("◎演 员", "").strip()
actors = [info]
for x in range(index + 1, 1000):
actor = information[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie['actors'] = actors
if info.startswith("◎简 介"):
info = info.replace("◎简 介", "").strip()
profile = information[index + 1]
movie['profile'] = profile
return movie
def spider():
base_url = 'https://www.ygdy8.net/html/gndy/china/list_4_{}.html'
movies = []
for x in range(1, 4):
url = base_url.format(x)
detail_urls = get_detail_url(url)
for detail_url in detail_urls:
detail_url = 'https://www.ygdy8.net' + detail_url
print(detail_url)
movie = parse_detail_page(detail_url)
movies.append(movie)
return movies
if __name__ == '__main__':
movie = spider()
if movie:
keys = movie[0].keys()
print(keys)
with open('movie_info.csv', 'w', nemline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(movie)
在你提供的代码中,出现了一个错误:list index out of range
。这个错误通常表示你正在尝试访问列表中不存在的索引位置。
根据你提供的代码,问题可能出现在以下几行代码中:
title=html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
cover=zoomE.xpath(".//img/@src")[0]
当使用索引 [0]
访问 XPath 返回的结果时,如果结果为空列表,就会导致 list index out of range
错误。
为了避免这个错误,你可以在访问结果之前先检查列表的长度,确保它不为空。例如,你可以使用以下方式修改代码:
titles = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")
if titles:
title = titles[0]
covers = zoomE.xpath(".//img/@src")
if covers:
cover = covers[0]
通过这种方式,即使结果为空列表,也不会触发 list index out of range
错误。
已解决,就是前面的代码出现了一些问题,感谢回答
```python
detail_url=html.xpath("//table[@class='tbspan']//a/@href")
n_detail_url = [x for i, x in enumerate(detail_url) if i % 2 == 1]
## print(n_detail_url)
return n_detail_url
## 还有
with open('movie_info.csv','w',newline='') as output_file:
```
你原有的代码 列表超出索引范围了
简单的话就是没拿到想要的东西
if __name__ == '__main__':
movie = spider()
keys = movie[0.keys()
print(keys)
with open('movie_info.csv', 'w', nemline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(movie)
** keys = movie[0.keys()**
这个有问题