刚学爬虫,学着别人的代码写了一个爬虫,但是报错了,显示indexerror: list index out of range索引错误:列表超出索引范围,但是那个要获取的值只有一个,应该不会超出范围,希望各位大神能帮忙解决一下问题!
findLink = re.compile(r'<a href="(.*?)" onclick.*?>') #图书详情链接 创建正则表达式对象,标售规则 影片详情链接的规则
findImgSrc = re.compile(r'<img src="(.*?)".*?>', re.S) #图书海报
findTitle = re.compile(r'<a.*?title="(.*?)">') #书名
findRating = re.compile(r'<span class="rating_nums">(.*?)</span>')#图书评分
findJudge = re.compile(r'<span class="pl">(\d*)人评价</span>')#评价人数
findInq = re.compile(r'<span class="inq">(.*)</span>') #概述
findBd = re.compile(r'<p class="pl">(.*?)</p>') #相关信息
#<span class="pl">\((.*?)人评价.*?
#<span class="p1">(\d*)人评价</span>
def main():
baseurl = "https://book.douban.com/top250?start="
datalist = getData(baseurl) # 爬数函数,见第四步
savepath = "豆瓣图书Top250.xls" # 将爬取的信息保存到当前目录下的Excel表中
saveData(datalist, savepath) # 保存数据函数,见第六步
def getData(baseurl):
datalist = [] # 用于存放所有书的信息
for i in range(11): # 调用获取页面信息的函数,一共10页,根据豆瓣图书的链接规律得到
# 1、生成访问链接
#baseurl = "https://book.douban.com/top250?start="
url = baseurl + str(i * 25)
html = askURL(url) # 爬取网页函数,见第五步
# 2、解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('tr', class_='item'):
# 根据html网页可以看到每本书是一个item
item = str(item)
data = [] # 用于存放一本书的所有信息
link = re.findall(findLink, item)[0] # 通过正则表达式查找
data.append(link)
imgSrc = re.findall(findImgSrc, item)[0]
data.append(imgSrc)
titles = re.findall(findTitle, item)
if (len(titles) == 1): # 修改
ctitle = titles[0]
data.append(ctitle)
#otitle = titles[1].replace("/", "") # 消除转义字符
#data.append(otitle)
else:
data.append(titles[0])
data.append(' ')
rating = re.findall(findRating, item)[0]
data.append(rating)
judgeNum = re.findall(findJudge, item)[0]
data.append(judgeNum)
inq = re.findall(findInq, item)
if len(inq) != 0:
inq = inq[0].replace("。", "")
data.append(inq)
else:
data.append(" ")
bd = re.findall(findBd, item)[0]
bd = re.sub('<br(\s+)?/>(\s+)?', "", bd)
bd = re.sub('/', "", bd)
data.append(bd.strip())
datalist.append(data)
return datalist
def askURL(url):
head = { # 伪装浏览器,向豆瓣服务器发送消息
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def saveData(datalist, savepath):
print("save.......")
# 1、创建workbook对象
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
# 2、sheet表信息
sheet = book.add_sheet('豆瓣图书TOP250', cell_overwrite_ok=True) # 创建工作表
# 3、写入数据
col = ('图书详情链接', "图书海报", "图书书名", "图书评分", "图书评价人数", "概况", "图书相关信息")
for i in range(11):
sheet.write(0, i, col[i]) # 列名
for i in range(250):
data = datalist[i]
for j in range(11):
sheet.write(i+1, j, data[j])
book.save(savepath) # 保存数据表
def remove(string):
pattern = re.compile(r'\s+')
return re.sub(pattern, '', string)
元组也超出范围是哪的原因呢