Python爬虫问题

问题遇到的现象和发生背景

#爬虫的代码,在143,40,91分别出现了错误。

img

C:\Users\Administrator\PycharmProjects\pythonProject4\demo3day3\venv\Scripts\python.exe C:/Users/Administrator/PycharmProjects/pythonProject4/douban/spider.py
Traceback (most recent call last):
  File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 143, in 
    main()
  File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 40, in main
    datalist = getData(baseurl)
  File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 91, in getData
    data.append(titles[0])
IndexError: list index out of range

Process finished with exit code 1
用代码块功能插入代码,请勿粘贴截图
# -*- codeing = utf-8 -*-
# @Time : 2022/9/11 0011 16:51
# @Author : Madara
# @File : spider.py
# @Software: PyCharm

# def main():     #ctrl+/注释多行
#     print("hello")
# main()
#
# if __name__ == "__main__":    #当程序执行时
#     #调用函数
#     main()


# def main(a):
#     print("hello",a)
# main(1)
#
# if __name__ == "__main__":    #当程序执行时。主程序执行的入口,开始阶段
#     #调用函数
#     main(2)

import urllib.parse as urillib
import urllib.request as urllib2
import sys
import os
import urllib.request
import bs4  #网页解析,获取数据
import sqlite3  #进行SQLite数据库操作
import re #正则表达式,进行文字匹配
import urllib  #指定URL,获取网页数据
import xlwt #进行Excel操作
from bs4 import BeautifulSoup


def main():
    baseurl = "https://movie.douban.com/top250?start="
    #1.爬取网页
    datalist = getData(baseurl)
    savepath = ".\\豆瓣电影Top250.xls"    #./保存在当前文件夹,.\\保存在文件
 #3.保存数据
    #saveData(savepath)
   #askURL("https://movie.douban.com/top250?start=0")

#影片详情链接的规则
# findLink = re.compile(r'  ') #创建正则表达式对象,表示规则(字符串的模式)
#影片图片的链接
findImgSrc = re.compile(r', re.S)  #re.S让换行符包含在字符中
#影片片名
findTitle = re.compile(r' (.*)  ')
#影片评分
findRating = re.compile(r' (.*) ')
#评价人数
findJudge = re.compile(r' (\d*)人评价 ')
#找到概况
findInq = re.compile(r' (.*) ')
#找到影片的相关内容
findBd = re.compile(r' 

(.*?)

, re.S '
) #爬取网页 def getData(baseurl): datalist = [] for i in range(0,10): #左闭右开. 调用获取页面信息的函数,10次。 url = baseurl + str(i * 25) html = askURL(url) #保存获取到的网页源码 #2.逐一解析数据 soup = BeautifulSoup(html, "html.parser") for item in soup.find_all("div", class_="item"): #查找符合要求的字符串,形成列表。class要加下划线。 #print(item) #测试,查看电影item全部信息 data = [] #保存一部电影的所有信息 item = str(item) #获取到影片的超链接,影片详情的链接 # link = re.findall(findLink,item)[0] #re库通过正则表达式来查找指定的字符串 # print(link) #添加链接 imgSrc = re.findall(findImgSrc,item)[0] #添加图片 data.append(imgSrc) titles = re.findall(findTitle,item) #片名可能只有一个中文名,没有外文名 if len(titles) == 2: ctitle = titles[0] #添加中文名 data.append(ctitle) otitle = titles[1].replace("/","") #去掉无关的符号 data.append(otitle) #添加外国名 else: data.append(titles[0]) data.append(' ') #外国名留空。每一行每一列也要占位置 rating =re.findall(findRating,item)[0] data.append(rating) #添加评分 judgeNum = re.findall(findJudge,item)[0] data.append(judgeNum) #添加评价人数 inq = re.findall(findInq,item) if len(inq) != 0: inq = inq[0].replace("。", "") #去掉句号 data.append(inq) # 添加概述 else: data.append(" ") #留空 bd = re.findall(findBd,item)[0] bd = re.sub( ' (\s+)?', " ", bd) #去掉
,三个位置,第一个是标准,第二是替换的字符,第三个要操作的字符串
bd = re.sub('/'," ", bd) #替换/ data.append(bd.strip()) #去掉前后的空格 datalist.append(data) #把处理好的一部电影信息放入datalist #print(datalist) return datalist #得到指定一个URL的网页内容 def askURL(url): head = { #模拟浏览器头部信息,向豆瓣服务器发送消息 "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 105.0.0.0Safari / 537.36" } #User-Agent很重要,要保证大小写,而且不能有空格,与网页端保持一致 #用户代理,表示告诉豆瓣服务器,我们是什么类型的机器,浏览器(本质上是告诉浏览器我们可以接收什么水平的文件内容) request = urllib.request.Request(url, headers = head) html = " " try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") #print(html) except urllib.error.URLerror as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html #3.保存数据 def saveData(savepath): print("save...") if __name__ == "__main__": main()
运行结果及报错内容
C:\Users\Administrator\PycharmProjects\pythonProject4\demo3day3\venv\Scripts\python.exe C:/Users/Administrator/PycharmProjects/pythonProject4/douban/spider.py
Traceback (most recent call last):
  File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 143, in 
    main()
  File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 40, in main
    datalist = getData(baseurl)
  File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 91, in getData
    data.append(titles[0])
IndexError: list index out of range

Process finished with exit code 1C:\Users\Administrator\PycharmProjects\pythonProject4\demo3day3\venv\Scripts\python.exe C:/Users/Administrator/PycharmProjects/pythonProject4/douban/spider.py
Traceback (most recent call last):
  File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 143, in 
    main()
  File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 40, in main
    datalist = getData(baseurl)
  File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 91, in getData
    data.append(titles[0])
IndexError: list index out of range

Process finished with exit code 1

我的解答思路和尝试过的方法

尝试过修改可引入库,不知道是不是网站的问题

我想要达到的结果

流畅运行

检查一下titles这个列表是不是空的叭,根据报错信息来看的话应该是因为它是空的

title 是个空列表,所以空列表取0报错,可以取值之前先做一次判断,如果不为空再取值