#爬虫的代码,在143,40,91分别出现了错误。
C:\Users\Administrator\PycharmProjects\pythonProject4\demo3day3\venv\Scripts\python.exe C:/Users/Administrator/PycharmProjects/pythonProject4/douban/spider.py
Traceback (most recent call last):
File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 143, in
main()
File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 40, in main
datalist = getData(baseurl)
File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 91, in getData
data.append(titles[0])
IndexError: list index out of range
Process finished with exit code 1
# -*- codeing = utf-8 -*-
# @Time : 2022/9/11 0011 16:51
# @Author : Madara
# @File : spider.py
# @Software: PyCharm
# def main(): #ctrl+/注释多行
# print("hello")
# main()
#
# if __name__ == "__main__": #当程序执行时
# #调用函数
# main()
# def main(a):
# print("hello",a)
# main(1)
#
# if __name__ == "__main__": #当程序执行时。主程序执行的入口,开始阶段
# #调用函数
# main(2)
import urllib.parse as urillib
import urllib.request as urllib2
import sys
import os
import urllib.request
import bs4 #网页解析,获取数据
import sqlite3 #进行SQLite数据库操作
import re #正则表达式,进行文字匹配
import urllib #指定URL,获取网页数据
import xlwt #进行Excel操作
from bs4 import BeautifulSoup
def main():
baseurl = "https://movie.douban.com/top250?start="
#1.爬取网页
datalist = getData(baseurl)
savepath = ".\\豆瓣电影Top250.xls" #./保存在当前文件夹,.\\保存在文件
#3.保存数据
#saveData(savepath)
#askURL("https://movie.douban.com/top250?start=0")
#影片详情链接的规则
# findLink = re.compile(r' ') #创建正则表达式对象,表示规则(字符串的模式)
#影片图片的链接
findImgSrc = re.compile(r', re.S) #re.S让换行符包含在字符中
#影片片名
findTitle = re.compile(r' (.*) ')
#影片评分
findRating = re.compile(r' (.*) ')
#评价人数
findJudge = re.compile(r' (\d*)人评价 ')
#找到概况
findInq = re.compile(r' (.*) ')
#找到影片的相关内容
findBd = re.compile(r' (.*?)
, re.S ')
#爬取网页
def getData(baseurl):
datalist = []
for i in range(0,10): #左闭右开. 调用获取页面信息的函数,10次。
url = baseurl + str(i * 25)
html = askURL(url) #保存获取到的网页源码
#2.逐一解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("div", class_="item"): #查找符合要求的字符串,形成列表。class要加下划线。
#print(item) #测试,查看电影item全部信息
data = [] #保存一部电影的所有信息
item = str(item)
#获取到影片的超链接,影片详情的链接
# link = re.findall(findLink,item)[0] #re库通过正则表达式来查找指定的字符串
# print(link) #添加链接
imgSrc = re.findall(findImgSrc,item)[0] #添加图片
data.append(imgSrc)
titles = re.findall(findTitle,item) #片名可能只有一个中文名,没有外文名
if len(titles) == 2:
ctitle = titles[0] #添加中文名
data.append(ctitle)
otitle = titles[1].replace("/","") #去掉无关的符号
data.append(otitle) #添加外国名
else:
data.append(titles[0])
data.append(' ') #外国名留空。每一行每一列也要占位置
rating =re.findall(findRating,item)[0]
data.append(rating) #添加评分
judgeNum = re.findall(findJudge,item)[0]
data.append(judgeNum) #添加评价人数
inq = re.findall(findInq,item)
if len(inq) != 0:
inq = inq[0].replace("。", "") #去掉句号
data.append(inq) # 添加概述
else:
data.append(" ") #留空
bd = re.findall(findBd,item)[0]
bd = re.sub( '
(\s+)?', " ", bd) #去掉
,三个位置,第一个是标准,第二是替换的字符,第三个要操作的字符串
bd = re.sub('/'," ", bd) #替换/
data.append(bd.strip()) #去掉前后的空格
datalist.append(data) #把处理好的一部电影信息放入datalist
#print(datalist)
return datalist
#得到指定一个URL的网页内容
def askURL(url):
head = { #模拟浏览器头部信息,向豆瓣服务器发送消息
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 105.0.0.0Safari / 537.36"
} #User-Agent很重要,要保证大小写,而且不能有空格,与网页端保持一致
#用户代理,表示告诉豆瓣服务器,我们是什么类型的机器,浏览器(本质上是告诉浏览器我们可以接收什么水平的文件内容)
request = urllib.request.Request(url, headers = head)
html = " "
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLerror as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
#3.保存数据
def saveData(savepath):
print("save...")
if __name__ == "__main__":
main()
C:\Users\Administrator\PycharmProjects\pythonProject4\demo3day3\venv\Scripts\python.exe C:/Users/Administrator/PycharmProjects/pythonProject4/douban/spider.py
Traceback (most recent call last):
File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 143, in
main()
File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 40, in main
datalist = getData(baseurl)
File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 91, in getData
data.append(titles[0])
IndexError: list index out of range
Process finished with exit code 1C:\Users\Administrator\PycharmProjects\pythonProject4\demo3day3\venv\Scripts\python.exe C:/Users/Administrator/PycharmProjects/pythonProject4/douban/spider.py
Traceback (most recent call last):
File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 143, in
main()
File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 40, in main
datalist = getData(baseurl)
File "C:\Users\Administrator\PycharmProjects\pythonProject4\douban\spider.py", line 91, in getData
data.append(titles[0])
IndexError: list index out of range
Process finished with exit code 1
尝试过修改可引入库,不知道是不是网站的问题
流畅运行
检查一下titles这个列表是不是空的叭,根据报错信息来看的话应该是因为它是空的
title 是个空列表,所以空列表取0报错,可以取值之前先做一次判断,如果不为空再取值