问题已在代码相应位置
import requests
from lxml import etree
import time
import csv
import codecs
import unicodedata
# 获得每一页的html,一共10页:
def get_source(url):
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
}
respose=requests.get(url,headers=headers)
# print(respose.text.encode('utf-8'))
respose.encoding='utf-8'
# print(respose.text)
html=etree.HTML(respose.text)
# print(html)
return (html)
# 把每一页的需要的电影名 评分 导演 名句等获取,然后形成一个列表里面是字典的形式返回:[{dict1},{dict2}...]
def get_date(html):
all_li=html.xpath('//*[@id="content"]/div/div[1]/ol/li')
# print(all_li)
movielist=[]
for li in all_li:
dict={}
title=li.xpath('div/div[2]/div[1]/a/span/text()')[0]
# print(title)
# print(title[0]+'---'+title[1])
rate_num=li.xpath('div/div[2]/div[2]/div/span[2]/text()')[0]
# print(rate_num)
inq=li.xpath('div//span[@class="inq"]/text()') #### 问题1:为什么这里写了[0]后,就会运行到保存完成第六次后报错????
# print(inq)
movieinfo1=li.xpath('div/div[2]/div[2]/p[1]/text()')[0].strip().split("/")[0]
movieinfo=unicodedata.normalize('NFKC', movieinfo1)
# print(movieinfo)
dict['标题']=title
dict['电影详情']=movieinfo
dict['评分']=rate_num
dict['名言']=inq
# print(dict)
movielist.append(dict)
# print(movielist)
return movielist
time.sleep(5)
# return (title,rate_num,inq,mioveinfo)
# 保存每一个返回的列表,存到csv文件里去
def sverfile(movielist):
with codecs.open('.豆瓣top250信息.csv','w','utf-8') as f:
filehaed=csv.DictWriter(f,fieldnames=['标题','电影详情','评分','名言'])
filehaed.writeheader() ###写入csv表头
for eval in movielist:
# print(eval)
filehaed.writerow(eval)
print("保存完成")
for i in range(10):
movielist1=[]
url="https://movie.douban.com/top250?start={}&filter=".format(i*25)
html = get_source(url)
movielist1=get_date(html)
print(movielist1)
sverfile(movielist1)
time.sleep(5)
# 问题2:为什么保存后打开excle文件后全是乱码,而且只保存了26行
第一个问题。因为inq=li.xpath('div//span[@class="inq"]/text()')这句获取名言,有的节点没有数据,值为None,用索引会报错。用try...except进行异常处理一下即可。
第二个问题,是因为csv是utf-8编码,而excel是默认系统的中文编码,所以出现乱码。解决方法:用记事本打开csv,选取ansi编码格式另存,再用excel打开就能正常显示中文了。或者编码改utf-8为utf-8-sig。
第三个问题,代码中csv写入方式,是w,最后写入会覆盖前面写入的内容,所以只有一页的数据,可将写入方式改成"a"追加方式。最好是在循环前打开文件,用w方式遍历每行写入。修改的代码如下:
import requests
from lxml import etree
import time
import csv
import codecs
import unicodedata
def get_source(url):
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
}
respose = requests.get(url, headers=headers)
# print(respose.text.encode('utf-8'))
respose.encoding = 'utf-8'
# print(respose.text)
html = etree.HTML(respose.text)
# print(html)
return (html)
def get_date(html):
all_li = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
# print(all_li)
movielist = []
for li in all_li:
dict = {}
title = li.xpath('div/div[2]/div[1]/a/span/text()')[0]
# print(title)
# print(title[0]+'---'+title[1])
rate_num = li.xpath('div/div[2]/div[2]/div/span[2]/text()')[0]
# print(rate_num)
try:
inq = li.xpath('div//span[@class="inq"]/text()')[0]
except:
inq=''
# print(inq)
movieinfo1 = li.xpath(
'div/div[2]/div[2]/p[1]/text()')[0].strip().split("/")[0]
movieinfo = unicodedata.normalize('NFKC', movieinfo1)
# print(movieinfo)
dict['标题'] = title
dict['电影详情'] = movieinfo
dict['评分'] = rate_num
dict['名言'] = inq
# print(dict)
movielist.append(dict)
# print(movielist)
return movielist
time.sleep(5)
# return (title,rate_num,inq,mioveinfo)
def sverfile(movielist):
for eval in movielist:
# print(eval)
filehaed.writerow(eval)
print("保存完成")
with codecs.open('豆瓣top250信息.csv', 'w', 'utf-8') as f:
filehaed = csv.DictWriter(f, fieldnames=['标题', '电影详情', '评分', '名言'])
filehaed.writeheader() # 写入csv表头
for i in range(10):
movielist1 = []
url = "https://movie.douban.com/top250?start={}&filter=".format(i*25)
html = get_source(url)
movielist1 = get_date(html)
#print(movielist1)
sverfile(movielist1)
time.sleep(5)
如有帮助,请采纳。点击我回答右上角【采纳】按钮。
pandas写入的错误
w改为a(追加)
utf-8改为utf-8-sig