本意是爬取豆瓣top250有关信息,但遇到了两个问题,困惑两天了,求解求解,对各位来说应该非常简单吧

问题已在代码相应位置


import requests
from lxml import  etree
import time
import csv
import codecs
import unicodedata



# 获得每一页的html,一共10页:
def get_source(url):
    headers = {
                'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
            }

    respose=requests.get(url,headers=headers)

    # print(respose.text.encode('utf-8'))
    respose.encoding='utf-8'
    # print(respose.text)
    html=etree.HTML(respose.text)
    # print(html)
    return (html)


#  把每一页的需要的电影名 评分  导演 名句等获取,然后形成一个列表里面是字典的形式返回:[{dict1},{dict2}...]
def get_date(html):
    all_li=html.xpath('//*[@id="content"]/div/div[1]/ol/li')
    # print(all_li)
    movielist=[]
    for li in all_li:
        dict={}
        title=li.xpath('div/div[2]/div[1]/a/span/text()')[0]
        # print(title)
        # print(title[0]+'---'+title[1])
        rate_num=li.xpath('div/div[2]/div[2]/div/span[2]/text()')[0]
        # print(rate_num)
        inq=li.xpath('div//span[@class="inq"]/text()')   ####  问题1:为什么这里写了[0]后,就会运行到保存完成第六次后报错????
        # print(inq)
        movieinfo1=li.xpath('div/div[2]/div[2]/p[1]/text()')[0].strip().split("/")[0]
        movieinfo=unicodedata.normalize('NFKC', movieinfo1)
        # print(movieinfo)
        dict['标题']=title
        dict['电影详情']=movieinfo
        dict['评分']=rate_num
        dict['名言']=inq
        # print(dict)
        movielist.append(dict)
    # print(movielist)
    return movielist
    time.sleep(5)
        # return (title,rate_num,inq,mioveinfo)


# 保存每一个返回的列表,存到csv文件里去
def sverfile(movielist):
    with codecs.open('.豆瓣top250信息.csv','w','utf-8') as f:
        filehaed=csv.DictWriter(f,fieldnames=['标题','电影详情','评分','名言'])
        filehaed.writeheader()  ###写入csv表头
        for eval in movielist:
            # print(eval)
            filehaed.writerow(eval)
        print("保存完成")



for i in  range(10):
    movielist1=[]
    url="https://movie.douban.com/top250?start={}&filter=".format(i*25)
    html = get_source(url)
    movielist1=get_date(html)
    print(movielist1)
    sverfile(movielist1)
    time.sleep(5)
# 问题2:为什么保存后打开excle文件后全是乱码,而且只保存了26行



第一个问题。因为inq=li.xpath('div//span[@class="inq"]/text()')这句获取名言,有的节点没有数据,值为None,用索引会报错。用try...except进行异常处理一下即可。
第二个问题,是因为csv是utf-8编码,而excel是默认系统的中文编码,所以出现乱码。解决方法:用记事本打开csv,选取ansi编码格式另存,再用excel打开就能正常显示中文了。或者编码改utf-8为utf-8-sig。
第三个问题,代码中csv写入方式,是w,最后写入会覆盖前面写入的内容,所以只有一页的数据,可将写入方式改成"a"追加方式。最好是在循环前打开文件,用w方式遍历每行写入。修改的代码如下:

import requests
from lxml import etree
import time
import csv
import codecs
import unicodedata

def get_source(url):
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
    }
    respose = requests.get(url, headers=headers)
    # print(respose.text.encode('utf-8'))
    respose.encoding = 'utf-8'
    # print(respose.text)
    html = etree.HTML(respose.text)
    # print(html)
    return (html)

def get_date(html):
    all_li = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
    # print(all_li)
    movielist = []
    for li in all_li:
        dict = {}
        title = li.xpath('div/div[2]/div[1]/a/span/text()')[0]
        # print(title)
        # print(title[0]+'---'+title[1])
        rate_num = li.xpath('div/div[2]/div[2]/div/span[2]/text()')[0]
        # print(rate_num)
        try:
            inq = li.xpath('div//span[@class="inq"]/text()')[0]
        except:
            inq=''
        # print(inq)
        movieinfo1 = li.xpath(
            'div/div[2]/div[2]/p[1]/text()')[0].strip().split("/")[0]
        movieinfo = unicodedata.normalize('NFKC', movieinfo1)
        # print(movieinfo)
        dict['标题'] = title
        dict['电影详情'] = movieinfo
        dict['评分'] = rate_num
        dict['名言'] = inq
        # print(dict)
        movielist.append(dict)
    # print(movielist)
    return movielist
    time.sleep(5)
    # return (title,rate_num,inq,mioveinfo)
    
def sverfile(movielist):
    for eval in movielist:
        # print(eval)
        filehaed.writerow(eval)
    print("保存完成")

with codecs.open('豆瓣top250信息.csv', 'w', 'utf-8') as f:
    filehaed = csv.DictWriter(f, fieldnames=['标题', '电影详情', '评分', '名言'])
    filehaed.writeheader()  # 写入csv表头
    for i in range(10):
        movielist1 = []
        url = "https://movie.douban.com/top250?start={}&filter=".format(i*25)
        html = get_source(url)
        movielist1 = get_date(html)
        #print(movielist1)
        sverfile(movielist1)
        time.sleep(5)

如有帮助,请采纳。点击我回答右上角【采纳】按钮。

pandas写入的错误
w改为a(追加)
utf-8改为utf-8-sig