爬虫爬出空列表报错后该怎么办?

问题遇到的现象和发生背景

爬虫报错list index out of range

用代码块功能插入代码,请勿粘贴截图
import requests
import re

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Cookie': 'BIDUPSID=9068950520CEA6010C221948BDE6A434; PSTM=1663678590; BAIDUID=9068950520CEA601AE2F5D4896206B58:FG=1; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=1; BD_UPN=12314753; BA_HECTOR=00258l250h8l040420a1ravi1hije3v19; BAIDUID_BFESS=9068950520CEA601AE2F5D4896206B58:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ZFY=ZUQtemxS15C0VGbzh17QhYXvb:BPJS4FA3oqh9TuZrE4:C; COOKIE_SESSION=0_0_0_0_2_0_1_0_0_0_0_0_29_0_31_0_1663678622_0_1663678591%7C1%230_0_1663678591%7C1; H_PS_645EC=6741X6h9cYcveIfCCsqIU5v8kkYt8Nbov9N%2FZPY7yd2Dc3nZi%2F9GT8vyq9M; BAIDU_WISE_UID=wapp_1663678969100_340; BDUSS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; BDUSS_BFESS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; ab_sr=1.0.1_MzRhODNhZGQ0M2QyMjNiN2U3NTFlMWFiZmRlMjE1MjE2ZTA3NzQyMWY2Mjk5ZGQ1YjI4MTcwY2JmNDZhNmRmNTcxYTE2ODhjMDVjYzUwNjNjZGM2YTljMTczYmUzZTUxZWNjNzNlMjcyMGUzYWRiNGNlODAwMWVjMTk0YzY0YTI1YmZkNThjYmFjY2VhZjJlYWMwYmNkZGY0ZTA0NDY2NQ==; H_PS_PSSID=37153_36552_37353_37299_36884_34812_37402_37273_37345_36789_37261_26350_37365'
           }

url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd=阿里巴巴'  # 把链接中rtt参数换成4即是按时间排序,默认为1按焦点排序,3.4.1小节也有讲到
res = requests.get(url, headers=headers) # 加上headers用来告诉网站这是通过一个浏览器进行的访问
res.encoding = 'utf-8'
res = res.text
#print(res) 不必输出
def baidu(company):
    url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=' + company  # 把rtt参数换成4即是按时间排序,默认为1按焦点排序
    res = requests.get(url, headers=headers).text
    # print(res)

    p_href = '

# range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10) title[i] = title[i].strip() # strip()函数用来取消字符串两端的换行或者空格,不过这里好像不太需要了 title[i] = re.sub('<.*?>', '', title[i]) # 核心,用re.sub()函数来替换不重要的内容 print(str(i + 1) + '.' + title[i] + '(' + date[i] + '-' + source[i] + ')') print(href[i]) companys = ['华能信托', '阿里巴巴', '万科集团', '百度集团', '腾讯', '京东'] for i in companys: # 这个i只是个代号,可以换成其他内容 baidu(i) print(i + '百度新闻爬取成功')

运行结果及报错内容

IndexError: list index out of range

我想要达到的结果

想知道该怎么改代码

因为你的时间只有9条,而你的数据每次有10条,所以最后一条总是没有时间,肯定会数组超出索引。要么就是这第十条新闻本身就没有时间,要么就是最后一条的re匹配规则不一样,你给漏了。代码上我做了异常处理,最后一条会输出其他数据,时间未知。这里要说一个弊端,是缺少一条时间,你要确定你数组里的时间和其它数据要一一对应,否则这样匹配过去时间是不对应的。还要你继续分析网站确定时间和其他数据对应哦!
代码已经修改,有帮助的话采纳一下哦!

import requests
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    'Cookie': 'BIDUPSID=9068950520CEA6010C221948BDE6A434; PSTM=1663678590; BAIDUID=9068950520CEA601AE2F5D4896206B58:FG=1; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=1; BD_UPN=12314753; BA_HECTOR=00258l250h8l040420a1ravi1hije3v19; BAIDUID_BFESS=9068950520CEA601AE2F5D4896206B58:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ZFY=ZUQtemxS15C0VGbzh17QhYXvb:BPJS4FA3oqh9TuZrE4:C; COOKIE_SESSION=0_0_0_0_2_0_1_0_0_0_0_0_29_0_31_0_1663678622_0_1663678591%7C1%230_0_1663678591%7C1; H_PS_645EC=6741X6h9cYcveIfCCsqIU5v8kkYt8Nbov9N%2FZPY7yd2Dc3nZi%2F9GT8vyq9M; BAIDU_WISE_UID=wapp_1663678969100_340; BDUSS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; BDUSS_BFESS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; ab_sr=1.0.1_MzRhODNhZGQ0M2QyMjNiN2U3NTFlMWFiZmRlMjE1MjE2ZTA3NzQyMWY2Mjk5ZGQ1YjI4MTcwY2JmNDZhNmRmNTcxYTE2ODhjMDVjYzUwNjNjZGM2YTljMTczYmUzZTUxZWNjNzNlMjcyMGUzYWRiNGNlODAwMWVjMTk0YzY0YTI1YmZkNThjYmFjY2VhZjJlYWMwYmNkZGY0ZTA0NDY2NQ==; H_PS_PSSID=37153_36552_37353_37299_36884_34812_37402_37273_37345_36789_37261_26350_37365'
    }

url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd=阿里巴巴'  # 把链接中rtt参数换成4即是按时间排序,默认为1按焦点排序,3.4.1小节也有讲到
res = requests.get(url, headers=headers)  # 加上headers用来告诉网站这是通过一个浏览器进行的访问
res.encoding = 'utf-8'
res = res.text


# print(res) 不必输出
def baidu(company):
    url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=' + company  # 把rtt参数换成4即是按时间排序,默认为1按焦点排序
    res = requests.get(url, headers=headers).text
    # print(res)
    p_href = '<a href="(.*?)"'

    href = re.findall(p_href, res, re.S)
    p_title = '<h3 class="news-title_1YtI1 ">.*?>(.*?)</a>'
    title = re.findall(p_title, res, re.S)
    p_date = '<span class="c-color-gray2 c-font-normal c-gap-right-xsmall".*?>(.*?)</span>'
    date = re.findall(p_date, res)
    p_source = '<span class="c-color-gray".*?>(.*?)</span>'
    source = re.findall(p_source, res)
    print(title)
    print(href)
    print(date)
    print(source)

    for i in range(len(title)):  # range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10)
        title[i] = title[i].strip()  # strip()函数用来取消字符串两端的换行或者空格,不过这里好像不太需要了
        title[i] = re.sub('<.*?>', '', title[i])  # 核心,用re.sub()函数来替换不重要的内容
        try:
            print(str(i + 1) + '.' + title[i] + '(' + date[i] + '-' + source[i] + ')')
        except:
            print(str(i + 1) + '.' + title[i] + '(' + "时间未知" + '-' + source[i] + ')')
        print(href[i])

companys = ['华能信托', '阿里巴巴', '万科集团', '百度集团', '腾讯', '京东']
for i in companys:  # 这个i只是个代号,可以换成其他内容
    baidu(i)
    print(i + '百度新闻爬取成功')

img

img


import requests
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    'Cookie': 'BIDUPSID=9068950520CEA6010C221948BDE6A434; PSTM=1663678590; BAIDUID=9068950520CEA601AE2F5D4896206B58:FG=1; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=1; BD_UPN=12314753; BA_HECTOR=00258l250h8l040420a1ravi1hije3v19; BAIDUID_BFESS=9068950520CEA601AE2F5D4896206B58:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ZFY=ZUQtemxS15C0VGbzh17QhYXvb:BPJS4FA3oqh9TuZrE4:C; COOKIE_SESSION=0_0_0_0_2_0_1_0_0_0_0_0_29_0_31_0_1663678622_0_1663678591%7C1%230_0_1663678591%7C1; H_PS_645EC=6741X6h9cYcveIfCCsqIU5v8kkYt8Nbov9N%2FZPY7yd2Dc3nZi%2F9GT8vyq9M; BAIDU_WISE_UID=wapp_1663678969100_340; BDUSS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; BDUSS_BFESS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; ab_sr=1.0.1_MzRhODNhZGQ0M2QyMjNiN2U3NTFlMWFiZmRlMjE1MjE2ZTA3NzQyMWY2Mjk5ZGQ1YjI4MTcwY2JmNDZhNmRmNTcxYTE2ODhjMDVjYzUwNjNjZGM2YTljMTczYmUzZTUxZWNjNzNlMjcyMGUzYWRiNGNlODAwMWVjMTk0YzY0YTI1YmZkNThjYmFjY2VhZjJlYWMwYmNkZGY0ZTA0NDY2NQ==; H_PS_PSSID=37153_36552_37353_37299_36884_34812_37402_37273_37345_36789_37261_26350_37365'
    }

url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd=阿里巴巴'  # 把链接中rtt参数换成4即是按时间排序,默认为1按焦点排序,3.4.1小节也有讲到
res = requests.get(url, headers=headers)  # 加上headers用来告诉网站这是通过一个浏览器进行的访问
res.encoding = 'utf-8'
res = res.text


# print(res) 不必输出
def baidu(company):
    url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=' + company  # 把rtt参数换成4即是按时间排序,默认为1按焦点排序
    res = requests.get(url, headers=headers).text
    # print(res)

    p_href = '<a href="(.*?)"'
    href = re.findall(p_href, res, re.S)
    p_title = '<h3 class="news-title_1YtI1 ">.*?>(.*?)</a>'
    title = re.findall(p_title, res, re.S)
    p_date = '<span class="c-color-gray2 c-font-normal c-gap-right-xsmall".*?>(.*?)</span>'
    date = re.findall(p_date, res)
    p_source = '<span class="c-color-gray".*?>(.*?)</span>'
    source = re.findall(p_source, res)
    print(title)
    print(href)
    print(date)
    print(source)

    for i in range(len(title)):  # range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10)
        title[i] = title[i].strip()  # strip()函数用来取消字符串两端的换行或者空格,不过这里好像不太需要了
        title[i] = re.sub('<.*?>', '', title[i])  # 核心,用re.sub()函数来替换不重要的内容
        print(str(i + 1) + '.' + title[i] + '(' + (date[i] if i < len(date) else '') + '-' + (source[i] if i < len(source) else '') + ')')
        print(href[i])


companys = ['华能信托', '阿里巴巴', '万科集团', '百度集团', '腾讯', '京东']
for i in companys:  # 这个i只是个代号,可以换成其他内容
    baidu(i)
    print(i + '百度新闻爬取成功')

 
import requests
import re
 
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    'Cookie': 'BIDUPSID=9068950520CEA6010C221948BDE6A434; PSTM=1663678590; BAIDUID=9068950520CEA601AE2F5D4896206B58:FG=1; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=1; BD_UPN=12314753; BA_HECTOR=00258l250h8l040420a1ravi1hije3v19; BAIDUID_BFESS=9068950520CEA601AE2F5D4896206B58:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ZFY=ZUQtemxS15C0VGbzh17QhYXvb:BPJS4FA3oqh9TuZrE4:C; COOKIE_SESSION=0_0_0_0_2_0_1_0_0_0_0_0_29_0_31_0_1663678622_0_1663678591%7C1%230_0_1663678591%7C1; H_PS_645EC=6741X6h9cYcveIfCCsqIU5v8kkYt8Nbov9N%2FZPY7yd2Dc3nZi%2F9GT8vyq9M; BAIDU_WISE_UID=wapp_1663678969100_340; BDUSS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; BDUSS_BFESS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; ab_sr=1.0.1_MzRhODNhZGQ0M2QyMjNiN2U3NTFlMWFiZmRlMjE1MjE2ZTA3NzQyMWY2Mjk5ZGQ1YjI4MTcwY2JmNDZhNmRmNTcxYTE2ODhjMDVjYzUwNjNjZGM2YTljMTczYmUzZTUxZWNjNzNlMjcyMGUzYWRiNGNlODAwMWVjMTk0YzY0YTI1YmZkNThjYmFjY2VhZjJlYWMwYmNkZGY0ZTA0NDY2NQ==; H_PS_PSSID=37153_36552_37353_37299_36884_34812_37402_37273_37345_36789_37261_26350_37365'
    }
 
url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd=阿里巴巴'  # 把链接中rtt参数换成4即是按时间排序,默认为1按焦点排序,3.4.1小节也有讲到
res = requests.get(url, headers=headers)  # 加上headers用来告诉网站这是通过一个浏览器进行的访问
res.encoding = 'utf-8'
res = res.text
 
 
# print(res) 不必输出
def baidu(company):
    url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=' + company  # 把rtt参数换成4即是按时间排序,默认为1按焦点排序
    res = requests.get(url, headers=headers).text
    # print(res)
 
    p_href = '<a href="(.*?)"'
    href = re.findall(p_href, res, re.S)
    p_title = '<h3 class="news-title_1YtI1 ">.*?>(.*?)</a>'
    title = re.findall(p_title, res, re.S)
    p_date = '<span class="c-color-gray2 c-font-normal c-gap-right-xsmall".*?>(.*?)</span>'
    date = re.findall(p_date, res)
    p_source = '<span class="c-color-gray".*?>(.*?)</span>'
    source = re.findall(p_source, res)
    print(title)
    print(href)
    print(date)
    print(source)
 
    for i in range(len(title)):  # range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10)
        title[i] = title[i].strip()  # strip()函数用来取消字符串两端的换行或者空格,不过这里好像不太需要了
        title[i] = re.sub('<.*?>', '', title[i])  # 核心,用re.sub()函数来替换不重要的内容
        print(str(i + 1) + '.' + title[i] + '(' + (date[i] if i < len(date) else '') + '-' + (source[i] if i < len(source) else '') + ')')
        print(href[i])
 
 
companys = ['华能信托', '阿里巴巴', '万科集团', '百度集团', '腾讯', '京东']
for i in companys:  # 这个i只是个代号,可以换成其他内容
    baidu(i)
    print(i + '百度新闻爬取成功')
 

 

在对应的list那里先判断一下if len(list)
如果为空,就不取,不为空再取


或者在你要取list的代码那里加个异常捕获,try except就好了