爬虫报错list index out of range
import requests import re headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36', 'Cookie': 'BIDUPSID=9068950520CEA6010C221948BDE6A434; PSTM=1663678590; BAIDUID=9068950520CEA601AE2F5D4896206B58:FG=1; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=1; BD_UPN=12314753; BA_HECTOR=00258l250h8l040420a1ravi1hije3v19; BAIDUID_BFESS=9068950520CEA601AE2F5D4896206B58:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ZFY=ZUQtemxS15C0VGbzh17QhYXvb:BPJS4FA3oqh9TuZrE4:C; COOKIE_SESSION=0_0_0_0_2_0_1_0_0_0_0_0_29_0_31_0_1663678622_0_1663678591%7C1%230_0_1663678591%7C1; H_PS_645EC=6741X6h9cYcveIfCCsqIU5v8kkYt8Nbov9N%2FZPY7yd2Dc3nZi%2F9GT8vyq9M; BAIDU_WISE_UID=wapp_1663678969100_340; BDUSS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; BDUSS_BFESS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; ab_sr=1.0.1_MzRhODNhZGQ0M2QyMjNiN2U3NTFlMWFiZmRlMjE1MjE2ZTA3NzQyMWY2Mjk5ZGQ1YjI4MTcwY2JmNDZhNmRmNTcxYTE2ODhjMDVjYzUwNjNjZGM2YTljMTczYmUzZTUxZWNjNzNlMjcyMGUzYWRiNGNlODAwMWVjMTk0YzY0YTI1YmZkNThjYmFjY2VhZjJlYWMwYmNkZGY0ZTA0NDY2NQ==; H_PS_PSSID=37153_36552_37353_37299_36884_34812_37402_37273_37345_36789_37261_26350_37365' } url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd=阿里巴巴' # 把链接中rtt参数换成4即是按时间排序,默认为1按焦点排序,3.4.1小节也有讲到 res = requests.get(url, headers=headers) # 加上headers用来告诉网站这是通过一个浏览器进行的访问 res.encoding = 'utf-8' res = res.text #print(res) 不必输出 def baidu(company): url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=' + company # 把rtt参数换成4即是按时间排序,默认为1按焦点排序 res = requests.get(url, headers=headers).text # print(res) p_href = '
# range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10) title[i] = title[i].strip() # strip()函数用来取消字符串两端的换行或者空格,不过这里好像不太需要了 title[i] = re.sub('<.*?>', '', title[i]) # 核心,用re.sub()函数来替换不重要的内容 print(str(i + 1) + '.' + title[i] + '(' + date[i] + '-' + source[i] + ')') print(href[i]) companys = ['华能信托', '阿里巴巴', '万科集团', '百度集团', '腾讯', '京东'] for i in companys: # 这个i只是个代号,可以换成其他内容 baidu(i) print(i + '百度新闻爬取成功')
IndexError: list index out of range
想知道该怎么改代码
因为你的时间只有9条,而你的数据每次有10条,所以最后一条总是没有时间,肯定会数组超出索引。要么就是这第十条新闻本身就没有时间,要么就是最后一条的re匹配规则不一样,你给漏了。代码上我做了异常处理,最后一条会输出其他数据,时间未知。这里要说一个弊端,是缺少一条时间,你要确定你数组里的时间和其它数据要一一对应,否则这样匹配过去时间是不对应的。还要你继续分析网站确定时间和其他数据对应哦!
代码已经修改,有帮助的话采纳一下哦!
import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Cookie': 'BIDUPSID=9068950520CEA6010C221948BDE6A434; PSTM=1663678590; BAIDUID=9068950520CEA601AE2F5D4896206B58:FG=1; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=1; BD_UPN=12314753; BA_HECTOR=00258l250h8l040420a1ravi1hije3v19; BAIDUID_BFESS=9068950520CEA601AE2F5D4896206B58:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ZFY=ZUQtemxS15C0VGbzh17QhYXvb:BPJS4FA3oqh9TuZrE4:C; COOKIE_SESSION=0_0_0_0_2_0_1_0_0_0_0_0_29_0_31_0_1663678622_0_1663678591%7C1%230_0_1663678591%7C1; H_PS_645EC=6741X6h9cYcveIfCCsqIU5v8kkYt8Nbov9N%2FZPY7yd2Dc3nZi%2F9GT8vyq9M; BAIDU_WISE_UID=wapp_1663678969100_340; BDUSS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; BDUSS_BFESS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; ab_sr=1.0.1_MzRhODNhZGQ0M2QyMjNiN2U3NTFlMWFiZmRlMjE1MjE2ZTA3NzQyMWY2Mjk5ZGQ1YjI4MTcwY2JmNDZhNmRmNTcxYTE2ODhjMDVjYzUwNjNjZGM2YTljMTczYmUzZTUxZWNjNzNlMjcyMGUzYWRiNGNlODAwMWVjMTk0YzY0YTI1YmZkNThjYmFjY2VhZjJlYWMwYmNkZGY0ZTA0NDY2NQ==; H_PS_PSSID=37153_36552_37353_37299_36884_34812_37402_37273_37345_36789_37261_26350_37365'
}
url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd=阿里巴巴' # 把链接中rtt参数换成4即是按时间排序,默认为1按焦点排序,3.4.1小节也有讲到
res = requests.get(url, headers=headers) # 加上headers用来告诉网站这是通过一个浏览器进行的访问
res.encoding = 'utf-8'
res = res.text
# print(res) 不必输出
def baidu(company):
url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=' + company # 把rtt参数换成4即是按时间排序,默认为1按焦点排序
res = requests.get(url, headers=headers).text
# print(res)
p_href = '<a href="(.*?)"'
href = re.findall(p_href, res, re.S)
p_title = '<h3 class="news-title_1YtI1 ">.*?>(.*?)</a>'
title = re.findall(p_title, res, re.S)
p_date = '<span class="c-color-gray2 c-font-normal c-gap-right-xsmall".*?>(.*?)</span>'
date = re.findall(p_date, res)
p_source = '<span class="c-color-gray".*?>(.*?)</span>'
source = re.findall(p_source, res)
print(title)
print(href)
print(date)
print(source)
for i in range(len(title)): # range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10)
title[i] = title[i].strip() # strip()函数用来取消字符串两端的换行或者空格,不过这里好像不太需要了
title[i] = re.sub('<.*?>', '', title[i]) # 核心,用re.sub()函数来替换不重要的内容
try:
print(str(i + 1) + '.' + title[i] + '(' + date[i] + '-' + source[i] + ')')
except:
print(str(i + 1) + '.' + title[i] + '(' + "时间未知" + '-' + source[i] + ')')
print(href[i])
companys = ['华能信托', '阿里巴巴', '万科集团', '百度集团', '腾讯', '京东']
for i in companys: # 这个i只是个代号,可以换成其他内容
baidu(i)
print(i + '百度新闻爬取成功')
import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Cookie': 'BIDUPSID=9068950520CEA6010C221948BDE6A434; PSTM=1663678590; BAIDUID=9068950520CEA601AE2F5D4896206B58:FG=1; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=1; BD_UPN=12314753; BA_HECTOR=00258l250h8l040420a1ravi1hije3v19; BAIDUID_BFESS=9068950520CEA601AE2F5D4896206B58:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ZFY=ZUQtemxS15C0VGbzh17QhYXvb:BPJS4FA3oqh9TuZrE4:C; COOKIE_SESSION=0_0_0_0_2_0_1_0_0_0_0_0_29_0_31_0_1663678622_0_1663678591%7C1%230_0_1663678591%7C1; H_PS_645EC=6741X6h9cYcveIfCCsqIU5v8kkYt8Nbov9N%2FZPY7yd2Dc3nZi%2F9GT8vyq9M; BAIDU_WISE_UID=wapp_1663678969100_340; BDUSS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; BDUSS_BFESS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; ab_sr=1.0.1_MzRhODNhZGQ0M2QyMjNiN2U3NTFlMWFiZmRlMjE1MjE2ZTA3NzQyMWY2Mjk5ZGQ1YjI4MTcwY2JmNDZhNmRmNTcxYTE2ODhjMDVjYzUwNjNjZGM2YTljMTczYmUzZTUxZWNjNzNlMjcyMGUzYWRiNGNlODAwMWVjMTk0YzY0YTI1YmZkNThjYmFjY2VhZjJlYWMwYmNkZGY0ZTA0NDY2NQ==; H_PS_PSSID=37153_36552_37353_37299_36884_34812_37402_37273_37345_36789_37261_26350_37365'
}
url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd=阿里巴巴' # 把链接中rtt参数换成4即是按时间排序,默认为1按焦点排序,3.4.1小节也有讲到
res = requests.get(url, headers=headers) # 加上headers用来告诉网站这是通过一个浏览器进行的访问
res.encoding = 'utf-8'
res = res.text
# print(res) 不必输出
def baidu(company):
url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=' + company # 把rtt参数换成4即是按时间排序,默认为1按焦点排序
res = requests.get(url, headers=headers).text
# print(res)
p_href = '<a href="(.*?)"'
href = re.findall(p_href, res, re.S)
p_title = '<h3 class="news-title_1YtI1 ">.*?>(.*?)</a>'
title = re.findall(p_title, res, re.S)
p_date = '<span class="c-color-gray2 c-font-normal c-gap-right-xsmall".*?>(.*?)</span>'
date = re.findall(p_date, res)
p_source = '<span class="c-color-gray".*?>(.*?)</span>'
source = re.findall(p_source, res)
print(title)
print(href)
print(date)
print(source)
for i in range(len(title)): # range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10)
title[i] = title[i].strip() # strip()函数用来取消字符串两端的换行或者空格,不过这里好像不太需要了
title[i] = re.sub('<.*?>', '', title[i]) # 核心,用re.sub()函数来替换不重要的内容
print(str(i + 1) + '.' + title[i] + '(' + (date[i] if i < len(date) else '') + '-' + (source[i] if i < len(source) else '') + ')')
print(href[i])
companys = ['华能信托', '阿里巴巴', '万科集团', '百度集团', '腾讯', '京东']
for i in companys: # 这个i只是个代号,可以换成其他内容
baidu(i)
print(i + '百度新闻爬取成功')
import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Cookie': 'BIDUPSID=9068950520CEA6010C221948BDE6A434; PSTM=1663678590; BAIDUID=9068950520CEA601AE2F5D4896206B58:FG=1; BD_HOME=1; delPer=0; BD_CK_SAM=1; PSINO=1; BD_UPN=12314753; BA_HECTOR=00258l250h8l040420a1ravi1hije3v19; BAIDUID_BFESS=9068950520CEA601AE2F5D4896206B58:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ZFY=ZUQtemxS15C0VGbzh17QhYXvb:BPJS4FA3oqh9TuZrE4:C; COOKIE_SESSION=0_0_0_0_2_0_1_0_0_0_0_0_29_0_31_0_1663678622_0_1663678591%7C1%230_0_1663678591%7C1; H_PS_645EC=6741X6h9cYcveIfCCsqIU5v8kkYt8Nbov9N%2FZPY7yd2Dc3nZi%2F9GT8vyq9M; BAIDU_WISE_UID=wapp_1663678969100_340; BDUSS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; BDUSS_BFESS=kxmUEJEaS1LUFN1ZWFRfnhEM29uRk5iUGV0MUlvMkJsWXNSYUZDNTVDazNSMUZqRVFBQUFBJCQAAAAAAAAAAAEAAAA0Vogxyb22q7aryb0wMDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADe6KWM3uiljM; ab_sr=1.0.1_MzRhODNhZGQ0M2QyMjNiN2U3NTFlMWFiZmRlMjE1MjE2ZTA3NzQyMWY2Mjk5ZGQ1YjI4MTcwY2JmNDZhNmRmNTcxYTE2ODhjMDVjYzUwNjNjZGM2YTljMTczYmUzZTUxZWNjNzNlMjcyMGUzYWRiNGNlODAwMWVjMTk0YzY0YTI1YmZkNThjYmFjY2VhZjJlYWMwYmNkZGY0ZTA0NDY2NQ==; H_PS_PSSID=37153_36552_37353_37299_36884_34812_37402_37273_37345_36789_37261_26350_37365'
}
url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd=阿里巴巴' # 把链接中rtt参数换成4即是按时间排序,默认为1按焦点排序,3.4.1小节也有讲到
res = requests.get(url, headers=headers) # 加上headers用来告诉网站这是通过一个浏览器进行的访问
res.encoding = 'utf-8'
res = res.text
# print(res) 不必输出
def baidu(company):
url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=' + company # 把rtt参数换成4即是按时间排序,默认为1按焦点排序
res = requests.get(url, headers=headers).text
# print(res)
p_href = '<a href="(.*?)"'
href = re.findall(p_href, res, re.S)
p_title = '<h3 class="news-title_1YtI1 ">.*?>(.*?)</a>'
title = re.findall(p_title, res, re.S)
p_date = '<span class="c-color-gray2 c-font-normal c-gap-right-xsmall".*?>(.*?)</span>'
date = re.findall(p_date, res)
p_source = '<span class="c-color-gray".*?>(.*?)</span>'
source = re.findall(p_source, res)
print(title)
print(href)
print(date)
print(source)
for i in range(len(title)): # range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10)
title[i] = title[i].strip() # strip()函数用来取消字符串两端的换行或者空格,不过这里好像不太需要了
title[i] = re.sub('<.*?>', '', title[i]) # 核心,用re.sub()函数来替换不重要的内容
print(str(i + 1) + '.' + title[i] + '(' + (date[i] if i < len(date) else '') + '-' + (source[i] if i < len(source) else '') + ')')
print(href[i])
companys = ['华能信托', '阿里巴巴', '万科集团', '百度集团', '腾讯', '京东']
for i in companys: # 这个i只是个代号,可以换成其他内容
baidu(i)
print(i + '百度新闻爬取成功')
在对应的list那里先判断一下if len(list)
如果为空,就不取,不为空再取
或者在你要取list的代码那里加个异常捕获,try except就好了