今天在写一个爬虫程序
最开始我能爬取到数据,代码如下
# 爬取帖子列表信息
def getData(baseurl):
datalist = []
# 末尾页2985
for i in range(0,1):
# 帖子列表地址
url = baseurl + str(i*50)
soup = askURL(url)
print(soup)
# 逐一解析数据
for item in soup.find_all('li',class_="j_thread_list clearfix thread_item_box"):
# 保存一个帖子的信息
data = []
item = str(item)
replyNum = re.findall(findReplyNum,item)[0]
data.append(replyNum) # 添加评论数
title = re.findall(findTitle,item)[0]
data.append(title) # 添加帖子名称
link = re.findall(findLink,item)[0]
link = "https://tieba.baidu.com/" + link # 拼接为完整链接
data.append(link) # 添加链接
datalist.append(data)
return datalist
# 得到一个指定URL的网页内容
def askURL(url):
html = requests.get(url,verify=False)
soup = BeautifulSoup(html.content, 'html.parser')
return soup
但后来应该是我的IP被锁,运行后爬取得到的soup是网络不给力的页面。
html>
<html lang="zh-CN">
<head>
<meta charset="utf-8"/>
<title>百度安全验证title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="black" name="apple-mobile-web-app-status-bar-style"/>
<meta content="width=device-width, user-scalable=no, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0" name="viewport"/>
<meta content="telephone=no, email=no" name="format-detection"/>
<link href="https://www.baidu.com/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="https://www.baidu.com/img/baidu.svg" mask="" rel="icon" sizes="any"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="upgrade-insecure-requests" http-equiv="Content-Security-Policy"/>
<link href="https://ppui-static-wap.cdn.bcebos.com/static/touch/css/api/mkdjump_aac6df1.css" rel="stylesheet">
link>head>
<body>
<div class="timeout hide">
<div class="timeout-img">div>
<div class="timeout-title">网络不给力,请稍后重试div>
<button class="timeout-button" type="button">返回首页button>
div>
<div class="timeout-feedback hide">
<div class="timeout-feedback-icon">div>
<p class="timeout-feedback-title">问题反馈p>
div>
<script src="https://wappass.baidu.com/static/machine/js/api/mkd.js">script>
<script src="https://ppui-static-wap.cdn.bcebos.com/static/touch/js/mkdjump_db105ab.js">script>
body>
html>
于是我伪装了一个请求头
head = {
"User-Agent":...
"Cookie":...
}
html = requests.get(url,verify=False,headers=head)
这时soup中有完整的页面类容,但soup.find_all就成了一个空列表。
请问有朋友知道怎么解决吗,为什么不加headers时soup.find_all就是正常的
说明您设置的这个j_thread_list clearfix thread_item_box值不正确,当获取不正常的时候有这个值,获取正常的时候没有这个个值,建议重新定位下元素。以及在获取到的页面内容中查找下j_thread_list clearfix thread_item_box这个值是否真的存在。获取到的和你在网页上看到的还是有区别的,有的时候。望采纳
import requests
from lxml import etree
name='python'
url=f'https://tieba.baidu.com/f?ie=utf-8&kw={name}&fr=search'
url2='https://tieba.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
dd=requests.get(url,headers)
html = etree.HTML(dd.text)
titles = html.xpath('//*[@class="threadlist_title pull_left j_th_tit "]/a')
reply = html.xpath('//*[@title="回复"]//text()')
data=[]
for n,i in enumerate(titles):
title=i.xpath('.//text()')[0].strip()
href=i.xpath('./@href')[0]
df={'title':title,'href':f"{url2}{href}",'回复':reply[n+1]}
data.append(df)
print(f"{n+1}.标题:{title}\n 链接:{url2}{href}\n 回复:{reply[n+1]}")