import requests
from lxml import etree
urls =[
f"https://www.cnblogs.com/#p{page}"
for page in range(1,50+1)
]
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36",
"referer": "https://www.cnblogs.com/"
}
def craw(u1):
rep = requests.get(u1,headers=headers)
return rep.text
# title ='//article[@class='post-item']//a/text()'
# href_u = '//article[@class='post-item']//a/@href'
def Processors(txt):
x1 ="//article[@class='post-item']//a[@class='post-item-title']/text()"
x2 = "//article[@class='post-item']//a/@href"
html = etree.HTML(txt)
title = html.xpath(x1)
href_u = html.xpath(x2)
print(txt)
# Processors(craw(urls[2])) #爬取第3页,返回第一页代码
print(craw("https://www.cnblogs.com/#p3")) #爬取第3页,返回第一页代码
print("------------------------------------------------------------------")
print(craw("https://www.cnblogs.com/#p5")) #爬取第5页,返回第一页代码
# -------------
# 从craw函数请求url后的结果一直都第一页代码
# 反复验证了Processors函数没毛病。
这个网站页面的翻页数据是通过接口的
他只有第一页是直接展示的