from urllib.request import Request, urlopen from urllib.parse import urlencode from fake_useragent import UserAgent def get_html(url): headers = { "User-Agent":UserAgent().chrome } request = Request(url,headers=headers) response = urlopen(request) print(response.read().decode()) return response.read() def save_html(fliename , html_bytes): with open(fliename, "wb")as f: f.write(html_bytes) def main(): content = input('请输入要下载的内容:') num = input('请输入要下载多少页:') base_url = "https://tieba.baidu.com/f?ie=utf-8&{}" for pn in range(int(num)): args = { 'pn':pn*50, 'kw':content } fliename = '第' + str(pn + 1) + "页.html" args = urlencode(args) print('正在下载'+fliename) html_bytes = get_html(base_url.format(args)) save_html(fliename,html_bytes) if __name__ == '__main__': main()
查看网页源代码可以发现这是动态加载的,通过这样根本爬取不到信息,可以考虑使用selenium模块试试
网页上右键查看源码,就可以看到爬取到的数据内容了