#-*- codeing = utf-8 -*- from bs4 import BeautifulSoup #网页解析,获取数据 import re #正则表达式,进行文字匹配 import urllib.request,urllib.error #制定URL,获取网页数据 import xlwt #进行excel操作 import sqlite3 #进行sqlite数据库操作 def main(): baseurl="https://xueshu.baidu.com/s?wd=%E6%97%A5%E6%9C%AC%E4%BC%81%E4%B8%9A%E7%AE%A1%E7%90%86%E6%A8%A1%E5%BC%8F%E5%AF%B9%E6%88%91%E5%9B%BD%E4%BC%81%E4%B8%9A%E5%8F%91%E5%B1%95%E7%9A%84%E5%90%AF%E7%A4%BA&tn=SE_baiduxueshu_c1gjeupa&cl=3&ie=utf-8&bs=%E6%97%A5%E6%9C%AC%E4%BC%81%E4%B8%9A%E7%AE%A1%E7%90%86%E6%A8%A1%E5%BC%8F%E5%AF%B9%E6%88%91%E5%9B%BD%E4%BC%81%E4%B8%9A%E5%8F%91%E5%B1%95%E7%9A%84%E5%90%AF%E7%A4%BA&f=8&rsv_bp=1&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&rsv_spt=3" #爬取网页 datalise = getData(baseurl) savepath =r".\\日本企业管理模式对我国企业发展的启示.xls" #保存数据 saveData(savepath) askURL("https://xueshu.baidu.com/s?wd=%E6%97%A5%E6%9C%AC%E4%BC%81%E4%B8%9A%E7%AE%A1%E7%90%86%E6%A8%A1%E5%BC%8F%E5%AF%B9%E6%88%91%E5%9B%BD%E4%BC%81%E4%B8%9A%E5%8F%91%E5%B1%95%E7%9A%84%E5%90%AF%E7%A4%BA&tn=SE_baiduxueshu_c1gjeupa&cl=3&ie=utf-8&bs=%E6%97%A5%E6%9C%AC%E4%BC%81%E4%B8%9A%E7%AE%A1%E7%90%86%E6%A8%A1%E5%BC%8F%E5%AF%B9%E6%88%91%E5%9B%BD%E4%BC%81%E4%B8%9A%E5%8F%91%E5%B1%95%E7%9A%84%E5%90%AF%E7%A4%BA&f=8&rsv_bp=1&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&rsv_spt=3") #爬取网页 def getData(baseurl): datalist = [] for i in range(0,10): url =baseurl + str(i*25) html = askURL(url) # 2.逐一解析数据 return datalist #得到指定一个url的网页内容 def askURL(url): #模拟浏览器头部信息,想服务区发送消息 伪装用 head = {"User-Agent:Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML,like Gecko)Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66"} request = urllib.request.Request(url,headers = head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("uf-8") print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html #3.保存数据 def saveData(savepath): print("savepath") if __name__ =="__main__": #当程序执行时 #调用函数 main()
我测试了一下 askURL函数中 的伪装头文件有问题
head = { # 模拟浏览器头部信息
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, like; Gecko) Chrome / 86.0; .4240; .198; Safari / 537.36"
}
希望采纳 已运行
#head 改成这个
head = {"User-Agent":"Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML,like Gecko)Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66"}
##这句是utf-8
html = response.read().decode("utf-8")
##然后可以跑通了,但你只是打印了网页源码,并没有写处理逻辑
我直接在getData的循环中打印的 可以打印出来 你这边可以试试 加油
#-*- codeing = utf-8 -*-
from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #制定URL,获取网页数据
import xlwt #进行excel操作
import sqlite3 #进行sqlite数据库操作
def main():
baseurl="https://xueshu.baidu.com/s?wd=%E6%97%A5%E6%9C%AC%E4%BC%81%E4%B8%9A%E7%AE%A1%E7%90%86%E6%A8%A1%E5%BC%8F%E5%AF%B9%E6%88%91%E5%9B%BD%E4%BC%81%E4%B8%9A%E5%8F%91%E5%B1%95%E7%9A%84%E5%90%AF%E7%A4%BA&tn=SE_baiduxueshu_c1gjeupa&cl=3&ie=utf-8&bs=%E6%97%A5%E6%9C%AC%E4%BC%81%E4%B8%9A%E7%AE%A1%E7%90%86%E6%A8%A1%E5%BC%8F%E5%AF%B9%E6%88%91%E5%9B%BD%E4%BC%81%E4%B8%9A%E5%8F%91%E5%B1%95%E7%9A%84%E5%90%AF%E7%A4%BA&f=8&rsv_bp=1&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&rsv_spt=3"
#爬取网页
datalise = getData(baseurl)
savepath =r".\日本企业管理模式对我国企业发展的启示.xls"
#保存数据
saveData(savepath)
askURL("https://xueshu.baidu.com/s?wd=%E6%97%A5%E6%9C%AC%E4%BC%81%E4%B8%9A%E7%AE%A1%E7%90%86%E6%A8%A1%E5%BC%8F%E5%AF%B9%E6%88%91%E5%9B%BD%E4%BC%81%E4%B8%9A%E5%8F%91%E5%B1%95%E7%9A%84%E5%90%AF%E7%A4%BA&tn=SE_baiduxueshu_c1gjeupa&cl=3&ie=utf-8&bs=%E6%97%A5%E6%9C%AC%E4%BC%81%E4%B8%9A%E7%AE%A1%E7%90%86%E6%A8%A1%E5%BC%8F%E5%AF%B9%E6%88%91%E5%9B%BD%E4%BC%81%E4%B8%9A%E5%8F%91%E5%B1%95%E7%9A%84%E5%90%AF%E7%A4%BA&f=8&rsv_bp=1&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&rsv_spt=3")
#爬取网页
def getData(baseurl):
datalist = []
for i in range(0,10):
url =baseurl + str(i*25)
html = askURL(url)
return datalist
#得到指定一个url的网页内容
def askURL(url):
#模拟浏览器头部信息,想服务区发送消息 伪装用
head = {"User-Agent" : "Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML,like Gecko)Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66"}
request = urllib.request.Request(url,headers = head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
#3.保存数据
def saveData(savepath):
print("savepath")
if __name__ =="__main__": #当程序执行时
#调用函数
main()
1 字符串前加了r 就不用写双 \ 了
savepath =r".\日本企业管理模式对我国企业发展的启示.xls"
2 url =baseurl + str(i*25)
你确定 str(i*25)要加在 baseurl 后面??
3 head = {"User-Agent" : "Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML,like Gecko)Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66"}
字典键名和键值要分成两个字符串
4 html = response.read().decode("utf-8")
你utf-8写成了uf-8