我在使用python爬取某个网站数据时出现了:
UnicodeEncodeError: 'ascii' codec can't encode character '\xb6' in position 38: ordinal not in range(128)
不知道是为什么,具体的显示是:
File "C:\Users\13304\Desktop\work\代码\test\myvac.py", line 118, in <module>
main()
File "C:\Users\13304\Desktop\work\代码\test\myvac.py", line 26, in main
datalist = getData(baseurl)
File "C:\Users\13304\Desktop\work\代码\test\myvac.py", line 74, in getData
html = askURL(link)
File "C:\Users\13304\Desktop\work\代码\test\myvac.py", line 102, in askURL
response = urllib.request.urlopen(request)
File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 543, in _open
'_open', req)
File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 1362, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 1319, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "C:\Users\13304\Anaconda3\lib\http\client.py", line 1252, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\13304\Anaconda3\lib\http\client.py", line 1263, in _send_request
self.putrequest(method, url, **skips)
File "C:\Users\13304\Anaconda3\lib\http\client.py", line 1118, in putrequest
self._output(self._encode_request(request))
File "C:\Users\13304\Anaconda3\lib\http\client.py", line 1198, in _encode_request
return request.encode('ascii')
UnicodeEncodeError: 'ascii' codec can't encode character '\xb6' in position 38: ordinal not in range(128)
我的代码段是:
import bs4 #网页解析,获取数据
import sys
import re#正则表达式,进行文字匹配
import urllib#指定URL,获取网页数据
import xlwt#进行excel操作
from bs4 import BeautifulSoup
import pandas as pd
def main():
#爬取网页+逐一解析数据
baseurl =r"网站的链接(为了不影响那个网站使用就没放网站)"
datalist = getData(baseurl)
#爬取网页
def getData(baseurl):
datalist = []
html = askURL(baseurl)
#html = etree.HTML(html)
#result = etree.tostring(html)
#print(result.decode("utf-8"))
soup = BeautifulSoup(html,"html.parser")
total = soup.find_all('a')
hrefs = []
for item in total:
x = item['href']
hrefs.append(x)
#删除hrefs中其他href但不是我们要找的含有信息的页面
n = len(hrefs)
hrefs = hrefs[5:n-1]
n = len(hrefs)
print(f"一共含有最新的数据页面{n}页")
get_num = input('请输入想要获取的最新数据页数:')
get_num = int(get_num)
#判断用户输入数值是否合法
while get_num > len(hrefs):
print('输入的数值不得大于文件总数,请重新输入!')
get_num = input('请输入需下载最新数据页面数量:')
get_num = int(get_num)
else:
#根据用户输入数量,对原超链接列表进行切片
url_list = hrefs[:get_num]
#依次进入生物制品批签发签发产品情况汇总表获取并解析数据
#爬取的link是'search.do?formAction=listGsxq&o……这种形式,给它加个开头就可以直接访问了
Link = []
for link in url_list:
link = "https://bio.nifdc.org.cn/pqf/"+str(link)
Link.append(link)
#该表格内容不多,且有合并单元格,用Excel保存即可,如数据量较大,需改用CSV格式
#定义一个累加器,用于给文件命名
i = 1
#遍历超链接,取出每一个Excel源地址进行内容抓取
#由于数据不是很规整,现在先把不同的更新数据写在不同的excel里
total_data=[]
for link in Link:
html = askURL(link)
#用pandas对网页Excel表格进行抓取,并保存到该代码目录下,以药检所+编码的形式进行命名,以中文GB18030编码进行保存
html_data = pd.read_html(html, encoding='utf-8')
#print(html_data)
#print(html_data)
#print(type(html_data))
table_data = pd.DataFrame(html_data[1])
#print(table_data)
#print(type(table_data))
#该表格内容不多,且有合并单元格,用Excel保存即可,如数据量较大,需改用CSV格式
#table_data[1].to_csv(f'批签发汇总-{i}.csv', index=0,encoding='gb18030')
total_data =pd.concat([total_data,table_data])
i+=1
#控制台显示抓取进度
print(f'共{len(Link)},已抓取{i-1}个,剩余{len(Link)-i+1}个')
total_data.to_excel(f"批签发汇总.xls", encoding='gb18030')
#控制台显示抓取进度
#全部抓取完成,输出完成信息
print('批签发数据已下载完成!')
#得到一个指定URL的网页内容
def askURL(url):
#user-agent告诉服务器我们是什么类型的机器:浏览器(本质上告诉浏览器我们可以接受什么水平的信息)
#头部信息又来模拟浏览器头部信息,向服务器发送消息
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56"}
request = urllib.request.Request(url,headers = head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
if __name__ == "__main__":
#调用函数
main()
print("爬取完毕")
不知道有哪位同好可以帮我解决一下,谢谢!!!!
获取的数据有中文,编码不对。设置为utf8编码格式试一下
可以先检查下你代码的编码格式,参考下面文章试试:
https://www.cnblogs.com/yhl-yh/p/6728567.html
XJdrBP{`HfX_Dy>@#!