'ascii' codec can't encode character '\xb6' 错误

我在使用python爬取某个网站数据时出现了:

UnicodeEncodeError: 'ascii' codec can't encode character '\xb6' in position 38: ordinal not in range(128)

不知道是为什么,具体的显示是:

 File "C:\Users\13304\Desktop\work\代码\test\myvac.py", line 118, in <module>
    main()

  File "C:\Users\13304\Desktop\work\代码\test\myvac.py", line 26, in main
    datalist = getData(baseurl)

  File "C:\Users\13304\Desktop\work\代码\test\myvac.py", line 74, in getData
    html = askURL(link)

  File "C:\Users\13304\Desktop\work\代码\test\myvac.py", line 102, in askURL
    response = urllib.request.urlopen(request)

  File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 222, in urlopen
    return opener.open(url, data, timeout)

  File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 525, in open
    response = self._open(req, data)

  File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 543, in _open
    '_open', req)

  File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 503, in _call_chain
    result = func(*args)

  File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 1362, in https_open
    context=self._context, check_hostname=self._check_hostname)

  File "C:\Users\13304\Anaconda3\lib\urllib\request.py", line 1319, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))

  File "C:\Users\13304\Anaconda3\lib\http\client.py", line 1252, in request
    self._send_request(method, url, body, headers, encode_chunked)

  File "C:\Users\13304\Anaconda3\lib\http\client.py", line 1263, in _send_request
    self.putrequest(method, url, **skips)

  File "C:\Users\13304\Anaconda3\lib\http\client.py", line 1118, in putrequest
    self._output(self._encode_request(request))

  File "C:\Users\13304\Anaconda3\lib\http\client.py", line 1198, in _encode_request
    return request.encode('ascii')

UnicodeEncodeError: 'ascii' codec can't encode character '\xb6' in position 38: ordinal not in range(128)

我的代码段是:

import bs4 #网页解析,获取数据
import sys
import re#正则表达式,进行文字匹配
import urllib#指定URL,获取网页数据
import xlwt#进行excel操作
from bs4 import BeautifulSoup
import pandas as pd

def main():
    #爬取网页+逐一解析数据
    baseurl =r"网站的链接(为了不影响那个网站使用就没放网站)"
    datalist = getData(baseurl)


#爬取网页
def getData(baseurl):
    datalist = []
    html = askURL(baseurl)
    #html = etree.HTML(html)
    #result = etree.tostring(html)
    #print(result.decode("utf-8"))
    
    soup = BeautifulSoup(html,"html.parser")
    total = soup.find_all('a')
    hrefs = []  
    for item in total:
        x = item['href']
        hrefs.append(x)
    #删除hrefs中其他href但不是我们要找的含有信息的页面
    n = len(hrefs)
    hrefs = hrefs[5:n-1]
    n = len(hrefs)
    print(f"一共含有最新的数据页面{n}页")
    get_num = input('请输入想要获取的最新数据页数:')
    get_num = int(get_num)
    #判断用户输入数值是否合法
    while get_num > len(hrefs):
        print('输入的数值不得大于文件总数,请重新输入!')
        get_num = input('请输入需下载最新数据页面数量:')
        get_num = int(get_num)
    else:
        #根据用户输入数量,对原超链接列表进行切片
        url_list = hrefs[:get_num]
    #依次进入生物制品批签发签发产品情况汇总表获取并解析数据
    #爬取的link是'search.do?formAction=listGsxq&o……这种形式,给它加个开头就可以直接访问了
    Link = []
    for link in url_list:
        link = "https://bio.nifdc.org.cn/pqf/"+str(link)
        Link.append(link)
   
    #该表格内容不多,且有合并单元格,用Excel保存即可,如数据量较大,需改用CSV格式
    #定义一个累加器,用于给文件命名
    i = 1
    #遍历超链接,取出每一个Excel源地址进行内容抓取
    
    #由于数据不是很规整,现在先把不同的更新数据写在不同的excel里
    
    total_data=[]
    for link in Link:
        html = askURL(link)
        #用pandas对网页Excel表格进行抓取,并保存到该代码目录下,以药检所+编码的形式进行命名,以中文GB18030编码进行保存
        html_data = pd.read_html(html, encoding='utf-8')
        #print(html_data)
        #print(html_data)
        #print(type(html_data))
        table_data = pd.DataFrame(html_data[1])
        #print(table_data)
        #print(type(table_data))
        #该表格内容不多,且有合并单元格,用Excel保存即可,如数据量较大,需改用CSV格式
        #table_data[1].to_csv(f'批签发汇总-{i}.csv', index=0,encoding='gb18030')
        total_data =pd.concat([total_data,table_data])
        i+=1
        #控制台显示抓取进度
        print(f'共{len(Link)},已抓取{i-1}个,剩余{len(Link)-i+1}个')
    total_data.to_excel(f"批签发汇总.xls", encoding='gb18030')
    #控制台显示抓取进度
    #全部抓取完成,输出完成信息
    print('批签发数据已下载完成!')  
    
#得到一个指定URL的网页内容
def askURL(url):
    #user-agent告诉服务器我们是什么类型的机器:浏览器(本质上告诉浏览器我们可以接受什么水平的信息)
    #头部信息又来模拟浏览器头部信息,向服务器发送消息
    head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56"}
    request = urllib.request.Request(url,headers = head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        #print(html)
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
    return html



if __name__ == "__main__":
    #调用函数
    main()
    print("爬取完毕")    

不知道有哪位同好可以帮我解决一下,谢谢!!!!

获取的数据有中文,编码不对。设置为utf8编码格式试一下

可以先检查下你代码的编码格式,参考下面文章试试:

https://www.cnblogs.com/yhl-yh/p/6728567.html

XJdrBP{`HfX_Dy>@#!