使用python的requests库爬取百度中的数据
使用response.encoding = 'utf8'转换后
response.text仍然乱码
结果:
代码如下:
```python
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
# 模拟百度搜索
def baiduAPI(params):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Cookie": "HMACCOUNT_BFESS=BD73DB1D35AF871A; BDUSS_BFESS=VJJdkswYnVrT0JHUmFGTFRpTWhUUn5wLVlLbkJvTzA1WTBDcnZwQTU2UU9pYlZqRVFBQUFBJCQAAAAAAAAAAAEAAABcJyX5wO7KzcH6NwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA78jWMO~I1jc; BAIDUID_BFESS=2B61874755F437B5EC6BEF21B3134DF7:FG=1; ZFY=K9QfSyPCgzUKQk2s8CZ4eAp9owXYz:BJRF7ftEin2pJ8:C; H_PS_PSSID=36543_37552_38092_38052_37990_37796_36803_37930_38088_38041_26350_38009_37881; ab_sr=1.0.1_NzYyZWM1ZDU3OGQ2NGM2ODMxYmQ3MTZmNjE4YWI0OGY2NzIzYWY2YzQ3MjI3YTg5M2U2ODljYWFjZWY2NTg0ODE5NWY2YjhiZDk2YTQ2ZjU0NzZhM2MyZmY2YzFlOGM0YWRmZGU3ZDNkN2VjZjY4MjYwYzQ5MzNhYWQzOTdkYTExOWZkMDhhZTRlMzI0MWZkZGJkNDU3ZTk5YjY2N2ZhZDM3NjAwNDZhMGMxNjg4N2U4N2UwMzU4MTg3ZTFhNTY3",
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
'Accept-Language': 'zh-CN,zh;q=0.9'
}
url = "https://www.baidu.com/s?" + params
response = requests.get(url, headers=headers)
response.encoding = 'utf8'
print(response.text)
#with open("zhongguo.html",'wb') as f:
# f.write(response.text)
return response
if __name__ == "__main__":
wd = {"wd": "中国"}
params = urllib.parse.urlencode(wd)
print(params) # 'wd=aa'
response = baiduAPI(params)
# print(response.content)
```
望采纳!!!
设置一下编码集即可。
response = requests.get(url=url,headers=headers)
content = response.content.decode('utf8')
print(content)
#使用requests模块发送请求获取代码
import requests
#发送请求,获取响应
response = requests.get('http://www.baidu.com%27/)
#获取响应数据
print(response.text) # 会打印出类似乱码的字符串,其实是西欧的字体
#获取编码用的字符集
print(response.encoding) # 会打印出ISO-8859-1, 是西欧字符集,官方说是根据响应头推断出要用的编解码字符集,但很明显不对的,编码和解码要用一致的字符集才行
#解决方法有两个
#方法一,可以通过设置字符编码response.encoding 来匹配指定的解码,这样就不会乱码了。
response.encoding = 'utf8' #utf8字符集目前大部分用它
print(response.text) # 就会打印出正常的字符串了
#方法二: 获取reponse的二进制数据再解码为字符串(推荐,常用)
print(response.content.decode()) # decode()括号里面不写就默认用utf8字符集