import requests #请求网页数据
from bs4 import BeautifulSoup #美味汤解析数据
import pandas as pd
import time
from tqdm import trange #获取爬取速度
def get_bilibili_url(start, end):
url_list = []
date_list = [i for i in pd.date_range(start, end).strftime('%Y-%m-%d')]
for date in date_list:
url = f"https://api.bilibili.com/x/v2/dm/history?type=1&oid=141367679&date={date}"
url_list.append(url)
return url_list
def get_bilibili_danmu(url_list):
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
"cookie": "你自己的" #Headers中copy即可
}
file = open("bilibili_danmu.txt", 'w')
for i in trange(len(url_list)):
url = url_list[i]
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text)
data = soup.find_all("d")
danmu = [data[i].text for i in range(len(data))]
for items in danmu:
file.write(items)
file.write("\n")
time.sleep(3)
file.close()
if name == "main":
start = '9/24/2020' #设置爬取弹幕的起始日
end = '9/26/2020' #设置爬取弹幕的终止日
url_list = get_bilibili_url(start, end)
get_bilibili_danmu(url_list)
print("弹幕爬取完成")
以上是相关代码
报错UnicodeEncodeError: 'latin-1' codec can't encode characters in position 0-3: ordinal not in range(25
报错后我不小心点到了最后一个报错项的py文件内并将第1100左右行改为了encoding=’utf-8’
于是再次运行出现这样的错误,我想能找到被误改的文件并修正!如果能解决原来的报错问题就更好了!望大家提出建议!