问题遇到的现象和发生背景
问题相关代码,请勿粘贴截图
运行结果及报错内容
我的解答思路和尝试过的方法
我想要达到的结果
```python
import requests, re, time
from lxml import etree
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.append(["年份", "排名", "电影名", "日期", "票房", "详情页url", "id", "类型", "地点","评分", "导演们", "主演们"])
# 伪装成浏览器去请求数据
# headers={'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Mobile Safari/537.36'}
# 出现了这个错误: raise JSONDecodeError("Expecting value", s, err.value) from None,JSONDecodeError: Expecting value
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Cookie': '_lxsdk_cuid=172574eb0c1c8-089b1c7aa92fd5-f7d1d38-144000-172574eb0c18d; _lxsdk=B55ECA50A04411EA8E3317C78304F5C137BF739A17864686BAFDCDCF192B28E8; isid=6411AA68003B448DE53F6EFCABD36751; token=azddXq07vt8BZhHqP67X5vusIloAAAAAygoAAKp_5OrZiZgDiGsXdLQ3DGZr6a3OKVmSxuPH1jFcLs709YEdYhzfq8_FU-cPpvRm9A; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1590602756,1590602770,1591330351,1591423061; __mta=251575993.1590602756744.1590602770565.1591423063062.3; __mta=217833001.1590602674639.1591421273339.1591445015235.18; _lx_utm=utm_source%3Dgoogle%26utm_medium%3Dorganic',
'Host': 'piaofang.maoyan.com',
'Referer': 'https://piaofang.maoyan.com/rankings/year',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Token': 'azddXq07vt8BZhHqP67X5vusIloAAAAAygoAAKp_5OrZiZgDiGsXdLQ3DGZr6a3OKVmSxuPH1jFcLs709YEdYhzfq8_FU-cPpvRm9A',
'Uid': 'c9cd1cfb8f95cd68972376145781dd3937103067',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
# 获取数据并解析
# 拿到的是request url,因为猫眼那个网站地址是ajax请求,更改括号就可拿到不同年份的数据year={}&limit=100&tab={}
temp_url = 'https://piaofang.maoyan.com/rankings/year?token=azddXq07vt8BZhHqP67X5vusIloAAAAAygoAAKp_5OrZiZgDiGsXdLQ3DGZr6a3OKVmSxuPH1jFcLs709YEdYhzfq8_FU-cPpvRm9A&year={}&limit=100&tab={}'
years = [2021, 2020, 2019, 2018, 2017]
for year in years:
index = years.index(year)
url = temp_url.format(year, index) # 生成相应年份的url
# print(url)
resp = requests.get(url, headers=headers)
js = resp.json() # 返回被解析的对象
content = js['yearList'] # 获取数据,在f12>network>preview
# print(content)
html = etree.HTML(content) # 解析
# print(html) #得到一个<Element html at 0x26a4e5a7d48>
uls = html.xpath("//ul")
# print(uls) #得到多个element
# 提取需要的数据
for ul in uls:
item = {} # 把字典中每对key和value组成一个元组,并把这些元组放在列表中返回。
item['year'] = year
item['paiming'] = ul.xpath("./li[1]/text()")[0]
item['name'] = ul.xpath("./li[2]/p[1]/text()")[0]
item['date'] = ul.xpath("./li[2]/p[2]/text()")[0]
item['pf'] = ul.xpath("./li[3]/text()")[0]
# print(item)
# 进去详情页抓取电影具体信息
# 发现每部电影有其编号:<ul class="row" data-com="hrefTo,href:'/movie/1211270'" data-loaded="true">
h = re.sub("hrefTo,href:'|'", "", ul.xpath("./@data-com")[0])
# print(h) #拿到/movie/1211270
item['href'] = 'https://piaofang.maoyan.com' + h # 每部电影详情页的网址https://piaofang.maoyan.com/movie/1211270
item['id'] = h.split("/")[-1]
# 到每部电影的详情页爬取其他数据
resp1 = requests.get(item['href'], headers=headers)
de_html = etree.HTML(resp1.text)
# 发现好多爬取失败的原因,改正
try:
item['type'] = de_html.xpath("//div[@class='detail-list']/div/p[1]/text()")[0].strip()
except Exception:
item['type'] = ''
except Exception:
item['type'] = de_html.xpath("//div[@class='info-list']/p[1]/text()")[0].strip()
except Exception:
item['type'] = ''
except:
continue
try:
item['address'] = re.sub("\s|/", "",
de_html.xpath("//div[@class='detail-list']/div/div[1]/div/p/text()")[0])
except:
item['address'] = ''
try:
item['rating'] = re.sub("\s|/", "",
de_html.xpath("//div[@class='score-detail']/div[1]/span[1]/text()")[0])
except:
item['rating'] = ''
# print(item)
deUrl = f"https://piaofang.maoyan.com/movie/{item['id']}/moresections?token=azddXq07vt8BZhHqP67X5vusIloAAAAAygoAAKp_5OrZiZgDiGsXdLQ3DGZr6a3OKVmSxuPH1jFcLs709YEdYhzfq8_FU-cPpvRm9A"
resp2 = requests.get(deUrl, headers=headers)
js1 = resp2.json()
de_html2 = etree.HTML(js1['sectionHTMLs']['celebritySection']['html'])
try:
item['directors'] = ' '.join([re.sub("\s", "", i) for i in
de_html2.xpath("//div[@class='hc-layout']/div[1]/div[2]/a//text()")]).strip()
except Exception:
item['derectors'] = ''
try:
item['actors'] = ' '.join([re.sub("\s", "", i).strip() for i in
de_html2.xpath("//div[@class='hc-layout']/div[2]/div[2]/a//text()") if
i.strip() != '']).strip()
except Exception:
item['actors'] = ''
print(item)
datas = [str(i) for i in item.values()]
ws.append(datas)
time.sleep(2)
wb.save(r'数据采集.xlsx')
报错
Traceback (most recent call last):
File "C:\Users\btod\PycharmProjects\crawl\venv\lib\site-packages\requests\models.py", line 910, in json
return complexjson.loads(self.text, **kwargs)
File "C:\anaconda\lib\json\__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "C:\anaconda\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\anaconda\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/btod/PycharmProjects/crawl/maoyan.py", line 99, in <module>
js1 = resp2.json()
File "C:\Users\btod\PycharmProjects\crawl\venv\lib\site-packages\requests\models.py", line 917, in json
raise RequestsJSONDecodeError(e.msg, e.doc, e.pos)
requests.exceptions.JSONDecodeError: [Errno Expecting value] Not Found: 0
Process finished with exit code 1
一般这样好像需要去重新刷新猫眼界面,但经常成功运行爬取了个两百条左右后又会报这个错
import requests, re, time
from lxml import etree
import openpyxl
def run(year, tab):
url = "https://piaofang.maoyan.com/rankings/year?year={}&limit=100&tab={}".format(year, tab)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': '_lxsdk_cuid=18048322de5c8-0d61a0565efa25-1734337f-1fa400-18048322de5c8; _lxsdk=18048322de5c8-0d61a0565efa25-1734337f-1fa400-18048322de5c8; theme=moviepro; _lxsdk_s=18048322de6-c1c-8e2-b99%7C%7C14',
'DNT': '1',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
response = requests.request("GET", url, headers=headers).text
dom = etree.HTML(response)
ranks_list = dom.xpath('//*[@id="ranks-list"]/ul')
global dataList
for row in ranks_list:
item = {}
# 年份
item['year'] = year
# 排名
item['rank'] = row.xpath('./li[1]//text()')[0]
# 电影名
item['name'] = row.xpath('./li[2]/p[1]/text()')[0]
# 日期
item['date'] = str(row.xpath('./li[2]/p[2]/text()')[0])[0:10]
# 票房
item['pf'] = row.xpath('./li[3]/text()')[0]
# 平均均价
item['avg'] = row.xpath('./li[4]/text()')[0]
# 场均人次
item['avg_count'] = row.xpath('./li[5]/text()')[0]
dataList.append(item)
print(year, len(dataList), dataList)
time.sleep(1)
def write_excel(dataList, filename):
label_array = ["年份", "排名", "片名", "日期", "票房(万元)", "平均票价", "场均人次"]
book = openpyxl.Workbook()
sheet = book.active
sheet.append(label_array)
for item in dataList:
year = item['year']
rank = item["rank"]
name = item["name"]
date = item["date"]
pf = item["pf"]
avg = item["avg"]
avg_count = item["avg_count"]
row = [year, rank, name, date, pf, avg, avg_count]
sheet.append(row)
book.save(f"{filename}.xlsx")
if __name__ == '__main__':
dataList = []
year = 2012
tab = 12
for i in range(1, tab):
run(year, i)
year += 1
write_excel(dataList, "猫眼专业版")