采集采集只要当天的价格,比如最新发布的时间,每天都要更新需求,做个文件出来自动下载。
采集网址http://www.xinfadi.com.cn/priceDetail.html
import requests
import openpyxl
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"
}
def write_data(json: dict, file_name: str):
"""
创建一个file_name.xlsx来存储json
:param json:
:param file_name:
:return:
"""
data_array = json['list']
label_array = ["品名", "最低价", "平均价", "最高价", "规格", "产地", "单位", "发布日期"]
book = openpyxl.Workbook()
sheet = book.active
sheet.append(label_array)
for i in data_array:
prodName = i['prodName']
lowPrice = i["lowPrice"]
avgPrice = i["avgPrice"]
highPrice = i["highPrice"]
specInfo = i["specInfo"]
place = i["place"]
unitInfo = i["unitInfo"]
pubDate = i["pubDate"]
array = [prodName, lowPrice, avgPrice, highPrice, specInfo, place, unitInfo, pubDate]
sheet.append(array)
book.save(f"{file_name}.xlsx")
print(f"{file_name}.xlsx 下载完成!")
class Spider:
def __init__(self):
self.url = "http://www.xinfadi.com.cn/getCat.html"
self.json = {"蔬菜": 1186, "水果": 1187, "肉禽蛋": 1189, "水产": 1190, "粮油": 1188, "豆制品": 1203, "调料": 1204}
self.session = requests.session()
def get_data(self, num_id: int):
"""
获取选择的数据
:param num_id:
:return:
"""
data = {
"prodCatid": num_id
}
req = self.session.post(url=self.url, data=data, headers=headers)
json = req.json()
req.close()
return json
def download(self):
keys = list(self.json.keys())
for i in range(len(keys)):
print(i, keys[i])
num = input("输入下标:")
the_id = self.json[keys[eval(num)]]
the_json = self.get_data(the_id)
write_data(json=the_json, file_name=keys[eval(num)])
if __name__ == '__main__':
User = Spider()
User.download()
运行程序,会在当前目录生成一个.xlsx文件,就是你需要的数据文件
需要你安装一下openpyxl 和 requests
有用的话点一下采纳
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
@author: Roc-xb
"""
import requests
import openpyxl
import time
def write_excel(dataList, filename):
label_array = ["品名", "最低价", "平均价", "最高价", "规格", "产地", "单位", "发布日期"]
book = openpyxl.Workbook()
sheet = book.active
sheet.append(label_array)
for item in dataList:
prodName = item['prodName']
lowPrice = item["lowPrice"]
avgPrice = item["avgPrice"]
highPrice = item["highPrice"]
specInfo = item["specInfo"]
place = item["place"]
unitInfo = item["unitInfo"]
pubDate = item["pubDate"]
row = [prodName, lowPrice, avgPrice, highPrice, specInfo, place, unitInfo, pubDate]
sheet.append(row)
book.save(f"{filename}.xlsx")
if __name__ == '__main__':
today = time.strftime("%Y/%m/%d", time.localtime())
filename = time.strftime("%Y年%m月%d日", time.localtime())
print("今日是:", today)
url = "http://www.xinfadi.com.cn/getPriceData.html"
payload = "limit=1000¤t=1&pubDateStartTime={}&pubDateEndTime={}&prodPcatid=&prodCatid=&prodName=".format(
today, today)
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
response = requests.post(url, headers=headers, data=payload).json()
dataList = response['list']
print("今日查询结果条数:", len(dataList))
write_excel(dataList, filename)
直接调接口 处理数据 用xlwt 库写数据到excel
看页面抓取分析也挺简单的,直接请求里面的api 解析json 然后就拿到数据源了,剩下的就是转excel了
请参考:
股票和菜是一样的
https://blog.csdn.net/qq285679784/article/details/109229295