[](链接:https://pan.baidu.com/s/1NI7yeiBcLpo2XSJvIUyBpw?pwd=1234
提取码:1234
--来自百度网盘超级会员V3的分享)
利用python对链接中三个excle表格进行数据处理,包括 数据收集(数据读入,数据集成合并),数据预处理,数据分析统计(例:在会时间小于70分钟为旷课),数据以图表方式展现。
兄弟,你问题没说明白,建议excel 重命名为abc , 然后说出 a做啥b做啥c做啥, 以及你要实现啥
# -*- coding: utf-8 -*-
from urllib import request, parse
import requests, time, re, xlrd, json, pymssql, datetime
from bs4 import BeautifulSoup
from lxml import etree
from xlutils.copy import copy
from datetime import timedelta
from gevent import monkey
class Amazon_books(object):
def __init__(self):
self.url = "https://www.amazon.com/gp/bestsellers/boost/12035215011/ref=pd_zg_hrsr_boost_1_3_last"
self.base_url = "http://fy.iciba.com/ajax.php?a=fy" # 翻译url
self.index = 1 # 记录图书序号
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
}
self.workbook = xlrd.open_workbook(r'D:\PycharmProjects\reptile\亚马逊\自拍杆美国市场分析表 .xls') # 文件路径
self.xlsx = copy(self.workbook)
self.shtc = self.xlsx.get_sheet(0)
self.num = 1
def get_page(self):
"""获取前20个商品信息"""
reqeons = requests.get(self.url, headers=self.headers).text
soup = BeautifulSoup(reqeons, "lxml")
li_list = soup.select('li[class="zg-item-immersion"]')
for li in li_list[:20]:
commodity_url = li.select('a')[0].attrs['href'] # 商品url
commodity_url = "https://www.amazon.com" + commodity_url
self.shtc.write(self.num, 2, commodity_url)
self.xlsx.save('自拍杆美国市场分析表 .xls')
self.handle_page(commodity_url)
img_url = li.select('img')[0].attrs['src'] # 图片url
self.shtc.write(self.num, 1, img_url)
self.xlsx.save('自拍杆美国市场分析表 .xls')
price = li.select('span[class="p13n-sc-price"]')[0].get_text() # 商品价格
self.shtc.write(self.num, 5, price)
self.xlsx.save('自拍杆美国市场分析表 .xls')
try:
grade = li.select('span[class="a-icon-alt"]')[0].get_text() # 评分
grade = "".join(re.findall(r"\d.\d", grade))
self.shtc.write(self.num, 8, grade)
self.xlsx.save('自拍杆美国市场分析表 .xls')
except Exception as e:
grade = 0
self.shtc.write(self.num, 8, grade)
self.xlsx.save('自拍杆美国市场分析表 .xls')
try:
comment_amounts = li.select('a[class="a-size-small a-link-normal"]')[0].get_text() # 总评论数
self.shtc.write(self.num, 6, comment_amounts)
self.xlsx.save('自拍杆美国市场分析表 .xls')
except Exception as e:
comment_amounts = 0
self.shtc.write(self.num, 6, comment_amounts)
self.xlsx.save('自拍杆美国市场分析表 .xls')
try:
comment_url1 = li.select('a[class="a-size-small a-link-normal"]')[0].attrs['href'] # 评论url
comment_url = "https://www.amazon.com" + comment_url1 + "&sortBy=recent&pageNumber=1&reviewerType=avp_only_reviews&filterByStar=one_star" # 一星评论url
self.one_star_comment(comment_url)
comment_amount_url = "https://www.amazon.com" + comment_url1 + "&sortBy=recent&pageNumber=1&reviewerType=avp_only_reviews&filterByStar=positive" # 实际评论数
response = requests.get(comment_amount_url, headers=self.headers).text
soup = etree.HTML(response)
time.sleep(1)
comment_amount = "".join(soup.xpath('//*[@id="filter-info-section"]/span[1]/text()'))
comment_amount1 = re.findall(r"of (\d,\d+)", comment_amount)
if not comment_amount1:
comment_amount = re.findall(r"of (\d+)", comment_amount)
self.shtc.write(self.num, 7, comment_amount)
self.xlsx.save('自拍杆美国市场分析表 .xls')
self.num += 1
else:
print(comment_amount1)
self.shtc.write(self.num, 7, comment_amount1)
self.xlsx.save('自拍杆美国市场分析表 .xls')
self.num += 1
except Exception as e:
comment_amount = 0
self.shtc.write(self.num, 7, comment_amount)
self.xlsx.save('自拍杆美国市场分析表 .xls')
self.num += 1
def handle_page(self, commodity_url):
"""解析商品详情"""
response = requests.get(commodity_url, headers=self.headers).text
soup = etree.HTML(response)
trademark = "".join(soup.xpath('//*[@id="bylineInfo"]/text()')) # 品牌
self.shtc.write(self.num, 3, trademark)
self.xlsx.save('自拍杆美国市场分析表 .xls')
trait = soup.xpath('//*[@id="feature-bullets"]/ul/li[position()>1]//span/text()') # 卖点
num = 9
for i in trait:
i = i.replace("\t", "")
i = i.replace("\n", "")
self.translate(i, num)
num += 1
soup1 = BeautifulSoup(response, "lxml")
li_list = soup1.find(attrs={"id": "productDetails_detailBullets_sections1"})
try:
li_list = li_list.find_all("tr")
data = {}
for i in li_list:
key = i.find(attrs={"class": "a-color-secondary a-size-base prodDetSectionEntry"})
key = key.text
try:
key = key.replace("\n", "")
key = key.replace("\r", "")
key = key.replace(" ", "")
key = key.replace("\t", "")
except Exception as e:
pass
try:
value = i.find('td', class_="a-size-base").get_text()
except Exception as e:
value = None
try:
value = value.replace("\n", "")
value = value.replace("\r", "")
value = value.replace("\t", "")
value = value.replace(" ", "")
except Exception as e:
pass
data.update({key: value})
items = data.get('DatefirstlistedonAmazon')
if items is None:
items = data.get('DateFirstAvailable')
num = 4
self.translate(items, num)
else:
num = 4
self.translate(items, num)
except Exception as e:
num = 4
current_content = "2008/08/08"
self.shtc.write(self.num, num, current_content)
self.xlsx.save('自拍杆美国市场分析表 .xls')
def page_img(self, img_url):
"""下载图片"""
img_url1 = requests.get(img_url, headers=self.headers).content
with open(r'./图片/' + str(self.index) + ".jpg", 'wb') as file:
file.write(img_url1)
self.index += 1
# with open('图片.txt', 'a', encoding='utf8') as f:
# # for data in img_url:
# json.dump(img_url, f, ensure_ascii=False)
# f.write('\n')
def one_star_comment(self, comment_url):
"""解析一星评论"""
response = requests.get(comment_url, headers=self.headers).text
soup = etree.HTML(response)
one_comment = soup.xpath('//*[@data-hook="review"]//span[@data-hook="review-body"]//span/text()')
num = 14
for i in one_comment[:5]:
try:
self.translate(i, num)
num += 1
except Exception as e:
pass
def translate(self, i, num):
"""翻译"""
data = {
"f": "auto",
"t": "auto",
"w": i
}
data_str = parse.urlencode(data)
# 封装
response = request.Request(url=self.base_url, headers=self.headers, data=bytes(data_str, encoding="utf-8"))
req = json.loads(request.urlopen(response).read().decode("utf-8"))
current_state = req["status"]
# 在这里判断中译英
if current_state == 1:
current_content = req["content"]
current_content = current_content["out"]
# 否则则汉译英
else:
current_content = req["content"]
current_content = current_content["word_mean"]
try:
current_content = current_content.replace("年", "/")
current_content = current_content.replace("月", "/")
current_content = current_content.replace("日", "")
current_content = current_content.replace("。", "")
except Exception as e:
pass
self.shtc.write(self.num, num, current_content)
self.xlsx.save('自拍杆美国市场分析表 .xls')
def run(self):
self.get_page()
time.sleep(50)
if __name__ == '__main__':
while True:
now = datetime.datetime.now()
print(now.hour, now.minute)
amazon = Amazon_books()
amazon.run()
# 每隔60秒检测一次
time.sleep(50)
我可以提供一些提示和指导来帮助你完成这个任务。这个任务主要包含以下步骤:
具体的解决方案如下:
我们可以使用Python中的pandas库来读取Excel表格并进行数据合并和处理。
代码如下:
import pandas as pd
# 读取三个Excel表格
df1 = pd.read_excel('腾讯会议数据_20210830(上午场).xlsx')
df2 = pd.read_excel('腾讯会议数据_20210830(下午场).xlsx')
df3 = pd.read_excel('腾讯会议数据_20210831.xlsx')
# 合并三个表格
df = pd.concat([df1, df2, df3], ignore_index=True)
# 查看合并后的表格信息
print(df.info())
这里使用了pandas库中的read_excel函数来读取Excel表格,concat函数来进行数据合并。ignore_index=True参数用于忽略已有的序号,重新设置序号。
将读取的表格进行数据处理并筛选出符合条件的数据:
代码如下:
# 筛选出会议时间小于70分钟的参会者
# 首先读取时间列和姓名列
start_time_col = df['会议开始时间']
end_time_col = df['会议结束时间']
name_col = df['姓名']
# 计算参会时间
duration_col = (end_time_col - start_time_col).astype('timedelta64[m]')
# 将计算结果添加到表格中
df['参会时间'] = duration_col
# 筛选出参会时间小于70分钟的参会者
absentees = df[df['参会时间'] < 70]['姓名']
# 将结果保存到Excel文件
absentees.to_excel('旷课名单.xlsx', index=False)
这里首先使用pandas的astype函数计算了参会时间,并将计算结果添加到表格中,紧接着筛选出了参会时间小于70分钟的参会者,并保存在了一个名为"旷课名单.xlsx"的Excel文件中。
我们可以使用Python中的matplotlib库对数据分析结果进行可视化,来展示筛选出的参会者结果。
代码如下:
import matplotlib.pyplot as plt
# 打开"旷课名单.xlsx"文件并读取
absentees = pd.read_excel('旷课名单.xlsx')
# 统计旷课人数并绘制饼图
absentees_count = len(absentees)
presentees_count = len(df) - absentees_count
counts = [absentees_count, presentees_count]
labels = ['旷课', '正常参加']
plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=90)
plt.title('参会状态统计')
plt.show()
这里使用了pandas的read_excel函数读取"旷课名单.xlsx"文件,并统计了旷课人数和正常参加人数,最后使用matplotlib的pie函数绘制了饼图。
import os
import xlrd2 #xlrd: 对Excel进行读相关操作
import xlwt #xlwt: 对Excel进行写相关操作,且只能创建一个全新的Excel然后进行写入和保存。
import numpy
import matplotlib
from prettytable import PrettyTable #PrettyTable 是python中的一个第三方库,可用来生成美观的ASCII格式的表格
from matplotlib import pyplot as plt
def get_files_name():
"""
用于获取文件名
:return: 返回值为文件名组成的列表
"""
file_list = os.listdir('./data')
return file_list
#保存生产excel表
def load_data(file_list):
"""
用于读取指定的文件并保存至字典数据结构中
:param file_list: 需要加载的文件列表
:return: 保存了文件内容的字典
"""
dictory = {}
for file in file_list:
# 获取表格文件
book = xlrd2.open_workbook('./data/'+file)
# 获取表格中的所有sheet
names = book.sheet_names()
# 获取第一个sheet
sheet = book.sheet_by_index(0)
# 获取当前表格的行数
rows = sheet.nrows
# 获取当前表格的列数
cols = sheet.ncols
# 获取表头文件,即表格第一行
head = sheet.row_values(0)
for row in range(rows-1):
# 如果当前字典中没有该城市则创建一个
if not sheet.cell_value(row+1, 0) in dictory.keys():
dictory[sheet.cell_value(row+1, 0)] = {}
for col in range(cols-1):
dictory[sheet.cell_value(row+1, 0)][head[col+1]] = float(sheet.cell_value(row+1, col+1))
return dictory
#数据排行
def sort_data(dictory, key, reverse):
"""
根据key和reverse对数据进行排序
:param dictory: 传入的字典对象
:param key: 需要排序的关键字,即那一列
:param reverse: 是否从大到小排序,false即为从小到大排序
:return:
"""
data = dictory
if not reverse:
data = dict(sorted(data.items(), key=lambda d: d[1][key], reverse=False)) #字典的升序
else:
data = dict(sorted(data.items(), key=lambda d: d[1][key], reverse=True)) #字典的降序
return data
def get_specified_cols(dictory, col_name_list):
"""
筛选出指定的列
:param dictory:原始字典
:param col_name_list: 需要筛选出的列名,城市名默认出现
:return: 筛选之后的字典
"""
new_dict = {}
for key in dictory.keys():
new_dict[key] = {}
for col_name in col_name_list:
new_dict[key][col_name] = dictory[key][col_name]
return new_dict
def get_specified_data(dictory, operation, col_name, value):
"""
根据输入的操作符、列名以及指定的value进行筛选,比如筛选出人均GDP大于5000的,则operation = '>', col_name = '人均GDP', value = 5000
:param dictory: 原始数据
:param operation: 操作符
:param col_name: 需要比较的列
:param value: 需要比较的值
:return: 筛选之后的字典
"""
new_dict = {}
if col_name != "城市":
for key in dictory.keys():
# flag用于标记是否需要添加该行
value = float(value)
flag = 0
if operation == '>':
if dictory[key][col_name] > value:
flag = 1
elif operation == '>=':
if dictory[key][col_name] >= value:
flag = 1
elif operation == '=':
if dictory[key][col_name] == value:
flag = 1
elif operation == '<=':
if dictory[key][col_name] <= value:
flag = 1
elif operation == '<':
if dictory[key][col_name] < value:
flag = 1
else:
flag = 0
if flag == 1:
new_dict[key] = {}
new_dict[key] = dictory[key]
else:
for key in dictory.keys():
# flag用于标记是否需要添加该行
flag = 0
if operation == '>':
if key > value:
flag = 1
elif operation == '>=':
if key >= value:
flag = 1
elif operation == '=':
if key == value:
flag = 1
elif operation == '<=':
if key <= value:
flag = 1
elif operation == '<':
if key < value:
flag = 1
else:
flag = 0
if flag == 1:
new_dict[key] = {}
new_dict[key] = dictory[key]
return new_dict
#制作图表
def draw_plot(dictory):
font = {'family': 'MicroSoft Yahei', 'weight': 'bold', 'size': 7}
matplotlib.rc('font', **font) #设置中文字体
# 定义三个颜色
index = numpy.arange(len(dictory.keys()))
color = [(256 / 256, 0 / 256, 0 / 256, 1),
(0 / 256, 0 / 256, 256 / 256, 1),
(0 / 256, 256 / 256, 0 / 256, 1),
(0 / 256, 0 / 256, 0 / 256, 1)]
first_key = list(dictory.keys())
first_key = first_key[0]
cols = list(dictory[first_key].keys())
data = []
for i in range(len(cols)):
data.append([])
for key in dictory.keys():
for col in range(len(cols)):
data[col].append(dictory[key][cols[col]])
offset = -1/4
for i in range(len(cols)):
plt.bar(index+offset, data[i], color=color[i], width=1 / 5) #通过bar函数可以用柱状图来表达一些变量的统计分布
offset += 1/4
plt.xticks(index, dictory.keys())#表示刻度
plt.legend(cols)#给图像加上图例
plt.show()
def show_data(dictory):
try:
keys = list(dictory[list(dictory.keys())[0]].keys())
except:
print("当前数据为空")
return
head = ['城市']
head.extend(keys)
table = PrettyTable(head)
for key in dictory.keys():
line = [key]
for key_2 in keys:
line.append(dictory[key][key_2])
table.add_row(line)
print(table)
def save(dictory):
name = input("请输入文件名(无需加后缀):")
book = xlwt.Workbook()
sheet = book.add_sheet('Sheet1', cell_overwrite_ok=True)
keys = list(data[list(data.keys())[0]].keys())
head = ["城市"]
head.extend(keys)
for h in range(len(head)):
sheet.write(0, h, head[h])
cities = list(dictory.keys())
for city in range(len(cities)):
sheet.write(city+1, 0, cities[city])
for key in range(len(keys)):
sheet.write(city+1, key+1, dictory[cities[city]][keys[key]])
book.save('./data/'+name+'.xls')
print("保存成功")
# 打印菜单
def menu():
print(" ----------Excel 数据分析师----------")
print("{:<30}".format(" ==============功能菜单============== "))
print("{:<30}".format(" 1. 显示当前数据 "))
print("{:<30}".format(" 2. 以柱状图展示当前数据 "))
print("{:<30}".format(" 3. 提起指定列 "))
print("{:<30}".format(" 4. 定向筛选指定元素 "))
print("{:<30}".format(" 5. 数据排行 "))
print("{:<30}".format(" 6. 重新加载数据 "))
print("{:<30}".format(" 7. 保存当前数据 "))
print("{:<30}".format(" 0. 退出程序 "))
print("{:<30}".format(" ==================================== "))
print("{:<30}".format(" 说明:输入相应数字后按下回车选择指定功能 "))
print('\n')
if __name__ == "__main__":
# 导入文件
files = get_files_name()
data = {}
print("当前data文件夹下的文件如下:")
num = 1
for file in files:
print(num, file)
num += 1
while(1):
index_str = input("请选择需要导入的文件序号(多个文件导入时用空格分开, 输入0则导入所有文件,输入多文件则自动合并):")
index_list = index_str.split(' ')
try:
index_list.remove('')
except:
pass
choice_file_list = []
if index_list[0] == '0':
choice_file_list = files
break
else:
try:
for item in index_list:
choice_file_list.append(files[int(item)-1])
except:
print("输入序号有误")
continue
if choice_file_list:
break
else:
print("输入序号有误")
data = load_data(choice_file_list)
print("导入数据成功\n")
# 调用函数,打印菜单
menu()
while 1:
choice = input("请选择指定功能:")
if choice == '0':
print("\n退出程序\n")
exit()
elif choice == '1':
print("当前功能:显示当前数据")
show_data(data)
input('\n按下回车返回菜单')
menu()
elif choice == '2':
print("当前功能:以柱状图显示数据")
draw_plot(data)
input('\n按下回车返回菜单')
menu()
elif choice == '3':
print("当前功能:筛选指定列")
keys = list(data[list(data.keys())[0]].keys())
print("当前表格中的列如下:")
num = 1
for key in keys:
print(num, key)
num += 1
choice_col_list = []
while (1):
index_str = input("请选择需要筛选出的列序号(多列之间用空格分开,0代表所有列):")
index_list = index_str.split(' ')
try:
index_list.remove('')
except:
pass
choice_file_list = []
if index_list[0] == '0':
choice_col_list = keys
break
else:
try:
for item in index_list:
choice_col_list.append(keys[int(item) - 1])
except:
print("输入序号有误")
continue
if choice_col_list:
break
else:
print("输入序号有误")
data = get_specified_cols(data, choice_col_list)
print("筛选成功")
input('\n按下回车返回菜单')
menu()
elif choice == '4':
print("当前功能:筛选指定行")
keys = list(data[list(data.keys())[0]].keys())
print("当前表格中的列如下:")
num = 1
print(num, "城市")
num += 1
for key in keys:
print(num, key)
num += 1
col = int(input("请输入需要进行筛选的数据所在的列:"))-2
if col == -1:
col = '城市'
else:
col = keys[col]
op_list = ['<', '<=', '=', '>=', '>']
print("比较操作符如下:")
num = 1
for op in op_list:
print(num, op)
num += 1
operation = int(input("请输入比较操作符前的序号:"))-1
operation = op_list[operation]
value = input("请输入需要筛选的值:")
data = get_specified_data(data, operation, col, value)
print("筛选成功")
input('\n按下回车返回菜单')
menu()
elif choice == '5':
print("当前功能:数据排序")
keys = list(data[list(data.keys())[0]].keys())
print("当前表格中的列如下:")
num = 1
for key in keys:
print(num, key) #显示当前表格中的所有的列
num += 1
col = int(input("请输入需要进行排序的数据所在的列:")) - 1
col = keys[col]
reverse = input("排序方式:\n1 从大到小排序\n2 从小到大排序\n")
if reverse == '1':
data = sort_data(data, col, True)
elif reverse == '2':
data = sort_data(data, col, False)
else:
print("输入有误")
input('\n按下回车返回菜单')
menu()
elif choice == '6':
# 导入文件
files = get_files_name()
data = {}
print("当前文件夹下的文件如下:")
num = 1
for file in files:
print(num, file)
num += 1
while (1):
index_str = input("请选择需要导入的文件序号(多个文件导入时用空格分开, 输入0则导入所有文件,输入多文件则自动合并):")
index_list = index_str.split(' ')
try:
index_list.remove('')
except:
pass
choice_file_list = []
if index_list[0] == '0':
choice_file_list = files
break
else:
try:
for item in index_list:
choice_file_list.append(files[int(item) - 1])
except:
print("输入序号有误")
continue
if choice_file_list:
break
else:
print("输入序号有误")
data = load_data(choice_file_list)
print("导入数据成功\n")
# 打印菜单
menu()
elif choice == '7':
print("当前功能:保存数据")
save(data)
input('\n按下回车返回菜单')
menu()
else:
print("请输入正确的数字")
input('\n按下回车返回菜单')
menu()
是怎么统计。这么漫无目的吗
要使用Python对Excel中的数据进行统计分析和预测,可以使用以下库:
以下是一个简单的Python程序,用于读取Excel文件并进行统计分析和预测:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
# 读取Excel文件
df = pd.read_excel('data.xlsx')
# 数据统计分析
mean = np.mean(df['column_name'])
median = np.median(df['column_name'])
std = np.std(df['column_name'])
# 数据可视化
plt.hist(df['column_name'])
plt.show()
# 预测分析
X = df['column1'].values.reshape(-1, 1)
y = df['column2'].values.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
plt.scatter(X, y)
plt.plot(X, y_pred, color='red')
plt.show()
在这个例子中,使用Pandas库读取Excel文件,并使用NumPy库计算数据的平均值,中位数和标准差。然后,我们使用Matplotlib库绘制直方图来可视化数据。最后,我们使用Scikit-learn库中的线性回归模型进行预测分析,并使用Matplotlib库绘制散点图和回归线。
和上面的答友一样,建议下次表达题意得时候,能够明确你得目的需求,大家才好针对性得帮助你,目前只能提供一个相似得优秀实例供你参考:利用Python对Excel数据进行处理,链接:https://blog.csdn.net/weixin_54500974/article/details/118557886
1.这三个文件用户名基本上都是重复的,怎么把三个合并在一起。需要保留什么?
2.你这个只有参会时间,你需要分析什么,或者这个文件能够分析什么?
3.结果用哪些图哪些表展示哪些分析数据?
你就发个excel,啥也不说,问作业也不是你这样的问啊