如何用python的pandas将以下的代码建立三维数组以方面提取不同维度的信息?用两个for loop运行太慢了,可不可以将行业名称,时间,及相关行业信息直接整合到一个dataframe中?感谢!
IndustryTotal=w.wset("sectorconstituent","date=2023-05-25;sectorid=a39901012d000000",usedf=True)[1]#########Wind行业########
secname=IndustryTotal.wind_code ##所有行业
Time=['2010-03-31','2010-06-30','2010-09-30','2010-12-31',
'2011-03-31','2011-06-30','2011-09-30','2011-12-31',
'2012-03-31','2012-06-30','2012-09-30','2012-12-31',
'2013-03-31','2013-06-30','2013-09-30','2013-12-31',
'2014-03-31','2014-06-30','2014-09-30','2014-12-31',
'2015-03-31','2015-06-30','2015-09-30','2015-12-31',
'2016-03-31','2016-06-30','2016-09-30','2016-12-31',
'2017-03-31','2017-06-30','2017-09-30','2017-12-31',
'2018-03-31','2018-06-30','2018-09-30','2018-12-31',
'2019-03-31','2019-06-30','2019-09-30','2019-12-31',
'2020-03-31','2020-06-30','2020-09-30','2020-12-31',
'2021-03-31','2021-06-30','2021-09-30','2021-12-31',
'2022-03-31','2022-06-30','2022-09-30','2022-12-31',
'2023-03-31'] ###时间维度
SectorInfo=pd.DataFrame(columns=('Date','Sector','Capital_Cost','Operate_Income','Profitmargin')) ##行业空数据集
SectorInfoS=pd.DataFrame(columns=('Date','Sector','Capital_Cost','Operate_Income','Profitmargin'))##行业下一时间点空数据集
SectorPercent=pd.DataFrame(columns=('Date','Sector','Capital_Cost','Operate_Income','Profitmargin'))##行业经处理后数据集
for i in range(len(Time)-1):###时间维度
print(Time[i])
for j in range(len(secname)): ##行业维度
res=w.wss(secname[j],"cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin","unit=1;rptDate={};rptType=1".format(Time[i]),usedf=True)[1]
resS=w.wss(secname[j],"cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin","unit=1;rptDate={};rptType=1".format(Time[i+1]),usedf=True)[1]
SectorInfo=SectorInfo.append(pd.DataFrame({'Date':[Time[i]],'Sector':[secname[j]],'Capital_Cost':[res['CASH_PAY_ACQ_CONST_FIOLTA'][0]],'Operate_Income':[res['OPER_REV'][0]],'Profitmargin':[res['GROSSPROFITMARGIN'][0]]}),ignore_index=True)
SectorInfoS=SectorInfoS.append(pd.DataFrame({'Date':[Time[i+1]],'Sector':[secname[j]],'Capital_Cost':[resS['CASH_PAY_ACQ_CONST_FIOLTA'][0]],'Operate_Income':[resS['OPER_REV'][0]],'Profitmargin':[resS['GROSSPROFITMARGIN'][0]]}),ignore_index=True)
# print(SectorInfo)
SectorInfoS['Capital_Cost%']=(SectorInfoS['Capital_Cost']-SectorInfo['Capital_Cost'])/SectorInfo['Capital_Cost']
SectorInfoS['Operate_Income%']=(SectorInfoS['Operate_Income']-SectorInfo['Operate_Income'])/SectorInfo['Operate_Income']
SectorInfoS['Profitmargin%']=(SectorInfoS['Profitmargin']-SectorInfo['Profitmargin'])/SectorInfo['Profitmargin']
CapitalPercent=stats.percentileofscore(SectorInfoS['Capital_Cost%'].tolist(),SectorInfoS[SectorInfoS['Sector']=='886017.WI']['Capital_Cost%'].iloc[0]) ##分位数计算
OperatePercent=stats.percentileofscore(SectorInfoS['Operate_Income%'].tolist(),SectorInfoS[SectorInfoS['Sector']=='886017.WI']['Operate_Income%'].iloc[0]) ##分位数计算
ProfitPercent=stats.percentileofscore(SectorInfoS['Profitmargin%'].tolist(),SectorInfoS[SectorInfoS['Sector']=='886017.WI']['Profitmargin%'].iloc[0]) ##分位数计算
SectorPercent=SectorPercent.append(pd.DataFrame({'Date':[Time[i+1]],'Sector':['886017.WI'],'Capital_Cost':[CapitalPercent],'Operate_Income':[OperatePercent],'Profitmargin':[ProfitPercent]}),ignore_index=True)
SectorPercent
引用chatgpt部分指引作答:
要将行业名称、时间和相关行业信息整合到一个DataFrame中,可以使用pandas的MultiIndex功能来创建一个三维索引。以下是修改后的代码示例:
import pandas as pd
IndustryTotal = w.wset("sectorconstituent", "date=2023-05-25;sectorid=a39901012d000000", usedf=True)[1]
secname = IndustryTotal.wind_code # 所有行业
Time = [
'2010-03-31', '2010-06-30', '2010-09-30', '2010-12-31',
'2011-03-31', '2011-06-30', '2011-09-30', '2011-12-31',
'2012-03-31', '2012-06-30', '2012-09-30', '2012-12-31',
'2013-03-31', '2013-06-30', '2013-09-30', '2013-12-31',
'2014-03-31', '2014-06-30', '2014-09-30', '2014-12-31',
'2015-03-31', '2015-06-30', '2015-09-30', '2015-12-31',
'2016-03-31', '2016-06-30', '2016-09-30', '2016-12-31',
'2017-03-31', '2017-06-30', '2017-09-30', '2017-12-31',
'2018-03-31', '2018-06-30', '2018-09-30', '2018-12-31',
'2019-03-31', '2019-06-30', '2019-09-30', '2019-12-31',
'2020-03-31', '2020-06-30', '2020-09-30', '2020-12-31',
'2021-03-31', '2021-06-30', '2021-09-30', '2021-12-31',
'2022-03-31', '2022-06-30', '2022-09-30', '2022-12-31',
'2023-03-31'
]
SectorInfo = pd.DataFrame(columns=('Date', 'Sector', 'Capital_Cost', 'Operate_Income', 'Profitmargin'))
SectorInfoS = pd.DataFrame(columns=('Date', 'Sector', 'Capital_Cost', 'Operate_Income', 'Profitmargin'))
SectorPercent = pd.DataFrame(columns=('Date', 'Sector', 'Capital_Cost', 'Operate_Income', 'Profitmargin'))
for i in range(len(Time) - 1):
print(Time[i])
for j in range(len(secname)):
res = w.wss(secname[j], "cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin",
"unit=1;rptDate={};rptType=1".format(Time[i]), usedf=True)[1]
resS = w.wss(secname[j], "cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin",
"unit=1;rptDate={};rptType=1".format(Time[i + 1]), usedf=True)[1]
SectorInfo = SectorInfo.append(pd.DataFrame({'Date': [Time[i]], 'Sector': [secname[j]],
'Capital_Cost': [res['CASH_PAY_ACQ_CONST_FIOLTA'][0]],
'Operate_Income': [res['OPER_REV'][0]],
'Profitmargin': [res['GROSSPROFITMARGIN'][0]]}),
ignore_index=True)
SectorInfoS = SectorInfoS.append(pd.DataFrame({'Date': [Time[i + 1]], 'Sector': [secname[j]],
'Capital_Cost': [resS['CASH_PAY_ACQ_CONST_FIOLTA'][0]],
'Operate_Income': [resS['OPER_REV'][0]],
'Profitmargin': [resS['GROSSPROFITMARGIN'][0]]}),
ignore_index=True)
SectorInfoS['Capital_Cost%'] = (SectorInfoS['Capital_Cost'] - SectorInfo['Capital_Cost']) / SectorInfo[
'Capital_Cost']
SectorInfoS['Operate_Income%'] = (SectorInfoS['Operate_Income'] - SectorInfo['Operate_Income']) / SectorInfo[
'Operate_Income']
SectorInfoS['Profitmargin%'] = (SectorInfoS['Profitmargin'] - SectorInfo['Profitmargin']) / SectorInfo[
'Profitmargin']
CapitalPercent = stats.percentileofscore(SectorInfoS['Capital_Cost%'].tolist(),
SectorInfoS[SectorInfoS['Sector'] == '886017.WI']['Capital_Cost%'].iloc[
0])
OperatePercent = stats.percentileofscore(SectorInfoS['Operate_Income%'].tolist(),
SectorInfoS[SectorInfoS['Sector'] == '886017.WI'][
'Operate_Income%'].iloc[0])
ProfitPercent = stats.percentileofscore(SectorInfoS['Profitmargin%'].tolist(),
SectorInfoS[SectorInfoS['Sector'] == '886017.WI']['Profitmargin%'].iloc[
0])
SectorPercent = SectorPercent.append(
pd.DataFrame({'Date': [Time[i + 1]], 'Sector': ['886017.WI'], 'Capital_Cost': [CapitalPercent],
'Operate_Income': [OperatePercent], 'Profitmargin': [ProfitPercent]}), ignore_index=True)
# 创建三维索引
index = pd.MultiIndex.from_arrays([SectorInfo['Date'], SectorInfo['Sector']], names=['Date', 'Sector'])
SectorInfo.set_index(index, inplace=True)
# 将SectorInfoS数据合并到SectorInfo中
SectorInfo = SectorInfo.merge(SectorInfoS.drop(columns=['Date', 'Sector']),
how='left',
left_index=True,
right_on=['Date', 'Sector'],
suffixes=('', '_next'))
# 将SectorPercent数据合并到SectorInfo中
SectorInfo = SectorInfo.merge(SectorPercent.drop(columns='Sector'),
how='left',
left_on=['Date', 'Sector'],
right_on=['Date', 'Sector'],
suffixes=('', '_percent'))
# 重新设置索引为默认索引
SectorInfo.reset_index(drop=True, inplace=True)
print(SectorInfo)
这段代码将行业名称、时间和相关行业信息整合到了一个名为SectorInfo的DataFrame中。使用MultiIndex作为索引,其中Date和Sector构成了一个二维索引,方便根据不同维度提取信息。另外,将SectorInfoS和SectorPercent的数据合并到SectorInfo中,通过左连接关联到相应的日期和行业。最后,重新设置索引为默认索引。
上述代码是基于给出的代码进行修改的,并没有运行测试,因此可能需要根据实际情况进行调整和调试。
可以使用pandas
的MultiIndex
功能来建立三维数组,其中第一维为行业名称,第二维为时间,第三维为相关行业信息。具体实现如下:
首先,将行业名称和时间作为索引,创建一个空的DataFrame
:
import pandas as pd
IndustryTotal = w.wset("sectorconstituent", "date=2023-05-25;sectorid=a39901012d000000", usedf=True)[1]
secname = IndustryTotal.wind_code
Time = ['2010-03-31', '2010-06-30', '2010-09-30', '2010-12-31',
'2011-03-31', '2011-06-30', '2011-09-30', '2011-12-31',
'2012-03-31', '2012-06-30', '2012-09-30', '2012-12-31',
'2013-03-31', '2013-06-30', '2013-09-30', '2013-12-31',
'2014-03-31', '2014-06-30', '2014-09-30', '2014-12-31',
'2015-03-31', '2015-06-30', '2015-09-30', '2015-12-31',
'2016-03-31', '2016-06-30', '2016-09-30', '2016-12-31',
'2017-03-31', '2017-06-30', '2017-09-30', '2017-12-31',
'2018-03-31', '2018-06-30', '2018-09-30', '2018-12-31',
'2019-03-31', '2019-06-30', '2019-09-30', '2019-12-31',
'2020-03-31', '2020-06-30', '2020-09-30', '2020-12-31',
'2021-03-31', '2021-06-30', '2021-09-30', '2021-12-31',
'2022-03-31', '2022-06-30', '2022-09-30', '2022-12-31',
'2023-03-31']
SectorInfo = pd.DataFrame(index=pd.MultiIndex.from_product([secname, Time]), columns=['Capital_Cost', 'Operate_Income', 'Profitmargin'])
然后,使用两个嵌套的for
循环,遍历所有行业和时间,将相关行业信息填充到DataFrame
中:
for i in range(len(Time)-1):
print(Time[i])
for j in range(len(secname)):
res = w.wss(secname[j], "cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin", "unit=1;rptDate={};rptType=1".format(Time[i]), usedf=True)[1]
resS = w.wss(secname[j], "cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin", "unit=1;rptDate={};rptType=1".format(Time[i+1]), usedf=True)[1]
SectorInfo.loc[(secname[j], Time[i]), 'Capital_Cost'] = res['CASH_PAY_ACQ_CONST_FIOLTA'][0]
SectorInfo.loc[(secname[j], Time[i]), 'Operate_Income'] = res['OPER_REV'][0]
SectorInfo.loc[(secname[j], Time[i]), 'Profitmargin'] = res['GROSSPROFITMARGIN'][0]
SectorInfo.loc[(secname[j], Time[i+1]), 'Capital_Cost'] = resS['CASH_PAY_ACQ_CONST_FIOLTA'][0]
SectorInfo.loc[(secname[j], Time[i+1]), 'Operate_Income'] = resS['OPER_REV'][0]
SectorInfo.loc[(secname[j], Time[i+1]), 'Profitmargin'] = resS['GROSSPROFITMARGIN'][0]
这样就可以通过SectorInfo.loc[(sector, time), column]
的方式来提取不同维度的信息了。
开启多个线程跑下试试pool = Pool(5) # 设置5个进程