pandas 转化至 三维数组

如何用python的pandas将以下的代码建立三维数组以方面提取不同维度的信息?用两个for loop运行太慢了,可不可以将行业名称,时间,及相关行业信息直接整合到一个dataframe中?感谢!

IndustryTotal=w.wset("sectorconstituent","date=2023-05-25;sectorid=a39901012d000000",usedf=True)[1]#########Wind行业########
secname=IndustryTotal.wind_code ##所有行业

Time=['2010-03-31','2010-06-30','2010-09-30','2010-12-31',
      '2011-03-31','2011-06-30','2011-09-30','2011-12-31',
      '2012-03-31','2012-06-30','2012-09-30','2012-12-31',
      '2013-03-31','2013-06-30','2013-09-30','2013-12-31',
      '2014-03-31','2014-06-30','2014-09-30','2014-12-31',
      '2015-03-31','2015-06-30','2015-09-30','2015-12-31',
      '2016-03-31','2016-06-30','2016-09-30','2016-12-31',
      '2017-03-31','2017-06-30','2017-09-30','2017-12-31',
      '2018-03-31','2018-06-30','2018-09-30','2018-12-31',
      '2019-03-31','2019-06-30','2019-09-30','2019-12-31',
      '2020-03-31','2020-06-30','2020-09-30','2020-12-31',
      '2021-03-31','2021-06-30','2021-09-30','2021-12-31',
      '2022-03-31','2022-06-30','2022-09-30','2022-12-31',
      '2023-03-31'] ###时间维度

SectorInfo=pd.DataFrame(columns=('Date','Sector','Capital_Cost','Operate_Income','Profitmargin')) ##行业空数据集
SectorInfoS=pd.DataFrame(columns=('Date','Sector','Capital_Cost','Operate_Income','Profitmargin'))##行业下一时间点空数据集
SectorPercent=pd.DataFrame(columns=('Date','Sector','Capital_Cost','Operate_Income','Profitmargin'))##行业经处理后数据集

    
for i in range(len(Time)-1):###时间维度
    print(Time[i])
    for j in range(len(secname)): ##行业维度
        res=w.wss(secname[j],"cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin","unit=1;rptDate={};rptType=1".format(Time[i]),usedf=True)[1]
        resS=w.wss(secname[j],"cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin","unit=1;rptDate={};rptType=1".format(Time[i+1]),usedf=True)[1]
        SectorInfo=SectorInfo.append(pd.DataFrame({'Date':[Time[i]],'Sector':[secname[j]],'Capital_Cost':[res['CASH_PAY_ACQ_CONST_FIOLTA'][0]],'Operate_Income':[res['OPER_REV'][0]],'Profitmargin':[res['GROSSPROFITMARGIN'][0]]}),ignore_index=True)
        SectorInfoS=SectorInfoS.append(pd.DataFrame({'Date':[Time[i+1]],'Sector':[secname[j]],'Capital_Cost':[resS['CASH_PAY_ACQ_CONST_FIOLTA'][0]],'Operate_Income':[resS['OPER_REV'][0]],'Profitmargin':[resS['GROSSPROFITMARGIN'][0]]}),ignore_index=True)
#     print(SectorInfo)
    SectorInfoS['Capital_Cost%']=(SectorInfoS['Capital_Cost']-SectorInfo['Capital_Cost'])/SectorInfo['Capital_Cost']
    SectorInfoS['Operate_Income%']=(SectorInfoS['Operate_Income']-SectorInfo['Operate_Income'])/SectorInfo['Operate_Income']
    SectorInfoS['Profitmargin%']=(SectorInfoS['Profitmargin']-SectorInfo['Profitmargin'])/SectorInfo['Profitmargin']
    CapitalPercent=stats.percentileofscore(SectorInfoS['Capital_Cost%'].tolist(),SectorInfoS[SectorInfoS['Sector']=='886017.WI']['Capital_Cost%'].iloc[0]) ##分位数计算
    OperatePercent=stats.percentileofscore(SectorInfoS['Operate_Income%'].tolist(),SectorInfoS[SectorInfoS['Sector']=='886017.WI']['Operate_Income%'].iloc[0]) ##分位数计算
    ProfitPercent=stats.percentileofscore(SectorInfoS['Profitmargin%'].tolist(),SectorInfoS[SectorInfoS['Sector']=='886017.WI']['Profitmargin%'].iloc[0]) ##分位数计算
    SectorPercent=SectorPercent.append(pd.DataFrame({'Date':[Time[i+1]],'Sector':['886017.WI'],'Capital_Cost':[CapitalPercent],'Operate_Income':[OperatePercent],'Profitmargin':[ProfitPercent]}),ignore_index=True)

SectorPercent

引用chatgpt部分指引作答:
要将行业名称、时间和相关行业信息整合到一个DataFrame中,可以使用pandas的MultiIndex功能来创建一个三维索引。以下是修改后的代码示例:

import pandas as pd

IndustryTotal = w.wset("sectorconstituent", "date=2023-05-25;sectorid=a39901012d000000", usedf=True)[1]
secname = IndustryTotal.wind_code  # 所有行业

Time = [
    '2010-03-31', '2010-06-30', '2010-09-30', '2010-12-31',
    '2011-03-31', '2011-06-30', '2011-09-30', '2011-12-31',
    '2012-03-31', '2012-06-30', '2012-09-30', '2012-12-31',
    '2013-03-31', '2013-06-30', '2013-09-30', '2013-12-31',
    '2014-03-31', '2014-06-30', '2014-09-30', '2014-12-31',
    '2015-03-31', '2015-06-30', '2015-09-30', '2015-12-31',
    '2016-03-31', '2016-06-30', '2016-09-30', '2016-12-31',
    '2017-03-31', '2017-06-30', '2017-09-30', '2017-12-31',
    '2018-03-31', '2018-06-30', '2018-09-30', '2018-12-31',
    '2019-03-31', '2019-06-30', '2019-09-30', '2019-12-31',
    '2020-03-31', '2020-06-30', '2020-09-30', '2020-12-31',
    '2021-03-31', '2021-06-30', '2021-09-30', '2021-12-31',
    '2022-03-31', '2022-06-30', '2022-09-30', '2022-12-31',
    '2023-03-31'
]

SectorInfo = pd.DataFrame(columns=('Date', 'Sector', 'Capital_Cost', 'Operate_Income', 'Profitmargin'))
SectorInfoS = pd.DataFrame(columns=('Date', 'Sector', 'Capital_Cost', 'Operate_Income', 'Profitmargin'))
SectorPercent = pd.DataFrame(columns=('Date', 'Sector', 'Capital_Cost', 'Operate_Income', 'Profitmargin'))

for i in range(len(Time) - 1):
    print(Time[i])
    for j in range(len(secname)):
        res = w.wss(secname[j], "cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin",
                    "unit=1;rptDate={};rptType=1".format(Time[i]), usedf=True)[1]
        resS = w.wss(secname[j], "cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin",
                     "unit=1;rptDate={};rptType=1".format(Time[i + 1]), usedf=True)[1]
        SectorInfo = SectorInfo.append(pd.DataFrame({'Date': [Time[i]], 'Sector': [secname[j]],
                                                     'Capital_Cost': [res['CASH_PAY_ACQ_CONST_FIOLTA'][0]],
                                                     'Operate_Income': [res['OPER_REV'][0]],
                                                     'Profitmargin': [res['GROSSPROFITMARGIN'][0]]}),
                                       ignore_index=True)
        SectorInfoS = SectorInfoS.append(pd.DataFrame({'Date': [Time[i + 1]], 'Sector': [secname[j]],
                                                       'Capital_Cost': [resS['CASH_PAY_ACQ_CONST_FIOLTA'][0]],
                                                       'Operate_Income': [resS['OPER_REV'][0]],
                                                       'Profitmargin': [resS['GROSSPROFITMARGIN'][0]]}),
                                         ignore_index=True)

    SectorInfoS['Capital_Cost%'] = (SectorInfoS['Capital_Cost'] - SectorInfo['Capital_Cost']) / SectorInfo[
        'Capital_Cost']
    SectorInfoS['Operate_Income%'] = (SectorInfoS['Operate_Income'] - SectorInfo['Operate_Income']) / SectorInfo[
        'Operate_Income']
    SectorInfoS['Profitmargin%'] = (SectorInfoS['Profitmargin'] - SectorInfo['Profitmargin']) / SectorInfo[
        'Profitmargin']
    CapitalPercent = stats.percentileofscore(SectorInfoS['Capital_Cost%'].tolist(),
                                             SectorInfoS[SectorInfoS['Sector'] == '886017.WI']['Capital_Cost%'].iloc[
                                                 0])
    OperatePercent = stats.percentileofscore(SectorInfoS['Operate_Income%'].tolist(),
                                             SectorInfoS[SectorInfoS['Sector'] == '886017.WI'][
                                                 'Operate_Income%'].iloc[0])
    ProfitPercent = stats.percentileofscore(SectorInfoS['Profitmargin%'].tolist(),
                                            SectorInfoS[SectorInfoS['Sector'] == '886017.WI']['Profitmargin%'].iloc[
                                                0])
    SectorPercent = SectorPercent.append(
        pd.DataFrame({'Date': [Time[i + 1]], 'Sector': ['886017.WI'], 'Capital_Cost': [CapitalPercent],
                      'Operate_Income': [OperatePercent], 'Profitmargin': [ProfitPercent]}), ignore_index=True)

# 创建三维索引
index = pd.MultiIndex.from_arrays([SectorInfo['Date'], SectorInfo['Sector']], names=['Date', 'Sector'])
SectorInfo.set_index(index, inplace=True)

# 将SectorInfoS数据合并到SectorInfoSectorInfo = SectorInfo.merge(SectorInfoS.drop(columns=['Date', 'Sector']),
                              how='left',
                              left_index=True,
                              right_on=['Date', 'Sector'],
                              suffixes=('', '_next'))

# 将SectorPercent数据合并到SectorInfoSectorInfo = SectorInfo.merge(SectorPercent.drop(columns='Sector'),
                              how='left',
                              left_on=['Date', 'Sector'],
                              right_on=['Date', 'Sector'],
                              suffixes=('', '_percent'))

# 重新设置索引为默认索引
SectorInfo.reset_index(drop=True, inplace=True)

print(SectorInfo)

这段代码将行业名称、时间和相关行业信息整合到了一个名为SectorInfo的DataFrame中。使用MultiIndex作为索引,其中Date和Sector构成了一个二维索引,方便根据不同维度提取信息。另外,将SectorInfoS和SectorPercent的数据合并到SectorInfo中,通过左连接关联到相应的日期和行业。最后,重新设置索引为默认索引。

上述代码是基于给出的代码进行修改的,并没有运行测试,因此可能需要根据实际情况进行调整和调试。

可以使用pandasMultiIndex功能来建立三维数组,其中第一维为行业名称,第二维为时间,第三维为相关行业信息。具体实现如下:

首先,将行业名称和时间作为索引,创建一个空的DataFrame

import pandas as pd

IndustryTotal = w.wset("sectorconstituent", "date=2023-05-25;sectorid=a39901012d000000", usedf=True)[1]
secname = IndustryTotal.wind_code
Time = ['2010-03-31', '2010-06-30', '2010-09-30', '2010-12-31',
        '2011-03-31', '2011-06-30', '2011-09-30', '2011-12-31',
        '2012-03-31', '2012-06-30', '2012-09-30', '2012-12-31',
        '2013-03-31', '2013-06-30', '2013-09-30', '2013-12-31',
        '2014-03-31', '2014-06-30', '2014-09-30', '2014-12-31',
        '2015-03-31', '2015-06-30', '2015-09-30', '2015-12-31',
        '2016-03-31', '2016-06-30', '2016-09-30', '2016-12-31',
        '2017-03-31', '2017-06-30', '2017-09-30', '2017-12-31',
        '2018-03-31', '2018-06-30', '2018-09-30', '2018-12-31',
        '2019-03-31', '2019-06-30', '2019-09-30', '2019-12-31',
        '2020-03-31', '2020-06-30', '2020-09-30', '2020-12-31',
        '2021-03-31', '2021-06-30', '2021-09-30', '2021-12-31',
        '2022-03-31', '2022-06-30', '2022-09-30', '2022-12-31',
        '2023-03-31']
SectorInfo = pd.DataFrame(index=pd.MultiIndex.from_product([secname, Time]), columns=['Capital_Cost', 'Operate_Income', 'Profitmargin'])

然后,使用两个嵌套的for循环,遍历所有行业和时间,将相关行业信息填充到DataFrame中:

for i in range(len(Time)-1):
    print(Time[i])
    for j in range(len(secname)):
        res = w.wss(secname[j], "cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin", "unit=1;rptDate={};rptType=1".format(Time[i]), usedf=True)[1]
        resS = w.wss(secname[j], "cash_pay_acq_const_fiolta,oper_rev,grossprofitmargin", "unit=1;rptDate={};rptType=1".format(Time[i+1]), usedf=True)[1]
        SectorInfo.loc[(secname[j], Time[i]), 'Capital_Cost'] = res['CASH_PAY_ACQ_CONST_FIOLTA'][0]
        SectorInfo.loc[(secname[j], Time[i]), 'Operate_Income'] = res['OPER_REV'][0]
        SectorInfo.loc[(secname[j], Time[i]), 'Profitmargin'] = res['GROSSPROFITMARGIN'][0]
        SectorInfo.loc[(secname[j], Time[i+1]), 'Capital_Cost'] = resS['CASH_PAY_ACQ_CONST_FIOLTA'][0]
        SectorInfo.loc[(secname[j], Time[i+1]), 'Operate_Income'] = resS['OPER_REV'][0]
        SectorInfo.loc[(secname[j], Time[i+1]), 'Profitmargin'] = resS['GROSSPROFITMARGIN'][0]

这样就可以通过SectorInfo.loc[(sector, time), column]的方式来提取不同维度的信息了。

开启多个线程跑下试试pool = Pool(5) # 设置5个进程