dataframe 分类排序问题2

我之前问过类似的问题,见链接: https://ask.csdn.net/questions/7409644

现在碰到更复杂的问题,我需要对dataframe按照类分类,每类按照该类data2的值最大的时候的data1值过滤该类,然后排序,请问该如何优化?

我把整数改成浮点数,更精确一点 ,代码如下:

import numpy as np
import pandas as pd

df = pd.DataFrame()
n = 200
df['category'] = np.random.choice(('A', 'B'), n)
df['data1'] = np.random.rand(len(df))*100
df['data2'] = np.random.rand(len(df))*100

a = df[df['category'] == 'A']
c = a[a['data2'] == a.data2.max()].data1.max()
a = a[a['data1'] <= c]
a = a.sort_values(by='data2', ascending=False).head(4)

b = df[df['category'] == 'B']
c = b[b['data2'] == b.data2.max()].data1.max()
b = b[b['data1'] <= c]
b = b.sort_values(by='data2', ascending=False).head(4)

df = pd.concat([a, b]).sort_values(by=['category', 'data1'], ascending=[True, False]).reset_index(drop=True)
print(df)

结果为:

  category      data1      data2
0        A  77.453241  98.628388
1        A  54.786469  97.470081
2        A  19.618200  96.261181
3        A   9.031004  97.067451
4        B  50.751809  99.219009
5        B  47.546003  96.488705
6        B  32.735357  98.565826
7        B  14.092039  95.359450

 

使用groupby方法。

import numpy as np
import pandas as pd
 
df = pd.DataFrame()
n = 200
df['category'] = np.random.choice(('A', 'B'), n)
df['data1'] = np.random.randint(1, 100, len(df))
df['data2'] = np.random.randint(1, 100, len(df))
print(df)

rowIndex = pd.Series()
for name, group in df.groupby('category'):
	heading = group[group['data1']<group['data2'].max()].sort_values(by='data2', ascending=False).head(4)
	rowIndex = pd.concat([rowIndex, pd.Series(heading.index)])
out = df.loc[rowIndex]
print(out)
    category  data1  data2
71         A     23     99
115        A     58     96
167        A     87     95
174        A     12     95
119        B     12     99
135        B     12     98
117        B     90     98
88         B     19     97

 

貌似结果有点不一样

import numpy as np
import pandas as pd

df = pd.DataFrame()
n = 200
df['category'] = np.random.choice(('A', 'B'), n)
df['data1'] = np.random.randint(1, 100, len(df))
df['data2'] = np.random.randint(1, 100, len(df))

a = df[df['category'] == 'A']
c = a[a['data2'] == a.data2.max()].data1.max()
a = a[a['data1'] <= c]
a = a.sort_values(by='data2', ascending=False).head(4)

b = df[df['category'] == 'B']
c = b[b['data2'] == b.data2.max()].data1.max()
b = b[b['data1'] <= c]
b = b.sort_values(by='data2', ascending=False).head(4)

df2 = pd.concat([a, b]).sort_values(by=['category', 'data1'], ascending=[True, False]).reset_index(drop=True)

rowIndex = pd.Series()

for name, group in df.groupby('category'):
    heading = group[group['data1'] < group['data2'].max()].sort_values(by='data2', ascending=False).head(4)

    rowIndex = pd.concat([rowIndex, pd.Series(heading.index)])

out = df.loc[rowIndex].sort_values(by=['category', 'data1'], ascending=[True, False]).reset_index(drop=True)


print((df2.data1-out.data1).max())
print((df2.data2-out.data2).max())

结果为:

0
1

而且还有warning

 DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  rowIndex = pd.Series()

 

最优方法:

import numpy as np
import pandas as pd

df = pd.DataFrame()
n = 200
df['category'] = np.random.choice(('A', 'B'), n)
df['data1'] = np.random.rand(len(df)) * 100
df['data2'] = np.random.rand(len(df)) * 100

a = df[df['category'] == 'A']

c = a[a['data2'] == a.data2.max()].data1.max()
a = a[a['data1'] <= c]
a = a.sort_values(by='data1', ascending=False).head(4)

b = df[df['category'] == 'B']
c = b[b['data2'] == b.data2.max()].data1.max()
b = b[b['data1'] <= c]
b = b.sort_values(by='data1', ascending=False).head(4)

df2 = pd.concat([a, b]).sort_values(by=['category', 'data1'], ascending=[True, False]).reset_index(drop=True)
s = (df.sort_values('data2').drop_duplicates('category', keep='last').set_index('category')['data1'])
df = df[df['data1'] <= df['category'].map(s)]
df3 = (df.sort_values(by=['category', 'data1'], ascending=[True, False])
       .groupby('category')
       .head(4)
       .reset_index(drop=True))
print((df2[['data1', 'data2']] - df3[['data1', 'data2']]).max())
print(df2)

结果

data1    0.0
data2    0.0
dtype: float64
  category      data1      data2
0        A  26.585410  97.971666
1        A  26.151227  75.417312
2        A  25.359553   7.111630
3        A  23.917786  63.567091
4        B  71.620900  98.862876
5        B  71.226793  50.661953
6        B  70.802247  31.320274
7        B  70.737850  36.903517