In [11]: df = pd.DataFrame({'cat':pd.Categorical(['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C']),
....: 'v1':np.random.randint(100, size=(9)),
....: 'v2':np.random.randint(10, size=(9)) })
In [12]: df
Out[12]:
cat v1 v2
0 A 79 7
1 A 97 5
2 A 81 9
3 B 75 3
4 B 43 7
5 B 27 8
6 C 47 6
7 C 23 9
8 C 53 0
In [13]:
In [13]: # top n per group
In [14]: # 1) sort by the "top n" column (s)
In [15]: df.sort(['v1', 'v2'])
Out[15]:
cat v1 v2
7 C 23 9
5 B 27 8
4 B 43 7
6 C 47 6
8 C 53 0
3 B 75 3
0 A 79 7
2 A 81 9
1 A 97 5
In [16]: # 2) group by column of your choice
In [17]: # 3) select the top n of it
In [18]: df.sort(['v1', 'v2']).groupby('cat').head(2)
Out[18]:
cat v1 v2
7 C 23 9
5 B 27 8
4 B 43 7
6 C 47 6
0 A 79 7
2 A 81 9
In [19]: # 4) optionally sort the output nicely
In [20]: df.sort(['v1', 'v2']).groupby('cat').head(2).sort(['cat', 'v1', 'v2'])
Out[20]:
cat v1 v2
0 A 79 7
2 A 81 9
5 B 27 8
4 B 43 7
7 C 23 9
6 C 47 6
In [21]:
No comments:
Post a Comment