print(df) # key1 key2 data1 data2 # 0 a one -0.176201 -0.105339 # 1 a two 0.188876 0.217776 # 2 b one 0.826747 0.587282 # 3 b two -0.032447 0.100238 # 4 a one -0.652499 -1.099947
# 调用grouped对象的sum方法 print(grouped.sum()) # key1 # a -0.639824 # b 0.794300 # Name: data1, dtype: float64
# 调用grouped对象的count方法 print(grouped.count()) # key1 # a 3 # b 2 # Name: data1, dtype: int64
# ⼀次传⼊多个数组的列表,通过两个键对数据进⾏了分组 means = df['data1'].groupby([df['key1'], df['key2']]).mean() print(means) # key1 key2 # a one -0.414350 # two 0.188876 # b one 0.826747 # two -0.032447 # Name: data1, dtype: float64
print(means.unstack()) # key2 one two # key1 # a -0.414350 0.188876 # b 0.826747 -0.032447
# 还可以将列名(可以是字符串、数字或其他Python对象)⽤作分组键 # 因为key2是分类型变量,无法计算平均值,就直接忽略 print(df.groupby('key1').mean()) # data1 data2 # key1 # a -0.213275 -0.32917 # b 0.397150 0.34376
# 对比上面,分类型变量无法计算平均值,但是可以计算次数,所以这里保留了key2列 print(df.groupby('key1').count()) # key2 data1 data2 # key1 # a 3 3 3 # b 2 2 2
print(df.groupby(['key1', 'key2']).mean()) # data1 data2 # key1 key2 # a one -0.414350 -0.602643 # two 0.188876 0.217776 # b one 0.826747 0.587282 # two -0.032447 0.100238
# size方法返回⼀个含有分组⼤⼩的Series: print(df.groupby(['key1', 'key2']).size()) # key1 key2 # a one 2 # two 1 # b one 1 # two 1 # dtype: int64
print(df) # key1 key2 data1 data2 # 0 a one -0.176201 -0.105339 # 1 a two 0.188876 0.217776 # 2 b one 0.826747 0.587282 # 3 b two -0.032447 0.100238 # 4 a one -0.652499 -1.099947
# 返回 分组名:name 数据块group for name, group in df.groupby('key1'): print(name,'\n') print(group) # a
# key1 key2 data1 data2 # 0 a one -0.176201 -0.105339 # 1 a two 0.188876 0.217776 # 4 a one -0.652499 -1.099947
# b
# key1 key2 data1 data2 # 2 b one 0.826747 0.587282 # 3 b two -0.032447 0.100238
# 多重键的分组: # 返回 分组名:(k1, k2) 数据块group for (k1, k2), group in df.groupby(['key1', 'key2']): print((k1, k2),'\n') print(group) print('--------------')
# ('a', 'one')
# key1 key2 data1 data2 # 0 a one -0.176201 -0.105339 # 4 a one -0.652499 -1.099947 # -------------- # ('a', 'two')
# key1 key2 data1 data2 # 1 a two 0.188876 0.217776 # -------------- # ('b', 'one')
# key1 key2 data1 data2 # 2 b one 0.826747 0.587282 # -------------- # ('b', 'two')
# key1 key2 data1 data2 # 3 b two -0.032447 0.100238 # --------------
print(list(df.groupby('key1'))) # [('a', key1 key2 data1 data2 # 0 a one -0.176201 -0.105339 # 1 a two 0.188876 0.217776 # 4 a one -0.652499 -1.099947), ('b', key1 key2 data1 data2 # 2 b one 0.826747 0.587282 # 3 b two -0.032447 0.100238)]
# 制作成字典,后续可用 pieces = dict(list(df.groupby('key1'))) print(pieces['b']) # key1 key2 data1 data2 # 2 b one 0.826747 0.587282 # 3 b two -0.032447 0.100238
print(df) # key1 key2 data1 data2 # 0 a one -0.176201 -0.105339 # 1 a two 0.188876 0.217776 # 2 b one 0.826747 0.587282 # 3 b two -0.032447 0.100238 # 4 a one -0.652499 -1.099947
people.iloc[2, [1, 2]] = np.nan print(people) # a b c d e # Joe 0 1.0 2.0 3 4 # Steve 5 6.0 7.0 8 9 # Wes 10 NaN NaN 13 14 # Jim 15 16.0 17.0 18 19 # Travis 20 21.0 22.0 23 24
people.iloc[2, [1, 2]] = np.nan print(people) # a b c d e # Joe 0 1.0 2.0 3 4 # Steve 5 6.0 7.0 8 9 # Wes 10 NaN NaN 13 14 # Jim 15 16.0 17.0 18 19 # Travis 20 21.0 22.0 23 24
print(people.groupby(len).sum()) # a b c d e # 3 25 17.0 19.0 34 37 # 5 5 6.0 7.0 8 9 # 6 20 21.0 22.0 23 24
# 将函数跟数组、列表、字典、Series混合使⽤ key_list = ['one', 'one', 'one', 'two', 'two'] print(people.groupby([len, key_list]).min()) # a b c d e # 3 one 0 1.0 2.0 3 4 # two 15 16.0 17.0 18 19 # 5 one 5 6.0 7.0 8 9 # 6 two 20 21.0 22.0 23 24
print(df) # key1 key2 data1 data2 # 0 a one -0.176201 -0.105339 # 1 a two 0.188876 0.217776 # 2 b one 0.826747 0.587282 # 3 b two -0.032447 0.100238 # 4 a one -0.652499 -1.099947
grouped = df.groupby('key1')
# 调用Series或DataFrame的样本分位数聚合函数 print(grouped['data1'].quantile(0.9)) # key1 # a 0.115861 # b 0.740828 # Name: data1, dtype: float64
tips['tip_pct'] = tips['tip'] / tips['total_bill'] print(tips[:6]) # total_bill tip smoker day time size tip_pct # 0 16.99 1.01 No Sun Dinner 2 0.059447 # 1 10.34 1.66 No Sun Dinner 3 0.160542 # 2 21.01 3.50 No Sun Dinner 3 0.166587 # 3 23.68 3.31 No Sun Dinner 2 0.139780 # 4 24.59 3.61 No Sun Dinner 4 0.146808 # 5 25.29 4.71 No Sun Dinner 4 0.186240
grouped = tips.groupby(['day', 'smoker'])
grouped_pct = grouped['tip_pct']
print(grouped_pct.agg('mean')) # day smoker # Fri No 0.151650 # Yes 0.174783 # Sat No 0.158048 # Yes 0.147906 # Sun No 0.160113 # Yes 0.187250 # Thur No 0.160298 # Yes 0.163863 # Name: tip_pct, dtype: float64
tips['tip_pct'] = tips['tip'] / tips['total_bill'] print(tips[:6]) # total_bill tip smoker day time size tip_pct # 0 16.99 1.01 No Sun Dinner 2 0.059447 # 1 10.34 1.66 No Sun Dinner 3 0.160542 # 2 21.01 3.50 No Sun Dinner 3 0.166587 # 3 23.68 3.31 No Sun Dinner 2 0.139780 # 4 24.59 3.61 No Sun Dinner 4 0.146808 # 5 25.29 4.71 No Sun Dinner 4 0.186240
print(df) # category data weights # 0 a -0.176201 0.132953 # 1 a 0.188876 0.533449 # 2 a 0.826747 0.899478 # 3 a -0.032447 0.248365 # 4 b -0.652499 0.030172 # 5 b -0.105339 0.072447 # 6 b 0.217776 0.874164 # 7 b 0.587282 0.558430
tips['tip_pct'] = tips['tip'] / tips['total_bill'] print(tips[:6]) # total_bill tip smoker day time size tip_pct # 0 16.99 1.01 No Sun Dinner 2 0.059447 # 1 10.34 1.66 No Sun Dinner 3 0.160542 # 2 21.01 3.50 No Sun Dinner 3 0.166587 # 3 23.68 3.31 No Sun Dinner 2 0.139780 # 4 24.59 3.61 No Sun Dinner 4 0.146808 # 5 25.29 4.71 No Sun Dinner 4 0.186240
# 根据day和smoker计算分组平均数 # 将day和smoker放到⾏,单元格的值为各个分组的平均数 print(tips.pivot_table(index=['day', 'smoker'])) # size tip total_bill # day smoker # Fri No 2.250000 2.812500 18.420000 # Yes 2.066667 2.714000 16.813333 # Sat No 2.555556 3.102889 19.661778 # Yes 2.476190 2.875476 21.276667 # Sun No 2.929825 3.167895 20.506667 # Yes 2.578947 3.516842 24.120000 # Thur No 2.488889 2.673778 17.113111 # Yes 2.352941 3.030000 19.190588
# 根据time和day进⾏分组,聚合tip_pct和size # 将smoker放到列上,把day放到⾏上: print(tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],columns='smoker')) # size tip_pct # smoker No Yes No Yes # time day # Dinner Fri 2.000000 2.222222 0.139622 0.165347 # Sat 2.555556 2.476190 0.158048 0.147906 # Sun 2.929825 2.578947 0.160113 0.187250 # Thur 2.000000 NaN 0.159744 NaN # Lunch Fri 3.000000 1.833333 0.187735 0.188937 # Thur 2.500000 2.352941 0.160311 0.163863
# margins:添加标签为All的⾏和列 print(tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],columns='smoker', margins=True)) # size tip_pct # smoker No Yes All No Yes All # time day # Dinner Fri 2.000000 2.222222 2.166667 0.139622 0.165347 0.158916 # Sat 2.555556 2.476190 2.517241 0.158048 0.147906 0.153152 # Sun 2.929825 2.578947 2.842105 0.160113 0.187250 0.166897 # Thur 2.000000 NaN 2.000000 0.159744 NaN 0.159744 # Lunch Fri 3.000000 1.833333 2.000000 0.187735 0.188937 0.188765 # Thur 2.500000 2.352941 2.459016 0.160311 0.163863 0.161301 # All 2.668874 2.408602 2.569672 0.159328 0.163196 0.160803
# 传入len,得到有关分组大小的交叉表 print(tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',aggfunc=len, margins=True)) # day Fri Sat Sun Thur All # time smoker # Dinner No 3.0 45.0 57.0 1.0 106.0 # Yes 9.0 42.0 19.0 NaN 70.0 # Lunch No 1.0 NaN NaN 44.0 45.0 # Yes 6.0 NaN NaN 17.0 23.0 # All 19.0 87.0 76.0 62.0 244.0
# fill_value:替换掉NA print(tips.pivot_table('tip_pct', index=['time', 'size', 'smoker'],columns='day', aggfunc='mean', fill_value=0)) # day Fri Sat Sun Thur # time size smoker # Dinner 1 No 0.000000 0.137931 0.000000 0.000000 # Yes 0.000000 0.325733 0.000000 0.000000 # 2 No 0.139622 0.162705 0.168859 0.159744 # Yes 0.171297 0.148668 0.207893 0.000000 # 3 No 0.000000 0.154661 0.152663 0.000000 # Yes 0.000000 0.144995 0.152660 0.000000 # 4 No 0.000000 0.150096 0.148143 0.000000 # Yes 0.117750 0.124515 0.193370 0.000000 # 5 No 0.000000 0.000000 0.206928 0.000000 # Yes 0.000000 0.106572 0.065660 0.000000 # 6 No 0.000000 0.000000 0.103799 0.000000 # Lunch 1 No 0.000000 0.000000 0.000000 0.181728 # Yes 0.223776 0.000000 0.000000 0.000000 # 2 No 0.000000 0.000000 0.000000 0.166005 # Yes 0.181969 0.000000 0.000000 0.158843 # 3 No 0.187735 0.000000 0.000000 0.084246 # Yes 0.000000 0.000000 0.000000 0.204952 # 4 No 0.000000 0.000000 0.000000 0.138919 # Yes 0.000000 0.000000 0.000000 0.155410 # 5 No 0.000000 0.000000 0.000000 0.121389 # 6 No 0.000000 0.000000 0.000000 0.173706
import numpy as np import pandas as pd from io import StringIO
data = """\ Sample Nationality Handedness 1 USA Right-handed 2 Japan Left-handed 3 USA Right-handed 4 Japan Right-handed 5 Japan Left-handed 6 Japan Right-handed 7 USA Right-handed 8 USA Left-handed 9 Japan Right-handed 10 USA Right-handed"""
data = pd.read_table(StringIO(data), sep='\s+') print(data) # Sample Nationality Handedness # 0 1 USA Right-handed # 1 2 Japan Left-handed # 2 3 USA Right-handed # 3 4 Japan Right-handed # 4 5 Japan Left-handed # 5 6 Japan Right-handed # 6 7 USA Right-handed # 7 8 USA Left-handed # 8 9 Japan Right-handed # 9 10 USA Right-handed
# 根据Nationality分组,对Handedness进行频率统计 print(pd.crosstab(data.Nationality, data.Handedness, margins=True)) # Handedness Left-handed Right-handed All # Nationality # Japan 2 3 5 # USA 1 4 5 # All 3 7 10