frame = DataFrame(data, columns=['year', 'state','debt']) print(frame) # year state debt # 0 2000 Ohio NaN # 1 2001 Ohio NaN # 2 2002 Ohio NaN # 3 2001 Nevada NaN # 4 2002 Nevada NaN # 5 2003 Nevada NaN
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four','five', 'six']) print(frame2) # year state pop debt # one 2000 Ohio 1.5 NaN # two 2001 Ohio 1.7 NaN # three 2002 Ohio 3.6 NaN # four 2001 Nevada 2.4 NaN # five 2002 Nevada 2.9 NaN # six 2003 Nevada 3.2 NaN
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four','five', 'six']) print(frame2) # year state pop debt # one 2000 Ohio 1.5 NaN # two 2001 Ohio 1.7 NaN # three 2002 Ohio 3.6 NaN # four 2001 Nevada 2.4 NaN # five 2002 Nevada 2.9 NaN # six 2003 Nevada 3.2 NaN
print(frame2.loc['three']) # year 2002 # state Ohio # pop 3.6 # debt NaN # Name: three, dtype: object
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four','five', 'six']) print(frame2) # year state pop debt # one 2000 Ohio 1.5 NaN # two 2001 Ohio 1.7 NaN # three 2002 Ohio 3.6 NaN # four 2001 Nevada 2.4 NaN # five 2002 Nevada 2.9 NaN # six 2003 Nevada 3.2 NaN
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four','five', 'six']) print(frame2) # year state pop debt # one 2000 Ohio 1.5 NaN # two 2001 Ohio 1.7 NaN # three 2002 Ohio 3.6 NaN # four 2001 Nevada 2.4 NaN # five 2002 Nevada 2.9 NaN # six 2003 Nevada 3.2 NaN
# 修改列 frame2['debt'] = 16.5 print(frame2) # year state pop debt # one 2000 Ohio 1.5 16.5 # two 2001 Ohio 1.7 16.5 # three 2002 Ohio 3.6 16.5 # four 2001 Nevada 2.4 16.5 # five 2002 Nevada 2.9 16.5 # six 2003 Nevada 3.2 16.5
frame2['debt'] = np.arange(6.) print(frame2) # year state pop debt # one 2000 Ohio 1.5 0.0 # two 2001 Ohio 1.7 1.0 # three 2002 Ohio 3.6 2.0 # four 2001 Nevada 2.4 3.0 # five 2002 Nevada 2.9 4.0 # six 2003 Nevada 3.2 5.0
# 修改行 aDict = {'year':9999,'state':'9999','pop':9999,'debt':9999} row = pd.Series(aDict) frame2.loc['one'] = row print(frame2) # year state pop debt # one 9999 9999 9999.0 9999.0 # two 2001 Ohio 1.7 1.0 # three 2002 Ohio 3.6 2.0 # four 2001 Nevada 2.4 3.0 # five 2002 Nevada 2.9 4.0 # six 2003 Nevada 3.2 5.0
# 使用index精确匹配某行. # 如果长度和DataFrame的⻓度不一致,那么所有的空行都将被填上缺失值 val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five']) frame2['debt'] = val print(frame2) # year state pop debt # one 9999 9999 9999.0 NaN # two 2001 Ohio 1.7 -1.2 # three 2002 Ohio 3.6 NaN # four 2001 Nevada 2.4 -1.5 # five 2002 Nevada 2.9 -1.7 # six 2003 Nevada 3.2 NaN
# 为不存在的列赋值会创建出一个新列 frame2['eastern'] = (frame2.state == 'Ohio') print(frame2) # year state pop debt eastern # one 9999 9999 9999.0 NaN False # two 2001 Ohio 1.7 -1.2 True # three 2002 Ohio 3.6 NaN True # four 2001 Nevada 2.4 -1.5 False # five 2002 Nevada 2.9 -1.7 False # six 2003 Nevada 3.2 NaN False
# 使用del方法删除列 del frame2['eastern'] print(frame2) # year state pop debt # one 9999 9999 9999.0 NaN # two 2001 Ohio 1.7 -1.2 # three 2002 Ohio 3.6 NaN # four 2001 Nevada 2.4 -1.5 # five 2002 Nevada 2.9 -1.7 # six 2003 Nevada 3.2 NaN
# 修改行索引 frame2.index = ['(1)', '(2)', '(3)', '(4)','(5)', '(6)'] print(frame2) # year state pop debt # (1) 9999 9999 9999.0 NaN # (2) 2001 Ohio 1.7 -1.2 # (3) 2002 Ohio 3.6 NaN # (4) 2001 Nevada 2.4 -1.5 # (5) 2002 Nevada 2.9 -1.7 # (6) 2003 Nevada 3.2 NaN
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four','five', 'six']) print(frame2) # year state pop debt # one 2000 Ohio 1.5 NaN # two 2001 Ohio 1.7 NaN # three 2002 Ohio 3.6 NaN # four 2001 Nevada 2.4 NaN # five 2002 Nevada 2.9 NaN # six 2003 Nevada 3.2 NaN
# 快速创建DataFrame的方式 frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California']) print(frame) # Ohio Texas California # a 0 1 2 # c 3 4 5 # d 6 7 8
frame2 = frame.reindex(['a', 'b', 'c', 'd']) print(frame2) # Ohio Texas California # a 0.0 1.0 2.0 # b NaN NaN NaN # c 3.0 4.0 5.0 # d 6.0 7.0 8.0
states = ['Texas', 'Utah', 'California'] frame3 = frame.reindex(columns=states) print(frame3) # Texas Utah California # a 1 NaN 2 # c 4 NaN 5 # d 7 NaN 8
print(frame) # b d e # Utah -1.176057 -0.783786 -0.543606 # Ohio 0.777264 1.151499 -1.428496 # Texas 0.036316 0.648344 -2.312486 # Oregon 1.286525 -1.263399 1.165449
deff(x): return pd.Series([x.min(), x.max()], index=['min', 'max']) print(frame.apply(f)) # b d e # min -1.176057 -1.263399 -2.312486 # max 1.286525 1.151499 1.165449
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) print(frame) # d a b c # three 0 1 2 3 # one 4 5 6 7
# 对索引进行排序,默认axis=0 print(frame.sort_index()) # d a b c # one 4 5 6 7 # three 0 1 2 3
print(frame.sort_index(axis=1)) # a b c d # three 1 2 3 0 # one 5 6 7 4
# 默认升序,可以使用ascending=False实现降序 print(frame.sort_index(axis=1, ascending=False)) # d c b a # three 0 3 2 1 # one 4 7 6 5
# 对于⾮数值型数据,describe会产⽣另外⼀种汇总统计: obj = pd.Series(['a', 'a', 'b', 'c'] * 3) print(obj) # 0 a # 1 a # 2 b # 3 c # 4 a # 5 a # 6 b # 7 c # 8 a # 9 a # 10 b # 11 c # dtype: object
print(obj.describe()) # count 12 # unique 3 # top a # freq 6 # dtype: object
import pandas as pd import numpy as np import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
# 获取DataFrame对象 price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()}) # 获取DataFrame对象 volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})
data = pd.DataFrame({'Qu1': ['a','b','e','d','e'], 'Qu2': ['e','b','b','d','e'], 'Qu3': ['c','c','c','d','e']})
print(data) # Qu1 Qu2 Qu3 # 0 a e c # 1 b b c # 2 e b c # 3 d d d # 4 e e e
# 左侧为出现的全部元素,矩阵为出现次数 result1 = data.apply(pd.value_counts) print(result1) # Qu1 Qu2 Qu3 # a 1.0 NaN NaN # b 1.0 2.0 NaN # c NaN NaN 3.0 # d 1.0 1.0 1.0 # e 2.0 2.0 1.0
# 得到每一个元素在每一列中出现的次数,对有的列中没有出现的数值的频率为NA的值设置为0 result2 = data.apply(pd.value_counts).fillna(0) print(result2) # Qu1 Qu2 Qu3 # a 1.0 0.0 0.0 # b 1.0 2.0 0.0 # c 0.0 0.0 3.0 # d 1.0 1.0 1.0 # e 2.0 2.0 1.0