In [181]:
%matplotlib inline
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [182]:
words_freq = np.array([200,300,400,350,390,600,900,400,300,120])
freq_dict = {'boy':words_freq,'girl':words_freq*2,'children':words_freq+100, 'child': words_freq+300}
total_words_freq = [12345000,23456000,22333000,45632000,11144000,65433000,44444000,55555000,34522000,55566000]
years = pd.date_range('2006', periods=10, freq='A')
In [183]:
df = DataFrame(freq_dict)
df
Out[183]:
In [184]:
df = DataFrame(freq_dict, index = years)
df
Out[184]:
In [185]:
df.plot()
Out[185]:
In [186]:
df.plot(kind='bar')
Out[186]:
In [187]:
df.plot(kind='barh')
Out[187]:
In [188]:
df.plot(kind = 'area')
Out[188]:
In [189]:
df.plot(kind='box')
Out[189]:
In [190]:
sns.boxplot(df)
Out[190]:
In [191]:
df.plot(kind='scatter', x='boy',y='girl')
Out[191]:
boy
与girl
两组数据之间是一种线性关系
In [192]:
df.plot(kind='scatter', x='boy',y='girl',s=df['child'].values)
Out[192]:
In [193]:
df.sum()
Out[193]:
In [194]:
df.sum(axis=1)
Out[194]:
In [195]:
df.min()
Out[195]:
In [196]:
df.mean()
Out[196]:
In [197]:
df.var()
Out[197]:
In [198]:
df.std()
Out[198]:
In [199]:
df.std()/df.mean()
Out[199]:
In [200]:
df.median()
Out[200]:
In [201]:
df.cumsum()
Out[201]:
In [202]:
df.kurt()
Out[202]:
In [203]:
df.skew()
Out[203]:
In [204]:
df.describe()
Out[204]:
In [205]:
df.head()
Out[205]:
In [206]:
df.tail(4)
Out[206]:
In [207]:
df.index
Out[207]:
In [208]:
df.values
Out[208]:
In [209]:
df.columns
Out[209]:
object
,在pandas中,非数字类型一般均为object类型。
In [210]:
dft = df.T
dft
Out[210]:
In [211]:
dft.index
Out[211]:
In [212]:
dft.columns
Out[212]:
In [213]:
df.sort_index(ascending = False)
Out[213]:
ascending = False
,使之降序排列
In [214]:
df.sort_index(axis = 1, ascending = False)
Out[214]:
In [215]:
df.sort_values(by='boy')
Out[215]:
In [216]:
df['boy']
Out[216]:
df.boy
来访问,效果相同。
In [217]:
df[['boy']]
Out[217]:
In [218]:
df[['boy','children']]
Out[218]:
In [219]:
df[[0,2]]
Out[219]:
In [220]:
df.loc[:,'boy']
Out[220]:
:
即为全部选取
In [221]:
df.loc[:,['boy']]
Out[221]:
In [222]:
df.loc[:,['boy', 'girl']]
Out[222]:
In [223]:
df.loc[:,'boy': 'girl']
Out[223]:
In [224]:
df.iloc[:,0]
Out[224]:
:
即为全部选取
In [225]:
df.iloc[:,[1]]
Out[225]:
In [226]:
df.iloc[:,[1,2]]
Out[226]:
In [227]:
df.iloc[:,0:2]
Out[227]:
4.2 行选择
In [228]:
df.loc[years[0]]
Out[228]:
In [229]:
df.loc[[years[0]]]
Out[229]:
In [230]:
df.loc[[years[0], years[2]]]
Out[230]:
In [231]:
df.loc[years[0]: years[2]]
Out[231]:
In [232]:
df.iloc[0]
Out[232]:
In [233]:
df.iloc[[0]]
Out[233]:
In [234]:
df.iloc[[0,1,2]]
Out[234]:
In [235]:
df.iloc[1:3]
Out[235]:
In [236]:
df[0:3]
Out[236]:
4.3 选择区块
4.3.1 利用loc
In [237]:
df.loc[years[0],'boy']
Out[237]:
In [238]:
df.loc[years[0],['boy']]
Out[238]:
In [239]:
df.loc[[years[0]],['boy']]
Out[239]:
In [240]:
df.loc[years[0],['boy', 'girl']]
Out[240]:
In [241]:
df.loc[years[0],'boy':'girl']
Out[241]:
In [242]:
df.loc[[years[0]],'boy':'girl']
Out[242]:
In [243]:
df.loc[years[0]:years[2],'boy':'girl']
Out[243]:
4.3.2 利用iloc
利用iloc进行区域选择的方式与loc基本类似,只是利用位置而非标签信息进行选取,请参照前面理解
In [244]:
df.iloc[0,0]
Out[244]:
In [245]:
df.iloc[0,[0,1,2]]
Out[245]:
In [246]:
df.iloc[[0,1,2],0]
Out[246]:
In [247]:
df.iloc[[0,1,2],[1, 2]]
Out[247]:
In [248]:
df.iloc[0:2,[1, 2]]
Out[248]:
In [249]:
df.iloc[0:5,0:3]
Out[249]:
4.3.3 利用iat选择单个元素
In [250]:
df.iat[1,1]
Out[250]:
In [251]:
df.boy > 400
Out[251]:
In [252]:
df[df.boy > 400]
Out[252]:
In [253]:
df[(df.boy >300) & (df.girl > 900)]
Out[253]:
and
操作用&
,'or'用|
,not
用~
。
In [254]:
df['girl'][df.boy >300]
Out[254]:
In [255]:
df[['girl']][df.boy >300]
Out[255]:
In [256]:
df[['girl', 'child']][(df.boy >300) & (df.girl > 900)]
Out[256]:
In [257]:
df['girl'].isin([700,800])
Out[257]:
isin()
函数,是判断该Series中的values是否在给定的数据表中
In [258]:
df[df['girl'].isin([700,800])]
Out[258]:
In [259]:
df[['girl', 'children']][df['girl'].isin([700,800])]
Out[259]:
In [260]:
df.loc[years[1]]>500
Out[260]:
In [261]:
df.loc[years[0]][df.loc[years[1]]>500]
Out[261]:
对象[行条件]
的方式进行条件数据选取,个人认为是pandas设计没有考虑周全的地方。虽然布尔索引选择行数据的应用场景较少。
In [262]:
df.T[[years[0],years[1]]][df.loc[years[2]]>500].T
Out[262]: