In [1]:
import numpy as np
import pandas as pd
In [2]:
np.set_printoptions(precision=2)
Считываем данные из файла
In [3]:
data = pd.read_csv('../data/beauty.csv', sep=';')
In [4]:
type(data)
Out[4]:
Смотрим на первые 5 строк
In [5]:
data.head()
Out[5]:
In [6]:
data.shape
Out[6]:
Краткая статистика – info и describe
In [7]:
data.info()
In [8]:
data.describe()
Out[8]:
Индексация
In [9]:
data['exper'].head()
Out[9]:
loc и iloc
In [10]:
data.loc[0:5, ['wage', 'female']]
Out[10]:
In [11]:
data.iloc[:,2:4].head()
Out[11]:
Логическая индексация
In [12]:
data[data['female'] == 1]['wage'].mean(), \
data[data['female'] == 0]['wage'].mean()
Out[12]:
In [13]:
data[(data['female'] == 0) & (data['married'] == 1)]['wage'].median(), \
data[(data['female'] == 0) & (data['married'] == 0)]['wage'].median()
Out[13]:
Groupby
In [14]:
for look, sub_df in data.groupby('looks'):
print(look)
# что угодно
print(sub_df['goodhlth'].mean())
In [15]:
data.groupby('looks')[['wage', 'exper']].agg(np.median)
Out[15]:
Сводная таблица
In [16]:
pd.crosstab(data['female'], data['married'])
Out[16]:
In [17]:
pd.crosstab(data['female'], data['looks'])
Out[17]:
Добавление столбцов (построение признаков)
In [18]:
data['is_rich'] = (data['wage'] >
data['wage'].quantile(.75)).astype('int64')
In [19]:
data.head()
Out[19]:
In [20]:
data['rubbish'] = .56 * data['wage'] + 0.32 * data['exper']
map и apply
In [21]:
def string_gender(female):
return 'female' if female else 'male'
In [22]:
d = {1: 'union', 0: 'non-union'}
In [23]:
data['union'].map(d).head()
Out[23]:
In [24]:
data['female'].apply(lambda female: 'female' if female else 'male').head()
Out[24]: