pandas-lecture-01-intro



In [55]:
import pandas as pd
import numpy as np

In [68]:
np.set_printoptions(precision=2)

In [57]:
data = pd.read_csv("../data/beauty.csv", sep=";")

In [58]:
data.head()


Out[58]:
wage exper union goodhlth black female married service educ looks
0 5.73 30 0 1 0 1 1 1 14 4
1 4.28 28 0 1 0 1 1 0 12 3
2 7.96 35 0 1 0 1 0 0 10 4
3 11.57 38 0 1 0 0 1 1 16 3
4 11.42 27 0 1 0 0 1 0 16 3

In [59]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260 entries, 0 to 1259
Data columns (total 10 columns):
wage        1260 non-null float64
exper       1260 non-null int64
union       1260 non-null int64
goodhlth    1260 non-null int64
black       1260 non-null int64
female      1260 non-null int64
married     1260 non-null int64
service     1260 non-null int64
educ        1260 non-null int64
looks       1260 non-null int64
dtypes: float64(1), int64(9)
memory usage: 98.5 KB

In [60]:
data.describe()


Out[60]:
wage exper union goodhlth black female married service educ looks
count 1260.000000 1260.000000 1260.000000 1260.000000 1260.000000 1260.000000 1260.000000 1260.000000 1260.000000 1260.000000
mean 6.306690 18.206349 0.272222 0.933333 0.073810 0.346032 0.691270 0.273810 12.563492 3.185714
std 4.660639 11.963485 0.445280 0.249543 0.261564 0.475892 0.462153 0.446089 2.624489 0.684877
min 1.020000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.000000 1.000000
25% 3.707500 8.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 12.000000 3.000000
50% 5.300000 15.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 12.000000 3.000000
75% 7.695000 27.000000 1.000000 1.000000 0.000000 1.000000 1.000000 1.000000 13.000000 4.000000
max 77.720000 48.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 17.000000 5.000000

In [61]:
data['exper'].head()


Out[61]:
0    30
1    28
2    35
3    38
4    27
Name: exper, dtype: int64

In [62]:
type(data['exper'])


Out[62]:
pandas.core.series.Series

Loc и iLoc


In [63]:
data.loc[0:5, ['wage', 'female']]


Out[63]:
wage female
0 5.73 1
1 4.28 1
2 7.96 1
3 11.57 0
4 11.42 0
5 3.91 1

In [64]:
data.iloc[:,2:4].head()


Out[64]:
union goodhlth
0 0 1
1 0 1
2 0 1
3 0 1
4 0 1

In [65]:
data[data['female'] == 1]['wage'].median(), \
data[data['female'] == 0]['wage'].median()


Out[65]:
(3.75, 6.41)

In [72]:
data[(data['female'] == 0) & (data['married'] == 1)]['wage'].median(), \
data[(data['female'] == 0) & (data['married'] != 1)]['wage'].median()


Out[72]:
(6.710000000000001, 5.0649999999999995)

In [82]:
for look, sub_df in data.groupby(['looks']):
    print(look, sub_df['female'].mean())


(1, 0.38461538461538464)
(2, 0.38028169014084506)
(3, 0.32271468144044324)
(4, 0.37362637362637363)
(5, 0.42105263157894735)

In [83]:
data.groupby('looks')['wage'].mean()


Out[83]:
looks
1    4.621538
2    5.328803
3    6.504598
4    6.299341
5    7.388421
Name: wage, dtype: float64

In [84]:
for look, sub_df in data.groupby(['looks', 'female']):
    print(look, sub_df['wage'].mean())


((1, 0), 6.16375)
((1, 1), 2.154)
((2, 0), 6.249090909090909)
((2, 1), 3.8290740740740743)
((3, 0), 7.598957055214724)
((3, 1), 4.207854077253218)
((4, 0), 7.226447368421053)
((4, 1), 4.745073529411764)
((5, 0), 9.923636363636364)
((5, 1), 3.9025)

In [90]:
data.groupby('looks')[['wage', 'exper']].agg(np.median)


Out[90]:
wage exper
looks
1 3.460 32.0
2 4.595 18.0
3 5.635 18.0
4 5.240 12.5
5 4.810 8.0

In [94]:
pd.crosstab(data['female'], data['married'])


Out[94]:
married 0 1
female
0 166 658
1 223 213

In [97]:
pd.crosstab(data['female'], data['looks'])


Out[97]:
looks 1 2 3 4 5
female
0 8 88 489 228 11
1 5 54 233 136 8

In [111]:
data['is_rich'] = (data['wage'] > data['wage'].quantile(.75)).astype('int64')

print(data.head())


    wage  exper  union  goodhlth  black  female  married  service  educ  \
0   5.73     30      0         1      0       1        1        1    14   
1   4.28     28      0         1      0       1        1        0    12   
2   7.96     35      0         1      0       1        0        0    10   
3  11.57     38      0         1      0       0        1        1    16   
4  11.42     27      0         1      0       0        1        0    16   

   looks  is_rich  
0      4        0  
1      3        0  
2      4        1  
3      3        1  
4      3        1  

In [112]:
data.apply(np.mean)


Out[112]:
wage         6.306690
exper       18.206349
union        0.272222
goodhlth     0.933333
black        0.073810
female       0.346032
married      0.691270
service      0.273810
educ        12.563492
looks        3.185714
is_rich      0.250000
dtype: float64

In [114]:
def string_gender(female):
    return 'female' if female else 'male'

In [118]:
data['female'].apply(lambda female: 'female' if female else 'male')[:10]


Out[118]:
0    female
1    female
2    female
3      male
4      male
5    female
6      male
7      male
8    female
9    female
Name: female, dtype: object

In [120]:
d = {1: 'union', 0: 'nonunion'}

In [121]:
data['union'].map(d)[:5]


Out[121]:
0    nonunion
1    nonunion
2    nonunion
3    nonunion
4    nonunion
Name: union, dtype: object