In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

In [2]:
dat = pd.read_csv('tips.csv')

In [3]:
dat.head()


Out[3]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4

In [4]:
dat.loc[:, ['sex', 'smoker']].pivot_table(index='sex', columns='smoker', aggfunc=len)


Out[4]:
smoker No Yes
sex
Female 54 33
Male 97 60

In [22]:
dat.loc[:, ['sex', 'smoker']].pivot_table(index='sex', columns='smoker', aggfunc=len).plot.bar(color=None)


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x11734b940>

In [4]:
dat.describe()


Out[4]:
total_bill tip size
count 244.000000 244.000000 244.000000
mean 19.785943 2.998279 2.569672
std 8.902412 1.383638 0.951100
min 3.070000 1.000000 1.000000
25% 13.347500 2.000000 2.000000
50% 17.795000 2.900000 2.000000
75% 24.127500 3.562500 3.000000
max 50.810000 10.000000 6.000000

In [5]:
dat.sex.value_counts(dropna=False)


Out[5]:
Male      157
Female     87
dtype: int64

In [6]:
dat.smoker.value_counts(dropna=False)


Out[6]:
No     151
Yes     93
dtype: int64

In [7]:
dat.day.value_counts(dropna=False)


Out[7]:
Sat     87
Sun     76
Thur    62
Fri     19
dtype: int64

In [8]:
dat.time.value_counts(dropna=False)


Out[8]:
Dinner    176
Lunch      68
dtype: int64

In [9]:
dat.dtypes


Out[9]:
total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

In [12]:
dat_float = dat.select_dtypes(include=[np.float])

In [13]:
dat_float.head()


Out[13]:
total_bill tip
0 16.99 1.01
1 10.34 1.66
2 21.01 3.50
3 23.68 3.31
4 24.59 3.61

In [15]:
dat_float_median = dat_float.median()

In [17]:
dat_float_median


Out[17]:
total_bill    17.795
tip            2.900
dtype: float64

In [18]:
dat.loc[:, dat_float_median.index].head()


Out[18]:
total_bill tip
0 16.99 1.01
1 10.34 1.66
2 21.01 3.50
3 23.68 3.31
4 24.59 3.61

In [9]:
pd.get_dummies(dat.loc[:, ['sex', 'smoker', 'day', 'time']], drop_first=True).head()


Out[9]:
sex_Male smoker_Yes day_Sat day_Sun day_Thur time_Lunch
0 0 0 0 1 0 0
1 1 0 0 1 0 0
2 1 0 0 1 0 0
3 1 0 0 1 0 0
4 0 0 0 1 0 0

In [ ]: