In [1]:

    
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('darkgrid')



In [2]:

    
# set random seed, for reproducibility
np.random.seed(12345)

Download Hep C replication archive from http://ghdx.healthdata.org/record/hepatitis-c-prevalence-1990-and-2005-all-gbd-regions, and extract input_data.csv

Or, since H: drive is preventing me from loading that into Sage Cloud, let's look at the good, old weather data from Week 1 of class:



In [3]:

    
df = pd.read_csv('weather-numeric.csv')
df









    Out[3]:






  
    
      
      outlook
      temperature
      humidity
      windy
      play
    
  
  
    
      0 
          sunny
       85
       85
       False
        no
    
    
      1 
          sunny
       80
       90
        True
        no
    
    
      2 
       overcast
       83
       86
       False
       yes
    
    
      3 
          rainy
       70
       96
       False
       yes
    
    
      4 
          rainy
       68
       80
       False
       yes
    
    
      5 
          rainy
       65
       70
        True
        no
    
    
      6 
       overcast
       64
       65
        True
       yes
    
    
      7 
          sunny
       72
       95
       False
        no
    
    
      8 
          sunny
       69
       70
       False
       yes
    
    
      9 
          rainy
       75
       80
       False
       yes
    
    
      10
          sunny
       75
       70
        True
       yes
    
    
      11
       overcast
       72
       90
        True
       yes
    
    
      12
       overcast
       81
       75
       False
       yes
    
    
      13
          rainy
       71
       91
        True
        no

Mapping from categorical data to dummy variables

A.k.a. One-hot encoding (http://en.wikipedia.org/wiki/One-hot ):



In [4]:

    
df.outlook.value_counts()









    Out[4]:





rainy       5
sunny       5
overcast    4
dtype: int64



In [5]:

    
X = np.array(df.filter(['outlook', 'temperature']))
y = np.array(df.play)



In [6]:

    
import sklearn.svm



In [7]:

    
clf = sklearn.svm.SVC()
clf.fit(X, y)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-7-1c49827b6ff0> in <module>()
      1 clf = sklearn.svm.SVC()
----> 2 clf.fit(X, y)

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
    135         self._sparse = sparse and not callable(self.kernel)
    136 
--> 137         X = atleast2d_or_csr(X, dtype=np.float64, order='C')
    138         y = self._validate_targets(y)
    139 

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/sklearn/utils/validation.py in atleast2d_or_csr(X, dtype, order, copy, force_all_finite)
    163     return _atleast2d_or_sparse(X, dtype, order, copy, sp.csr_matrix,
    164                                 "tocsr", sp.isspmatrix_csr,
--> 165                                 force_all_finite)
    166 
    167 

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/sklearn/utils/validation.py in _atleast2d_or_sparse(X, dtype, order, copy, sparse_class, convmethod, check_same_type, force_all_finite)
    140     else:
    141         X = array2d(X, dtype=dtype, order=order, copy=copy,
--> 142                     force_all_finite=force_all_finite)
    143     return X
    144 

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/sklearn/utils/validation.py in array2d(X, dtype, order, copy, force_all_finite)
    118         raise TypeError('A sparse matrix was passed, but dense data '
    119                         'is required. Use X.toarray() to convert to dense.')
--> 120     X_2d = np.asarray(np.atleast_2d(X), dtype=dtype, order=order)
    121     if force_all_finite:
    122         _assert_all_finite(X_2d)

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/numpy/core/numeric.pyc in asarray(a, dtype, order)
    458 
    459     """
--> 460     return array(a, dtype, copy=False, order=order)
    461 
    462 def asanyarray(a, dtype=None, order=None):

ValueError: could not convert string to float: rainy

What's the problem?



In [8]:

    
# SVC not smart enough to handle strings

The solution: one-hot encoding.



In [9]:

    
# can do this manually:
for val in df.outlook.unique():
    print 'adding column', val
    df[val] = (df.outlook == val)









    



adding column sunny
adding column overcast
adding column rainy



In [10]:

    
X = np.array(df.filter(['sunny', 'overcast', 'rainy', 'temperature']))
y = np.array(df.play)

clf = sklearn.svm.SVC()
clf.fit(X, y)









    Out[10]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [11]:

    
y_pred = clf.predict(X)
np.mean(y_pred == y)









    Out[11]:





0.8571428571428571

Impressed?



In [12]:

    
# NO, not oos

Shifting and scaling numeric variables



In [13]:

    
df.temperature









    Out[13]:





0     85
1     80
2     83
3     70
4     68
5     65
6     64
7     72
8     69
9     75
10    75
11    72
12    81
13    71
Name: temperature, dtype: int64



In [14]:

    
df.temperature.mean()









    Out[14]:





73.571428571428569



In [15]:

    
df.temperature.std()









    Out[15]:





6.5716674586297126



In [16]:

    
df['normalized_temp'] = (df.temperature - df.temperature.mean()) / df.temperature.std()



In [17]:

    
sns.distplot(df.normalized_temp)









    Out[17]:





<matplotlib.axes.AxesSubplot at 0x7f024b3c10d0>

Interactions



In [18]:

    
df['hot_and_humid'] = df.temperature * df.humidity



In [19]:

    
sns.distplot(df.hot_and_humid)









    Out[19]:





<matplotlib.axes.AxesSubplot at 0x7f024b2bd810>

Should we have normalized that somehow? Could do before or after multiplying...



In [20]:

    
df.hot_and_humid = (df.hot_and_humid - df.hot_and_humid.mean()) / df.hot_and_humid.std()

sns.distplot(df.hot_and_humid)









    Out[20]:





<matplotlib.axes.AxesSubplot at 0x7f024b2058d0>



In [21]:

    
df['normalized_humidity'] = (df.humidity - df.humidity.mean()) / df.humidity.std()



In [22]:

    
df.hot_and_humid = df.normalized_temp * df.normalized_humidity
sns.distplot(df.hot_and_humid)









    Out[22]:





<matplotlib.axes.AxesSubplot at 0x7f024b0e25d0>

There are fancier things you can consider, too. And we will perhaps return to them in the Data Transformations week. If you need one for your project, start with Box-Cox transform: http://en.wikipedia.org/wiki/Power_transform#Box.E2.80.93Cox_transformation

Let's have a look at `input_data.csv` now



In [23]:

    
df = pd.read_csv('input_data.csv')
df.head()









    Out[23]:






  
    
      
      index
      age_end
      age_start
      age_weights
      area
      data_type
      effective_sample_size
      lower_ci
      sex
      standard_error
      upper_ci
      value
      year_end
      year_start
      z_bias
    
  
  
    
      0
       0
       65
       18
      NaN
       ARG
       p
       12891
      NaN
        total
      -99
      NaN
       0.0071
       1992
       1991
       1
    
    
      1
       1
       89
        3
      NaN
       ARG
       p
         196
      NaN
        total
      -99
      NaN
       0.0000
       1993
       1993
       1
    
    
      2
       2
       24
       20
      NaN
       AUS
       p
        1348
      NaN
        total
      -99
      NaN
       0.0530
       1998
       1996
       1
    
    
      3
       3
       39
       30
      NaN
       AUS
       p
        1452
      NaN
        total
      -99
      NaN
       0.0110
       1998
       1996
       1
    
    
      4
       4
       49
       15
      NaN
       AUS
       p
        1537
      NaN
       female
      -99
      NaN
       0.0111
       1995
       1995
       1

Other fun features you could consider adding, based on special case you are dealing with:



In [24]:

    
# age interval contains specific ages:
for a in np.arange(0,81,5):
    df['includes_age_'+str(a)] = 1. * ((df.age_start <= a) & (df.age_end < a))



In [25]:

    
df.filter(like='age').head()









    Out[25]:






  
    
      
      age_end
      age_start
      age_weights
      includes_age_0
      includes_age_5
      includes_age_10
      includes_age_15
      includes_age_20
      includes_age_25
      includes_age_30
      includes_age_35
      includes_age_40
      includes_age_45
      includes_age_50
      includes_age_55
      includes_age_60
      includes_age_65
      includes_age_70
      includes_age_75
      includes_age_80
    
  
  
    
      0
       65
       18
      NaN
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       1
       1
       1
    
    
      1
       89
        3
      NaN
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
    
    
      2
       24
       20
      NaN
       0
       0
       0
       0
       0
       1
       1
       1
       1
       1
       1
       1
       1
       1
       1
       1
       1
    
    
      3
       39
       30
      NaN
       0
       0
       0
       0
       0
       0
       0
       0
       1
       1
       1
       1
       1
       1
       1
       1
       1
    
    
      4
       49
       15
      NaN
       0
       0
       0
       0
       0
       0
       0
       0
       0
       0
       1
       1
       1
       1
       1
       1
       1



In [26]:

    
# geographic hierarchy, dummy coded:
import json, networkx as nx



In [27]:

    
hierarchy = json.load(file('hierarchy.json'))
type(hierarchy)









    Out[27]:





dict



In [28]:

    
G = nx.DiGraph()
for n, n_props in hierarchy['nodes']:
    G.add_node(n)



In [29]:

    
for u, v, edge_props in hierarchy['edges']:
    G.add_edge(u,v)



In [30]:

    
def region_containing(country):
    parents = G.predecessors(country)
    assert len(parents) == 1
    return parents[0]

df['region'] = df.area.map(region_containing)



In [31]:

    
df['super-region'] = df.region.map(region_containing)



In [32]:

    
df['super-region']









    Out[32]:





0     super-region_0
1     super-region_0
2     super-region_0
3     super-region_0
4     super-region_0
5     super-region_2
6     super-region_0
7     super-region_0
8     super-region_0
9     super-region_0
10    super-region_1
11    super-region_1
12    super-region_1
13    super-region_1
14    super-region_1
...
2569    all
2570    all
2571    all
2572    all
2573    all
2574    all
2575    all
2576    all
2577    all
2578    all
2579    all
2580    all
2581    all
2582    all
2583    all
Name: super-region, Length: 2584, dtype: object

challenge: do a one-hot encoding of region and super-region



In [35]:

    
for r in df.region.unique():
    print 'adding column', r
    df[r] = (df.region == r)









    



adding column latin_america_southern
adding column australasia
adding column sub-saharan_africa_east
adding column europe_western
adding column europe_central
adding column latin_america_tropical
adding column asia_south
adding column sub-saharan_africa_central
adding column north_america_high_income
adding column asia_east
adding column sub-saharan_africa_west
adding column north_africa_middle_east
adding column caribbean
adding column asia_southeast
adding column asia_pacific_high_income
adding column asia_central
adding column latin_america_central
adding column oceania
adding column europe_eastern
adding column sub-saharan_africa_southern
adding column super-region_0
adding column super-region_5
adding column super-region_6
adding column super-region_1
adding column super-region_2
adding column super-region_4
adding column super-region_3



In [37]:

    
for r in df['super-region'].unique():
    print 'adding column', r
    df[r] = (df['super-region'] == r)









    



adding column super-region_0
adding column super-region_2
adding column super-region_1
adding column super-region_6
adding column super-region_4
adding column super-region_5
adding column super-region_3
adding column all

This "hierarchical-hot" might be good for ICD codes, too. Someone should check...

	outlook	temperature	humidity	windy	play
0	sunny	85	85	False	no
1	sunny	80	90	True	no
2	overcast	83	86	False	yes
3	rainy	70	96	False	yes
4	rainy	68	80	False	yes
5	rainy	65	70	True	no
6	overcast	64	65	True	yes
7	sunny	72	95	False	no
8	sunny	69	70	False	yes
9	rainy	75	80	False	yes
10	sunny	75	70	True	yes
11	overcast	72	90	True	yes
12	overcast	81	75	False	yes
13	rainy	71	91	True	no

	index	age_end	age_start	age_weights	area	data_type	effective_sample_size	lower_ci	sex	standard_error	upper_ci	value	year_end	year_start	z_bias
0	0	65	18	NaN	ARG	p	12891	NaN	total	-99	NaN	0.0071	1992	1991	1
1	1	89	3	NaN	ARG	p	196	NaN	total	-99	NaN	0.0000	1993	1993	1
2	2	24	20	NaN	AUS	p	1348	NaN	total	-99	NaN	0.0530	1998	1996	1
3	3	39	30	NaN	AUS	p	1452	NaN	total	-99	NaN	0.0110	1998	1996	1
4	4	49	15	NaN	AUS	p	1537	NaN	female	-99	NaN	0.0111	1995	1995	1

	age_end	age_start	age_weights	includes_age_25	includes_age_30	includes_age_35	includes_age_40	includes_age_45	includes_age_50	includes_age_55	includes_age_60	includes_age_65	includes_age_70	includes_age_75	includes_age_80
0	65	18	NaN	0	0	0	0	0	0	0	0	0	1	1	1
1	89	3	NaN	0	0	0	0	0	0	0	0	0	0	0	0
2	24	20	NaN	1	1	1	1	1	1	1	1	1	1	1	1
3	39	30	NaN	0	0	0	1	1	1	1	1	1	1	1	1
4	49	15	NaN	0	0	0	0	0	1	1	1	1	1	1	1

	age_end	age_start	age_weights	includes_age_25	includes_age_30	includes_age_35	includes_age_40	includes_age_45	includes_age_50	includes_age_55	includes_age_60	includes_age_65	includes_age_70	includes_age_75	includes_age_80
0	65	18	NaN	0	0	0	0	0	0	0	0	0	1	1	1
1	89	3	NaN	0	0	0	0	0	0	0	0	0	0	0	0
2	24	20	NaN	1	1	1	1	1	1	1	1	1	1	1	1
3	39	30	NaN	0	0	0	1	1	1	1	1	1	1	1	1
4	49	15	NaN	0	0	0	0	0	1	1	1	1	1	1	1

Mapping from categorical data to dummy variables

Shifting and scaling numeric variables

Interactions

Let's have a look at input_data.csv now

challenge: do a one-hot encoding of region and super-region

Let's have a look at `input_data.csv` now

	age_end	age_start	age_weights	includes_age_25	includes_age_30	includes_age_35	includes_age_40	includes_age_45	includes_age_50	includes_age_55	includes_age_60	includes_age_65	includes_age_70	includes_age_75	includes_age_80
0	65	18	NaN	0	0	0	0	0	0	0	0	0	1	1	1
1	89	3	NaN	0	0	0	0	0	0	0	0	0	0	0	0
2	24	20	NaN	1	1	1	1	1	1	1	1	1	1	1	1
3	39	30	NaN	0	0	0	1	1	1	1	1	1	1	1	1
4	49	15	NaN	0	0	0	0	0	1	1	1	1	1	1	1