In [1]:

    
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('darkgrid')



In [2]:

    
# set random seed, for reproducibility
np.random.seed(12345)

Download Hep C replication archive from http://ghdx.healthdata.org/record/hepatitis-c-prevalence-1990-and-2005-all-gbd-regions, and extract input_data.csv

Or, since H: drive is preventing me from loading that into Sage Cloud, let's look at the good, old weather data from Week 1 of class:



In [4]:

    
df = pd.read_csv('weather-numeric.csv')
df









    Out[4]:






  
    
      
      outlook
      temperature
      humidity
      windy
      play
    
  
  
    
      0 
          sunny
       85
       85
       False
        no
    
    
      1 
          sunny
       80
       90
        True
        no
    
    
      2 
       overcast
       83
       86
       False
       yes
    
    
      3 
          rainy
       70
       96
       False
       yes
    
    
      4 
          rainy
       68
       80
       False
       yes
    
    
      5 
          rainy
       65
       70
        True
        no
    
    
      6 
       overcast
       64
       65
        True
       yes
    
    
      7 
          sunny
       72
       95
       False
        no
    
    
      8 
          sunny
       69
       70
       False
       yes
    
    
      9 
          rainy
       75
       80
       False
       yes
    
    
      10
          sunny
       75
       70
        True
       yes
    
    
      11
       overcast
       72
       90
        True
       yes
    
    
      12
       overcast
       81
       75
       False
       yes
    
    
      13
          rainy
       71
       91
        True
        no

Mapping from categorical data to dummy variables

A.k.a. One-hot encoding (http://en.wikipedia.org/wiki/One-hot ):



In [5]:

    
df.outlook.value_counts()









    Out[5]:





rainy       5
sunny       5
overcast    4
dtype: int64



In [6]:

    
X = np.array(df.filter(['outlook', 'temperature']))
y = np.array(df.play)



In [7]:

    
import sklearn.svm



In [8]:

    
clf = sklearn.svm.SVC()
clf.fit(X, y)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-8-1c49827b6ff0> in <module>()
      1 clf = sklearn.svm.SVC()
----> 2 clf.fit(X, y)

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
    135         self._sparse = sparse and not callable(self.kernel)
    136 
--> 137         X = atleast2d_or_csr(X, dtype=np.float64, order='C')
    138         y = self._validate_targets(y)
    139 

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/sklearn/utils/validation.py in atleast2d_or_csr(X, dtype, order, copy, force_all_finite)
    163     return _atleast2d_or_sparse(X, dtype, order, copy, sp.csr_matrix,
    164                                 "tocsr", sp.isspmatrix_csr,
--> 165                                 force_all_finite)
    166 
    167 

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/sklearn/utils/validation.py in _atleast2d_or_sparse(X, dtype, order, copy, sparse_class, convmethod, check_same_type, force_all_finite)
    140     else:
    141         X = array2d(X, dtype=dtype, order=order, copy=copy,
--> 142                     force_all_finite=force_all_finite)
    143     return X
    144 

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/sklearn/utils/validation.py in array2d(X, dtype, order, copy, force_all_finite)
    118         raise TypeError('A sparse matrix was passed, but dense data '
    119                         'is required. Use X.toarray() to convert to dense.')
--> 120     X_2d = np.asarray(np.atleast_2d(X), dtype=dtype, order=order)
    121     if force_all_finite:
    122         _assert_all_finite(X_2d)

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/numpy/core/numeric.pyc in asarray(a, dtype, order)
    458 
    459     """
--> 460     return array(a, dtype, copy=False, order=order)
    461 
    462 def asanyarray(a, dtype=None, order=None):

ValueError: could not convert string to float: rainy

What's the problem?



In [ ]:

    
# SVC not smart enough to handle strings

The solution: one-hot encoding.



In [9]:

    
# can do this manually:
for val in df.outlook.unique():
    print 'adding column', val
    df[val] = (df.outlook == val)









    



adding column sunny
adding column overcast
adding column rainy



In [11]:

    
X = np.array(df.filter(['sunny', 'overcast', 'rainy', 'temperature']))
y = np.array(df.play)

clf = sklearn.svm.SVC()
clf.fit(X, y)









    Out[11]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [13]:

    
y_pred = clf.predict(X)
np.mean(y_pred == y)









    Out[13]:





0.8571428571428571

Impressed?



In [ ]:

    
# NO, not oos

Shifting and scaling numeric variables



In [ ]:

    
df.temperature



In [ ]:

    
df.temperature.mean()



In [ ]:

    
df.temperature.std()



In [ ]:

    
df['normalized_temp'] = (df.temperature - df.temperature.mean()) / df.temperature.std()



In [ ]:

    
sns.distplot(df.normalized_temp)

Interactions



In [14]:

    
df['hot_and_humid'] = df.temperature * df.humidity



In [15]:

    
sns.distplot(df.hot_and_humid)









    Out[15]:





<matplotlib.axes.AxesSubplot at 0x7f064d9f9e50>

Should we have normalized that somehow? Could do before or after multiplying...



In [16]:

    
df.hot_and_humid = (df.hot_and_humid - df.hot_and_humid.mean()) / df.hot_and_humid.std()

sns.distplot(df.hot_and_humid)









    Out[16]:





<matplotlib.axes.AxesSubplot at 0x7f064d8eea90>



In [17]:

    
df['normalized_humidity'] = (df.humidity - df.humidity.mean()) / df.humidity.std()



In [18]:

    
df.hot_and_humid = df.normalized_temp * df.normalized_humidity
sns.distplot(df.hot_and_humid)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-18-c0d8cc61c5b0> in <module>()
----> 1 df.hot_and_humid = df.normalized_temp * df.normalized_humidity
      2 sns.distplot(df.hot_and_humid)

/usr/local/sage/sage-6.4/local/lib/python2.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   1934                 return self[name]
   1935             raise AttributeError("'%s' object has no attribute '%s'" %
-> 1936                                  (type(self).__name__, name))
   1937 
   1938     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'normalized_temp'

There are fancier things you can consider, too. And we will perhaps return to them in the Data Transformations week. If you need one for your project, start with Box-Cox transform: http://en.wikipedia.org/wiki/Power_transform#Box.E2.80.93Cox_transformation

Let's have a look at `input_data.csv` now



In [19]:

    
df = pd.read_csv('input_data.csv')
df.head()









    Out[19]:






  
    
      
      index
      age_end
      age_start
      age_weights
      area
      data_type
      effective_sample_size
      lower_ci
      sex
      standard_error
      upper_ci
      value
      year_end
      year_start
      z_bias
    
  
  
    
      0
       0
       65
       18
      NaN
       ARG
       p
       12891
      NaN
        total
      -99
      NaN
       0.0071
       1992
       1991
       1
    
    
      1
       1
       89
        3
      NaN
       ARG
       p
         196
      NaN
        total
      -99
      NaN
       0.0000
       1993
       1993
       1
    
    
      2
       2
       24
       20
      NaN
       AUS
       p
        1348
      NaN
        total
      -99
      NaN
       0.0530
       1998
       1996
       1
    
    
      3
       3
       39
       30
      NaN
       AUS
       p
        1452
      NaN
        total
      -99
      NaN
       0.0110
       1998
       1996
       1
    
    
      4
       4
       49
       15
      NaN
       AUS
       p
        1537
      NaN
       female
      -99
      NaN
       0.0111
       1995
       1995
       1

Other fun features you could consider adding, based on special case you are dealing with:



In [20]:

    
# age interval contains specific ages:
for a in np.arange(0,81,5):
    df['includes_age_'+str(a)] = 1. * ((df.age_start <= a) & (df.age_end < a))



In [ ]:

    
df.filter(like='age').head()



In [ ]:

    
# geographic hierarchy, dummy coded:
import json, networkx as nx



In [ ]:

    
hierarchy = json.load(file('hierarchy.json'))
type(hierarchy)



In [ ]:

    
G = nx.DiGraph()
for n, n_props in hierarchy['nodes']:
    G.add_node(n)



In [ ]:

    
for u, v, edge_props in hierarchy['edges']:
    G.add_edge(u,v)



In [ ]:

    
def region_containing(country):
    parents = G.predecessors(country)
    assert len(parents) == 1
    return parents[0]

df['region'] = df.area.map(region_containing)



In [ ]:

    
df['super-region'] = df.region.map(region_containing)



In [ ]:

    
df['super-region']



In [ ]:

    
# challenge: do a one-hot encoding of region and super-region

This "hierarchical-hot" might be good for ICD codes, too. Someone should check...

	outlook	temperature	humidity	windy	play
0	sunny	85	85	False	no
1	sunny	80	90	True	no
2	overcast	83	86	False	yes
3	rainy	70	96	False	yes
4	rainy	68	80	False	yes
5	rainy	65	70	True	no
6	overcast	64	65	True	yes
7	sunny	72	95	False	no
8	sunny	69	70	False	yes
9	rainy	75	80	False	yes
10	sunny	75	70	True	yes
11	overcast	72	90	True	yes
12	overcast	81	75	False	yes
13	rainy	71	91	True	no

	index	age_end	age_start	age_weights	area	data_type	effective_sample_size	lower_ci	sex	standard_error	upper_ci	value	year_end	year_start	z_bias
0	0	65	18	NaN	ARG	p	12891	NaN	total	-99	NaN	0.0071	1992	1991	1
1	1	89	3	NaN	ARG	p	196	NaN	total	-99	NaN	0.0000	1993	1993	1
2	2	24	20	NaN	AUS	p	1348	NaN	total	-99	NaN	0.0530	1998	1996	1
3	3	39	30	NaN	AUS	p	1452	NaN	total	-99	NaN	0.0110	1998	1996	1
4	4	49	15	NaN	AUS	p	1537	NaN	female	-99	NaN	0.0111	1995	1995	1

Mapping from categorical data to dummy variables

Shifting and scaling numeric variables

Interactions

Let's have a look at input_data.csv now

Let's have a look at `input_data.csv` now