In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline
sns.set('notebook')
sns.set_style('whitegrid')

Import GRIMM data

Convert epoch to date-times and visualize relationships in data


In [5]:
grimm = pd.read_csv('data/humexp/Grimm.csv', index_col='EpochTime', header=False, names=['EpochTime','Count','PM1', 'PM10', 'PM2.5'])
grimm.index = pd.to_datetime((grimm.index.values*1e9).astype(int))

In [6]:
grimm.head()


Out[6]:
Count PM1 PM10 PM2.5
2015-06-30 15:24:21 130450 10.5 10.5 10.5
2015-06-30 15:24:27 128050 10.0 17.2 10.6
2015-06-30 15:24:33 126550 10.0 14.9 11.5
2015-06-30 15:24:39 124700 10.0 10.7 10.7
2015-06-30 15:24:45 126700 9.7 15.2 10.5

In [7]:
sns.pairplot(grimm)


Out[7]:
<seaborn.axisgrid.PairGrid at 0x109c72590>

Import Speck data

Drop extra rows and convert epoch timestamps to date-times


In [8]:
speck1 = pd.read_csv('data/humexp/Speck1.csv', index_col='EpochTime', header=False, names=['EpochTime','Humidity', 'Concentration', 'Count', 'Raw', 'Temp'])
speck2 = pd.read_csv('data/humexp/Speck2.csv', index_col='EpochTime', header=False, names=['EpochTime','Humidity', 'Concentration', 'Count', 'Raw', 'Temp'])

speck1 = speck1.iloc[2:]
speck2 = speck2.iloc[1:]

speck1.index = pd.to_datetime((speck1.index.values*1e9).astype(int))
speck2.index = pd.to_datetime((speck2.index.values*1e9).astype(int))

In [9]:
speck1.head()


Out[9]:
Humidity Concentration Count Raw Temp
2015-06-30 15:24:24 93 1.9 49 11 0.1
2015-06-30 15:25:24 81 2.6 66 19 3.1
2015-06-30 15:26:24 73 4.1 102 59 5.5
2015-06-30 15:27:24 67 5.3 133 4 7.3
2015-06-30 15:28:24 63 5.1 127 20 8.6

Resample data to common interval of 1 minute


In [10]:
speck1 = speck1.resample('1Min').dropna()
speck2 = speck2.resample('1Min').dropna()
grimm = grimm.resample('1Min').dropna()

In [11]:
sns.jointplot(speck1['Concentration'].values, speck2['Concentration'].values)
sns.jointplot(speck1['Concentration'].values, grimm['PM2.5'].values)
sns.jointplot(speck2['Concentration'].values, grimm['PM2.5'].values)


Out[11]:
<seaborn.axisgrid.JointGrid at 0x10aa27bd0>

In [12]:
plt.subplot(121)
plt.plot(grimm['PM2.5'])
plt.plot(speck1['Concentration'], alpha=0.8)
plt.plot(speck2['Concentration'], alpha=0.8)
plt.subplot(122)
plt.plot(speck1['Humidity'])
plt.plot(speck2['Humidity'])


Out[12]:
[<matplotlib.lines.Line2D at 0x11166dd50>]

Learning a better fit to PM2.5


In [13]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, LinearRegression

Compare two predictors, SVM may overfit the training data, linear ridge regression will not be able to overfit if $d<<n$


In [14]:
predictors = {'Ridge': make_pipeline(StandardScaler(), PolynomialFeatures(2), Ridge()),
             'RBF SVM': make_pipeline(StandardScaler(), SVR(kernel='rbf', C=1e4, epsilon=1, degree=3))}

# Note, RBF parameters were not tunes with a validation set, but with the test set.  
# This is more of an exploration and is not suitable for publication

In [15]:
results = {}
X = speck1.iloc[:500].values
y = grimm['PM2.5'].iloc[:500]
testX = speck1.iloc[500:].values
testy = grimm['PM2.5'].iloc[500:]

#X = speck1.iloc[::2].values
#y = grimm['PM2.5'].iloc[::2]
#testX = speck1.iloc[1::2].values
#testy = grimm['PM2.5'].iloc[1::2]

for label in predictors:
    regressor = predictors[label]
    regressor.fit(X, y)
    results[label] = regressor.predict(testX)

In [16]:
plt.subplot(111)
plt.plot(testy, label='Grimm')
for label in results:
    plt.plot(results[label], label=label, alpha=0.7)
plt.legend()


Out[16]:
<matplotlib.legend.Legend at 0x11172b0d0>

In [ ]:
print 'Training data fit scores'
for label in predictors:
    print label + ' ' + str(predictors[label].score(speck1.iloc[::2].values, grimm['PM2.5'].iloc[::2]))

For each feautre (polynomial combination of features), what is the respective weight in the ridge regressor?


In [ ]:
print speck1.columns
print zip(predictors['Ridge'].steps[1][1].powers_, predictors['Ridge'].steps[2][1].coef_)