In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [18]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [19]:
df_train.head()


Out[19]:
smiles feat_001 feat_002 feat_003 feat_004 feat_005 feat_006 feat_007 feat_008 feat_009 ... feat_248 feat_249 feat_250 feat_251 feat_252 feat_253 feat_254 feat_255 feat_256 gap
0 c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n... 0 0 0 0 1 0 1 0 0 ... 1 0 0 0 0 0 0 0 0 1.19
1 C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si... 1 0 0 0 1 0 1 0 0 ... 1 0 0 1 0 0 0 0 0 1.60
2 [nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2... 1 0 0 0 1 1 1 0 0 ... 1 0 0 0 1 0 0 0 0 1.49
3 [nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13... 1 0 0 0 1 1 1 0 0 ... 1 0 0 0 1 0 0 0 0 1.36
4 c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1 0 0 0 0 1 0 1 0 0 ... 1 0 0 0 0 0 0 0 0 1.98

5 rows × 258 columns


In [20]:
df_test.head()


Out[20]:
Id smiles feat_001 feat_002 feat_003 feat_004 feat_005 feat_006 feat_007 feat_008 ... feat_247 feat_248 feat_249 feat_250 feat_251 feat_252 feat_253 feat_254 feat_255 feat_256
0 1 c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n... 0 0 0 0 1 1 1 0 ... 0 1 0 0 0 0 0 0 0 0
1 2 [nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1 0 0 0 0 1 1 1 0 ... 0 1 0 0 0 0 0 0 0 0
2 3 [nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4... 1 0 0 0 1 1 1 0 ... 0 1 0 0 0 0 0 0 0 0
3 4 [nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c... 1 0 0 0 1 1 1 0 ... 0 1 0 0 0 0 0 0 0 0
4 5 c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c... 0 0 0 0 1 0 1 0 ... 0 1 0 0 0 0 0 0 0 0

5 rows × 258 columns


In [21]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [50]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()


Out[50]:
smiles feat_001 feat_002 feat_003 feat_004 feat_005 feat_006 feat_007 feat_008 feat_009 ... feat_247 feat_248 feat_249 feat_250 feat_251 feat_252 feat_253 feat_254 feat_255 feat_256
0 c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n... 0 0 0 0 1 0 1 0 0 ... 0 1 0 0 0 0 0 0 0 0
1 C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si... 1 0 0 0 1 0 1 0 0 ... 0 1 0 0 1 0 0 0 0 0
2 [nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2... 1 0 0 0 1 1 1 0 0 ... 0 1 0 0 0 1 0 0 0 0
3 [nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13... 1 0 0 0 1 1 1 0 0 ... 0 1 0 0 0 1 0 0 0 0
4 c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1 0 0 0 0 1 0 1 0 0 ... 0 1 0 0 0 0 0 0 0 0

5 rows × 257 columns


In [45]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)

In [51]:
#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape


Train features: (1000000, 256)
Train gap: (1000000,)
Test features: (824230, 256)

In [38]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

In [47]:
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

In [40]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [48]:
write_to_file("sample1.csv", LR_pred)
write_to_file("sample2.csv", RF_pred)