In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn import linear_model

train_filename = 'train.csv'
test_filename = 'test.csv'
pred_filename = 'regression.csv'


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-08e94d8e4ad4> in <module>()
      1 import csv
----> 2 import numpy as np
      3 import pandas as pd
      4 from sklearn import linear_model
      5 

ImportError: No module named numpy

In [2]:
# read in the data
df = pd.read_csv(train_filename, header = 0)
df.head(10)


Out[2]:
smiles feat_001 feat_002 feat_003 feat_004 feat_005 feat_006 feat_007 feat_008 feat_009 ... feat_248 feat_249 feat_250 feat_251 feat_252 feat_253 feat_254 feat_255 feat_256 gap
0 c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n... 0 0 0 0 1 0 1 0 0 ... 1 0 0 0 0 0 0 0 0 1.19
1 C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si... 1 0 0 0 1 0 1 0 0 ... 1 0 0 1 0 0 0 0 0 1.60
2 [nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2... 1 0 0 0 1 1 1 0 0 ... 1 0 0 0 1 0 0 0 0 1.49
3 [nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13... 1 0 0 0 1 1 1 0 0 ... 1 0 0 0 1 0 0 0 0 1.36
4 c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1 0 0 0 0 1 0 1 0 0 ... 1 0 0 0 0 0 0 0 0 1.98
5 C1=Cc2cnc3cc4cc(-c5scc6[nH]ccc56)c5ccccc5c4cc3... 1 0 0 0 1 1 1 0 0 ... 1 0 0 0 0 0 0 0 0 1.81
6 c1ncc(s1)-c1cnc2c(c1)oc1c2ccc2ccccc12 0 0 0 0 1 0 1 0 0 ... 1 0 0 0 0 0 0 0 0 2.91
7 c1sc(-c2ccc3c(c2)sc2c3c3=CCC=c3c3cccnc23)c2[se... 1 0 0 0 1 0 1 0 0 ... 1 0 0 0 0 0 0 0 0 2.17
8 c1ccc(o1)-c1cc2cc3cc4c5c[nH]cc5ccc4cc3cc2o1 0 0 0 0 1 1 1 0 0 ... 1 0 0 0 0 0 0 0 0 2.19
9 [nH]1ccc2c3c[nH]cc3c3cc(-c4cncs4)c4=CCC=c4c3c12 1 0 0 0 1 1 1 0 0 ... 1 0 0 0 0 0 0 0 0 1.71

10 rows × 258 columns


In [3]:
# subset it so we only work with positive variance variables
posVarColNames = [colName for colName in df.columns.values if colName != "smiles" and np.var(df[colName]) > 0]
df = df.reindex(columns=posVarColNames)

In [4]:
t = df.gap
df.drop('gap', axis=1, inplace=True)
df.head(10)


Out[4]:
feat_001 feat_005 feat_006 feat_007 feat_025 feat_037 feat_044 feat_068 feat_069 feat_072 ... feat_199 feat_200 feat_208 feat_218 feat_225 feat_226 feat_243 feat_248 feat_251 feat_252
0 0 1 0 1 0 0 0 1 0 1 ... 0 0 0 0 0 1 0 1 0 0
1 1 1 0 1 1 1 0 0 1 1 ... 0 0 0 1 0 1 0 1 1 0
2 1 1 1 1 0 1 0 1 1 1 ... 0 0 0 0 0 1 1 1 0 1
3 1 1 1 1 1 0 0 1 0 0 ... 0 0 0 1 0 1 1 1 0 1
4 0 1 0 1 0 0 0 0 0 1 ... 0 0 1 0 0 1 0 1 0 0
5 1 1 1 1 0 1 0 1 1 1 ... 0 0 0 0 1 1 1 1 0 0
6 0 1 0 1 0 0 0 0 0 1 ... 0 0 0 0 0 1 0 1 0 0
7 1 1 0 1 1 0 0 1 0 1 ... 0 0 0 1 0 1 0 1 0 0
8 0 1 1 1 0 0 0 1 0 1 ... 0 0 0 0 1 1 1 1 0 0
9 1 1 1 1 1 0 0 0 0 0 ... 0 0 0 1 1 1 1 1 0 0

10 rows × 31 columns


In [5]:
training = df
training.head(10)


Out[5]:
feat_001 feat_005 feat_006 feat_007 feat_025 feat_037 feat_044 feat_068 feat_069 feat_072 ... feat_199 feat_200 feat_208 feat_218 feat_225 feat_226 feat_243 feat_248 feat_251 feat_252
0 0 1 0 1 0 0 0 1 0 1 ... 0 0 0 0 0 1 0 1 0 0
1 1 1 0 1 1 1 0 0 1 1 ... 0 0 0 1 0 1 0 1 1 0
2 1 1 1 1 0 1 0 1 1 1 ... 0 0 0 0 0 1 1 1 0 1
3 1 1 1 1 1 0 0 1 0 0 ... 0 0 0 1 0 1 1 1 0 1
4 0 1 0 1 0 0 0 0 0 1 ... 0 0 1 0 0 1 0 1 0 0
5 1 1 1 1 0 1 0 1 1 1 ... 0 0 0 0 1 1 1 1 0 0
6 0 1 0 1 0 0 0 0 0 1 ... 0 0 0 0 0 1 0 1 0 0
7 1 1 0 1 1 0 0 1 0 1 ... 0 0 0 1 0 1 0 1 0 0
8 0 1 1 1 0 0 0 1 0 1 ... 0 0 0 0 1 1 1 1 0 0
9 1 1 1 1 1 0 0 0 0 0 ... 0 0 0 1 1 1 1 1 0 0

10 rows × 31 columns


In [6]:
# trying out a Lasso Regression
from sklearn import linear_model

alpha = 0.001 # our initial guess for an Alpha Value
fit_intercept = False # the data is NOT normalized
positive = False # force the coefficients to be positive

model = linear_model.Lasso(alpha=alpha, fit_intercept=fit_intercept, positive=positive)

In [7]:
model.fit(training, t)


Out[7]:
Lasso(alpha=0.001, copy_X=True, fit_intercept=False, max_iter=1000,
   normalize=False, positive=False, precompute='auto', tol=0.0001,
   warm_start=False)

In [8]:
model.intercept_
model.get_params()
model.coef_


Out[8]:
array([ 0.        ,  2.26156039, -0.        ,  0.01639207, -0.07901493,
       -0.        ,  0.11430998, -0.20341926, -0.01440025, -0.00440086,
       -0.04695871, -0.0702192 , -0.34950767, -0.41398594, -0.0184991 ,
        0.02382125,  0.03495369, -0.        , -0.        , -0.07496057,
       -0.17149325,  0.        ,  0.        ,  0.16853748, -0.        ,
       -0.12063423,  0.17092502, -0.        , -0.03176   , -0.25877695,
       -0.00619749])

In [9]:
model.score(training, t)


Out[9]:
0.45501084526935043

In [10]:
# read in the test data
td = pd.read_csv(test_filename, header = 0)
td.head(10)


Out[10]:
Id smiles feat_001 feat_002 feat_003 feat_004 feat_005 feat_006 feat_007 feat_008 ... feat_247 feat_248 feat_249 feat_250 feat_251 feat_252 feat_253 feat_254 feat_255 feat_256
0 1 c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n... 0 0 0 0 1 1 1 0 ... 0 1 0 0 0 0 0 0 0 0
1 2 [nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1 0 0 0 0 1 1 1 0 ... 0 1 0 0 0 0 0 0 0 0
2 3 [nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4... 1 0 0 0 1 1 1 0 ... 0 1 0 0 0 0 0 0 0 0
3 4 [nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c... 1 0 0 0 1 1 1 0 ... 0 1 0 0 0 0 0 0 0 0
4 5 c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c... 0 0 0 0 1 0 1 0 ... 0 1 0 0 0 0 0 0 0 0
5 6 c1cc2sc3c4[SiH2]C(=Cc4ncc3c2s1)c1cccnc1 1 0 0 0 1 0 1 0 ... 0 1 0 0 1 0 0 0 0 0
6 7 [nH]1cccc1-c1cc2c3c[nH]cc3c3c4CC=Cc4[nH]c3c2c2... 1 0 0 0 1 1 1 0 ... 0 1 0 0 0 0 0 0 0 0
7 8 C1=CC=C([SiH2]1)c1cc2ncc3c4cnccc4c4=CCC=c4c3c2... 1 0 0 0 1 0 1 0 ... 0 1 0 0 1 0 0 0 0 0
8 9 c1sc(-c2sc(-c3ccc(cc3)-c3scc4nccnc34)c3cc[se]c... 0 0 0 0 1 0 1 0 ... 0 1 0 0 0 0 0 0 0 0
9 10 [nH]1c(cc2c3c[nH]cc3c3c4occc4ncc3c12)-c1scc2[n... 0 0 0 0 1 1 1 0 ... 0 1 0 0 0 0 0 0 0 0

10 rows × 258 columns


In [11]:
# we only keep the features we care about here
td = td.reindex(columns=posVarColNames)

In [12]:
td.drop('gap', axis=1, inplace=True)
td.head(10)


Out[12]:
feat_001 feat_005 feat_006 feat_007 feat_025 feat_037 feat_044 feat_068 feat_069 feat_072 ... feat_199 feat_200 feat_208 feat_218 feat_225 feat_226 feat_243 feat_248 feat_251 feat_252
0 0 1 1 1 0 0 0 1 0 1 ... 0 0 0 0 1 1 1 1 0 0
1 0 1 1 1 0 0 0 1 0 1 ... 0 0 0 0 1 1 1 1 0 0
2 1 1 1 1 1 0 0 0 0 1 ... 0 0 0 1 0 1 1 1 0 0
3 1 1 1 1 1 0 0 1 0 1 ... 0 0 0 1 0 1 1 1 0 0
4 0 1 0 1 0 0 0 1 0 1 ... 0 0 0 0 0 1 0 1 0 0
5 1 1 0 1 0 1 0 0 1 1 ... 0 0 0 0 0 1 0 1 1 0
6 1 1 1 1 0 1 0 0 1 0 ... 0 0 0 0 1 1 1 1 0 0
7 1 1 0 1 1 0 0 0 0 1 ... 0 0 0 1 0 1 0 1 1 0
8 0 1 0 1 0 0 0 1 0 1 ... 0 0 0 0 0 1 0 1 0 0
9 0 1 1 1 0 0 0 1 0 1 ... 0 0 0 0 1 1 1 1 0 0

10 rows × 31 columns


In [13]:
predictions = model.predict(td)
len(predictions)


Out[13]:
824230

In [14]:
len(np.unique(predictions))


Out[14]:
3599

In [15]:
# write prediction file
with open(pred_filename, 'w') as pred_fh:

    # Produce a CSV file.
    pred_csv = csv.writer(pred_fh, delimiter=',', quotechar='"')

    # Write the header row.
    pred_csv.writerow(['Id', 'Prediction'])

    i = 1
    for prediction in predictions:
        pred_csv.writerow([i, prediction])
        i += 1

In [16]:
pd_file = pd.read_csv(pred_filename, header = 0)

In [17]:
len(predictions)


Out[17]:
824230

In [86]:
pd_file.head(10)


Out[86]:
Id Prediction
0 NaN NaN
1 1 1.839849
2 NaN NaN
3 2 1.839849
4 NaN NaN
5 3 1.569478
6 NaN NaN
7 4 1.800124
8 NaN NaN
9 5 2.100033

In [ ]:
import rdkit