In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn import linear_model
train_filename = 'train.csv'
test_filename = 'test.csv'
pred_filename = 'regression.csv'
In [2]:
# read in the data
df = pd.read_csv(train_filename, header = 0)
df.head(10)
Out[2]:
In [3]:
# subset it so we only work with positive variance variables
posVarColNames = [colName for colName in df.columns.values if colName != "smiles" and np.var(df[colName]) > 0]
df = df.reindex(columns=posVarColNames)
In [4]:
t = df.gap
df.drop('gap', axis=1, inplace=True)
df.head(10)
Out[4]:
In [5]:
training = df
training.head(10)
Out[5]:
In [6]:
# trying out a Lasso Regression
from sklearn import linear_model
alpha = 0.001 # our initial guess for an Alpha Value
fit_intercept = False # the data is NOT normalized
positive = False # force the coefficients to be positive
model = linear_model.Lasso(alpha=alpha, fit_intercept=fit_intercept, positive=positive)
In [7]:
model.fit(training, t)
Out[7]:
In [8]:
model.intercept_
model.get_params()
model.coef_
Out[8]:
In [9]:
model.score(training, t)
Out[9]:
In [10]:
# read in the test data
td = pd.read_csv(test_filename, header = 0)
td.head(10)
Out[10]:
In [11]:
# we only keep the features we care about here
td = td.reindex(columns=posVarColNames)
In [12]:
td.drop('gap', axis=1, inplace=True)
td.head(10)
Out[12]:
In [13]:
predictions = model.predict(td)
len(predictions)
Out[13]:
In [14]:
len(np.unique(predictions))
Out[14]:
In [15]:
# write prediction file
with open(pred_filename, 'w') as pred_fh:
# Produce a CSV file.
pred_csv = csv.writer(pred_fh, delimiter=',', quotechar='"')
# Write the header row.
pred_csv.writerow(['Id', 'Prediction'])
i = 1
for prediction in predictions:
pred_csv.writerow([i, prediction])
i += 1
In [16]:
pd_file = pd.read_csv(pred_filename, header = 0)
In [17]:
len(predictions)
Out[17]:
In [86]:
pd_file.head(10)
Out[86]:
In [ ]:
import rdkit