In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
In [3]:
data = pd.read_csv('HD_data.csv')
In [4]:
data.head(5)
Out[4]:
In [5]:
data.count()
Out[5]:
In [6]:
compound = data.iloc[:, 2:309]
In [7]:
compound.count()
Out[7]:
In [8]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(compound)
print(pca.explained_variance_)
print(pca.components_)
In [9]:
plt.title("Explained Variance")
plt.ylabel("Percentage of explained variance")
plt.xlabel("PCA Components")
plt.plot(pca.explained_variance_ratio_);
In [10]:
plt.title("Cumulated Explained Variance")
plt.ylabel("Percentage of explained variance")
plt.xlabel("PCA Components")
plt.plot(np.cumsum(pca.explained_variance_ratio_));
In [11]:
compound = compound
activity = data.iloc[:, 0]
In [12]:
from sklearn.cross_validation import train_test_split
compound_train, compound_test, activity_train, activity_test = train_test_split(
compound, activity, test_size = 0.20, random_state = 0)
print("train data shape: %r, train target shape: %r"
% (compound_train.shape, activity_train.shape))
print("test data shape: %r, test target shape: %r"
% (compound_test.shape, activity_test.shape))
In [13]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(compound_train, activity_train)
Out[13]:
In [14]:
activity_predicted = logreg.predict(compound_test)
In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(activity_test, activity_predicted)
Out[15]:
In [16]:
compound_descriptors = compound.columns
compound_descriptors
Out[16]:
In [17]:
logreg.coef_
Out[17]:
In [18]:
x = np.arange(len(compound_descriptors))
plt.bar(x, logreg.coef_.ravel())
plt.xticks(x + 0.5, compound_descriptors, rotation = 30);
In [19]:
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(activity_test, activity_predicted)
print(cm)
In [20]:
def plot_confusion(cm, activity_names = ['Active', 'Inactive'],
title = 'Confusion matrix'):
plt.imshow(cm, interpolation = 'nearest', cmap = plt.cm.Blues)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(activity_names))
plt.xticks(tick_marks, activity_names, rotation = 60)
plt.yticks(tick_marks, activity_names)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plot_confusion(cm)
In [21]:
print(cm)
In [22]:
cm.sum(axis =1)
Out[22]:
In [23]:
cm_normalized = cm.astype(np.float64) / cm.sum(axis = 1)[:, np.newaxis]
print(cm_normalized)
In [24]:
plot_confusion(cm_normalized, title = "Normalized confusion matrix")
In [25]:
from sklearn.metrics import classification_report
print(classification_report(activity_test, activity_predicted,
target_names = ['Inactive', 'Active']))
In [26]:
activity_predicted_proba = logreg.predict_proba(compound_test)
activity_predicted_proba[:5]
Out[26]:
In [73]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
def plot_roc_curve(activity_test, activity_predicted_proba):
fpr, tpr, thresholds = roc_curve(activity_test,
activity_predicted_proba[:, 1],
pos_label = 'Active')
roc_auc = auc(fpr, tpr)
#Plot ROC curve
plt.plot(fpr, tpr, label = 'Area Under Curve = %0.3f' %roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specificity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operation Characteristic for Histone Acetylase')
plt.legend(loc = 'lower right')
In [28]:
fpr, tpr, thresholds = roc_curve(activity_test, activity_predicted_proba[:, 1],
pos_label = 'Active')
In [29]:
plot_roc_curve(activity_test, activity_predicted_proba)
In [30]:
protein = data.iloc[:, 311:937]
In [31]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(protein)
print(pca.explained_variance_)
print(pca.components_)
In [32]:
plt.title("Explained Variance")
plt.ylabel("Percentage of explained variance")
plt.xlabel("PCA Components")
plt.plot(pca.explained_variance_ratio_);
In [33]:
plt.title("Cumulated Explained VAriance")
plt.ylabel("Percentage of explained variance")
plt.xlabel("PCA Components")
plt.plot(np.cumsum(pca.explained_variance_ratio_));
In [34]:
from sklearn.cross_validation import train_test_split
protein_train, protein_test, activity_train, activity_test = train_test_split(
protein, activity, test_size = 0.20, random_state = 0)
print("train data shape: %r, trian target shape: %r"
% (protein_train.shape, activity_train.shape))
print("test data shape: %r, test target shape: %r"
% (protein_test.shape, activity_test.shape))
In [35]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression (C = 1)
logreg.fit(protein_train, activity_train)
Out[35]:
In [36]:
activity_predicted = logreg.predict(protein_test)
In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(activity_test, activity_predicted)
Out[37]:
In [38]:
protein_descriptors = protein.columns
protein_descriptors
Out[38]:
In [39]:
x = np.arange(len(protein_descriptors))
plt.bar(x, logreg.coef_.ravel())
plt.xticks(x + 20, protein_descriptors, rotation = 30);
In [40]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(activity_test, activity_predicted)
print(cm)
In [41]:
plot_confusion(cm)
In [42]:
cm_normalized = cm.astype(np.float64) / cm.sum(axis = 1)[:, np.newaxis]
print(cm_normalized)
In [43]:
plot_confusion(cm_normalized, title = "Normalized confusion matrix")
In [44]:
from sklearn.metrics import classification_report
print(classification_report(activity_test, activity_predicted,
target_names = ['Active', 'Inactive']))
In [45]:
activity_predicted_proba = logreg.predict_proba(protein_test)
activity_predicted_proba[:5]
Out[45]:
In [46]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
plot_roc_curve(activity_test, activity_predicted_proba)
In [61]:
protein = data.iloc[:, 311:320]
compound = data.iloc[:, 2:10]
print(protein.shape)
print(compound.shape)
def cross_terms_ligand_protein(ligand, header_ligand, protein, header_protein):
import numpy as np
R, Cl = np.shape(ligand)
R, Cp = np.shape(protein)
cross_terms = Cl * Cp
Cross_lp = np.zeros((R, cross_terms))
H_lp = []
for j in range(Cl):
for jj in range(Cp):
H_lp.append(str(header_ligand[j])+'*'+str(header_protein[jj]))
cross = np.multiply(ligand[:,j], protein[:, jj])
Cross_lp = np.append(Cross_lp, np.reshape(cross, (ligand.shape[0],1)),
axis = 1)
return np.delete(Cross_lp, 0, axis = 1), H_lp
In [60]:
R, Cl = np.shape(ligand)
R
Cross_lp = np.zeros((R, 1))
Cross_lp
Cl
Out[60]:
In [63]:
#header_protein, protein = protein.iloc[0,:], np.delete(protein.iloc, 0, axis = 0).astype(np.float)
ligand = np.array(compound)
protein = np.array(protein)
#header_ligand, ligand = ligand[0,:], np.delete(ligand, 0, axis = 0).astype(np.float)
#header_protein, protein = protein[0,:], np.delete(protein, 0, axis = 0).astype(np.float)
cross_terms, header_cross_terms = cross_terms_ligand_protein(ligand, list(header_ligand), protein, list(header_protein))
print(cross_terms.shape)
In [64]:
print(cross_terms.shape)
print(ligand.shape)
activity = np.array(activity)
print(activity.shape)
from sklearn.cross_validation import train_test_split
cross_terms_train, cross_terms_test, activity_train, activity_test = train_test_split(
cross_terms, activity, test_size = 0.20, random_state = 0)
print("train data shape: %r, train target shape: %r"
% (cross_terms_train.shape, activity_train.shape))
print("test data shape: %r, test target shape: %r"
% (cross_terms_test.shape, activity_test.shape))
In [107]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression (C = 1)
logreg.fit(cross_terms_train, activity_train)
Out[107]:
In [108]:
activity_predicted = logreg.predict(cross_terms_test)
In [109]:
from sklearn.metrics import accuracy_score
accuracy_score(activity_test, activity_predicted)
Out[109]:
In [110]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(activity_test, activity_predicted)
In [111]:
print(cm)
In [112]:
plot_confusion(cm)
In [113]:
activity_predicted_proba = logreg.predict_proba(cross_terms_test)
activity_predicted_proba[:5]
Out[113]:
In [74]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
plot_roc_curve(activity_test, activity_predicted_proba)
In [53]:
len(ligand[0])
Out[53]:
In [51]:
protein
Out[51]:
In [54]:
protein.shape
Out[54]:
In [65]:
print(data)
In [67]:
data.summary
In [68]:
data.count()
Out[68]:
In [68]:
data.plot()
Out[68]:
In [69]:
data.std()
Out[69]:
In [71]:
print(data.std() < 0.1)
In [72]:
%time
data = pd.read_csv('HD_data.csv')
In [ ]: