In [1]:
import warnings
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
In [2]:
ytrain = pd.read_csv("ytrain23.csv")
ytest = pd.read_excel("ytest23.xlsx")
In [3]:
ytrain
Out[3]:
In [4]:
ytest
Out[4]:
In [3]:
ytrain2 = ytrain.readmitted_3.replace(2,0)
ytest2 = ytest.readmit_true_3.replace(2,0)
In [7]:
ytrain2.to_csv("Send/ytrain.csv")
ytest2.to_csv("Send/ytest.csv")
In [4]:
data = pd.read_csv("full data.csv")
In [5]:
print(data.shape[0])
print(data.shape[1])
In [6]:
data.diag_1.value_counts()
Out[6]:
In [7]:
data.diag_1 = pd.DataFrame(data.diag_1).fillna('Neoplasms')
data.diag_1.value_counts()
Out[7]:
In [8]:
data.diag_2 = pd.DataFrame(data.diag_2).fillna('Neoplasms')
data.diag_3 = pd.DataFrame(data.diag_3).fillna('Neoplasms')
print(sum(pd.isnull(data.diag_2)))
print(sum(pd.isnull(data.diag_3)))
In [9]:
alldata = pd.get_dummies(data)
In [10]:
print(alldata.shape[0])
print(alldata.shape[1])
In [11]:
pd.DataFrame(alldata).to_csv("alldata.csv")
In [12]:
xtrainA = alldata[0:50000]
xtestA = alldata[50000:62937]
In [13]:
xtrainA
Out[13]:
In [22]:
xtestA
Out[22]:
In [23]:
xtrainA.to_csv("xtrainA.csv")
xtestA.to_csv("xtestA.csv")
In [15]:
fsel=pd.read_csv("features selected.csv")
xtrainB=fsel[0:50000]
xtestB=fsel[50000:62937]
In [18]:
xtrainB.to_csv("xtrainB.csv")
xtestB.to_csv("xtestB.csv")
In [16]:
#Age changed
data = pd.read_excel("data.xlsx")
print(data.race.value_counts())
data.race = data.race.replace('?','Caucasian')
print(data.race.value_counts())
In [17]:
Data = pd.get_dummies(data)
print(Data.shape[0])
print(Data.shape[1])
xtrainC = Data[0:50000]
xtestC = Data[50000:62937]
xtrainC.to_csv("xtrainC.csv")
xtestC.to_csv("xtestC.csv")
In [28]:
#SGD on 136 features xtrainA and xtestA with 2 readmit values with log
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
#Applying SGDClassifierModel
reg=SGDClassifier(loss="log", penalty="elasticnet",n_iter=70, random_state=0)
#reg=SGDClassifier(penalty='l1')
reg.fit(xtrainA,ytrain2)
print(accuracy_score(ytrain2, reg.predict(xtrainA)))
print(reg.score(xtrainA,ytrain2))
sgd136 = reg.predict(xtestA)
#output= reg.decision_function(all_data_test)
#plt.hist(output)
#plt.show()
print("model fit done")
print(sgd136)
pd.DataFrame(sgd136).to_csv('ytestsgd136.csv')
print(np.sum(sgd136))
print('Accuracy for 2 clusters: ',np.sum(sgd136==ytest2)/len(ytest2))
In [34]:
#SGD on 136 features xtrainA and xtestA with 2 readmit values with log
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
#Applying SGDClassifierModel
reg=SGDClassifier(loss="log", penalty="l1")
#reg=SGDClassifier(penalty='l1')
reg.fit(xtrainA,ytrain2)
print(accuracy_score(ytrain2, reg.predict(xtrainA)))
print(reg.score(xtrainA,ytrain2))
sgd136 = reg.predict(xtestA)
#output= reg.decision_function(all_data_test)
#plt.hist(output)
#plt.show()
print("model fit done")
print(sgd136)
pd.DataFrame(sgd136).to_csv('ytestsgd136.csv')
print(np.sum(sgd136))
print('Accuracy for 2 clusters: ',np.sum(sgd136==ytest2)/len(ytest2))
In [35]:
#SGD on 26 features xtrainA and xtestA with 2 readmit values with log
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
#Applying SGDClassifierModel
reg=SGDClassifier(loss="log", penalty="l1")
#reg=SGDClassifier(penalty='l1')
reg.fit(xtrainB,ytrain2)
print(accuracy_score(ytrain2, reg.predict(xtrainB)))
print(reg.score(xtrainB,ytrain2))
sgd26 = reg.predict(xtestB)
#output= reg.decision_function(all_data_test)
#plt.hist(output)
#plt.show()
print("model fit done")
print(sgd26)
pd.DataFrame(sgd26).to_csv('ytestsgd26.csv')
print(np.sum(sgd26))
print('Accuracy for 2 clusters: ',np.sum(sgd26==ytest2)/len(ytest2))
In [33]:
#SGD on 129 features xtrainA and xtestA with 2 readmit values with log
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
#Applying SGDClassifierModel
reg=SGDClassifier(loss="log", penalty="l1")
#reg=SGDClassifier(penalty='l1')
reg.fit(xtrainC,ytrain2)
print(accuracy_score(ytrain2, reg.predict(xtrainC)))
print(reg.score(xtrainC,ytrain2))
sgd129 = reg.predict(xtestC)
#output= reg.decision_function(all_data_test)
#plt.hist(output)
#plt.show()
print("model fit done")
print(sgd129)
pd.DataFrame(sgd129).to_csv('ytestsgd129.csv')
print(np.sum(sgd129))
print('Accuracy for 2 clusters: ',np.sum(sgd129==ytest2)/len(ytest2))
In [36]:
#SGD on 26 features xtrainA and xtestA with 2 readmit values with log
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
#Applying SGDClassifierModel
reg=SGDClassifier(loss="log", penalty="elasticnet",n_iter=70, random_state=0)
#reg=SGDClassifier(penalty='l1')
reg.fit(xtrainB,ytrain2)
print(accuracy_score(ytrain2, reg.predict(xtrainB)))
print(reg.score(xtrainB,ytrain2))
sgd26 = reg.predict(xtestB)
#output= reg.decision_function(all_data_test)
#plt.hist(output)
#plt.show()
print("model fit done")
print(sgd26)
#pd.DataFrame(sgd26).to_csv('ytestsgd26.csv')
#print(np.sum(sgd26))
print('Accuracy for 2 clusters: ',np.sum(sgd26==ytest2)/len(ytest2))
In [37]:
#SGD on 129 features xtrainA and xtestA with 2 readmit values with log
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
#Applying SGDClassifierModel
reg=SGDClassifier(loss="log", penalty="elasticnet",n_iter=70, random_state=0)
#reg=SGDClassifier(penalty='l1')
reg.fit(xtrainC,ytrain2)
print(accuracy_score(ytrain2, reg.predict(xtrainC)))
print(reg.score(xtrainC,ytrain2))
sgd129 = reg.predict(xtestC)
#output= reg.decision_function(all_data_test)
#plt.hist(output)
#plt.show()
print("model fit done")
print(sgd129)
#pd.DataFrame(sgd129).to_csv('ytestsgd129.csv')
#print(np.sum(sgd129))
print('Accuracy for 2 clusters: ',np.sum(sgd129==ytest2)/len(ytest2))
In [39]:
#GaussianNB
from sklearn.naive_bayes import GaussianNB
clfA = GaussianNB()
clfA.fit(xtrainA, ytrain2)
GaussianNB(priors=None)
print(clfA.predict(xtestA))
pred=clfA.predict(xtestA)
print('Accuracy for 2 clusters: ',np.sum(pred==ytest2)/len(ytest2))
In [40]:
#GaussianNB
from sklearn.naive_bayes import GaussianNB
clfB = GaussianNB()
clfB.fit(xtrainB, ytrain2)
GaussianNB(priors=None)
print(clfB.predict(xtestB))
pred=clfB.predict(xtestB)
print('Accuracy for 2 clusters: ',np.sum(pred==ytest2)/len(ytest2))
In [41]:
#GaussianNB
from sklearn.naive_bayes import GaussianNB
clfC = GaussianNB()
clfC.fit(xtrainC, ytrain2)
GaussianNB(priors=None)
print(clfC.predict(xtestC))
pred=clfC.predict(xtestC)
print('Accuracy for 2 clusters: ',np.sum(pred==ytest2)/len(ytest2))
In [43]:
# k-means on 136 features for 2 clusters (NO & >30 = 0, <30 = 1)
from sklearn.cluster import KMeans
# Input
kmeansA = KMeans(n_clusters=2, random_state=0).fit(xtrainA)
# Gives labels of training points after classification into 2 clusters
print(kmeansA.labels_)
ytrainApred = pd.DataFrame(kmeansA.labels_)
#ytraind2Apred.to_csv('with diag/ytrain2Apred.csv')
c1A=ytrain2[kmeansA.labels_==0]
#saves the ytrain2 values in c1 at indices only pertaining to the label 0
#assume kmeans_labels = {0,1,0,0,1,0} & ytrain2.readmitted = {A,B,C,D,E,F}, then c1 = {A,C,D,F}
c2A=ytrain2[kmeansA.labels_==1]
#Observations of label 0
print("Observations of label 0: ",c1A.shape[0])
#Observations of label 1
print("Observations of label 1: ",c2A.shape[0])
print("Number of YES observations in label 0: ",sum(c1A))
print("Number of YES observations in label 1: ",sum(c2A))
In [44]:
#Hence, label 0 corresponds to YES. But initially YES was mapped as 1, therefore swap 0 and 1 in ytest2
ytest2
Out[44]:
In [51]:
#SWAP 0 & 1 in ytest2
ytest2s = ytest2.replace(0,'temp')
ytest2s = ytest2s.replace(1,0)
ytest2s = ytest2s.replace('temp',1)
ytest2s
Out[51]:
In [52]:
# Predicts labels ytest2
ytestApred = kmeansA.predict(xtestA)
print("Predicted labels for ftest:",ytestApred)
#write to CSV
#pd.DataFrame(ytest2Apred).to_csv('with diag/ytest2Apred.csv')
# Cluster center co-ordinates
print("Cluster Centers:\n",kmeansA.cluster_centers_)
print('Accuracy for 2 clusters: ',np.sum(ytestApred==ytest2)/len(ytest2))
In [53]:
# k-means on 26 features for 2 clusters (NO & >30 = 0, <30 = 1)
from sklearn.cluster import KMeans
# Input
kmeansB = KMeans(n_clusters=2, random_state=0).fit(xtrainB)
# Gives labels of training points after classification into 2 clusters
print(kmeansB.labels_)
ytrainBpred = pd.DataFrame(kmeansB.labels_)
#ytraind2Apred.to_csv('with diag/ytrain2Apred.csv')
c1B=ytrain2[kmeansB.labels_==0]
#saves the ytrain2 values in c1 at indices only pertaining to the label 0
#assume kmeans_labels = {0,1,0,0,1,0} & ytrain2.readmitted = {A,B,C,D,E,F}, then c1 = {A,C,D,F}
c2B=ytrain2[kmeansA.labels_==1]
#Observations of label 0
print("Observations of label 0: ",c1B.shape[0])
#Observations of label 1
print("Observations of label 1: ",c2B.shape[0])
print("Number of YES observations in label 0: ",sum(c1B))
print("Number of YES observations in label 1: ",sum(c2B))
In [54]:
#Hence, label 0 corresponds to YES. But initially YES was mapped as 1, therefore swap 0 and 1 in ytest2
ytest2
Out[54]:
In [55]:
ytest2s
Out[55]:
In [56]:
# Predicts labels ytest2
ytestBpred = kmeansB.predict(xtestB)
print("Predicted labels for ftest:",ytestBpred)
#write to CSV
#pd.DataFrame(ytest2Apred).to_csv('with diag/ytest2Apred.csv')
# Cluster center co-ordinates
print("Cluster Centers:\n",kmeansB.cluster_centers_)
print('Accuracy for 2 clusters: ',np.sum(ytestBpred==ytest2)/len(ytest2))
In [57]:
# k-means on 129 features for 2 clusters (NO & >30 = 0, <30 = 1)
from sklearn.cluster import KMeans
# Input
kmeansC = KMeans(n_clusters=2, random_state=0).fit(xtrainC)
# Gives labels of training points after classification into 2 clusters
print(kmeansC.labels_)
ytrainCpred = pd.DataFrame(kmeansC.labels_)
#ytraind2Apred.to_csv('with diag/ytrain2Apred.csv')
c1C=ytrain2[kmeansC.labels_==0]
#saves the ytrain2 values in c1 at indices only pertaining to the label 0
#assume kmeans_labels = {0,1,0,0,1,0} & ytrain2.readmitted = {A,B,C,D,E,F}, then c1 = {A,C,D,F}
c2C=ytrain2[kmeansC.labels_==1]
#Observations of label 0
print("Observations of label 0: ",c1C.shape[0])
#Observations of label 1
print("Observations of label 1: ",c2C.shape[0])
print("Number of YES observations in label 0: ",sum(c1C))
print("Number of YES observations in label 1: ",sum(c2C))
In [58]:
# Predicts labels ytest2
ytestCpred = kmeansC.predict(xtestC)
print("Predicted labels for ftest:",ytestCpred)
#write to CSV
#pd.DataFrame(ytest2Apred).to_csv('with diag/ytest2Apred.csv')
# Cluster center co-ordinates
print("Cluster Centers:\n",kmeansC.cluster_centers_)
print('Accuracy for 2 clusters: ',np.sum(ytestCpred==ytest2)/len(ytest2))
In [23]:
#DecisionTreeClassifier136
from sklearn.tree import DecisionTreeClassifier
dtcA = DecisionTreeClassifier(random_state=0)
dtcA.fit(xtrainA, ytrain2)
print(dtcA.predict(xtestA))
predA=dtcA.predict(xtestA)
print('Accuracy for 136 features: ',np.sum(predA==ytest2)/len(ytest2))
In [24]:
#DecisionTreeClassifier26
from sklearn.tree import DecisionTreeClassifier
dtcB = DecisionTreeClassifier(random_state=0)
dtcB.fit(xtrainB, ytrain2)
print(dtcB.predict(xtestB))
predB=dtcB.predict(xtestB)
print('Accuracy for 26 features: ',np.sum(predB==ytest2)/len(ytest2))
In [25]:
#DecisionTreeClassifier129
from sklearn.tree import DecisionTreeClassifier
dtcC = DecisionTreeClassifier(random_state=0)
dtcC.fit(xtrainC, ytrain2)
print(dtcC.predict(xtestC))
predC=dtcC.predict(xtestC)
print('Accuracy for 129 features: ',np.sum(predC==ytest2)/len(ytest2))
In [3]:
import matplotlib.pyplot as plt
c136=[0.906856304,0.104274561,0.552601067,0.833500812]
c26=[0.863183118,0.882971323,0.552446471,0.824998068]
c129=[0.869598825,0.099636701,0.552601067,0.832805133]
#pd.DataFrame.plot(kind='scatter',x='c1x',y='c1y',color='b')
#plt.scatter(c1x,c1y,label='C1',color='b')
#plt.scatter(c2x,c2y,label='C2',color='g')
plt.hist(c136,label='c136',color='r')
plt.hist(c26,label='c26',color='b')
plt.hist(c129,label='c129',color='g')
#plt.plot([-4,6],[7,-4],label='L1',color='y')
#plt.plot([-5,6],[0,0],label='L2',color='c')
#plt.plot([0.3,0.3],[7,-4],label='L3',color='m')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Scatter Plot of C1, C2 & C3')
#plt.axis([-5,6,-4,7])
#plt.legend()
plt.show()
In [ ]: