In [1]:
import pandas as pd
import numpy as np
In [2]:
two_party_words = pd.read_csv("../data/two.csv")
two_party_words.head()
Out[2]:
In [3]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
#make it so that we only show first 4 decimals for floats
np.set_printoptions(precision=4,suppress=True)
# visualization
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
In [4]:
party_dummies = pd.get_dummies(two_party_words.party_x).astype(int)
party_dummies = party_dummies[["R"]]
party_dummies.head()
capitol_words = party_dummies.merge(two_party_words, right_index=True, left_index=True)
#del capitol_words['Unnamed: 0']
capitol_words.head()
Out[4]:
In [5]:
sns.pairplot(capitol_words[["zimbabwe","zinc"]])
Out[5]:
In [6]:
word_columns = capitol_words.columns[807:]
capitol_words[word_columns]
capitol_words.R.head()
X_words = capitol_words[word_columns]
y_words = capitol_words["R"]
X_train,X_test,y_train,y_test = train_test_split(X_words,y_words,test_size=0.4)
from sklearn.tree import DecisionTreeClassifier
words_tree = DecisionTreeClassifier(max_depth=3, random_state=1)
words_tree.fit(X_train, y_train)
Out[6]:
In [7]:
words_tree.feature_importances_
features = pd.DataFrame({'feature':word_columns, 'importance':words_tree.feature_importances_}).sort_values(by='importance',ascending=False)
In [8]:
features.head()
Out[8]:
In [9]:
print capitol_words.columns[15:]
capitol_words.crp_id
capitol_words.ix[:,:25].head()
Out[9]:
In [10]:
def my_mask(df,column,condition,value):
new_data = []
if condition == "==":
new_data = df[df[column] == value]
elif condition == "<=":
new_data = df[df[column] <= value]
elif condition == "!=":
new_data = df[df[column] != value]
elif condition == ">=":
new_data = df[df[column] >= value]
elif condition == ">":
new_data = df[df[column] > value]
elif condition == "<":
new_data = df[df[column] < value]
else:
print "arguments needed-column,condition,value-:"
return new_data
In [11]:
my_first_mask = my_mask(capitol_words,"firstname","==","Neil")
In [12]:
def subset(df,column):
dict = {}
subs = df[column].unique()
for element in subs:
dict[element] = my_mask(df,column,"==",element)
print "New available dictionary of dataframes is:\n subsets_of ",subs
return dict
In [13]:
states = subset(capitol_words,"state_x")
states['AK'].head()
parties = subset(capitol_words, "party_x")
parties['D'].head()
Out[13]:
In [14]:
str(my_first_mask)
globals()[capitol_words.state_x.unique()[0]+"lala"] = {}
print capitol_words.state_x.unique()[1]
globals()['variable{}'.format(capitol_words.state_x.unique()[1])] = 0
In [15]:
def clean_sparse_irrelevant(pd):
cols = pd.columns
deleted=0
for c in cols:
x=pd[c]
if x.dtype=="float64":
if x.sum()==0:
del pd[c]
deleted += 1
print "DELETED:",deleted
return pd
In [16]:
clean_sparse_irrelevant(states['AK'])
Out[16]:
In [17]:
sns.pairplot(states['AK'][["veterans","wildlife","wilderness","villages"]])
Out[17]:
In [18]:
word_columns = states['AK'].columns
word_columns[40:]
Out[18]:
In [19]:
y = word_columns[35:][0]
x = "..."
def reporter(x):
try:
return float(x)
except ValueError:
return "no"
print reporter(x)
reporter(y)
def word_finder(list,start):
for index, element in enumerate(list, start):
if element[0]!="a":
pass
else:
first = index
break
return first
x = word_columns[29:].tolist()
print word_finder(x,29)
word_columns[word_finder(x,29):]
Out[19]:
In [20]:
my_mask(capitol_words,"whaling",">",0)
Out[20]:
with sklearn.preprocessing package
centering sparse data would destroy the sparseness structure in the data, but MaxAbsScaler and maxabs_scale were specifically designed for scaling sparse data, specially if the features are in different scales. scale and StandardScaler can accept scipy.sparse matrices as input, as long as with_centering=False More about this
In [21]:
alaska = states["AK"]
alaska
Out[21]:
In [22]:
from sklearn.preprocessing import maxabs_scale
print maxabs_scale(alaska.ix[:,43:], axis=0, copy=False)
alaska.ix[:,43:] = maxabs_scale(alaska.ix[:,43:], axis=0, copy=False)
print alaska.ix[:,43:].head()
In [23]:
alaska
Out[23]:
I decided to build this a small method, it doesn't have any game changing logic to it, I just want to save myself some lines of code: What this method will do is select a part of the DataFrama fron the end until the words sparse matrix ends and another or until the numbers start.
In [24]:
#select the words from the data frame: the words should be the last part of it. [ex:]
#index, where the data frame becomes the sparse matrix
#word_finder(dataFrame)
#from sklearn.preprocessing import maxabs_scale
#scaler = preprocessing.StandardScaler().fit(X)
def word_maxabsscaler(dataFrame,index):
dataFrame.ix[:,word_finder(dataFrame,index):] = maxabs_scale(dataFrame.ix[:,word_finder(dataFrame,index):], axis=0, copy=False)
In [25]:
NY = states["NY"]#.ix[:,30:]
NY.head()
Out[25]:
In [26]:
clean_sparse_irrelevant(NY)
word_maxabsscaler(NY,30)
In [27]:
word_maxabsscaler(capitol_words,30)
clean_sparse_irrelevant(capitol_words)
capitol_words.head()
Out[27]:
In [28]:
print "Commented method crashed when the dictionary is too long"
# def subset_sparse_reg(frameDict, ):
# for key in states:
# clean_sparse_irrelevant(frameDict[key])
# word_maxabsscaler(frameDict[key],30)
# print key
# try:
# frameDict[key]["villages"][0]
# except KeyError:
# print "no villages in ", key
# subset_sparse_reg(states)
In [29]:
len(states)
Out[29]:
Just to compare result between these models in this particular data set. This is an example of when it's a good idea to reduce the number of columns in the data set. There are more than 14 000 columns (it was the resut of getting the words that were said the must as dummies and then getting the td-idf count of them) So too many columns are being used to predict the target variable, that is Republican or Democrat. One of the risks of these techniques is overfitting the model
In [30]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, train_test_split
import scipy.stats as stats
# visualization
%matplotlib inline
import seaborn as sns
In [31]:
capitol_words.head()
Out[31]:
After normalization, some words had a global weight that was very small in a td-idf matrix count, so their column.sum() was cero, I will not feed that to my model because a colum n filled with 0 will not add much variance in a spacer matrix. Also at index 30 is where I the sparse matrix got attached to the original data set.
In [32]:
print "where the words start with a column sum that is different to 0, some words are said so little that their td-idf count could be 0, index position", word_finder(capitol_words,30)
capitol_words.ix[:,836:].head()
Out[32]:
In [33]:
global_correlations = capitol_words.ix[:,836:].corr()
global_correlations.head()
Out[33]:
In [34]:
sns.plt.figure(figsize=(24,20))
sns.heatmap(capitol_words.ix[:,836:].transpose().corr().values)
Out[34]:
we can see that some variables are very correlated, this will cause a lot of trouble doing the PCA because some words are highly correlated with one another, some because they are synonims other because they are used in very similar contexts, so far I expect it to return an ell-conditioned matrix of Principal Components. This is done only to have a look or some type of description on the columns.
In [35]:
pca = PCA()
transformed_pca_x = pca.fit_transform(capitol_words.ix[:,836:])
component_names = ["component_"+str(comp) for comp in range(1, len(pca.explained_variance_)+1)]
#generate new component dataframe
transformed_pca_x = pd.DataFrame(transformed_pca_x,columns=component_names)
print "CCOMPONENT MATRIX:"
transformed_pca_x.head()
Out[35]:
In [36]:
#generate component loadings on original features
component_matrix = pd.DataFrame(pca.components_,index=component_names)
In [37]:
#add additional columns to describe what
# component_matrix["explained_variance_ratio"] = pca.explained_variance_ratio_
#component_matrix["eigenvalue"] = pca.explained_variance_
# figure = sns.plt.figure(figsize=(18,6))
In [38]:
#add 3 subplots one at a time
#first the component matrix
# figure.add_subplot(131)
# sns.heatmap(component_matrix.ix[:,:-2])
# #then the eigenvalues
# figure.add_subplot(132)
# sns.plt.plot(range(1,component_matrix.shape[0]+1), component_matrix.eigenvalue)
# sns.plt.xlabel("component number")
# sns.plt.ylabel("variance explained")
# #then the explained variance ratio
# figure.add_subplot(133)
# sns.plt.plot(range(1,component_matrix.shape[0]+1), component_matrix.explained_variance_ratio)
# sns.plt.xlabel("component number")
# sns.plt.ylabel("eigenvalue")
The problem with this is that PCA expects features with little to no correlation, and in this case, with words if I were to build a model that was based on eliminating similar words or correlated words, this would only acomplish the task of being overfitted and it would not do well at all for predicting a real example. Let's say one of the components was based on the word "small" and "small" is correlated with "little" but I just deleted little. Unless I have another way to capture semantic similarity I can't get rid of those words just yet.
In [39]:
component_matrix.head()
Out[39]:
First try Logistic Regresssion, and comparison with the failed PC.
In [158]:
X = transformed_pca_x.ix[:,:500]
y = capitol_words["R"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
lr = LogisticRegression(C=1e9, penalty='l1')
lr.fit(X_train,y_train)
y_test_pred = lr.predict(X_test)
print "Test set accuracy of LR model: ",metrics.accuracy_score(y_test, y_test_pred)
In [52]:
from sklearn.cross_validation import KFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn import metrics
import scipy.stats as stats
In [118]:
X = capitol_words[capitol_words.columns.tolist()[836:]]
y = capitol_words["R"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
lr = LogisticRegression(C=1e9, penalty='l2')
lr.fit(X_train,y_train)
y_test_pred = lr.predict(X_test)
print "Test set accuracy of LR model: ",metrics.accuracy_score(y_test, y_test_pred)
the prediction on data never seen before it got this
In [119]:
# null accuracy?
# compute null accuracy manually
print "Null accuracy on the test set: ",y_test.mean()
this is the fraction of samples in the test set that.... So we look on the test set and we see it counts. Even with multiple categories, you'd still have to look at the count of the categories and pick the one that was must frequent. in this case the data set is pretty balance, because there are almost as many republicans and democrats. So this regression is capturing some things. This is the null accuracy, it's capturing something, or some kind of information. This would be a dummie classifier that would just be showing the must frequent, it will always pick the category that is must frequent. it will alwasys pick what ever category is the most over represented. This would be also useful for a multiclass regression
In [120]:
from sklearn.dummy import DummyClassifier
dumb_model = DummyClassifier(strategy='most_frequent')
dumb_model.fit(X_train, y_train)
y_dumb_class = dumb_model.predict(X_test)
print "Most frequent class dummy classifier test accuracy: ",metrics.accuracy_score(y_test, y_dumb_class)
we would have to see if the means are statiscally different as long as the standard deviation is not crazy huge, if it is then we might not be able to say much you want the green bars to be on the rigth and the gray things on the left so there would be not so much variance but on average across many fold the mean of the folds.... The cross validated scores vary per folds,
In [122]:
dumb_model = DummyClassifier(strategy='most_frequent')
dummy_scores = cross_val_score(dumb_model, X, y, cv=30)
real_scores = cross_val_score(LogisticRegression(),X , y,cv=30)
sns.plt.hist(dummy_scores)
sns.plt.hist(real_scores)
#we could use a cv=Startifield Kfold for when you have really unbalanced
#real_scores = cross_val_score(LogisticRegression(),X , y,cv=30)
print np.mean(dummy_scores)
print np.mean(real_scores)
print np.std(real_scores)
we don;t know what we are classifying wrong accuracy is difficiult when you have unbalanced models, it really depends of what categories you are capturing right and which one you are capturing bad, the unrepresented categories and such.... In terms of classification we can decompose the error and the hits in a confusion matrix
True positives, true negatives and bleh... bad things happen too. False Negative and False Positive, if I have a classifier, it will make error, but when it does the error fall in either false negative and true negative. A false negative is awful. (like if you have cancer) it's called a miss, it depends on the context. A false alart There are multiple ways of talking about i. hit correct, rejection, miss false alarm. what we want to do is maximized the missclasification, what accurary tells you is just the fraction in the matrix. (Maybe you want to maximize some over other, maybe missing people that have cancer Unbalanced case... False alarm is? Maybe we want more possitives to capture the one ones that are more positive? what is left to do is tunnign the thereshold of the classifier. This is a measure of how accurate the model is. True negative rate in this case is much higher than the true positive rate.
False negative rates, can have a secondary effect on the accuracy, i haven't decided which one I want to maximize. The diagonal is class 0 and class1. In this case it's only a 2x2 matrix, the diagonal is what I want to maximize. so class 2 and class 1 misspredictited are not so large. The is not an obvious visual difference in the confusion matrix
In [123]:
# confusion matrix
cm = metrics.confusion_matrix(y_test, y_test_pred)
print cm
sns.heatmap(cm)
Out[123]:
What fraction of the positive labels in unseen data did we correctly called positive? the TPR Specificity. the true negative rate. What fraction of the class 0s did we get. This metrics are run on the test set. What I'd rreally want to do is compute across all of the crossvalidation folds, take the average of the unseen data. TODO! Precision, when the model says republican, how many times whas it actually a republican, it's a metric based on the model/ Presicion is (PPV) how more can you trust it. True positive rate is pretty good. I captured 92% of the actual republicans. Recall is higher than chance. This means that everytime this classfier runs it's correrct on saying republican and it is republican is 92%. the PPV is 89% it's the believability of the classifier, when it says republicna what fraction of the times it's actually republican. Precision is almost on the order of 90% So I can be very certain it is republican text.
the 2 numbers that are more importatn precision and recall. recall is how believable are you.. The F1 score combines precision and recall.
In [124]:
# calculate each metric by hand
print "Sensitivity/Recall (TPR): ",cm[1,1] / float(cm[1,1] + cm[1,0])
print "Specificity (TNR): ", cm[0,0] / float(cm[0,0] + cm[0,1])
print "Precision (PPV): ", cm[1,1] / float(cm[1,1]+cm[0,1])
print "NPV: ", cm[0,0] / float(cm[0,0]+cm[1,0])
print "Accuracy: ", (cm[1,1]+cm[0,0]) / float(cm.sum())
print "F1:", metrics.f1_score(y_test,y_test_pred)
In [125]:
# calculate some of these metrics using sklearn and the test set samples
print "Sensitivity/Recall (TPR): ",metrics.recall_score(y_test,y_test_pred)
print "Precision (PPV): ", metrics.precision_score(y_test,y_test_pred)
print "Accuracy: ", metrics.accuracy_score(y_test,y_test_pred)
print "F1:", metrics.f1_score(y_test,y_test_pred)
In [126]:
print "Classification Report:\n", metrics.classification_report(y_test,y_test_pred)
The classifier is setting thersfolf at .5 so any probability higher than .5 as a the default probability thersjolf if we alter it we would get a different precission. out of the box it tries to maximize thershold, we can sistematically look at what happens are we vary the thhereshold in every possible levl.
In [128]:
#lr probabilities per category for first five samples
predicted_probs_lr = lr.predict_proba(X_test).round(3)
predictions_lr = lr.predict(X_test)
print "Logistic Regression predicted probabilities for first five samples in test set:\n",predicted_probs_lr[:5]
print "Logistic Regression predictions for first five samples in test set:\n",predictions_lr[:5]
y_test_lr_df = pd.DataFrame(
np.concatenate((
predicted_probs_lr,predictions_lr.reshape((predictions_lr.shape[0],-1)),
y_test.reshape((y_test.shape[0],-1))),axis=1
),
columns = ["class_0","class_1","predicted","actual"])
y_test_lr_df.head()
Out[128]:
There are guys I still need to investigate, there are some case were the misclasification is not of probability right at chance, In a portion of cases it gets things run, to these cases need to be flags, this need to be seen as in manually reviewed. the bulk of them could still be correct, those cases need to be reviewed. We can look at whihc one of those are in the test sets. Isolate the examplest:
There could be some other dimension that we are not capturing, this could be mislabeled or this guy just acts like a republican... not sure yet.
In [133]:
bad_y_class_0 = y_test_lr_df[np.logical_and(y_test_lr_df.class_0>.9, y_test_lr_df.actual==1.0)]
print bad_y_class_0
bad_y_class_1 = y_test_lr_df[np.logical_and(y_test_lr_df.class_1>.9, y_test_lr_df.actual==0.0)]
print bad_y_class_1
In [ ]:
In [78]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)
predicted_probs_rf = rf.predict_proba(X_test)
predictions_rf = rf.predict(X_test)
y_test_rf_df = pd.DataFrame(
np.concatenate((
predicted_probs_rf,predictions_rf.reshape((predictions_rf.shape[0],-1)),
y_test.reshape((y_test.shape[0],-1))),axis=1
),
columns = ["class_0","class_1","predicted","actual"])
y_test_rf_df.head()
Out[78]:
Class probability changes The random forest did not help. its even stronger thinking the opposite case. if you want no false positive rates is .78 as the thereshold, the 2 curves
IN ROC curve what it does is illustrate the performance of the binary classfier and systematically change the thershold
In [134]:
#generate lr model false positive and true positive rates
fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(y_test, predicted_probs_lr[:,1])
#generate same for random forest model
fpr_rf, tpr_rf, thresholds_rf = metrics.roc_curve(y_test, predicted_probs_rf[:,1])
# plot LR and RF model ROC curves
sns.plt.plot(fpr_lr, tpr_lr,label="lr")
sns.plt.plot(fpr_rf, tpr_rf,label="rf")
sns.plt.xlim([0, 1])
sns.plt.ylim([0, 1.05])
sns.plt.legend(loc="lower right")
sns.plt.xlabel('False Positive Rate (1 - Specificity)')
sns.plt.ylabel('True Positive Rate (Sensitivity)')
Out[134]:
The area below the curve is the auc this will tell you how good your classifier is regarless of the thershold so logistic regression is better. If the classifier was bad it would be a line, that would be a flip of the coin, it's also a way to compare classifier agains eachohter, so in the context of this problem with the parameters that I used the logistic regression is actually better than a random forest. at all the false positive rates. The AUC is very high. Logistic regression always outperform the random forest
In [135]:
# calculate AUC for lr and rf
print "LR model AUC: ",metrics.roc_auc_score(y_test, predicted_probs_lr[:,1])
print "RF model AUC: ",metrics.roc_auc_score(y_test, predicted_probs_rf[:,1])
Probabilities are green and ROC is blue, on the x axis in both case we have the false postive anbd tge y is true positv at a false positive rate of .1 you There are not as many drops nad the curve looks very smooth There are not many gaps in the data.
In [136]:
# plot LR and RF model ROC curves
sns.plt.plot(fpr_lr, tpr_lr,label="lr")
sns.plt.plot(fpr_lr,thresholds_lr, label="lr_thresh")
sns.plt.xlim([0, 1])
sns.plt.ylim([0, 1.05])
sns.plt.legend(loc="center")
sns.plt.xlabel('False Positive Rate (1 - Specificity)')
sns.plt.ylabel('True Positive Rate (Sensitivity) or Class 1 Threshold Probability')
Out[136]:
at a false positive rate of 12% we are going to have a classififer that is stilas good the thereshold need to be a little over 7 it's an exache of accuracy for significantly fewer false positives, it should be base on what I decide to care about later. you have to provide the probabilities not the class labels.
In [144]:
y_test_lr_df["predicted_075"] = (y_test_lr_df.class_1 > 0.72).astype(float)
print y_test_lr_df.head()
print "Confusion matrix at original 0.5 threshold:\n",metrics.confusion_matrix(y_test_lr_df.actual,
y_test_lr_df.predicted),"\n"
print "Classification Report at original 0.5 threshold:\n", metrics.classification_report(y_test_lr_df.actual,
y_test_lr_df.predicted),"\n"
print "Confusion matrix at 0.72 threshold:\n",metrics.confusion_matrix(y_test_lr_df.actual,
y_test_lr_df.predicted_07),"\n"
print "Classification Report at 0.72 threshold:\n", metrics.classification_report(y_test_lr_df.actual,
y_test_lr_df.predicted_07)
In [138]:
# calculate AUC using y_pred_class (producing incorrect results)
print "Wrong way to calculate LR model AUC: ",metrics.roc_auc_score(y_test, predictions_lr)
print "Wrong way to calculate RF model AUC: ",metrics.roc_auc_score(y_test, predictions_rf)
Histogram grouped by the actual categories. Things in between are what is missclasified and you want it as flat as possible. the 2 classfier have different distributions so they get things wrong in different ways. so the top is logistic regression and the bottom is random forest.
In [139]:
# histogram of predicted probabilities grouped by actual response value for LR
y_test_lr_df.class_1.hist(by= y_test_lr_df.actual, sharex=True, sharey=True)
#same for RF
y_test_rf_df.class_1.hist(by= y_test_rf_df.actual, sharex=True, sharey=True)
Out[139]:
In [ ]:
The roc does not change if you double the probabilities, the relative probability of the samples should not change. it cares about the ranking.
In [140]:
#convert outcome into binary 0/1 attribute
le = LabelEncoder()
#create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
#create logistic regression object
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_test_pred = lr.predict(X_test)
print "Test set accuracy of default 0.5 threshold LR model: ",metrics.accuracy_score(y_test, y_test_pred)
In [141]:
# calculate predicted probabilities for class 1
y_pred_prob1 = lr.predict_proba(X_test)[:, 1]
# show predicted probabilities in a histogram
sns.plt.hist(y_pred_prob1)
Out[141]:
In [145]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob1)
Out[145]:
In [146]:
# plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob1)
sns.plt.plot(fpr, tpr)
sns.plt.xlim([0, 1])
sns.plt.ylim([0, 1.05])
sns.plt.xlabel('False Positive Rate (1 - Specificity)')
sns.plt.ylabel('True Positive Rate (Sensitivity)')
Out[146]:
probabilities are only probabilities in the context of the data base. You are making a transformation so that it has a certain order, so that some guys have you are ordering the set. the first guy maybe is the must republican. it is not really a probability. The guys are never going to be democrats they are just ordered in certain way.
In [ ]:
In [147]:
# change the predicted probabilities
y_pred_prob2 = np.sqrt(y_pred_prob1)
# here are the old ones (y_pred_prob1)
print "Old predicted probs:\n",y_pred_prob1[:10].round(3)
In [148]:
# here are the new ones (y_pred_prob2)
print "New predicted probs:\n",y_pred_prob2[:10].round(3)
In [149]:
# you can see the histogram changed
figure = sns.plt.figure(figsize=(12,8))
figure.add_subplot(121)
sns.plt.title("Original histogram of predicted probabilities")
sns.plt.hist(y_pred_prob1)
figure.add_subplot(122)
sns.plt.title("Histogram of square root predicted probabilities")
sns.plt.hist(y_pred_prob2)
Out[149]:
In [150]:
# the AUC did not change
print "Old AUC: ",metrics.roc_auc_score(y_test, y_pred_prob1)
print "New AUC: ",metrics.roc_auc_score(y_test, y_pred_prob2)
In [151]:
# the ROC curve did not change
fpr2, tpr2, thresholds2 = metrics.roc_curve(y_test, y_pred_prob2)
figure = sns.plt.figure(figsize=(12,8))
figure.add_subplot(121)
sns.plt.plot(fpr, tpr)
sns.plt.title("Original ROC Curve")
figure.add_subplot(122)
sns.plt.title("ROC Curve of sqrt probabilities")
sns.plt.plot(fpr2, tpr2)
Out[151]:
In [154]:
#create rf regressor and check 10-fold RMSE
lr = LogisticRegression(C=1e9, penalty='l1')
cross_val_scores = np.abs(cross_val_score(lr,X,y,scoring = "mean_squared_error", cv=10))
rmse_cross_val_scores = map(np.sqrt, cross_val_scores)
print "Mean 10-fold rmse: ", np.mean(rmse_cross_val_scores)
print "Std 10-fold rmse: ", np.std(rmse_cross_val_scores)
Cross validation just allows you to split your data how ever you want
This is for splitting the data base, the categorical should be first and the numerical last
In [155]:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
"""For data grouped by feature, select subset of data at a provided key.
The data is expected to be stored in a 2D data structure, where the first
index is over features and the second is over samples. i.e.
>> len(data[key]) == n_samples
Please note that this is the opposite convention to sklearn feature
matrixes (where the first index corresponds to sample).
ItemSelector only requires that the collection implement getitem
(data[key]). Examples include: a dict of lists, 2D numpy array, Pandas
DataFrame, numpy record array, etc.
>> data = {'a': [1, 5, 2, 5, 2, 8],
'b': [9, 4, 1, 4, 1, 3]}
>> ds = ItemSelector(key='a')
>> data['a'] == ds.transform(data)
ItemSelector is not designed to handle data grouped by sample. (e.g. a
list of dicts). If your data is structured this way, consider a
transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
Parameters
----------
key : hashable, required
The key corresponding to the desired value in a mappable.
"""
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
Categorical variables need to absolutetly be encoded, they have to be applied only to the categorical columns in this case I need to week it. label encoder is a transformer it takes string and makes them in to number
In [ ]:
# from sklearn.pipeline import FeatureUnion, Pipeline
# from sklearn.preprocessing import OneHotEncoder
# #encode the categorical column from strings to ints
# le = LabelEncoder()
# abalone_data["sex_encoded"] = abalone_data[[categorical_columns]].apply(le.fit_transform)
# #extract the y
# y = abalone_data.age
# #create the feature union for the features
# X_transformed_pipe = FeatureUnion(
# transformer_list=[
# # Pipeline for one hot encoding categorical column
# ('sexes', Pipeline([
# ('selector', ItemSelector(key=["sex_encoded"])),
# ('encoder', OneHotEncoder())
# ])),
# # Pipeline for pulling out numeric features and scaling them
# ('numeric', Pipeline([
# ('selector', ItemSelector(key=numeric_columns)),
# #('polyfeatures', PolynomialFeatures(degree=2,interaction_only=True)),
# ('scaler', StandardScaler()),
# ]))])
# #create the full final pipeline
# full_pipeline = Pipeline([("all_features",X_transformed_pipe),("rf_regressor",RandomForestRegressor(n_estimators=100))])
a pipleine is a list of transformation, every list is a list of tuple, every thing is a tuple, that defines two things, it has a string name for your transformation and then the actual transformation you want to do, the transformaton you want to do can the themselves pipelines, if you have 2 pipelines and you want to combine their results toggether, you have to use a feature union that is another class that takes the list of pipelines that you want to put together. we have the original matrix we are going to take out the numerical and categorical and they recive 2 actions, the numeric gets sacles, categorical gets to be anumber, the first stage is to select the columns. in the abov e example sexes. its 2 pipelines. when you encode it, it's like pd.dummies. we are going to move to the numeric pipeline One hot encoder is just implemented in scikit learn, because it automatically takes off the last column. They themselvestransformer list takes a list that is a list of pipelines. the feature union gets put in to another pipeline. the final pipelnie is first tstage is runnign through the fist pipleine and the second stage is running the random forest regressor and telling it how many trees at the end yo pass on the crazy pipeline n to crossval. then we can feed it our data.
in the kidney data set. label encoder inscikit lear converts things in to class 0, the 0 label always correspons to nana, missing categorical values imputer the values with a mputer class, we tell them the missing values and te strategy is most frequent in imputer you can also pass it your own fancy function. transformer object a new pipeline where the selector would be key first set select the column you care about second thing is the column you are creatwing and then
every pipeline has a parameter called step and wen you call them it lists them. when you call them it gives you the index. you can access the steps like and array "[0]"[1]" It can get very nested. everything has it's sequence that has another sequnce.
At the very end you would pickle the pipeline and the grid search so you would have a basci script that would.