In [153]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import xlrd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestClassifier
In [154]:
#Load data form excel spreadsheet into pandas
xls_file = pd.ExcelFile('D:\\Users\\Borja.gonzalez\\Desktop\\Thinkful-DataScience-Borja\\resultados_personality_insights_v0.2.xlsx')
# View the excel file's sheet names
#xls_file.sheet_names
# Load the xls file's 14tbl08ny as a dataframe
person = xls_file.parse('Raw Data')
person.head()
Out[154]:
In [155]:
person = person[:-9]
person.tail()
Out[155]:
In [156]:
person.folder.unique()
Out[156]:
In [157]:
# Check quality of the information
person.info()
In [158]:
person1 = person[['folder','word_count','big5_openness_raw_score','big5_conscientiousness_raw_score','big5_extraversion_raw_score','big5_agreeableness_raw_score','big5_neuroticism_raw_score']]
#Describe the dataset
person1
Out[158]:
In [159]:
#Scaling all features
# Select only numeric variables to scale.
df = person1.select_dtypes(include=[np.number]).dropna()
# Save the column names.
names = df.columns
# Scale, then turn the resulting numpy array back into a data frame with the correct column names.
df_scaled = pd.DataFrame(preprocessing.scale(df), columns=names)
# Lookit all those matching means and standard deviations!
df_scaled.head()
Out[159]:
In [160]:
person2= df_scaled.rename(columns={'big5_openness_raw_score': 'big5_openness_scaled',
'big5_conscientiousness_raw_score': 'big5_conscientiousness_scaled',
'big5_extraversion_raw_score':'big5_extraversion_scaled',
'big5_agreeableness_raw_score': 'big5_agreeableness_scaled',
'big5_neuroticism_raw_score':'big5_neuroticism_scaled'
})
person2.head()
Out[160]:
In [161]:
# Make the correlation matrix.
correlation_matrix = person2.corr()
# Set up the matplotlib figure.
f, ax = plt.subplots(figsize=(12, 9))
# Draw the heatmap using seaborn.
sns.heatmap(correlation_matrix, vmax=.8, square=True)
plt.show()
In [162]:
#Eigenvectores & Eigenvalues
eig_vals, eig_vecs = np.linalg.eig(correlation_matrix)
# Inspecting the eigenvalues and eigenvectors.
for i in range(len(eig_vals)):
eigvecs = eig_vecs[:, i].reshape(1, len(person2.columns)).T
print('Eigenvector {}: \n{}'.format(i + 1, eigvecs))
print('Eigenvalue {}: {}'.format(i + 1, eig_vals[i]))
print(40 * '-')
sklearn_pca = PCA(n_components=len(person2.columns))
Y_sklearn = sklearn_pca.fit_transform(correlation_matrix)
print(
'The percentage of total variance in the dataset explained by each',
'component from Sklearn PCA.\n',
sklearn_pca.explained_variance_ratio_
)
In [163]:
#From the Scree plot.
plt.plot(eig_vals)
plt.show()
In [164]:
y = person1['folder']
X = person2[['word_count','big5_openness_scaled','big5_conscientiousness_scaled','big5_extraversion_scaled','big5_agreeableness_scaled',
'big5_neuroticism_scaled']]
In [165]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)
In [166]:
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, y_train)
In [167]:
print ("Multinomial Logistic regression Train Accuracy :: ", metrics.accuracy_score(y_train, mul_lr.predict(X_train)))
print ("Multinomial Logistic regression Test Accuracy :: ", metrics.accuracy_score(y_test, mul_lr.predict(X_test)))
In [168]:
y = pd.factorize(y_train)[0]
Out[168]:
In [169]:
clf = RandomForestClassifier(n_jobs=11, random_state=0)
clf.fit(X_train, y_train)
Out[169]:
In [172]:
In [ ]:
In [ ]: