In [153]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import xlrd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestClassifier

In [154]:
#Load data form excel spreadsheet into pandas
xls_file = pd.ExcelFile('D:\\Users\\Borja.gonzalez\\Desktop\\Thinkful-DataScience-Borja\\resultados_personality_insights_v0.2.xlsx')

# View the excel file's sheet names
#xls_file.sheet_names

# Load the xls file's 14tbl08ny as a dataframe
person = xls_file.parse('Raw Data')
person.head()


Out[154]:
folder file word_count processed_language big5_openness_raw_score big5_openness__facet_adventurousness_raw_score big5_openness__facet_artistic_interests_raw_score big5_openness__facet_emotionality_raw_score big5_openness__facet_imagination_raw_score big5_openness__facet_intellect_raw_score ... need_love_raw_score need_practicality_raw_score need_self_expression_raw_score need_stability_raw_score need_structure_raw_score value_conservation_raw_score value_openness_to_change_raw_score value_hedonism_raw_score value_self_enhancement_raw_score value_self_transcendence_raw_score
0 BCDM BCDM Essay #1.docx 1037.0 en 0.784474 0.552340 0.723263 0.688233 0.744498 0.731954 ... 0.722490 0.688515 0.621080 0.732718 0.730330 0.668267 0.796639 0.632937 0.674912 0.834175
1 BCDM BCDM Essay #10.docx 944.0 en 0.771121 0.545828 0.779554 0.739122 0.784990 0.769389 ... 0.810111 0.683229 0.637034 0.732972 0.701467 0.622341 0.799327 0.661351 0.679099 0.849262
2 BCDM BCDM Essay #11.docx 1109.0 en 0.821584 0.526137 0.769487 0.762700 0.805796 0.766390 ... 0.802418 0.696390 0.683699 0.741340 0.702538 0.608368 0.819925 0.644431 0.634194 0.849675
3 BCDM BCDM Essay #12.docx 1398.0 en 0.765794 0.448368 0.683062 0.729064 0.732648 0.675948 ... 0.812411 0.674062 0.632017 0.762876 0.698028 0.643144 0.742352 0.674282 0.632661 0.842662
4 BCDM BCDM Essay #13.docx 985.0 en 0.814402 0.545682 0.742526 0.688421 0.772085 0.770653 ... 0.721496 0.696977 0.645380 0.684512 0.699831 0.569484 0.817540 0.647659 0.648155 0.836181

5 rows × 56 columns


In [155]:
person = person[:-9]
person.tail()


Out[155]:
folder file word_count processed_language big5_openness_raw_score big5_openness__facet_adventurousness_raw_score big5_openness__facet_artistic_interests_raw_score big5_openness__facet_emotionality_raw_score big5_openness__facet_imagination_raw_score big5_openness__facet_intellect_raw_score ... need_love_raw_score need_practicality_raw_score need_self_expression_raw_score need_stability_raw_score need_structure_raw_score value_conservation_raw_score value_openness_to_change_raw_score value_hedonism_raw_score value_self_enhancement_raw_score value_self_transcendence_raw_score
212 TDHR TDHR Essay #5.docx 1164.0 en 0.784142 0.571794 0.654346 0.621485 0.723117 0.737616 ... 0.743371 0.713913 0.616218 0.736226 0.749601 0.617243 0.803430 0.626427 0.674647 0.833964
213 TDHR TDHR Essay #6.docx 1841.0 en 0.797916 0.531801 0.740805 0.702944 0.731774 0.763092 ... 0.722096 0.669997 0.601237 0.736552 0.726431 0.631269 0.766601 0.587490 0.648672 0.839953
214 TDHR TDHR Essay #7.pdf 599.0 en 0.801086 0.562731 0.614591 0.556978 0.674139 0.688987 ... 0.717358 0.720633 0.607210 0.713225 0.711345 0.583818 0.794607 0.615938 0.681913 0.802872
215 TDHR TDHR Essay #8.docx 1083.0 en 0.828846 0.555455 0.763287 0.690920 0.803990 0.773501 ... 0.767748 0.704486 0.669404 0.693312 0.692027 0.594143 0.821815 0.626897 0.662800 0.837475
216 TDHR TDHR Essay #9.docx 518.0 en 0.860767 0.520779 0.808049 0.638710 0.830613 0.768699 ... 0.645866 0.674787 0.590121 0.680510 0.692115 0.567373 0.792367 0.587199 0.635510 0.807051

5 rows × 56 columns


In [156]:
person.folder.unique()


Out[156]:
array(['BCDM', 'BIS', 'EMCC', 'EXMPLS', 'GMBD', 'MBD', 'MCC', 'MCS',
       'MCXI', 'MRCB', 'MVDM', 'TDHR'], dtype=object)

In [157]:
# Check quality of the information

person.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 56 columns):
folder                                                          217 non-null object
file                                                            217 non-null object
word_count                                                      217 non-null float64
processed_language                                              217 non-null object
big5_openness_raw_score                                         217 non-null float64
big5_openness__facet_adventurousness_raw_score                  217 non-null float64
big5_openness__facet_artistic_interests_raw_score               217 non-null float64
big5_openness__facet_emotionality_raw_score                     217 non-null float64
big5_openness__facet_imagination_raw_score                      217 non-null float64
big5_openness__facet_intellect_raw_score                        217 non-null float64
big5_openness__facet_liberalism_raw_score                       217 non-null float64
big5_conscientiousness_raw_score                                217 non-null float64
big5_conscientiousness__facet_achievement_striving_raw_score    217 non-null float64
big5_conscientiousness__facet_cautiousness_raw_score            217 non-null float64
big5_conscientiousness__facet_dutifulness_raw_score             217 non-null float64
big5_conscientiousness__facet_orderliness_raw_score             217 non-null float64
big5_conscientiousness__facet_self_discipline_raw_score         217 non-null float64
big5_conscientiousness__facet_self_efficacy_raw_score           217 non-null float64
big5_extraversion_raw_score                                     217 non-null float64
big5_extraversion__facet_activity_level_raw_score               217 non-null float64
big5_extraversion__facet_assertiveness_raw_score                217 non-null float64
big5_extraversion__facet_cheerfulness_raw_score                 217 non-null float64
big5_extraversion__facet_excitement_seeking_raw_score           217 non-null float64
big5_extraversion__facet_friendliness_raw_score                 217 non-null float64
big5_extraversion__facet_gregariousness_raw_score               217 non-null float64
big5_agreeableness_raw_score                                    217 non-null float64
big5_agreeableness__facet_altruism_raw_score                    217 non-null float64
big5_agreeableness__facet_cooperation_raw_score                 217 non-null float64
big5_agreeableness__facet_modesty_raw_score                     217 non-null float64
big5_agreeableness__facet_morality_raw_score                    217 non-null float64
big5_agreeableness__facet_sympathy_raw_score                    217 non-null float64
big5_agreeableness__facet_trust_raw_score                       217 non-null float64
big5_neuroticism_raw_score                                      217 non-null float64
big5_neuroticism__facet_anger_raw_score                         217 non-null float64
big5_neuroticism__facet_anxiety_raw_score                       217 non-null float64
big5_neuroticism__facet_depression_raw_score                    217 non-null float64
big5_neuroticism__facet_immoderation_raw_score                  217 non-null float64
big5_neuroticism__facet_self_consciousness_raw_score            217 non-null float64
big5_neuroticism__facet_vulnerability_raw_score                 217 non-null float64
need_challenge_raw_score                                        217 non-null float64
need_closeness_raw_score                                        217 non-null float64
need_curiosity_raw_score                                        217 non-null float64
need_excitement_raw_score                                       217 non-null float64
need_harmony_raw_score                                          217 non-null float64
need_ideal_raw_score                                            217 non-null float64
need_liberty_raw_score                                          217 non-null float64
need_love_raw_score                                             217 non-null float64
need_practicality_raw_score                                     217 non-null float64
need_self_expression_raw_score                                  217 non-null float64
need_stability_raw_score                                        217 non-null float64
need_structure_raw_score                                        217 non-null float64
value_conservation_raw_score                                    217 non-null float64
value_openness_to_change_raw_score                              217 non-null float64
value_hedonism_raw_score                                        217 non-null float64
value_self_enhancement_raw_score                                217 non-null float64
value_self_transcendence_raw_score                              217 non-null float64
dtypes: float64(53), object(3)
memory usage: 95.0+ KB

In [158]:
person1 = person[['folder','word_count','big5_openness_raw_score','big5_conscientiousness_raw_score','big5_extraversion_raw_score','big5_agreeableness_raw_score','big5_neuroticism_raw_score']]
#Describe the dataset
person1


Out[158]:
folder word_count big5_openness_raw_score big5_conscientiousness_raw_score big5_extraversion_raw_score big5_agreeableness_raw_score big5_neuroticism_raw_score
0 BCDM 1037.0 0.784474 0.647777 0.507292 0.705867 0.472964
1 BCDM 944.0 0.771121 0.574128 0.501644 0.741771 0.422941
2 BCDM 1109.0 0.821584 0.598315 0.494727 0.750020 0.401171
3 BCDM 1398.0 0.765794 0.591388 0.512185 0.772154 0.445388
4 BCDM 985.0 0.814402 0.616285 0.501854 0.716395 0.464840
5 BCDM 650.0 0.830572 0.578275 0.489811 0.678850 0.371156
6 BCDM 956.0 0.784986 0.657518 0.467775 0.670573 0.471771
7 BCDM 555.0 0.853094 0.655600 0.522446 0.703827 0.493167
8 BCDM 702.0 0.793103 0.614618 0.549806 0.747433 0.482147
9 BCDM 568.0 0.826836 0.576928 0.509567 0.751530 0.439642
10 BCDM 1937.0 0.831228 0.639953 0.532695 0.654130 0.508496
11 BCDM 506.0 0.796406 0.627902 0.517048 0.683698 0.482519
12 BCDM 1078.0 0.831727 0.666666 0.503420 0.689208 0.534743
13 BCDM 1546.0 0.796767 0.569959 0.485702 0.734860 0.432378
14 BCDM 611.0 0.767014 0.571563 0.498915 0.715347 0.454412
15 BIS 1262.0 0.833760 0.653384 0.489032 0.651833 0.549215
16 BIS 627.0 0.814350 0.626922 0.452273 0.664134 0.475572
17 BIS 1041.0 0.814281 0.649249 0.506349 0.729125 0.487553
18 BIS 914.0 0.786332 0.569781 0.513221 0.758894 0.374587
19 BIS 445.0 0.793603 0.643588 0.512760 0.717842 0.464486
20 BIS 497.0 0.768328 0.638858 0.523778 0.684490 0.474741
21 BIS 1136.0 0.802678 0.620585 0.523026 0.692387 0.452739
22 BIS 1408.0 0.788466 0.596585 0.541857 0.721299 0.431155
23 BIS 761.0 0.827433 0.602352 0.558148 0.628755 0.485206
24 BIS 917.0 0.799459 0.659157 0.500673 0.688529 0.529178
25 BIS 925.0 0.774660 0.635339 0.499117 0.702165 0.493690
26 BIS 1050.0 0.822442 0.640226 0.516685 0.671679 0.479070
27 BIS 630.0 0.795939 0.589845 0.486373 0.714871 0.423696
28 BIS 559.0 0.850150 0.604707 0.467417 0.640840 0.526028
29 BIS 896.0 0.782523 0.543350 0.485366 0.689979 0.419266
... ... ... ... ... ... ... ...
187 MVDM 710.0 0.821373 0.633548 0.561368 0.686419 0.467188
188 MVDM 1858.0 0.801350 0.639905 0.553201 0.701872 0.467313
189 MVDM 1663.0 0.811635 0.618572 0.538370 0.650232 0.464092
190 MVDM 970.0 0.817362 0.638886 0.576353 0.668079 0.519775
191 MVDM 1095.0 0.814708 0.627567 0.507300 0.655108 0.506072
192 MVDM 1606.0 0.811679 0.599690 0.514282 0.655292 0.461818
193 MVDM 1588.0 0.808115 0.605791 0.525639 0.640698 0.469702
194 MVDM 1128.0 0.784938 0.628240 0.523701 0.700249 0.465118
195 MVDM 589.0 0.836460 0.685382 0.573893 0.682619 0.551122
196 MVDM 1142.0 0.811224 0.623289 0.532908 0.661197 0.460595
197 MVDM 1195.0 0.811526 0.653412 0.554200 0.702209 0.468103
198 MVDM 1626.0 0.804423 0.624118 0.566358 0.674623 0.505498
199 MVDM 597.0 0.780467 0.579961 0.502402 0.731105 0.445790
200 TDHR 1143.0 0.790548 0.679955 0.536264 0.691009 0.518097
201 TDHR 1372.0 0.799601 0.639468 0.531169 0.679276 0.483912
202 TDHR 1025.0 0.798398 0.641255 0.567299 0.677446 0.487078
203 TDHR 1669.0 0.798977 0.670610 0.540362 0.738324 0.490344
204 TDHR 263.0 0.834513 0.653579 0.609476 0.695142 0.512266
205 TDHR 642.0 0.809884 0.627719 0.529245 0.656981 0.479647
206 TDHR 629.0 0.817642 0.614205 0.605506 0.656003 0.500044
207 TDHR 1744.0 0.829489 0.636871 0.567968 0.681831 0.497909
208 TDHR 1043.0 0.777484 0.617336 0.527547 0.672747 0.467975
209 TDHR 1194.0 0.784093 0.658437 0.544969 0.665350 0.511934
210 TDHR 562.0 0.810377 0.604879 0.567782 0.678530 0.442344
211 TDHR 567.0 0.805316 0.612327 0.536813 0.695724 0.432867
212 TDHR 1164.0 0.784142 0.651490 0.528617 0.674021 0.481337
213 TDHR 1841.0 0.797916 0.682389 0.491711 0.716173 0.477386
214 TDHR 599.0 0.801086 0.633979 0.576884 0.642123 0.539169
215 TDHR 1083.0 0.828846 0.615844 0.588596 0.713022 0.443045
216 TDHR 518.0 0.860767 0.624725 0.527515 0.617228 0.525838

217 rows × 7 columns


In [159]:
#Scaling all features

# Select only numeric variables to scale.
df = person1.select_dtypes(include=[np.number]).dropna()

# Save the column names.
names = df.columns

# Scale, then turn the resulting numpy array back into a data frame with the correct column names.
df_scaled = pd.DataFrame(preprocessing.scale(df), columns=names)

# Lookit all those matching means and standard deviations!
df_scaled.head()


Out[159]:
word_count big5_openness_raw_score big5_conscientiousness_raw_score big5_extraversion_raw_score big5_agreeableness_raw_score big5_neuroticism_raw_score
0 0.050025 -1.189772 0.653134 -1.138976 0.753642 -0.437386
1 -0.134604 -1.848076 -2.354997 -1.317098 1.987142 -1.946366
2 0.192963 0.639677 -1.367108 -1.535213 2.270529 -2.603088
3 0.766700 -2.110693 -1.650030 -0.984659 3.030965 -1.269245
4 -0.053208 0.285630 -0.633140 -1.310452 1.115322 -0.682455

In [160]:
person2= df_scaled.rename(columns={'big5_openness_raw_score': 'big5_openness_scaled',
                                 'big5_conscientiousness_raw_score': 'big5_conscientiousness_scaled',
                                 'big5_extraversion_raw_score':'big5_extraversion_scaled',
                                 'big5_agreeableness_raw_score': 'big5_agreeableness_scaled',
                                 'big5_neuroticism_raw_score':'big5_neuroticism_scaled'
                                })
person2.head()


Out[160]:
word_count big5_openness_scaled big5_conscientiousness_scaled big5_extraversion_scaled big5_agreeableness_scaled big5_neuroticism_scaled
0 0.050025 -1.189772 0.653134 -1.138976 0.753642 -0.437386
1 -0.134604 -1.848076 -2.354997 -1.317098 1.987142 -1.946366
2 0.192963 0.639677 -1.367108 -1.535213 2.270529 -2.603088
3 0.766700 -2.110693 -1.650030 -0.984659 3.030965 -1.269245
4 -0.053208 0.285630 -0.633140 -1.310452 1.115322 -0.682455

In [161]:
# Make the correlation matrix.
correlation_matrix = person2.corr()

# Set up the matplotlib figure.
f, ax = plt.subplots(figsize=(12, 9))

# Draw the heatmap using seaborn.
sns.heatmap(correlation_matrix, vmax=.8, square=True)
plt.show()



In [162]:
#Eigenvectores & Eigenvalues

eig_vals, eig_vecs = np.linalg.eig(correlation_matrix)

# Inspecting the eigenvalues and eigenvectors.
for i in range(len(eig_vals)):
    eigvecs = eig_vecs[:, i].reshape(1, len(person2.columns)).T
    print('Eigenvector {}: \n{}'.format(i + 1, eigvecs))
    print('Eigenvalue {}: {}'.format(i + 1, eig_vals[i]))
    print(40 * '-')


sklearn_pca = PCA(n_components=len(person2.columns))
Y_sklearn = sklearn_pca.fit_transform(correlation_matrix)

print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    sklearn_pca.explained_variance_ratio_
)


Eigenvector 1: 
[[-0.07910804]
 [ 0.42973135]
 [ 0.43986724]
 [ 0.37821307]
 [-0.37474999]
 [ 0.57628727]]
Eigenvalue 1: 2.3277045474505345
----------------------------------------
Eigenvector 2: 
[[ 0.10681784]
 [-0.01398469]
 [-0.54387073]
 [-0.01824947]
 [ 0.41424768]
 [ 0.72157108]]
Eigenvalue 2: 0.21289857044447305
----------------------------------------
Eigenvector 3: 
[[-0.2530512 ]
 [-0.83910785]
 [ 0.05236615]
 [ 0.41463868]
 [-0.16966073]
 [ 0.16855532]]
Eigenvalue 3: 0.665670550115972
----------------------------------------
Eigenvector 4: 
[[ 0.94401633]
 [-0.22181736]
 [ 0.22631285]
 [ 0.05766409]
 [-0.04607966]
 [ 0.05444487]]
Eigenvalue 4: 1.0273766555370096
----------------------------------------
Eigenvector 5: 
[[-0.13793165]
 [-0.08300486]
 [ 0.64247332]
 [-0.05973448]
 [ 0.74305045]
 [ 0.07497273]]
Eigenvalue 5: 0.961292341936451
----------------------------------------
Eigenvector 6: 
[[-0.09002774]
 [-0.23439425]
 [ 0.20973733]
 [-0.82328807]
 [-0.32392783]
 [ 0.33201222]]
Eigenvalue 6: 0.8050573345155562
----------------------------------------
The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 [  6.28201539e-01   1.59072557e-01   1.17277504e-01   7.66064020e-02
   1.88419981e-02   2.57603426e-34]

In [163]:
#From the Scree plot.

plt.plot(eig_vals)
plt.show()



In [164]:
y = person1['folder']
X = person2[['word_count','big5_openness_scaled','big5_conscientiousness_scaled','big5_extraversion_scaled','big5_agreeableness_scaled',
             'big5_neuroticism_scaled']]

In [165]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)

In [166]:
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, y_train)

In [167]:
print ("Multinomial Logistic regression Train Accuracy :: ", metrics.accuracy_score(y_train, mul_lr.predict(X_train)))
print ("Multinomial Logistic regression Test Accuracy :: ", metrics.accuracy_score(y_test, mul_lr.predict(X_test)))


Multinomial Logistic regression Train Accuracy ::  0.53488372093
Multinomial Logistic regression Test Accuracy ::  0.0977011494253

In [168]:
y = pd.factorize(y_train)[0]


Out[168]:
array([ 0,  1,  2,  1,  0,  3,  4,  1,  4,  5,  3,  6,  7,  3,  7,  6,  4,
        8,  7,  7,  5,  3,  7,  8,  3,  8,  0,  1,  9, 10,  6,  7,  6,  0,
        5, 11,  6,  1,  7,  1,  2,  0,  3], dtype=int64)

In [169]:
clf = RandomForestClassifier(n_jobs=11, random_state=0)

clf.fit(X_train, y_train)


Out[169]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=11,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [172]:



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-172-5cf38150a878> in <module>()
      1 # Apply the Classifier we trained to the test data (which, remember, it has never seen before)
----> 2 clf.predict(X_test).prob()

AttributeError: 'numpy.ndarray' object has no attribute 'prob'

In [ ]:


In [ ]: