notebook.community

Edit and run



In [153]:

    
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import xlrd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestClassifier



In [154]:

    
#Load data form excel spreadsheet into pandas
xls_file = pd.ExcelFile('D:\\Users\\Borja.gonzalez\\Desktop\\Thinkful-DataScience-Borja\\resultados_personality_insights_v0.2.xlsx')

# View the excel file's sheet names
#xls_file.sheet_names

# Load the xls file's 14tbl08ny as a dataframe
person = xls_file.parse('Raw Data')
person.head()









    Out[154]:







  
    
      
      folder
      file
      word_count
      processed_language
      big5_openness_raw_score
      big5_openness__facet_adventurousness_raw_score
      big5_openness__facet_artistic_interests_raw_score
      big5_openness__facet_emotionality_raw_score
      big5_openness__facet_imagination_raw_score
      big5_openness__facet_intellect_raw_score
      ...
      need_love_raw_score
      need_practicality_raw_score
      need_self_expression_raw_score
      need_stability_raw_score
      need_structure_raw_score
      value_conservation_raw_score
      value_openness_to_change_raw_score
      value_hedonism_raw_score
      value_self_enhancement_raw_score
      value_self_transcendence_raw_score
    
  
  
    
      0
      BCDM
      BCDM Essay #1.docx
      1037.0
      en
      0.784474
      0.552340
      0.723263
      0.688233
      0.744498
      0.731954
      ...
      0.722490
      0.688515
      0.621080
      0.732718
      0.730330
      0.668267
      0.796639
      0.632937
      0.674912
      0.834175
    
    
      1
      BCDM
      BCDM Essay #10.docx
      944.0
      en
      0.771121
      0.545828
      0.779554
      0.739122
      0.784990
      0.769389
      ...
      0.810111
      0.683229
      0.637034
      0.732972
      0.701467
      0.622341
      0.799327
      0.661351
      0.679099
      0.849262
    
    
      2
      BCDM
      BCDM Essay #11.docx
      1109.0
      en
      0.821584
      0.526137
      0.769487
      0.762700
      0.805796
      0.766390
      ...
      0.802418
      0.696390
      0.683699
      0.741340
      0.702538
      0.608368
      0.819925
      0.644431
      0.634194
      0.849675
    
    
      3
      BCDM
      BCDM Essay #12.docx
      1398.0
      en
      0.765794
      0.448368
      0.683062
      0.729064
      0.732648
      0.675948
      ...
      0.812411
      0.674062
      0.632017
      0.762876
      0.698028
      0.643144
      0.742352
      0.674282
      0.632661
      0.842662
    
    
      4
      BCDM
      BCDM Essay #13.docx
      985.0
      en
      0.814402
      0.545682
      0.742526
      0.688421
      0.772085
      0.770653
      ...
      0.721496
      0.696977
      0.645380
      0.684512
      0.699831
      0.569484
      0.817540
      0.647659
      0.648155
      0.836181
    
  

5 rows × 56 columns



In [155]:

    
person = person[:-9]
person.tail()









    Out[155]:







  
    
      
      folder
      file
      word_count
      processed_language
      big5_openness_raw_score
      big5_openness__facet_adventurousness_raw_score
      big5_openness__facet_artistic_interests_raw_score
      big5_openness__facet_emotionality_raw_score
      big5_openness__facet_imagination_raw_score
      big5_openness__facet_intellect_raw_score
      ...
      need_love_raw_score
      need_practicality_raw_score
      need_self_expression_raw_score
      need_stability_raw_score
      need_structure_raw_score
      value_conservation_raw_score
      value_openness_to_change_raw_score
      value_hedonism_raw_score
      value_self_enhancement_raw_score
      value_self_transcendence_raw_score
    
  
  
    
      212
      TDHR
      TDHR Essay #5.docx
      1164.0
      en
      0.784142
      0.571794
      0.654346
      0.621485
      0.723117
      0.737616
      ...
      0.743371
      0.713913
      0.616218
      0.736226
      0.749601
      0.617243
      0.803430
      0.626427
      0.674647
      0.833964
    
    
      213
      TDHR
      TDHR Essay #6.docx
      1841.0
      en
      0.797916
      0.531801
      0.740805
      0.702944
      0.731774
      0.763092
      ...
      0.722096
      0.669997
      0.601237
      0.736552
      0.726431
      0.631269
      0.766601
      0.587490
      0.648672
      0.839953
    
    
      214
      TDHR
      TDHR Essay #7.pdf
      599.0
      en
      0.801086
      0.562731
      0.614591
      0.556978
      0.674139
      0.688987
      ...
      0.717358
      0.720633
      0.607210
      0.713225
      0.711345
      0.583818
      0.794607
      0.615938
      0.681913
      0.802872
    
    
      215
      TDHR
      TDHR Essay #8.docx
      1083.0
      en
      0.828846
      0.555455
      0.763287
      0.690920
      0.803990
      0.773501
      ...
      0.767748
      0.704486
      0.669404
      0.693312
      0.692027
      0.594143
      0.821815
      0.626897
      0.662800
      0.837475
    
    
      216
      TDHR
      TDHR Essay #9.docx
      518.0
      en
      0.860767
      0.520779
      0.808049
      0.638710
      0.830613
      0.768699
      ...
      0.645866
      0.674787
      0.590121
      0.680510
      0.692115
      0.567373
      0.792367
      0.587199
      0.635510
      0.807051
    
  

5 rows × 56 columns



In [156]:

    
person.folder.unique()









    Out[156]:





array(['BCDM', 'BIS', 'EMCC', 'EXMPLS', 'GMBD', 'MBD', 'MCC', 'MCS',
       'MCXI', 'MRCB', 'MVDM', 'TDHR'], dtype=object)



In [157]:

    
# Check quality of the information

person.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 56 columns):
folder                                                          217 non-null object
file                                                            217 non-null object
word_count                                                      217 non-null float64
processed_language                                              217 non-null object
big5_openness_raw_score                                         217 non-null float64
big5_openness__facet_adventurousness_raw_score                  217 non-null float64
big5_openness__facet_artistic_interests_raw_score               217 non-null float64
big5_openness__facet_emotionality_raw_score                     217 non-null float64
big5_openness__facet_imagination_raw_score                      217 non-null float64
big5_openness__facet_intellect_raw_score                        217 non-null float64
big5_openness__facet_liberalism_raw_score                       217 non-null float64
big5_conscientiousness_raw_score                                217 non-null float64
big5_conscientiousness__facet_achievement_striving_raw_score    217 non-null float64
big5_conscientiousness__facet_cautiousness_raw_score            217 non-null float64
big5_conscientiousness__facet_dutifulness_raw_score             217 non-null float64
big5_conscientiousness__facet_orderliness_raw_score             217 non-null float64
big5_conscientiousness__facet_self_discipline_raw_score         217 non-null float64
big5_conscientiousness__facet_self_efficacy_raw_score           217 non-null float64
big5_extraversion_raw_score                                     217 non-null float64
big5_extraversion__facet_activity_level_raw_score               217 non-null float64
big5_extraversion__facet_assertiveness_raw_score                217 non-null float64
big5_extraversion__facet_cheerfulness_raw_score                 217 non-null float64
big5_extraversion__facet_excitement_seeking_raw_score           217 non-null float64
big5_extraversion__facet_friendliness_raw_score                 217 non-null float64
big5_extraversion__facet_gregariousness_raw_score               217 non-null float64
big5_agreeableness_raw_score                                    217 non-null float64
big5_agreeableness__facet_altruism_raw_score                    217 non-null float64
big5_agreeableness__facet_cooperation_raw_score                 217 non-null float64
big5_agreeableness__facet_modesty_raw_score                     217 non-null float64
big5_agreeableness__facet_morality_raw_score                    217 non-null float64
big5_agreeableness__facet_sympathy_raw_score                    217 non-null float64
big5_agreeableness__facet_trust_raw_score                       217 non-null float64
big5_neuroticism_raw_score                                      217 non-null float64
big5_neuroticism__facet_anger_raw_score                         217 non-null float64
big5_neuroticism__facet_anxiety_raw_score                       217 non-null float64
big5_neuroticism__facet_depression_raw_score                    217 non-null float64
big5_neuroticism__facet_immoderation_raw_score                  217 non-null float64
big5_neuroticism__facet_self_consciousness_raw_score            217 non-null float64
big5_neuroticism__facet_vulnerability_raw_score                 217 non-null float64
need_challenge_raw_score                                        217 non-null float64
need_closeness_raw_score                                        217 non-null float64
need_curiosity_raw_score                                        217 non-null float64
need_excitement_raw_score                                       217 non-null float64
need_harmony_raw_score                                          217 non-null float64
need_ideal_raw_score                                            217 non-null float64
need_liberty_raw_score                                          217 non-null float64
need_love_raw_score                                             217 non-null float64
need_practicality_raw_score                                     217 non-null float64
need_self_expression_raw_score                                  217 non-null float64
need_stability_raw_score                                        217 non-null float64
need_structure_raw_score                                        217 non-null float64
value_conservation_raw_score                                    217 non-null float64
value_openness_to_change_raw_score                              217 non-null float64
value_hedonism_raw_score                                        217 non-null float64
value_self_enhancement_raw_score                                217 non-null float64
value_self_transcendence_raw_score                              217 non-null float64
dtypes: float64(53), object(3)
memory usage: 95.0+ KB



In [158]:

    
person1 = person[['folder','word_count','big5_openness_raw_score','big5_conscientiousness_raw_score','big5_extraversion_raw_score','big5_agreeableness_raw_score','big5_neuroticism_raw_score']]
#Describe the dataset
person1









    Out[158]:







  
    
      
      folder
      word_count
      big5_openness_raw_score
      big5_conscientiousness_raw_score
      big5_extraversion_raw_score
      big5_agreeableness_raw_score
      big5_neuroticism_raw_score
    
  
  
    
      0
      BCDM
      1037.0
      0.784474
      0.647777
      0.507292
      0.705867
      0.472964
    
    
      1
      BCDM
      944.0
      0.771121
      0.574128
      0.501644
      0.741771
      0.422941
    
    
      2
      BCDM
      1109.0
      0.821584
      0.598315
      0.494727
      0.750020
      0.401171
    
    
      3
      BCDM
      1398.0
      0.765794
      0.591388
      0.512185
      0.772154
      0.445388
    
    
      4
      BCDM
      985.0
      0.814402
      0.616285
      0.501854
      0.716395
      0.464840
    
    
      5
      BCDM
      650.0
      0.830572
      0.578275
      0.489811
      0.678850
      0.371156
    
    
      6
      BCDM
      956.0
      0.784986
      0.657518
      0.467775
      0.670573
      0.471771
    
    
      7
      BCDM
      555.0
      0.853094
      0.655600
      0.522446
      0.703827
      0.493167
    
    
      8
      BCDM
      702.0
      0.793103
      0.614618
      0.549806
      0.747433
      0.482147
    
    
      9
      BCDM
      568.0
      0.826836
      0.576928
      0.509567
      0.751530
      0.439642
    
    
      10
      BCDM
      1937.0
      0.831228
      0.639953
      0.532695
      0.654130
      0.508496
    
    
      11
      BCDM
      506.0
      0.796406
      0.627902
      0.517048
      0.683698
      0.482519
    
    
      12
      BCDM
      1078.0
      0.831727
      0.666666
      0.503420
      0.689208
      0.534743
    
    
      13
      BCDM
      1546.0
      0.796767
      0.569959
      0.485702
      0.734860
      0.432378
    
    
      14
      BCDM
      611.0
      0.767014
      0.571563
      0.498915
      0.715347
      0.454412
    
    
      15
      BIS
      1262.0
      0.833760
      0.653384
      0.489032
      0.651833
      0.549215
    
    
      16
      BIS
      627.0
      0.814350
      0.626922
      0.452273
      0.664134
      0.475572
    
    
      17
      BIS
      1041.0
      0.814281
      0.649249
      0.506349
      0.729125
      0.487553
    
    
      18
      BIS
      914.0
      0.786332
      0.569781
      0.513221
      0.758894
      0.374587
    
    
      19
      BIS
      445.0
      0.793603
      0.643588
      0.512760
      0.717842
      0.464486
    
    
      20
      BIS
      497.0
      0.768328
      0.638858
      0.523778
      0.684490
      0.474741
    
    
      21
      BIS
      1136.0
      0.802678
      0.620585
      0.523026
      0.692387
      0.452739
    
    
      22
      BIS
      1408.0
      0.788466
      0.596585
      0.541857
      0.721299
      0.431155
    
    
      23
      BIS
      761.0
      0.827433
      0.602352
      0.558148
      0.628755
      0.485206
    
    
      24
      BIS
      917.0
      0.799459
      0.659157
      0.500673
      0.688529
      0.529178
    
    
      25
      BIS
      925.0
      0.774660
      0.635339
      0.499117
      0.702165
      0.493690
    
    
      26
      BIS
      1050.0
      0.822442
      0.640226
      0.516685
      0.671679
      0.479070
    
    
      27
      BIS
      630.0
      0.795939
      0.589845
      0.486373
      0.714871
      0.423696
    
    
      28
      BIS
      559.0
      0.850150
      0.604707
      0.467417
      0.640840
      0.526028
    
    
      29
      BIS
      896.0
      0.782523
      0.543350
      0.485366
      0.689979
      0.419266
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      187
      MVDM
      710.0
      0.821373
      0.633548
      0.561368
      0.686419
      0.467188
    
    
      188
      MVDM
      1858.0
      0.801350
      0.639905
      0.553201
      0.701872
      0.467313
    
    
      189
      MVDM
      1663.0
      0.811635
      0.618572
      0.538370
      0.650232
      0.464092
    
    
      190
      MVDM
      970.0
      0.817362
      0.638886
      0.576353
      0.668079
      0.519775
    
    
      191
      MVDM
      1095.0
      0.814708
      0.627567
      0.507300
      0.655108
      0.506072
    
    
      192
      MVDM
      1606.0
      0.811679
      0.599690
      0.514282
      0.655292
      0.461818
    
    
      193
      MVDM
      1588.0
      0.808115
      0.605791
      0.525639
      0.640698
      0.469702
    
    
      194
      MVDM
      1128.0
      0.784938
      0.628240
      0.523701
      0.700249
      0.465118
    
    
      195
      MVDM
      589.0
      0.836460
      0.685382
      0.573893
      0.682619
      0.551122
    
    
      196
      MVDM
      1142.0
      0.811224
      0.623289
      0.532908
      0.661197
      0.460595
    
    
      197
      MVDM
      1195.0
      0.811526
      0.653412
      0.554200
      0.702209
      0.468103
    
    
      198
      MVDM
      1626.0
      0.804423
      0.624118
      0.566358
      0.674623
      0.505498
    
    
      199
      MVDM
      597.0
      0.780467
      0.579961
      0.502402
      0.731105
      0.445790
    
    
      200
      TDHR
      1143.0
      0.790548
      0.679955
      0.536264
      0.691009
      0.518097
    
    
      201
      TDHR
      1372.0
      0.799601
      0.639468
      0.531169
      0.679276
      0.483912
    
    
      202
      TDHR
      1025.0
      0.798398
      0.641255
      0.567299
      0.677446
      0.487078
    
    
      203
      TDHR
      1669.0
      0.798977
      0.670610
      0.540362
      0.738324
      0.490344
    
    
      204
      TDHR
      263.0
      0.834513
      0.653579
      0.609476
      0.695142
      0.512266
    
    
      205
      TDHR
      642.0
      0.809884
      0.627719
      0.529245
      0.656981
      0.479647
    
    
      206
      TDHR
      629.0
      0.817642
      0.614205
      0.605506
      0.656003
      0.500044
    
    
      207
      TDHR
      1744.0
      0.829489
      0.636871
      0.567968
      0.681831
      0.497909
    
    
      208
      TDHR
      1043.0
      0.777484
      0.617336
      0.527547
      0.672747
      0.467975
    
    
      209
      TDHR
      1194.0
      0.784093
      0.658437
      0.544969
      0.665350
      0.511934
    
    
      210
      TDHR
      562.0
      0.810377
      0.604879
      0.567782
      0.678530
      0.442344
    
    
      211
      TDHR
      567.0
      0.805316
      0.612327
      0.536813
      0.695724
      0.432867
    
    
      212
      TDHR
      1164.0
      0.784142
      0.651490
      0.528617
      0.674021
      0.481337
    
    
      213
      TDHR
      1841.0
      0.797916
      0.682389
      0.491711
      0.716173
      0.477386
    
    
      214
      TDHR
      599.0
      0.801086
      0.633979
      0.576884
      0.642123
      0.539169
    
    
      215
      TDHR
      1083.0
      0.828846
      0.615844
      0.588596
      0.713022
      0.443045
    
    
      216
      TDHR
      518.0
      0.860767
      0.624725
      0.527515
      0.617228
      0.525838
    
  

217 rows × 7 columns



In [159]:

    
#Scaling all features

# Select only numeric variables to scale.
df = person1.select_dtypes(include=[np.number]).dropna()

# Save the column names.
names = df.columns

# Scale, then turn the resulting numpy array back into a data frame with the correct column names.
df_scaled = pd.DataFrame(preprocessing.scale(df), columns=names)

# Lookit all those matching means and standard deviations!
df_scaled.head()









    Out[159]:







  
    
      
      word_count
      big5_openness_raw_score
      big5_conscientiousness_raw_score
      big5_extraversion_raw_score
      big5_agreeableness_raw_score
      big5_neuroticism_raw_score
    
  
  
    
      0
      0.050025
      -1.189772
      0.653134
      -1.138976
      0.753642
      -0.437386
    
    
      1
      -0.134604
      -1.848076
      -2.354997
      -1.317098
      1.987142
      -1.946366
    
    
      2
      0.192963
      0.639677
      -1.367108
      -1.535213
      2.270529
      -2.603088
    
    
      3
      0.766700
      -2.110693
      -1.650030
      -0.984659
      3.030965
      -1.269245
    
    
      4
      -0.053208
      0.285630
      -0.633140
      -1.310452
      1.115322
      -0.682455



In [160]:

    
person2= df_scaled.rename(columns={'big5_openness_raw_score': 'big5_openness_scaled',
                                 'big5_conscientiousness_raw_score': 'big5_conscientiousness_scaled',
                                 'big5_extraversion_raw_score':'big5_extraversion_scaled',
                                 'big5_agreeableness_raw_score': 'big5_agreeableness_scaled',
                                 'big5_neuroticism_raw_score':'big5_neuroticism_scaled'
                                })
person2.head()









    Out[160]:







  
    
      
      word_count
      big5_openness_scaled
      big5_conscientiousness_scaled
      big5_extraversion_scaled
      big5_agreeableness_scaled
      big5_neuroticism_scaled
    
  
  
    
      0
      0.050025
      -1.189772
      0.653134
      -1.138976
      0.753642
      -0.437386
    
    
      1
      -0.134604
      -1.848076
      -2.354997
      -1.317098
      1.987142
      -1.946366
    
    
      2
      0.192963
      0.639677
      -1.367108
      -1.535213
      2.270529
      -2.603088
    
    
      3
      0.766700
      -2.110693
      -1.650030
      -0.984659
      3.030965
      -1.269245
    
    
      4
      -0.053208
      0.285630
      -0.633140
      -1.310452
      1.115322
      -0.682455



In [161]:

    
# Make the correlation matrix.
correlation_matrix = person2.corr()

# Set up the matplotlib figure.
f, ax = plt.subplots(figsize=(12, 9))

# Draw the heatmap using seaborn.
sns.heatmap(correlation_matrix, vmax=.8, square=True)
plt.show()



In [162]:

    
#Eigenvectores & Eigenvalues

eig_vals, eig_vecs = np.linalg.eig(correlation_matrix)

# Inspecting the eigenvalues and eigenvectors.
for i in range(len(eig_vals)):
    eigvecs = eig_vecs[:, i].reshape(1, len(person2.columns)).T
    print('Eigenvector {}: \n{}'.format(i + 1, eigvecs))
    print('Eigenvalue {}: {}'.format(i + 1, eig_vals[i]))
    print(40 * '-')


sklearn_pca = PCA(n_components=len(person2.columns))
Y_sklearn = sklearn_pca.fit_transform(correlation_matrix)

print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    sklearn_pca.explained_variance_ratio_
)









    



Eigenvector 1: 
[[-0.07910804]
 [ 0.42973135]
 [ 0.43986724]
 [ 0.37821307]
 [-0.37474999]
 [ 0.57628727]]
Eigenvalue 1: 2.3277045474505345
----------------------------------------
Eigenvector 2: 
[[ 0.10681784]
 [-0.01398469]
 [-0.54387073]
 [-0.01824947]
 [ 0.41424768]
 [ 0.72157108]]
Eigenvalue 2: 0.21289857044447305
----------------------------------------
Eigenvector 3: 
[[-0.2530512 ]
 [-0.83910785]
 [ 0.05236615]
 [ 0.41463868]
 [-0.16966073]
 [ 0.16855532]]
Eigenvalue 3: 0.665670550115972
----------------------------------------
Eigenvector 4: 
[[ 0.94401633]
 [-0.22181736]
 [ 0.22631285]
 [ 0.05766409]
 [-0.04607966]
 [ 0.05444487]]
Eigenvalue 4: 1.0273766555370096
----------------------------------------
Eigenvector 5: 
[[-0.13793165]
 [-0.08300486]
 [ 0.64247332]
 [-0.05973448]
 [ 0.74305045]
 [ 0.07497273]]
Eigenvalue 5: 0.961292341936451
----------------------------------------
Eigenvector 6: 
[[-0.09002774]
 [-0.23439425]
 [ 0.20973733]
 [-0.82328807]
 [-0.32392783]
 [ 0.33201222]]
Eigenvalue 6: 0.8050573345155562
----------------------------------------
The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 [  6.28201539e-01   1.59072557e-01   1.17277504e-01   7.66064020e-02
   1.88419981e-02   2.57603426e-34]



In [163]:

    
#From the Scree plot.

plt.plot(eig_vals)
plt.show()



In [164]:

    
y = person1['folder']
X = person2[['word_count','big5_openness_scaled','big5_conscientiousness_scaled','big5_extraversion_scaled','big5_agreeableness_scaled',
             'big5_neuroticism_scaled']]



In [165]:

    
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)



In [166]:

    
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, y_train)



In [167]:

    
print ("Multinomial Logistic regression Train Accuracy :: ", metrics.accuracy_score(y_train, mul_lr.predict(X_train)))
print ("Multinomial Logistic regression Test Accuracy :: ", metrics.accuracy_score(y_test, mul_lr.predict(X_test)))









    



Multinomial Logistic regression Train Accuracy ::  0.53488372093
Multinomial Logistic regression Test Accuracy ::  0.0977011494253



In [168]:

    
y = pd.factorize(y_train)[0]









    Out[168]:





array([ 0,  1,  2,  1,  0,  3,  4,  1,  4,  5,  3,  6,  7,  3,  7,  6,  4,
        8,  7,  7,  5,  3,  7,  8,  3,  8,  0,  1,  9, 10,  6,  7,  6,  0,
        5, 11,  6,  1,  7,  1,  2,  0,  3], dtype=int64)



In [169]:

    
clf = RandomForestClassifier(n_jobs=11, random_state=0)

clf.fit(X_train, y_train)









    Out[169]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=11,
            oob_score=False, random_state=0, verbose=0, warm_start=False)



In [172]:









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-172-5cf38150a878> in <module>()
      1 # Apply the Classifier we trained to the test data (which, remember, it has never seen before)
----> 2 clf.predict(X_test).prob()

AttributeError: 'numpy.ndarray' object has no attribute 'prob'



In [ ]:



In [ ]:

	folder	file	word_count	processed_language	big5_openness_raw_score	big5_openness__facet_adventurousness_raw_score	big5_openness__facet_artistic_interests_raw_score	big5_openness__facet_emotionality_raw_score	big5_openness__facet_imagination_raw_score	big5_openness__facet_intellect_raw_score	...	need_love_raw_score	need_practicality_raw_score	need_self_expression_raw_score	need_stability_raw_score	need_structure_raw_score	value_conservation_raw_score	value_openness_to_change_raw_score	value_hedonism_raw_score	value_self_enhancement_raw_score	value_self_transcendence_raw_score
0	BCDM	BCDM Essay #1.docx	1037.0	en	0.784474	0.552340	0.723263	0.688233	0.744498	0.731954	...	0.722490	0.688515	0.621080	0.732718	0.730330	0.668267	0.796639	0.632937	0.674912	0.834175
1	BCDM	BCDM Essay #10.docx	944.0	en	0.771121	0.545828	0.779554	0.739122	0.784990	0.769389	...	0.810111	0.683229	0.637034	0.732972	0.701467	0.622341	0.799327	0.661351	0.679099	0.849262
2	BCDM	BCDM Essay #11.docx	1109.0	en	0.821584	0.526137	0.769487	0.762700	0.805796	0.766390	...	0.802418	0.696390	0.683699	0.741340	0.702538	0.608368	0.819925	0.644431	0.634194	0.849675
3	BCDM	BCDM Essay #12.docx	1398.0	en	0.765794	0.448368	0.683062	0.729064	0.732648	0.675948	...	0.812411	0.674062	0.632017	0.762876	0.698028	0.643144	0.742352	0.674282	0.632661	0.842662
4	BCDM	BCDM Essay #13.docx	985.0	en	0.814402	0.545682	0.742526	0.688421	0.772085	0.770653	...	0.721496	0.696977	0.645380	0.684512	0.699831	0.569484	0.817540	0.647659	0.648155	0.836181

	folder	file	word_count	processed_language	big5_openness_raw_score	big5_openness__facet_adventurousness_raw_score	big5_openness__facet_artistic_interests_raw_score	big5_openness__facet_emotionality_raw_score	big5_openness__facet_imagination_raw_score	big5_openness__facet_intellect_raw_score	...	need_love_raw_score	need_practicality_raw_score	need_self_expression_raw_score	need_stability_raw_score	need_structure_raw_score	value_conservation_raw_score	value_openness_to_change_raw_score	value_hedonism_raw_score	value_self_enhancement_raw_score	value_self_transcendence_raw_score
212	TDHR	TDHR Essay #5.docx	1164.0	en	0.784142	0.571794	0.654346	0.621485	0.723117	0.737616	...	0.743371	0.713913	0.616218	0.736226	0.749601	0.617243	0.803430	0.626427	0.674647	0.833964
213	TDHR	TDHR Essay #6.docx	1841.0	en	0.797916	0.531801	0.740805	0.702944	0.731774	0.763092	...	0.722096	0.669997	0.601237	0.736552	0.726431	0.631269	0.766601	0.587490	0.648672	0.839953
214	TDHR	TDHR Essay #7.pdf	599.0	en	0.801086	0.562731	0.614591	0.556978	0.674139	0.688987	...	0.717358	0.720633	0.607210	0.713225	0.711345	0.583818	0.794607	0.615938	0.681913	0.802872
215	TDHR	TDHR Essay #8.docx	1083.0	en	0.828846	0.555455	0.763287	0.690920	0.803990	0.773501	...	0.767748	0.704486	0.669404	0.693312	0.692027	0.594143	0.821815	0.626897	0.662800	0.837475
216	TDHR	TDHR Essay #9.docx	518.0	en	0.860767	0.520779	0.808049	0.638710	0.830613	0.768699	...	0.645866	0.674787	0.590121	0.680510	0.692115	0.567373	0.792367	0.587199	0.635510	0.807051

	word_count	big5_openness_raw_score	big5_conscientiousness_raw_score	big5_extraversion_raw_score	big5_agreeableness_raw_score	big5_neuroticism_raw_score
0	0.050025	-1.189772	0.653134	-1.138976	0.753642	-0.437386
1	-0.134604	-1.848076	-2.354997	-1.317098	1.987142	-1.946366
2	0.192963	0.639677	-1.367108	-1.535213	2.270529	-2.603088
3	0.766700	-2.110693	-1.650030	-0.984659	3.030965	-1.269245
4	-0.053208	0.285630	-0.633140	-1.310452	1.115322	-0.682455