notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [113]:

    
import numpy as np
import pandas as pd



In [114]:

    
data=pd.read_csv("final\OCDEfinal4.csv")



In [115]:

    
data.head(10)









    Out[115]:






  
    
      
      country
      location
      public_spendingPRY_NTRY2013
      public_spendingPRY_TRY2013
      public_spendingTRY2013
      private_spendingPRY_NTRY2012
      private_spendingPRY_NTRY2013
      private_spendingPRY_NTRY2014
      private_spendingPRY_TRY2012
      private_spendingPRY_TRY2013
      ...
      vT021
      vT022
      vT023
      vT024
      vT025
      vT026
      vT027
      vT028
      valueLOWSRY_AVGAGE
      valueLOWSRY_WORK
    
  
  
    
      0
      Australia
      AUS
      3.4
      4.7
      1.3
      4.0
      3.9
      NaN
      5.6
      5.6
      ...
      5.1
      7.0
      14.5
      78.1
      81.3
      78.4
      38.5
      90.0
      43.4
      16.7
    
    
      1
      Austria
      AUT
      3.2
      5.0
      1.8
      3.1
      3.2
      NaN
      4.9
      5.0
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      Belgium
      BEL
      4.3
      5.8
      1.4
      4.3
      4.4
      NaN
      5.7
      5.8
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      Brazil
      BRA
      4.4
      5.5
      1.1
      4.3
      4.3
      NaN
      5.1
      5.2
      ...
      5.7
      12.2
      19.8
      66.7
      94.8
      95.1
      12.6
      87.0
      39.2
      13.6
    
    
      4
      Canada
      CAN
      NaN
      NaN
      NaN
      3.6
      NaN
      NaN
      6.1
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      5
      Switzerland
      CHE
      3.5
      4.8
      1.3
      3.8
      NaN
      NaN
      5.1
      5.1
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      6
      Chile
      CHL
      2.7
      3.8
      1.2
      3.6
      3.4
      3.1
      6.1
      5.8
      ...
      4.1
      10.8
      15.3
      73.1
      91.0
      90.2
      33.6
      94.6
      41.3
      15.1
    
    
      7
      Colombia
      COL
      NaN
      NaN
      NaN
      0.7
      4.3
      NaN
      2.6
      6.6
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      8
      Czech Republic
      CZE
      2.5
      3.4
      0.9
      2.8
      2.7
      NaN
      4.2
      4.0
      ...
      4.5
      6.6
      8.8
      84.0
      39.0
      51.8
      12.2
      88.6
      44.2
      17.7
    
    
      9
      Germany
      DEU
      2.9
      4.2
      1.3
      3.1
      3.1
      NaN
      4.4
      4.3
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

10 rows × 279 columns



In [116]:

    
cols = data.columns



In [117]:

    
mask_GINI = np.array(['GINI' in cols[i] for i in range(len(cols))])



In [118]:

    
cols_GINI = cols[mask_GINI]



In [119]:

    
data['last_GINI'] = np.zeros(shape=40)



In [120]:

    
def last_not_nan(nparray):
    #print(nparray)
    for i in nparray[::-1]:
        #print(i)
        if not(np.isnan(i)):
            return i
    return np.nan



In [121]:

    
for idx in data.index:
    data.loc[data.index==idx, 'last_GINI'] = last_not_nan(np.array(data.loc[data.index==idx][cols_GINI])[0])



In [122]:

    
data['last_GINI']









    Out[122]:





0     0.337
1     0.280
2     0.268
3       NaN
4     0.322
5     0.295
6     0.465
7       NaN
8     0.262
9     0.292
10    0.254
11    0.346
12    0.361
13    0.257
14    0.294
15    0.358
16    0.343
17    0.288
18      NaN
19    0.309
20    0.244
21    0.365
22    0.325
23    0.330
24    0.302
25    0.281
26    0.352
27    0.459
28    0.283
29    0.252
30    0.333
31      NaN
32    0.300
33    0.342
34      NaN
35    0.269
36    0.255
37    0.281
38    0.393
39    0.394
Name: last_GINI, dtype: float64



In [123]:

    
data.to_csv('OCDE_final_withGINI.csv')



In [ ]:



In [2]:

    
import pandas as pd
import matplotlib.pyplot as plt
DB=pd.read_csv("OCDEfinal4_newcols.csv", index_col=0, sep=";")
DB.head(1)
DB.index=DB['location']
DBTalis=DB.ix[:,len(DB.columns)-31:len(DB.columns)-2]
DBTalis.head(1)
DBTalis.index=DB['location']



In [3]:

    
DBPisa=DB['reading2015']

plt.plot(DBTalis['vTBelieveHelpStudentsThinkCritic'],DB['science2015'], "bo")
plt.show()



In [4]:

    
plt.plot(DBTalis['vTFB2'],DB['science2015'], "bo")









    Out[4]:





[<matplotlib.lines.Line2D at 0x252d96c8cc0>]



In [5]:

    
fig, ax = plt.subplots()
plt.plot(DBTalis['vTFB2'], DB['science2015'], "bo")
index=pd.DataFrame(DBTalis['vTFB2'].index)
for i, index in enumerate(index.values):
    ax.annotate(index[0], (DBTalis['vTFB2'].iloc[i],DB['science2015'].iloc[i]))



In [41]:

    
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as sklearnPCA
import numpy as np



In [7]:

    
X = DBTalis
X= X.dropna()
X_std = StandardScaler().fit_transform(X)



In [27]:

    
sklearn_pca = sklearnPCA(n_components=3)
Y_sklearn = sklearn_pca.fit_transform(X_std)



In [28]:

    
maxDBPisa=DBPisa.max()
minDBPisa=DBPisa.min()



In [65]:

    
vectors = sklearn_pca.components_
print(vectors)









    



[[-0.24333792 -0.32558188 -0.23585826 -0.12969598 -0.20148294  0.05177964
  -0.234052    0.15999069 -0.01269383  0.22659865  0.33211095  0.17454673
   0.05107937  0.04654602  0.26627067  0.14917021  0.21626985  0.11013657
   0.11074444  0.11119976  0.03735587  0.01534012  0.27938409  0.24386697
  -0.28281166 -0.04998291 -0.09667796  0.16359195 -0.0848518 ]
 [ 0.13913872 -0.05979151 -0.05382999 -0.23942087  0.2778574  -0.25665812
   0.15347208 -0.17473873  0.0511267  -0.14862022 -0.05798402 -0.25304269
  -0.07835626  0.05507113  0.2005652   0.19896721  0.10924276 -0.13329419
  -0.09617877  0.31157654 -0.11704404  0.07218594  0.28185214  0.14193587
  -0.21938053  0.27883602  0.30756721 -0.14771612  0.20438997]
 [-0.15939229  0.03945046 -0.08857257 -0.07118459 -0.12837888 -0.00330997
  -0.24247278 -0.28604569 -0.2714068  -0.07497999 -0.05546848 -0.25048514
  -0.30657528  0.22643526  0.13355599 -0.32826437 -0.13309404 -0.14783675
  -0.24574931 -0.18252035 -0.04046091  0.03017883 -0.01637873  0.25436311
  -0.17063065 -0.17375682 -0.1681021  -0.28542467 -0.13127933]]



In [79]:

    
argsort_v1=np.argsort(-np.abs(vectors[1,:]))



In [80]:

    
colnames = np.array(['vTPercentFem','vTAge','vTYExp','vTPercentCompleteTrain','vTPercentFemPrincipals','vTAgePrincipals','vTYExpPrincipals','vTBelieveProfValInSocPrincipals','vTSatisfPrincipals','vTFI','vTMentor','vTMentorToOther','vTProfDev','vTNeedICT','vTNeedSN','vTFB1','vTFB2','vTEvalSchool','vTEvalClass','vTtimepW','vTtimePrep','vTTime1','vTTime2','vTTime3','vtTimeTeaching','vTBelieveHelpStudentsValueLearning','vTBelieveHelpStudentsThinkCritic','vTBelieveProfValInSoc','vTSatisf'])



In [81]:

    
#display the 10 first features for the vector #2 in terms of importance
for i in range(10):
    print("Feature ", i, "of vector 1: ", colnames[argsort_v1[i]], ", importance coeff =", vectors[1, np.where( colnames ==colnames[argsort_v1[i]] )[0][0]])









    



Feature  0 of vector 1:  vTtimepW , importance coeff = 0.311576539472
Feature  1 of vector 1:  vTBelieveHelpStudentsThinkCritic , importance coeff = 0.307567210217
Feature  2 of vector 1:  vTTime2 , importance coeff = 0.281852140385
Feature  3 of vector 1:  vTBelieveHelpStudentsValueLearning , importance coeff = 0.278836016558
Feature  4 of vector 1:  vTPercentFemPrincipals , importance coeff = 0.277857395358
Feature  5 of vector 1:  vTAgePrincipals , importance coeff = -0.256658124708
Feature  6 of vector 1:  vTMentorToOther , importance coeff = -0.253042688615
Feature  7 of vector 1:  vTPercentCompleteTrain , importance coeff = -0.239420871761
Feature  8 of vector 1:  vtTimeTeaching , importance coeff = -0.219380532865
Feature  9 of vector 1:  vTSatisf , importance coeff = 0.204389970317



In [83]:

    
argsort_v0=np.argsort(-np.abs(vectors[0,:]))
#display the 10 first features for the vector #2 in terms of importance
for i in range(10):
    print("Feature ", i, "of vector 0: ", colnames[argsort_v0[i]], ", importance coeff =", vectors[0, np.where( colnames ==colnames[argsort_v0[i]] )[0][0]])









    



Feature  0 of vector 0:  vTMentor , importance coeff = 0.33211095087
Feature  1 of vector 0:  vTAge , importance coeff = -0.325581878297
Feature  2 of vector 0:  vtTimeTeaching , importance coeff = -0.282811662957
Feature  3 of vector 0:  vTTime2 , importance coeff = 0.279384089803
Feature  4 of vector 0:  vTNeedSN , importance coeff = 0.266270673853
Feature  5 of vector 0:  vTTime3 , importance coeff = 0.243866966742
Feature  6 of vector 0:  vTPercentFem , importance coeff = -0.243337919504
Feature  7 of vector 0:  vTYExp , importance coeff = -0.235858256883
Feature  8 of vector 0:  vTYExpPrincipals , importance coeff = -0.234051997076
Feature  9 of vector 0:  vTFI , importance coeff = 0.226598648615



In [ ]:



In [ ]:



In [ ]:



In [87]:

    
sklearn_pca.explained_variance_









    Out[87]:





array([ 6.21614016,  4.74133369,  3.68113903])



In [89]:

    
4.741/29









    Out[89]:





0.16348275862068964



In [32]:

    
sklearn_pca









    Out[32]:





PCA(copy=True, n_components=3, whiten=False)



In [33]:

    
help(sklearn_pca)









    



Help on PCA in module sklearn.decomposition.pca object:

class PCA(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  Principal component analysis (PCA)
 |  
 |  Linear dimensionality reduction using Singular Value Decomposition of the
 |  data and keeping only the most significant singular vectors to project the
 |  data to a lower dimensional space.
 |  
 |  This implementation uses the scipy.linalg implementation of the singular
 |  value decomposition. It only works for dense arrays and is not scalable to
 |  large dimensional data.
 |  
 |  The time complexity of this implementation is ``O(n ** 3)`` assuming
 |  n ~ n_samples ~ n_features.
 |  
 |  Read more in the :ref:`User Guide <PCA>`.
 |  
 |  Parameters
 |  ----------
 |  n_components : int, None or string
 |      Number of components to keep.
 |      if n_components is not set all components are kept::
 |  
 |          n_components == min(n_samples, n_features)
 |  
 |      if n_components == 'mle', Minka's MLE is used to guess the dimension
 |      if ``0 < n_components < 1``, select the number of components such that
 |      the amount of variance that needs to be explained is greater than the
 |      percentage specified by n_components
 |  
 |  copy : bool
 |      If False, data passed to fit are overwritten and running
 |      fit(X).transform(X) will not yield the expected results,
 |      use fit_transform(X) instead.
 |  
 |  whiten : bool, optional
 |      When True (False by default) the `components_` vectors are divided
 |      by n_samples times singular values to ensure uncorrelated outputs
 |      with unit component-wise variances.
 |  
 |      Whitening will remove some information from the transformed signal
 |      (the relative variance scales of the components) but can sometime
 |      improve the predictive accuracy of the downstream estimators by
 |      making there data respect some hard-wired assumptions.
 |  
 |  Attributes
 |  ----------
 |  components_ : array, [n_components, n_features]
 |      Principal axes in feature space, representing the directions of
 |      maximum variance in the data.
 |  
 |  explained_variance_ratio_ : array, [n_components]
 |      Percentage of variance explained by each of the selected components.
 |      If ``n_components`` is not set then all components are stored and the
 |      sum of explained variances is equal to 1.0
 |  
 |  mean_ : array, [n_features]
 |      Per-feature empirical mean, estimated from the training set.
 |  
 |  n_components_ : int
 |      The estimated number of components. Relevant when n_components is set
 |      to 'mle' or a number between 0 and 1 to select using explained
 |      variance.
 |  
 |  noise_variance_ : float
 |      The estimated noise covariance following the Probabilistic PCA model
 |      from Tipping and Bishop 1999. See "Pattern Recognition and
 |      Machine Learning" by C. Bishop, 12.2.1 p. 574 or
 |      http://www.miketipping.com/papers/met-mppca.pdf. It is required to
 |      computed the estimated data covariance and score samples.
 |  
 |  Notes
 |  -----
 |  For n_components='mle', this class uses the method of `Thomas P. Minka:
 |  Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
 |  
 |  Implements the probabilistic PCA model from:
 |  M. Tipping and C. Bishop, Probabilistic Principal Component Analysis,
 |  Journal of the Royal Statistical Society, Series B, 61, Part 3, pp. 611-622
 |  via the score and score_samples methods.
 |  See http://www.miketipping.com/papers/met-mppca.pdf
 |  
 |  Due to implementation subtleties of the Singular Value Decomposition (SVD),
 |  which is used in this implementation, running fit twice on the same matrix
 |  can lead to principal components with signs flipped (change in direction).
 |  For this reason, it is important to always use the same estimator object to
 |  transform data in a consistent fashion.
 |  
 |  Examples
 |  --------
 |  
 |  >>> import numpy as np
 |  >>> from sklearn.decomposition import PCA
 |  >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
 |  >>> pca = PCA(n_components=2)
 |  >>> pca.fit(X)
 |  PCA(copy=True, n_components=2, whiten=False)
 |  >>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
 |  [ 0.99244...  0.00755...]
 |  
 |  See also
 |  --------
 |  RandomizedPCA
 |  KernelPCA
 |  SparsePCA
 |  TruncatedSVD
 |  
 |  Method resolution order:
 |      PCA
 |      sklearn.base.BaseEstimator
 |      sklearn.base.TransformerMixin
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, n_components=None, copy=True, whiten=False)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit(self, X, y=None)
 |      Fit the model with X.
 |      
 |      Parameters
 |      ----------
 |      X: array-like, shape (n_samples, n_features)
 |          Training data, where n_samples in the number of samples
 |          and n_features is the number of features.
 |      
 |      Returns
 |      -------
 |      self : object
 |          Returns the instance itself.
 |  
 |  fit_transform(self, X, y=None)
 |      Fit the model with X and apply the dimensionality reduction on X.
 |      
 |      Parameters
 |      ----------
 |      X : array-like, shape (n_samples, n_features)
 |          Training data, where n_samples is the number of samples
 |          and n_features is the number of features.
 |      
 |      Returns
 |      -------
 |      X_new : array-like, shape (n_samples, n_components)
 |  
 |  get_covariance(self)
 |      Compute data covariance with the generative model.
 |      
 |      ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)``
 |      where  S**2 contains the explained variances.
 |      
 |      Returns
 |      -------
 |      cov : array, shape=(n_features, n_features)
 |          Estimated covariance of data.
 |  
 |  get_precision(self)
 |      Compute data precision matrix with the generative model.
 |      
 |      Equals the inverse of the covariance but computed with
 |      the matrix inversion lemma for efficiency.
 |      
 |      Returns
 |      -------
 |      precision : array, shape=(n_features, n_features)
 |          Estimated precision of data.
 |  
 |  inverse_transform(self, X)
 |      Transform data back to its original space, i.e.,
 |      return an input X_original whose transform would be X
 |      
 |      Parameters
 |      ----------
 |      X : array-like, shape (n_samples, n_components)
 |          New data, where n_samples is the number of samples
 |          and n_components is the number of components.
 |      
 |      Returns
 |      -------
 |      X_original array-like, shape (n_samples, n_features)
 |  
 |  score(self, X, y=None)
 |      Return the average log-likelihood of all samples
 |      
 |      See. "Pattern Recognition and Machine Learning"
 |      by C. Bishop, 12.2.1 p. 574
 |      or http://www.miketipping.com/papers/met-mppca.pdf
 |      
 |      Parameters
 |      ----------
 |      X: array, shape(n_samples, n_features)
 |          The data.
 |      
 |      Returns
 |      -------
 |      ll: float
 |          Average log-likelihood of the samples under the current model
 |  
 |  score_samples(self, X)
 |      Return the log-likelihood of each sample
 |      
 |      See. "Pattern Recognition and Machine Learning"
 |      by C. Bishop, 12.2.1 p. 574
 |      or http://www.miketipping.com/papers/met-mppca.pdf
 |      
 |      Parameters
 |      ----------
 |      X: array, shape(n_samples, n_features)
 |          The data.
 |      
 |      Returns
 |      -------
 |      ll: array, shape (n_samples,)
 |          Log-likelihood of each sample under the current model
 |  
 |  transform(self, X)
 |      Apply the dimensionality reduction on X.
 |      
 |      X is projected on the first principal components previous extracted
 |      from a training set.
 |      
 |      Parameters
 |      ----------
 |      X : array-like, shape (n_samples, n_features)
 |          New data, where n_samples is the number of samples
 |          and n_features is the number of features.
 |      
 |      Returns
 |      -------
 |      X_new : array-like, shape (n_samples, n_components)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.BaseEstimator:
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  get_params(self, deep=True)
 |      Get parameters for this estimator.
 |      
 |      Parameters
 |      ----------
 |      deep: boolean, optional
 |          If True, will return the parameters for this estimator and
 |          contained subobjects that are estimators.
 |      
 |      Returns
 |      -------
 |      params : mapping of string to any
 |          Parameter names mapped to their values.
 |  
 |  set_params(self, **params)
 |      Set the parameters of this estimator.
 |      
 |      The method works on simple estimators as well as on nested objects
 |      (such as pipelines). The former have parameters of the form
 |      ``<component>__<parameter>`` so that it's possible to update each
 |      component of a nested object.
 |      
 |      Returns
 |      -------
 |      self
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from sklearn.base.BaseEstimator:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [37]:

    
import sklearn
print(sklearn.__version__)



In [96]:

    
X['vTtimepW']









    Out[96]:





location
AUS     18.6
BRA     25.4
CHL     26.7
CZE     17.8
DNK     18.9
ESP     18.6
EST     20.9
FIN     20.6
FRA     18.6
ISL     19.0
ISR     18.3
ITA     17.3
JPN     17.7
KOR     18.8
LVA     19.2
MEX     22.7
NLD     16.9
NOR     15.0
NZL     18.0
OAVG    19.2
POL     18.6
PRT     20.8
RUS     23.6
SVK     19.9
SWE     17.6
Name: vTtimepW, dtype: float64



In [ ]:



In [ ]:

	country	location	public_spendingPRY_NTRY2013	public_spendingPRY_TRY2013	public_spendingTRY2013	private_spendingPRY_NTRY2012	private_spendingPRY_NTRY2013	private_spendingPRY_NTRY2014	private_spendingPRY_TRY2012	private_spendingPRY_TRY2013	...	vT021	vT022	vT023	vT024	vT025	vT026	vT027	vT028	valueLOWSRY_AVGAGE	valueLOWSRY_WORK
0	Australia	AUS	3.4	4.7	1.3	4.0	3.9	NaN	5.6	5.6	...	5.1	7.0	14.5	78.1	81.3	78.4	38.5	90.0	43.4	16.7
1	Austria	AUT	3.2	5.0	1.8	3.1	3.2	NaN	4.9	5.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	Belgium	BEL	4.3	5.8	1.4	4.3	4.4	NaN	5.7	5.8	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Brazil	BRA	4.4	5.5	1.1	4.3	4.3	NaN	5.1	5.2	...	5.7	12.2	19.8	66.7	94.8	95.1	12.6	87.0	39.2	13.6
4	Canada	CAN	NaN	NaN	NaN	3.6	NaN	NaN	6.1	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	Switzerland	CHE	3.5	4.8	1.3	3.8	NaN	NaN	5.1	5.1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	Chile	CHL	2.7	3.8	1.2	3.6	3.4	3.1	6.1	5.8	...	4.1	10.8	15.3	73.1	91.0	90.2	33.6	94.6	41.3	15.1
7	Colombia	COL	NaN	NaN	NaN	0.7	4.3	NaN	2.6	6.6	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
8	Czech Republic	CZE	2.5	3.4	0.9	2.8	2.7	NaN	4.2	4.0	...	4.5	6.6	8.8	84.0	39.0	51.8	12.2	88.6	44.2	17.7
9	Germany	DEU	2.9	4.2	1.3	3.1	3.1	NaN	4.4	4.3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN