Working with data 2017. Class 8

Contact

Javier Garcia-Bernardo garcia@uva.nl

1. Clustering

2. Data imputation

3. Dimensionality reduction


In [2]:
##Some code to run at the beginning of the file, to be able to show images in the notebook
##Don't worry about this cell

#Print the plots in this screen
%matplotlib inline 

#Be able to plot images saved in the hard drive
from IPython.display import Image 

#Make the notebook wider
from IPython.core.display import display, HTML 
display(HTML("<style>.container { width:90% !important; }</style>"))

import seaborn as sns
import pylab as plt
import pandas as pd
import numpy as np
import scipy.stats

import statsmodels.formula.api as smf


1. Clustering


In [34]:
#Som elibraries
from sklearn import preprocessing
from sklearn.cluster import DBSCAN, KMeans

In [58]:
#Read teh data, dropna, get sample
df = pd.read_csv("data/big3_position.csv",sep="\t").dropna()
df["Revenue"] = np.log10(df["Revenue"])
df["Assets"] = np.log10(df["Assets"])
df["Employees"] = np.log10(df["Employees"])
df["MarketCap"] = np.log10(df["MarketCap"])
df = df.replace([np.inf,-np.inf],np.nan).dropna().sample(300)
df.head(2)


/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: RuntimeWarning: divide by zero encountered in log10
  from ipykernel import kernelapp as app
/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:5: RuntimeWarning: divide by zero encountered in log10
Out[58]:
Company_name Company_ID Big3Share Position Revenue Assets Employees MarketCap Exchange TypeEnt
3130 APPLIED OPTOELECTRONICS, INC. US760533927 9.76 2 5.278532 5.436918 3.400192 5.240180 NASDAQ National Market Industrial company
755 MONOTYPE IMAGING HOLDINGS INC. US203289482 19.78 1 5.284248 5.593050 2.693727 5.982111 NASDAQ National Market Industrial company

In [60]:
#Scale variables to give all of them the same weight
X = df.loc[:,["Revenue","Assets","Employees","MarketCap"]]
X = preprocessing.scale(X)
print(X.sum(0))
print(X.std(0))
X


[ -1.11910481e-13   1.85934601e-13   2.19824159e-14   2.79554158e-13]
[ 1.  1.  1.  1.]
Out[60]:
array([[-0.36876713, -0.68220535,  0.26763352, -0.80639115],
       [-0.36325862, -0.49958626, -0.50401999,  0.04420174],
       [-0.59109167,  0.53105902, -0.3752977 , -0.48330142],
       ..., 
       [ 0.51807468, -0.21091708,  0.77356348,  0.14753705],
       [ 1.07606207,  1.14466777,  0.6561359 ,  1.11508633],
       [ 0.85340652,  0.21902024,  1.55356623,  0.58284111]])

1a. Clustering with K-means

  • k-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster. This results in a partitioning of the data space into Voronoi cells.
  • Other methods: http://scikit-learn.org/stable/modules/clustering.html

In [69]:
#Get labels of each row and add a new column with the labels
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
labels = kmeans.labels_
df["kmeans_labels"] = labels
sns.lmplot(x="MarketCap",y="Assets",hue="kmeans_labels",fit_reg=False,data=df)


Out[69]:
<seaborn.axisgrid.FacetGrid at 0x7f16052a84a8>

1b. Clustering with DBSCAN

  • The DBSCAN algorithm views clusters as areas of high density separated by areas of low density. Due to this rather generic view, clusters found by DBSCAN can be any shape, as oppos

In [66]:
#Get labels of each row and add a new column with the labels
db = DBSCAN(eps=1, min_samples=10).fit(X)
labels = db.labels_
df["dbscan_labels"] = labels
sns.lmplot(x="MarketCap",y="Assets",hue="dbscan_labels",fit_reg=False,data=df)


Out[66]:
<seaborn.axisgrid.FacetGrid at 0x7f1605614240>

In [70]:
Image(url="http://scikit-learn.org/stable/_images/sphx_glr_plot_cluster_comparison_0011.png")


Out[70]:

1c. Hierarchical clustering

  • Keeps aggreagating from a point

In [78]:
import scipy
import pylab
import scipy.cluster.hierarchy as sch

# Generate distance matrix based on the difference between rows
D = np.zeros([4,4])
for i in range(4):
    for j in range(4):
        D[i,j] = np.sum(np.abs(X[:,i]-X[:,j])) #Euclidean distance or mutual information are also common
        
print(D)

#Create the linkage and plot
Y = sch.linkage(D, method='centroid') #many methods, single, complete...
Z1 = sch.dendrogram(Y, orientation='right',labels=["Revenue","Assets","Employees","MarketCap"])


[[   0.          150.3373409   120.10595294  155.10171976]
 [ 150.3373409     0.          193.3547258   140.18218283]
 [ 120.10595294  193.3547258     0.          190.80318619]
 [ 155.10171976  140.18218283  190.80318619    0.        ]]

2. Imputation of missing data (fancy)


In [ ]:
#Required libraries
!conda install tensorflow -y
!pip install fancyimpute
!pip install pydot_ng

In [5]:
import sklearn.preprocessing
import sklearn

In [10]:
#Read the data again but do not 
df = pd.read_csv("data/big3_position.csv",sep="\t")
df["Revenue"] = np.log10(df["Revenue"])
df["Assets"] = np.log10(df["Assets"])
df["Employees"] = np.log10(df["Employees"])
df["MarketCap"] = np.log10(df["MarketCap"])


le = sklearn.preprocessing.LabelEncoder()
labels = le.fit_transform(df["TypeEnt"])
df["TypeEnt_int"] = labels

print(le.classes_)

df = df.replace([np.inf,-np.inf],np.nan).sample(300)
df.head(2)


['Bank' 'Financial company' 'Foundation/Research institute'
 'Industrial company' 'Insurance company' 'Venture capital']
/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: RuntimeWarning: divide by zero encountered in log10
  app.launch_new_instance()
/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: RuntimeWarning: invalid value encountered in log10
  app.launch_new_instance()
/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:6: RuntimeWarning: divide by zero encountered in log10
Out[10]:
Company_name Company_ID Big3Share Position Revenue Assets Employees MarketCap Exchange TypeEnt TypeEnt_int
1142 MAGNEGAS CORP US260250418 0.18 8 3.385785 4.249467 NaN 4.575361 NASDAQ National Market Industrial company 3
878 DS HEALTHCARE GROUP INC US208380461 1.37 5 4.243038 NaN 1.544068 4.179293 NASDAQ National Market Industrial company 3

In [11]:
X = df.loc[:,["Revenue","Assets","Employees","MarketCap","TypeEnt_int"]].values
X


Out[11]:
array([[ 3.38578496,  4.24946741,         nan,  4.57536107,  3.        ],
       [ 4.24303805,         nan,  1.54406804,  4.17929321,  3.        ],
       [ 5.94375215,  7.28090249,  3.58319877,  6.45869238,  0.        ],
       ..., 
       [ 3.40823997,  5.33826321,  1.94939001,  5.79339851,  3.        ],
       [ 6.80444892,  6.54323141,  4.64200921,  6.28966007,  3.        ],
       [ 4.59920601,  4.39970833,  2.09691001,  3.86946641,  3.        ]])

In [ ]:
df.describe()

In [12]:
from fancyimpute import KNN

# X is the complete data matrix
# X_incomplete has the same values as X except a subset have been replace with NaN

# Use 10 nearest rows which have a feature to fill in each row's missing features
X_filled_knn = KNN(k=10).complete(X)
df.loc[:,cols] = X_filled_knn

df.describe()


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-12-f364ba1a43b7> in <module>()
----> 1 from fancyimpute import KNN
      2 
      3 # X is the complete data matrix
      4 # X_incomplete has the same values as X except a subset have been replace with NaN
      5 

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/fancyimpute/__init__.py in <module>()
      2 
      3 from .solver import Solver
----> 4 from .nuclear_norm_minimization import NuclearNormMinimization
      5 from .bayesian_ridge_regression import BayesianRidgeRegression
      6 from .mice import MICE

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/fancyimpute/nuclear_norm_minimization.py in <module>()
     13 from __future__ import absolute_import, print_function, division
     14 
---> 15 import cvxpy
     16 
     17 from .solver import Solver

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/cvxpy/__init__.py in <module>()
     19 
     20 __version__ = "0.4.8"
---> 21 from cvxpy.atoms import *
     22 from cvxpy.expressions.variables import (Variable, Semidef, Symmetric, Bool,
     23                                          Int, NonNegative)

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/cvxpy/atoms/__init__.py in <module>()
     18 """
     19 
---> 20 from cvxpy.atoms.affine_prod import affine_prod
     21 from cvxpy.atoms.geo_mean import geo_mean
     22 from cvxpy.atoms.harmonic_mean import harmonic_mean

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/cvxpy/atoms/affine_prod.py in <module>()
     18 """
     19 
---> 20 from cvxpy.atoms.atom import Atom
     21 import cvxpy.utilities as u
     22 import numpy as np

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/cvxpy/atoms/atom.py in <module>()
     19 
     20 
---> 21 from .. import utilities as u
     22 from .. import interface as intf
     23 from ..expressions.constants import Constant, CallbackParam

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/cvxpy/utilities/__init__.py in <module>()
     22 from . import shape
     23 from . import sign
---> 24 from .quadratic import QuadCoeffExtractor

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/cvxpy/utilities/quadratic.py in <module>()
     22 import numpy as np
     23 import scipy.sparse as sp
---> 24 import canonInterface
     25 import cvxpy.lin_ops.lin_utils as lu
     26 from numpy import linalg as LA

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/canonInterface.py in <module>()
     14 #    along with CVXcanon.  If not, see <http:#www.gnu.org/licenses/>.
     15 
---> 16 import CVXcanon
     17 import numpy as np
     18 import scipy.sparse

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/CVXcanon.py in <module>()
     26                 fp.close()
     27             return _mod
---> 28     _CVXcanon = swig_import_helper()
     29     del swig_import_helper
     30 else:

/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/CVXcanon.py in swig_import_helper()
     22         if fp is not None:
     23             try:
---> 24                 _mod = imp.load_module('_CVXcanon', fp, pathname, description)
     25             finally:
     26                 fp.close()

/home/javiergb/Programs/anaconda3/lib/python3.5/imp.py in load_module(name, file, filename, details)
    240                 return load_dynamic(name, filename, opened_file)
    241         else:
--> 242             return load_dynamic(name, filename, file)
    243     elif type_ == PKG_DIRECTORY:
    244         return load_package(name, filename)

/home/javiergb/Programs/anaconda3/lib/python3.5/imp.py in load_dynamic(name, path, file)
    340         spec = importlib.machinery.ModuleSpec(
    341             name=name, loader=loader, origin=path)
--> 342         return _load(spec)
    343 
    344 else:

ImportError: /home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/_CVXcanon.cpython-35m-x86_64-linux-gnu.so: undefined symbol: _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED1Ev