In [2]:
##Some code to run at the beginning of the file, to be able to show images in the notebook
##Don't worry about this cell
#Print the plots in this screen
%matplotlib inline
#Be able to plot images saved in the hard drive
from IPython.display import Image
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import seaborn as sns
import pylab as plt
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.formula.api as smf
In [34]:
#Som elibraries
from sklearn import preprocessing
from sklearn.cluster import DBSCAN, KMeans
In [58]:
#Read teh data, dropna, get sample
df = pd.read_csv("data/big3_position.csv",sep="\t").dropna()
df["Revenue"] = np.log10(df["Revenue"])
df["Assets"] = np.log10(df["Assets"])
df["Employees"] = np.log10(df["Employees"])
df["MarketCap"] = np.log10(df["MarketCap"])
df = df.replace([np.inf,-np.inf],np.nan).dropna().sample(300)
df.head(2)
Out[58]:
In [60]:
#Scale variables to give all of them the same weight
X = df.loc[:,["Revenue","Assets","Employees","MarketCap"]]
X = preprocessing.scale(X)
print(X.sum(0))
print(X.std(0))
X
Out[60]:
In [69]:
#Get labels of each row and add a new column with the labels
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
labels = kmeans.labels_
df["kmeans_labels"] = labels
sns.lmplot(x="MarketCap",y="Assets",hue="kmeans_labels",fit_reg=False,data=df)
Out[69]:
In [66]:
#Get labels of each row and add a new column with the labels
db = DBSCAN(eps=1, min_samples=10).fit(X)
labels = db.labels_
df["dbscan_labels"] = labels
sns.lmplot(x="MarketCap",y="Assets",hue="dbscan_labels",fit_reg=False,data=df)
Out[66]:
In [70]:
Image(url="http://scikit-learn.org/stable/_images/sphx_glr_plot_cluster_comparison_0011.png")
Out[70]:
In [78]:
import scipy
import pylab
import scipy.cluster.hierarchy as sch
# Generate distance matrix based on the difference between rows
D = np.zeros([4,4])
for i in range(4):
for j in range(4):
D[i,j] = np.sum(np.abs(X[:,i]-X[:,j])) #Euclidean distance or mutual information are also common
print(D)
#Create the linkage and plot
Y = sch.linkage(D, method='centroid') #many methods, single, complete...
Z1 = sch.dendrogram(Y, orientation='right',labels=["Revenue","Assets","Employees","MarketCap"])
In [ ]:
#Required libraries
!conda install tensorflow -y
!pip install fancyimpute
!pip install pydot_ng
In [5]:
import sklearn.preprocessing
import sklearn
In [10]:
#Read the data again but do not
df = pd.read_csv("data/big3_position.csv",sep="\t")
df["Revenue"] = np.log10(df["Revenue"])
df["Assets"] = np.log10(df["Assets"])
df["Employees"] = np.log10(df["Employees"])
df["MarketCap"] = np.log10(df["MarketCap"])
le = sklearn.preprocessing.LabelEncoder()
labels = le.fit_transform(df["TypeEnt"])
df["TypeEnt_int"] = labels
print(le.classes_)
df = df.replace([np.inf,-np.inf],np.nan).sample(300)
df.head(2)
Out[10]:
In [11]:
X = df.loc[:,["Revenue","Assets","Employees","MarketCap","TypeEnt_int"]].values
X
Out[11]:
In [ ]:
df.describe()
In [12]:
from fancyimpute import KNN
# X is the complete data matrix
# X_incomplete has the same values as X except a subset have been replace with NaN
# Use 10 nearest rows which have a feature to fill in each row's missing features
X_filled_knn = KNN(k=10).complete(X)
df.loc[:,cols] = X_filled_knn
df.describe()