In [11]:
%pylab inline
import pandas as pd
import mca
In [12]:
data = pd.read_table('./cookieclassifier_data_matrix.tsv',
sep='\t', header=0)
In [13]:
data.columns = ['category'] + data.columns.tolist()[1:]
In [14]:
data['category'] = data.category.astype('category')
data['category'] = data.category.cat.rename_categories([1,2,3])
In [15]:
data#
X = data.drop('category', axis=1)
mca_ben = mca.MCA(X)
mca_ind = mca.MCA(X, benzecri=False)
In [16]:
mca_ben
Out[16]:
In [18]:
fs, cos, cont = 'Factor score','Squared cosines', 'Contributions x 1000'
table3 = pd.DataFrame(columns=X.index, index=pd.MultiIndex
.from_product([[fs, cos, cont], range(1, 3)]))
table3.loc[fs, :] = mca_ben.fs_r(N=2).T
table3.loc[cos, :] = mca_ben.cos_r(N=2).T
table3.loc[cont, :] = mca_ben.cont_r(N=2).T * 1000
In [19]:
table3
Out[19]:
In [28]:
import matplotlib.pyplot as plt
points = table3.loc[fs].values
labels = table3.columns.values
colors = ['#66c2a5', '#fc8d62','#8da0cb']
plt.figure()
plt.margins(0.1)
plt.axhline(0, color='gray')
plt.axvline(0, color='gray')
plt.xlabel('Factor 1')
plt.ylabel('Factor 2')
plt.scatter(*points, s=20, marker='o', c='r', alpha=.5, linewidths=0)
for label, x, y in zip(labels, *points):
if y>3:
plt.annotate(label, xy=(x, y), xytext=(x + .03, y + .03))
plt.show()
In [26]:
table4 = pd.DataFrame(columns=X.columns, index=pd.MultiIndex
.from_product([[fs, cos, cont], range(1, 3)]))
table4.loc[fs, :] = mca_ben.fs_c(N=2).T
table4.loc[cos, :] = mca_ben.cos_c(N=2).T
table4.loc[cont,:] = mca_ben.cont_c(N=2).T * 1000
In [27]:
table4
Out[27]:
In [ ]: