In [1]:
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
In [2]:
data = pd.read_csv('data.csv')
print("data has {} measurements for {} variables".format(*data.shape))
print("\n{}\n...".format(data.head(8)))
In [3]:
countries = ['Canada', 'USA', 'England', 'Italy', 'Switzerland']
languages = ['English', 'French', 'Spanish', 'German', 'Italian']
F = pd.crosstab(data.country, data.language, margins=True)
F.index = [*countries, 'col totals']
F.columns = [*languages, 'row totals']
print("{}".format(F))
In [4]:
F = pd.crosstab(data.country, data.language, margins=False)
F.index = countries
F.columns = languages
chisq_stat, p_value, dof, E = scipy.stats.chi2_contingency(F)
print('Results of Chi-squared test of independence\n')
print(' Chi-squared test statistic: {:02.2F}'.format(chisq_stat))
print(' degrees of freedom: {}'.format(dof))
print(' p-value: {:02.6F}'.format(p_value))
In [5]:
print('matrix of observations F:\n\n{}'.format(F))
In [6]:
P = F / F.sum().sum()
print('correspondence matrix P:\n\n{}'.format(P))
In [7]:
row_centroid = P.sum(axis=1)
print('row centroid (marginal frequency distribution over countries):\n\n{}'.format(row_centroid))
In [8]:
col_centroid = P.sum(axis=0)
print('column centroid (marginal frequency distribution over languages):\n\n{}'.format(col_centroid))
In [9]:
row_totals = F.sum(axis=1)
print("row totals (marginal frequency distribution over the countries):\n\n{}".format(row_totals))
In [10]:
col_totals = F.sum(axis=0)
print("column totals (marginal frequency distribution over the languages):\n\n{}".format(col_totals))
In [11]:
data = []
for _,row in P.iterrows():
acc = []
cntry_i = row.name
p_iplus = row_centroid.ix[cntry_i]
for cntry_k in P.index:
p_kplus = row_centroid.ix[cntry_k]
chisqd = np.sqrt(np.sum(np.square(row/p_iplus - P.ix[cntry_k]/p_kplus) / col_centroid))
acc.append(chisqd)
data.append(acc)
row2row_chisqd = pd.DataFrame(data, index=P.index, columns=P.index)
print("row-to-row Chi-squared distance table:\n\n{}".format(row2row_chisqd))
In [12]:
PT = P.T
data = []
for _,row in PT.iterrows():
acc = []
lang_j = row.name
p_plusj = col_centroid.ix[lang_j]
for lang_k in PT.index:
p_plusk = col_centroid.ix[lang_k]
chisqd = np.sqrt(np.sum(np.square(row/p_plusj - PT.ix[lang_k]/p_plusk) / row_centroid))
acc.append(chisqd)
data.append(acc)
col2col_chisqd = pd.DataFrame(data, index=PT.index, columns=PT.index)
print("column-to-column Chi-squared distance table:\n\n{}".format(col2col_chisqd))
In [13]:
Mu_ij = row_centroid.values.reshape((P.index.size,1)) * col_centroid.values.reshape((1,P.columns.size))
Lambda = (P - Mu_ij) / np.sqrt(Mu_ij)
print('inertia Lambda:\n\n{}'.format(Lambda))
In [14]:
U,S,V = np.linalg.svd(Lambda)
num_sv = np.arange(1,len(S)+1)
cum_var_explained = [np.sum(np.square(S[0:n])) / np.sum(np.square(S)) for n in num_sv]
print('Using first singular value, {:0.3F}% variance explained'.format(cum_var_explained[0]))
print('Using first 2 singular values, {:0.3F}% variance explained'.format(cum_var_explained[1]))
print('Using first 3 singular values, {:0.3F}% variance explained'.format(cum_var_explained[2]))
In [16]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(num_sv, cum_var_explained, color='#2171b5', label='variance explained')
plt.scatter(num_sv, S, marker='s', color='#fc4e2a', label='singular values')
plt.legend(loc='lower left', scatterpoints=1)
ax.set_xticks(num_sv)
ax.set_xlim([0.8, 5.1])
ax.set_ylim([0.0, 1.1])
ax.set_xlabel('Number of singular values used')
ax.set_title('Singular values & cumulative variance explained',
fontsize=16,
y=1.03)
plt.grid()
In [17]:
cntry_x = U[:,0]
cntry_y = U[:,1]
cntry_z = U[:,2]
lang_x = V.T[:,0]
lang_y = V.T[:,1]
lang_z = V.T[:,2]
In [18]:
import pylab
from mpl_toolkits.mplot3d import Axes3D, proj3d
fig = pylab.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(cntry_x, cntry_y, cntry_z, marker='s', s=50, c='#2171b5')
cntry_labels = []
for i,(x,y,z) in enumerate(zip(cntry_x,cntry_y,cntry_z)):
x2, y2, _ = proj3d.proj_transform(x,y,z, ax.get_proj())
label = pylab.annotate(Lambda.index[i],
xy=(x2,y2),
xytext=(-2,2),
textcoords='offset points',
ha='right',
va='bottom',
color='#2171b5')
cntry_labels.append(label)
ax.scatter(lang_x, lang_y, lang_z, marker='o', s=50, c='#fc4e2a')
lang_labels = []
for i,(x,y,z) in enumerate(zip(lang_x,lang_y,lang_z)):
x2, y2, _ = proj3d.proj_transform(x,y,z, ax.get_proj())
label = pylab.annotate(Lambda.columns[i],
xy=(x2,y2),
xytext=(-2,2),
textcoords='offset points',
ha='right',
va='bottom',
color='#fc4e2a')
lang_labels.append(label)
def update_position(e):
for i,(x,y,z) in enumerate(zip(cntry_x,cntry_y,cntry_z)):
x2, y2, _ = proj3d.proj_transform(x,y,z, ax.get_proj())
cntry_labels[i].xy = x2, y2
for i,(x,y,z) in enumerate(zip(lang_x,lang_y,lang_z)):
x2, y2, _ = proj3d.proj_transform(x,y,z, ax.get_proj())
lang_labels[i].xy = x2, y2
fig.canvas.draw()
fig.canvas.mpl_connect('button_release_event', update_position)
ax.set_xlabel(r'$1^{st}$ singular value')
ax.set_xticks([-0.5, 0.0, 0.5])
ax.set_ylabel(r'$2^{nd}$ singular value')
ax.set_yticks([-0.5, 0.0, 0.4])
ax.set_zlabel(r'$3^{rd}$ singular value')
ax.set_zticks([-0.5, 0.0, 0.5])
ax.set_title('Correspondence Analysis with 3 Singular Values (3D)',
fontsize=16,
y=1.1)
pylab.show()
Next we derive the correspondence matrix $P$ from $F$ with
\begin{align} P &= \left[ p_{ij} \right] \\ &= \left[ \frac{f_{ij}}{n} \right] & \text{where } n = \sum_{i=1}^{I} \sum_{j=1}^{J} f_{ij} \end{align}The $\chi^2$ distances between rows gives us a clue as to how the countries relate to one another in terms of the primary spoken languages.
The $\chi^2$ distance between rows $i$ and $k$ is given by
\begin{align} d_{ik} &= \sqrt{\sum_{j=1}^{J} \frac{(p_{ij}/p_{i+} - p_{kj}/p_{k+})^2}{p_{+j}} } \end{align}We can see in this row-to-row $\chi^2$ distance table that for the Anglophonic countries, Canada, USA and England should be clustered near one another, while Italy and Switzerland are both separated from the other countries.
Conversely, the $\chi^2$ distances between columns gives us a clue as to how the languages relate to one another in terms of the countries.
The $\chi^2$ distance between columns $j$ and $k$ is given by
\begin{align} d_{jk} &= \sqrt{\sum_{i=1}^{I} \frac{(p_{ij}/p_{+j} - p_{kj}/p_{+k})^2}{p_{i+}} } \end{align}For the languages, we can see from the column-to-column $\chi^2$ distances that English and Spanish should be closely related, with French somewhere between English and German. Italian, however, should be sitting alone all by itself away from the others.
We start with a matrix of standardized residuals:
\begin{align} \Omega &= \left[ \frac{p_{ij} - \mu_{ij}}{\sqrt{\mu_{ij}}} \right] \end{align}
In [ ]: