Setup



In [1]:

    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline









    



/Users/pmlandwehr/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))



In [2]:

    
def get_mbt_corrs(mbts):
    """
    :param list mbts: a list of individual MBT scores
    :returns pandas.DataFrame: pearon-correlation of individual score co-occurrences.
    """
    indiv_mbts = ['I', 'E', 'S', 'N', 'F', 'T', 'J', 'P']
    mbt_dict = dict()
    for val in indiv_mbts:
        mbt_dict[val] = [True if x.find(val) > -1 else False for x in mbts]
    return pd.DataFrame(mbt_dict).corr().ix[indiv_mbts, indiv_mbts]

Hard-coded type frequencies for the US from the Myers-Briggs site



In [3]:

    
us_types = {'mbt': [('ISFJ', 13.8), ('ESFJ', 12.3),
                    ('ISTJ', 11.6), ('ISFP', 8.8),
                    ('ESTJ', 8.7), ('ESFP', 8.5),
                    ('ENFP', 8.1), ('ISTP', 5.4),
                    ('INFP', 4.4), ('ESTP', 4.3),
                    ('INTP', 3.3), ('ENTP', 3.2),
                    ('ENFJ', 2.5), ('INTJ', 2.1),
                    ('ENTJ', 1.8), ('INFJ', 1.5),
                    ('?', 0.0)],
            'indiv': [('I', 50.7), ('E', 49.3),
                      ('S', 73.3), ('N', 26.7),
                      ('F', 59.8), ('T', 40.2),
                      ('J', 54.1), ('P', 45.9)]}

Import and split data.



In [4]:

    
df = pd.read_csv('data/MeFites & Myers-Briggs Types.csv')
df.columns = ['timestamp', 'MBT']

# set % based on total submitters
# NOT the total number of submitted types.
submission_count= df.shape[0]

df['MBT'] = df['MBT'].apply(
    lambda x: x.replace('“It\'s always something different"', '?'))
df['MBT'] = df['MBT'].apply(lambda x: x.replace(' ', ''))
df['MBT'] = df['MBT'].apply(
    lambda x: ';'.join([y[0] + y[2:] if len(y) > 4 else y
                        for y in x.split(';')]))

# all_vals: _all_ of the individual types
# (multi-entry submissions split)
all_vals = pd.Series(np.concatenate(df['MBT'].apply(
            lambda x: x.split(';'))))

Making Pictures

Bar Plots for Specific MBTs, MeFi vs. US



In [5]:

    
a = pd.DataFrame(all_vals.value_counts()).reset_index()
a.columns = ['MBT', 'Count']
a['% MeFi'] = 100. * a['Count'] / submission_count  # a['Count'].sum()

a = a.merge(pd.DataFrame.from_records(
    us_types['mbt'], columns=['MBT', '% US']), on=['MBT'])

b = pd.melt(a, id_vars=['MBT'], value_vars=['% US', '% MeFi'])
b.columns = ['MBT', 'Domain', 'Percentage']
b['Domain'] = b['Domain'].apply(lambda x: x[2:])



In [6]:

    
sns.set_context('talk', font_scale=0.75)
sns.factorplot(x='MBT', y='Percentage', hue='Domain', data=b.sort_values(
        ['Domain', 'Percentage', 'MBT'], ascending=False), kind='bar', size=8)









    



/Users/pmlandwehr/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:892: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))






    Out[6]:





<seaborn.axisgrid.FacetGrid at 0x1104ddb38>



In [7]:

    
a.ix[:, ['MBT', '% US', '% MeFi', 'Count']]

Bar Plots for Individual MBT Components, MeFi vs. US



In [8]:

    
# all_vals_two: a list of _all_ individual letter categories.
all_vals_two = pd.Series(np.concatenate(
    df['MBT'].apply(lambda x: list(x.replace(';', '').replace('?', '')))))

c = pd.DataFrame(all_vals_two.value_counts()).reset_index()
c.columns = ['MBT', 'Count']
c['% MeFi'] = 100. * c['Count'] / (all_vals_two.shape[0] / 4)

c = c.merge(pd.DataFrame.from_records(
    us_types['indiv'], columns=['MBT', '% US']), on=['MBT'])

d = pd.melt(c, id_vars=['MBT'], value_vars=['% US', '% MeFi'])
d.columns = ['MBT', 'Domain', 'Percentage']
d['Domain'] = d['Domain'].apply(lambda x: x[2:])



In [9]:

    
sns.set_context('talk', font_scale=1.)
f, axes = plt.subplots(2, 2, figsize=(8, 8), sharey=True)
for ax, order in zip(axes.flat, [('I', 'E'), ('S', 'N'), ('F', 'T'), ('J', 'P')]):
    g = sns.factorplot(x='MBT', y='Percentage', hue='Domain',
                       data=d, kind='bar',  order=order, ax=ax, legend=False)
    g.despine(left=True)
    ax.set(ylim=(0, 90.))
    ax.set_xlabel('')
for i in [0, 2, 3]:
    f.axes[i].legend([])
for i in [1, 3]:
    f.axes[i].set_ylabel('')
for i in [0, 2]:
    f.axes[i].set_ylabel('Percentage')









    



/Users/pmlandwehr/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:892: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))



In [10]:

    
c.ix[[1, 6, 7, 0, 4, 3, 2, 5], ['MBT', '% US', '% MeFi', 'Count']]

Correlation plot for general MeFi categories



In [11]:

    
# all_vals_three: all specific values that aren't "?"
all_vals_three = all_vals.ix[all_vals != '?']

corrs = get_mbt_corrs(all_vals_three)



In [12]:

    
sns.heatmap(corrs, vmax=0.4, vmin=-0.4, annot=True)









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x1104cfda0>

Correlation plot for general US categories.



In [13]:

    
all_vals_four = np.repeat([x[0] for x in us_types['mbt']], [x[1]*10 for x in us_types['mbt']])
corrs_two = get_mbt_corrs(all_vals_four)



In [14]:

    
sns.heatmap(corrs_two, vmax=0.4, vmin=-0.4, annot=True)









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x1116791d0>

	MBT	% US	% MeFi	Count
1	I	50.7	79.254457	489
6	E	49.3	20.745543	128
7	S	73.3	13.290113	82
0	N	26.7	86.709887	535
4	P	45.9	46.191248	285
3	J	54.1	53.808752	332
2	T	40.2	56.077796	346
5	F	59.8	43.922204	271

	MBT	% US	% MeFi	Count
0	INTJ	2.1	26.595745	150
1	INTP	3.3	18.971631	107
2	INFP	4.4	16.489362	93
3	INFJ	1.5	13.297872	75
4	ENFP	8.1	6.560284	37
5	?	0.0	6.028369	34
6	ENTP	3.2	4.609929	26
7	ISFJ	13.8	4.609929	26
8	ENFJ	2.5	4.255319	24
9	ENTJ	1.8	4.078014	23
10	ISTJ	11.6	3.723404	21
11	ISTP	5.4	1.950355	11
12	ESFJ	12.3	1.241135	7
13	ESTJ	8.7	1.063830	6
14	ISFP	8.8	1.063830	6
15	ESFP	8.5	0.531915	3
16	ESTP	4.3	0.354610	2