In [1]:
import os, csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
%matplotlib inline
In [2]:
relativepath = os.path.join('..', 'data', 'bio_logratios.csv')
bio = pd.read_csv(relativepath)
relativepath = os.path.join('..', 'data', 'fiction_logratios.csv')
fic = pd.read_csv(relativepath)
In [3]:
bio.head()
Out[3]:
In [4]:
fic.head()
Out[4]:
In [5]:
bio_list = list(bio.columns.values)
fic_list = list(fic.columns.values)
shared_words = set(fic_list).intersection(set(bio_list))
shared_list = list(shared_words)
shared_list.pop(shared_list.index('thedate'))
fic_subset = fic[list(shared_list)]
bio_subset = bio[list(shared_list)]
print(fic_subset.shape)
print(bio_subset.shape)
In [6]:
word = 'hair'
r, p = pearsonr(fic_subset[word], bio_subset[word])
print(r, p)
In [28]:
def fisher_average(listofrvals):
zscores = (np.arctanh(listofrvals))
avg_z = sum(zscores) / len(zscores)
transformed_r = np.tanh(avg_z)
naive_r = sum(listofrvals) / len(listofrvals)
return transformed_r, naive_r
In [30]:
# Let's calculate correlations for each word across time
r_list = []
significant = 0
for w in shared_list:
r, p = pearsonr(fic_subset[w], bio_subset[w])
r_list.append(r)
if (p / 2) < 0.05 and r > 0:
significant += 1
# The scipy function reports a two-tailed p-value
# But we are clearly expecting positive correlations
# and wouldn't consider negative ones as significant.
# So I'm dividing p by 2 to convert to a one-tailed test.
print(significant)
print(fisher_average(r_list))
Let's try sliding the time series by as much as 12 years either way. Of course, when we shift the time series relative to each other, the area of overlap in the middle will be smaller than the original timeline. In order to make an apples-to-apples comparison between time series of the same length, we'll need to cut something off the end of the time series even when we're just shifting them zero, or one years, relative to each other. In other words, if we want to compare the central overlap between two time series shifted 12 years relative to each other, we need to compare it to an unshifted pairing that has twelve years taken off either end.
Trimming the ends is potentially a complicating factor, since the ends of the timeline might be inherently less (or more) likely to correlate. We can partly address this by running the shift comparison several times, with a different "ceiling" on the shift each time. I.e., the first time we'll shift a maximum of two years and That way we can see which differences are due to considering a smaller timeline, and which differences are due to the shift itself.
In [13]:
def shifted_correlation(vectora, vectorb, shift_ceiling, shift):
assert len(vectora) == len(vectorb)
maxlen = len(vectora)
floor = shift_ceiling
ceiling = maxlen - shift_ceiling
floorb = floor + shift
ceilingb = ceiling + shift
slicea = vectora[floor : ceiling]
sliceb = vectorb[floorb : ceilingb]
return pearsonr(slicea, sliceb)
the_correlations = dict()
# we're going to create a different dictionary entry
# for each "shift-ceiling"
for shift_ceiling in range(2, 15):
print(shift_ceiling)
shifts = []
correlations = []
for shift in range(-shift_ceiling, shift_ceiling + 1):
temp_r = []
significant = 0
for w in shared_list:
r, p = shifted_correlation(fic_subset[w], bio_subset[w], shift_ceiling, shift)
temp_r.append(r)
if (p / 2) < 0.05 and r > 0:
significant += 1
true_avg, naive_avg = fisher_average(temp_r)
shifts.append(shift)
correlations.append(true_avg)
the_correlations[shift_ceiling] = list(zip(shifts, correlations))
# we save a list of tuples where x[0] = shift and x[1] = avg corr
for sc in range(2,15):
x, y = zip(*the_correlations[sc])
plt.plot(x, y)
plt.show()
I'm not exactly sure what to make of those results. I think the peak correlation is (usually) around 0, which suggests that lag is not helping.
On the other hand, there's a pretty clear asymmetry in the curve. Correlations are generally lower when biography is shifted back relative to fiction than when it's shifted forward. I'm not sure I should make anything of that, but it's intriguing enough to make me want to keep trying this with different slices of the timeline.
That will help rule out the possibility that this is due to something odd about the ends of the timeline.
Let's try five different 100-year periods. 1800-1900, 1820-1920, and so on. In each case, we'll try a 16-year lag.
In [23]:
import math
def correlate_arbitrary_slice(vectora, vectorb, shift, absolute_floor):
assert len(vectora) == len(vectorb)
maxlen = len(vectora)
floor = absolute_floor
ceiling = absolute_floor + 100
floorb = floor + shift
ceilingb = ceiling + shift
slicea = vectora[floor : ceiling]
sliceb = vectorb[floorb : ceilingb]
return pearsonr(slicea, sliceb)
the_correlations = dict()
# we're going to create a different dictionary entry
# for each "shift-ceiling"
for absolute_floor in range(20, 100, 20):
print(absolute_floor)
shift_ceiling = 15
shifts = []
correlations = []
for shift in range(-shift_ceiling, shift_ceiling + 1):
temp_r = []
significant = 0
for w in shared_list:
r, p = correlate_arbitrary_slice(fic_subset[w], bio_subset[w], shift, absolute_floor)
if not math.isnan(r):
temp_r.append(r)
if (p / 2) < 0.05 and r > 0:
significant += 1
true_avg, naive_avg = fisher_average(temp_r)
shifts.append(shift)
correlations.append(true_avg)
the_correlations[absolute_floor] = list(zip(shifts, correlations))
# we save a list of tuples where x[0] = shift and x[1] = avg corr
for af in range(20, 100, 20):
x, y = zip(*the_correlations[af])
plt.plot(x, y)
plt.show()
If the code I wrote is running correctly, there are big differences in the average correlation across different segments of the timeline. We need to figure out why that would be true, first of all! But secondly, if that's true, it's really not going to work to compare shift-backward and shift-forward, because of course you'l get different correlations.
In [27]:
for af in range(20, 100, 20):
x, y = zip(*the_correlations[af])
meancorr = sum(y) / len(y)
print("From " + str(1780 + af) + " to " + str(1780 + af + 100) + ": " + str(meancorr))
Okay, this could be caused by an error of some kind. But if it's not an error, it means biography and fiction are tracking each other less well as we move forward in time.
Worth keeping in mind that it could be caused by distribution of examples in the datasets being uneven across time, or gender differences in authorship, or the vocabulary being biased in some way across time.
In [38]:
bio_mean = bio_subset.mean(axis=0).reset_index()
bio_mean.columns = ['word', 'bio_mean']
fic_mean = fic_subset.mean(axis=0).reset_index()
fic_mean.columns = ['word', 'fic_mean']
means_df = pd.DataFrame(bio_mean).merge(pd.DataFrame(fic_mean), on='word', how='inner')
means_df.head()
Out[38]:
In [40]:
means_df.sort_values(by='fic_mean', ascending=False).head(10)
Out[40]:
In [51]:
all_words = list(means_df.word.unique())
len(all_words)
def plot_diff_means(words, adjust=False, title=False, label_outliers=False, normalize_axes=False, save=False):
df = means_df.loc[(means_df.word.isin(words)),:]
ax0 = df.plot('bio_mean', 'fic_mean', kind='scatter', s=50, figsize=(15,15))
ax0.plot(0, 0, 'r+', ms=15, mew=2, label='(0,0) Gender Neutral')
texts = []
for x, y, s in zip(df['bio_mean'], df['fic_mean'], df['word']):
if label_outliers:
if ((-1.5 < x < 2) == False) | ((-2 < y < 1.5) == False):
texts.append((x, y, s))
plt.legend(loc='best')
plt.xlabel('m <-- Avg Non-fiction Diff 1780-1923 --> f', fontsize=16, )
plt.ylabel('m <-- Avg Fiction Diff 1780-1923 --> f', fontsize=16)
if title:
plt.title(title)
if normalize_axes:
ylim = ax0.get_ylim()
xlim = ax0.get_xlim()
new_low = min(xlim[0], ylim[0])
new_hi = max(xlim[1], ylim[1])
plt.ylim(new_low, new_hi)
plt.xlim(new_low, new_hi)
if label_outliers:
for x, y, label in texts:
plt.annotate(label, xy=(x, y), size = 11)
if save:
if title:
plt.savefig('./{}.png'.format(title))
else:
plt.savefig('./genderDiffMeans.png')
plt.show()
plot_diff_means(all_words, adjust=True, title="Average Gender Difference for All Shared Words (Equal Axes)", label_outliers=True, normalize_axes=True, save=True)
plot_diff_means(all_words, adjust=True, title="Average Gender Difference for All Shared Words", label_outliers=True, normalize_axes=False, save = True)
In [47]:
pearsonr(means_df['bio_mean'], means_df['fic_mean'])
Out[47]:
In [ ]: