In [5]:
from collections import Counter
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
In [2]:
bio = pd.read_csv('biocharsizes.tsv', sep = '\t')
bio.shape
Out[2]:
In [3]:
bio = bio[bio.totalsize > 0]
bio.shape
Out[3]:
In [49]:
bio.head()
Out[49]:
In [85]:
signibio = bio.gender[bio.totalsize > 10]
sum(signibio == 'f') / (sum(signibio == 'm') + sum(signibio == 'f'))
Out[85]:
In [4]:
numchars = []
ratios = []
stories = bio.groupby('story')
for story, df in stories:
numchars.append(len(df))
maxsize = np.max(df.totalsize)
mediansize = np.median(df.totalsize)
maxratio = maxsize/mediansize
if mediansize > 0:
ratios.append(maxratio)
In [70]:
n = 20000
counts = Counter(bio.totalsize[bio.totalsize < n])
y = np.zeros(n -1)
for i in range(1, n):
y[i - 1] = counts[i]
x = np.array([i for i in range(1, n)])
y = y * x
y = y /np.sum(y)
In [71]:
plt.plot(x[0:5000], np.cumsum(y[0:5000]))
Out[71]:
In [31]:
np.median(ratios)
Out[31]:
In [27]:
numchars = np.array(numchars)
In [29]:
sns.distplot(numchars[numchars < 1000], kde = False)
Out[29]:
In [30]:
ratios = np.array(ratios)
sns.distplot(ratios)
Out[30]:
In [32]:
np.median(numchars)
Out[32]:
In [16]:
fic = pd.read_csv('ficcharsizes.tsv', sep = '\t')
fic.shape
Out[16]:
In [17]:
fic = fic[fic.totalsize > 0]
fic.shape
Out[17]:
In [86]:
signific = fic.gender[fic.totalsize > 10]
In [87]:
sum(signific == 'f') / (sum(signific == 'm') + sum(signific == 'f'))
Out[87]:
In [29]:
n = 2000
counts = Counter(fic.totalsize[fic.totalsize < n])
y = np.zeros(n -1)
for i in range(1, n):
y[i - 1] = counts[i]
x = np.array([i for i in range(1, n)])
y = y * x
y = y /np.sum(y)
In [30]:
plt.plot(x, y)
Out[30]:
In [31]:
total = 0
for idx, y1 in enumerate(y):
total += y1
if total > 0.5:
print(idx)
break
In [36]:
ficnumchars = []
ficratios = []
stories = fic.groupby('story')
for story, df in stories:
ficnumchars.append(len(df))
maxsize = np.max(df.totalsize)
mediansize = np.median(df.totalsize)
maxratio = maxsize/mediansize
if mediansize > 0:
ficratios.append(maxratio)
In [55]:
n = 5000
counts = Counter(ficnumchars)
y = np.zeros(n -1)
for i in range(1, n):
y[i - 1] = counts[i]
x = np.array([i for i in range(1, n)])
y = y * x
y = y /np.sum(y)
In [56]:
plt.plot(x, y)
Out[56]:
In [58]:
plt.plot(x[0:400], np.cumsum(y[0:400]))
Out[58]:
In [47]:
total = 0
for idx, y1 in enumerate(y):
total += y1
if total > 0.5:
print(idx)
break
In [39]:
np.mean(ficnumchars)
Out[39]:
In [72]:
np.mean(numchars)
Out[72]:
In [40]:
np.mean(ficratios)
Out[40]:
In [43]:
np.mean(fic.totalsize)
Out[43]:
In [44]:
np.mean(bio.totalsize)
Out[44]:
In [73]:
np.sum(fic.dialsize)/np.sum(fic.totalsize)
Out[73]:
In [74]:
np.sum(bio.dialsize)/np.sum(bio.totalsize)
Out[74]:
In [81]:
biodial = bio.dialsize[bio.totalsize > 10]
sum(biodial == 0) / len(biodial)
Out[81]:
In [82]:
ficdial = fic.dialsize[fic.totalsize > 10]
sum(ficdial == 0) / len(ficdial)
Out[82]:
In [83]:
sum(fic.totalsize[fic.totalsize > 10]) / sum(fic.totalsize)
Out[83]:
In [84]:
sum(bio.totalsize[bio.totalsize > 10]) / sum(bio.totalsize)
Out[84]:
In [ ]: