In [1]:
import csv
from scipy.stats import spearmanr
from scipy.stats import pearsonr

In [39]:
def comparetwo(pathA, pathB):
    women = []
    with open(pathA) as f:
        reader = csv.reader(f)
        for row in reader:
            women.append(row[0])

    men = []
    with open(pathB) as f:
        reader = csv.reader(f)
        for row in reader:
            men.append(row[0])

    both = set(women).intersection(set(men))
    print('Overlap: ', len(both))
    
    newwomen = []
    for item in women:
        if item in both:
            newwomen.append(item)

    newmen = []
    for item in men:
        if item in both:
            newmen.append(item)

    bothlist = list(both)

    w = []
    m = []
    for word in bothlist:
        w.append(newwomen.index(word))
        m.append(newmen.index(word))

    r, p = spearmanr(w, m)
    print(r, p)
    
    diff = []
    for i in range(len(bothlist)):
        d = w[i] - m[i]
        diff.append((d, bothlist[i]))

    diff.sort()
    return r, diff

In [32]:
comparetwo('../models/onlywomenwritersB.coefs.csv', '../models/onlymalewritersB.coefs.csv')


Overlap:  2157
0.339304095252 2.9316866432e-59
Out[32]:
0.33930409525218724

In [31]:
comparetwo('../models/onlymalewriters.coefs.csv', '../models/onlymalewritersB.coefs.csv')


Overlap:  2292
0.471246223687 4.71022074549e-127
Out[31]:
0.4712462236871332

In [45]:
r, diff = comparetwo('../models/onlywomenwriters.coefs.csv', '../models/onlymalewriters.coefs.csv')


Overlap:  2388
0.304292675273 2.4285636894e-52

In [50]:
diff[-200: ]


Out[50]:
[(1164, 'was-obliged'),
 (1165, 'action'),
 (1170, 'knew'),
 (1178, 'saw'),
 (1179, 'hanging'),
 (1184, 'fire'),
 (1184, 'work'),
 (1185, 'lessons'),
 (1185, 'was-carry'),
 (1189, 'born'),
 (1192, 'cares'),
 (1192, 'was-drawn'),
 (1196, 'landed'),
 (1197, 'was-inquired'),
 (1204, 'known'),
 (1205, 'curiosity'),
 (1207, 'envied'),
 (1208, 'stare'),
 (1209, 'takes'),
 (1217, 'arrival'),
 (1223, 'stands'),
 (1224, 'imagine'),
 (1228, 'cold'),
 (1228, 'yelled'),
 (1229, 'faith'),
 (1230, 'training'),
 (1231, 'passing'),
 (1233, 'story'),
 (1236, 'horror'),
 (1236, 'tied'),
 (1240, 'conscious'),
 (1244, 'withdrew'),
 (1247, 'wondered'),
 (1250, 'received'),
 (1250, 'was-startled'),
 (1255, 'party'),
 (1259, 'slipped'),
 (1262, 'history'),
 (1265, 'make'),
 (1266, 'surprised'),
 (1267, 'fate'),
 (1269, 'join'),
 (1270, 'locked'),
 (1270, 'was-admitted'),
 (1278, 'motives'),
 (1279, 'deserves'),
 (1282, 'ill'),
 (1293, 'was-want'),
 (1294, 'interrupted'),
 (1297, 'meal'),
 (1297, 'was-wished'),
 (1298, 'go'),
 (1299, 'was-hurried'),
 (1302, 'done'),
 (1305, 'attendants'),
 (1306, 'beckoned'),
 (1306, 'knowledge'),
 (1306, 'performed'),
 (1311, 'brain'),
 (1314, 'throat'),
 (1318, 'generosity'),
 (1321, 'mansion'),
 (1322, 'was-notice'),
 (1324, 'avoided'),
 (1329, 'pay'),
 (1344, 'example'),
 (1344, 'pursue'),
 (1344, 'was-missed'),
 (1351, 'neck'),
 (1352, 'considered'),
 (1352, 'pretended'),
 (1352, 'replaced'),
 (1355, 'faults'),
 (1358, 'pupil'),
 (1361, 'represented'),
 (1361, 'was-relieved'),
 (1368, 'explain'),
 (1369, 'car'),
 (1370, 'stamped'),
 (1371, 'advice'),
 (1379, 'was-requested'),
 (1381, 'keep'),
 (1385, 'door'),
 (1387, 'carried'),
 (1387, 'disappointed'),
 (1387, 'incapable'),
 (1388, 'was-directed'),
 (1391, 'clapped'),
 (1395, 'was-threw'),
 (1396, 'acquired'),
 (1402, 'writing'),
 (1404, 'promised'),
 (1408, 'disposed'),
 (1409, 'presence'),
 (1419, 'thoughts'),
 (1426, 'was-shown'),
 (1429, 'pictured'),
 (1431, 'sang'),
 (1432, 'was-caught'),
 (1433, 'glasses'),
 (1433, 'old'),
 (1439, 'established'),
 (1440, 'stammered'),
 (1447, 'becoming'),
 (1457, 'right'),
 (1460, 'broke'),
 (1461, 'bought'),
 (1461, 'was-drawing'),
 (1465, 'lodgings'),
 (1466, 'confident'),
 (1467, 'nose'),
 (1468, 'view'),
 (1469, 'triumph'),
 (1470, 'conceived'),
 (1471, 'bearing'),
 (1472, 'closed'),
 (1474, 'formed'),
 (1482, 'was-echoed'),
 (1490, 'set'),
 (1491, 'was-finding'),
 (1502, 'forced'),
 (1515, 'panted'),
 (1518, 'recover'),
 (1522, 'venture'),
 (1528, 'talks'),
 (1529, 'sensible'),
 (1532, 'hatred'),
 (1537, 'suffered'),
 (1543, 'knowing'),
 (1563, 'order'),
 (1565, 'palms'),
 (1567, 'slave'),
 (1588, 'supposed'),
 (1599, 'wisdom'),
 (1614, 'was-announced'),
 (1619, 'sent'),
 (1621, 'grumbled'),
 (1624, 'folly'),
 (1625, 'was-surprise'),
 (1630, 'assumed'),
 (1633, 'pleased'),
 (1638, 'examined'),
 (1643, 'informed'),
 (1643, 'searched'),
 (1646, 'admit'),
 (1647, 'pursued'),
 (1647, 'was-addressed'),
 (1652, 'lungs'),
 (1657, 'overcome'),
 (1667, 'advantage'),
 (1669, 'dismissed'),
 (1672, 'stayed'),
 (1675, 'offer'),
 (1676, 'was-treat'),
 (1680, 'accepted'),
 (1681, 'drove'),
 (1691, 'sleeves'),
 (1697, 'ate'),
 (1700, 'jaw'),
 (1702, 'night'),
 (1704, 'was-strike'),
 (1709, 'wont'),
 (1715, 'unhappy'),
 (1724, 'was-assure'),
 (1734, 'help'),
 (1745, 'believe'),
 (1749, 'imagination'),
 (1766, 'afraid'),
 (1791, 'satisfaction'),
 (1806, 'handsome'),
 (1809, 'reverie'),
 (1817, 'anticipated'),
 (1817, 'win'),
 (1833, 'spend'),
 (1838, 'killed'),
 (1845, 'was-saved'),
 (1851, 'throw'),
 (1867, 'lost'),
 (1867, 'was-mentioned'),
 (1872, 'trunk'),
 (1873, 'liking'),
 (1891, 'giving'),
 (1898, 'notes'),
 (1898, 'show'),
 (1901, 'was-determined'),
 (1914, 'attack'),
 (1928, 'wrote'),
 (1947, 'scrambled'),
 (1950, 'luck'),
 (1960, 'fly'),
 (1961, 'owned'),
 (1968, 'proved'),
 (1977, 'effort'),
 (1992, 'conscience'),
 (1996, 'spine'),
 (2046, 'maintained'),
 (2055, 'thought'),
 (2131, 'discover'),
 (2162, 'was-tired'),
 (2188, 'dined')]

In [44]:
adiff = diff

In [49]:
from collections import Counter
totaldiff = Counter()
for a,b in diff:
    totaldiff[b] += a
for a,b in adiff:
    totaldiff[b] += a
totaldiff.most_common(100)


Out[49]:
[('spend', 3516),
 ('discover', 3282),
 ('wrote', 3221),
 ('attendants', 3104),
 ('conscience', 2971),
 ('assumed', 2930),
 ('set', 2900),
 ('chair', 2849),
 ('was-announced', 2842),
 ('was-treat', 2840),
 ('disappointed', 2838),
 ('palms', 2799),
 ('disposed', 2799),
 ('thoughts', 2796),
 ('accepted', 2771),
 ('supposed', 2725),
 ('imagination', 2716),
 ('stayed', 2697),
 ('lungs', 2680),
 ('maintained', 2676),
 ('was-caught', 2667),
 ('ill', 2661),
 ('sent', 2619),
 ('was-drawn', 2602),
 ('join', 2593),
 ('was-mentioned', 2589),
 ('reception', 2576),
 ('promised', 2562),
 ('jaw', 2554),
 ('order', 2540),
 ('handsome', 2532),
 ('giving', 2527),
 ('chin', 2525),
 ('was-want', 2520),
 ('was-tired', 2517),
 ('throw', 2513),
 ('effort', 2512),
 ('aim', 2505),
 ('object', 2485),
 ('endeavoured', 2458),
 ('account', 2454),
 ('history', 2440),
 ('satisfaction', 2434),
 ('experience', 2396),
 ('passing', 2386),
 ('lost', 2383),
 ('takes', 2383),
 ('was-strike', 2352),
 ('wont', 2339),
 ('was-addressed', 2310),
 ('talks', 2307),
 ('forced', 2285),
 ('folly', 2264),
 ('reach', 2244),
 ('finished', 2244),
 ('help', 2244),
 ('witnessed', 2239),
 ('drove', 2237),
 ('was-saved', 2225),
 ('was-relieved', 2217),
 ('tried', 2215),
 ('was-requested', 2214),
 ('notes', 2208),
 ('clutched', 2208),
 ('resented', 2205),
 ('anticipated', 2203),
 ('thought', 2192),
 ('committed', 2169),
 ('venture', 2165),
 ('expected', 2162),
 ('fly', 2153),
 ('killed', 2141),
 ('slipped', 2127),
 ('attack', 2124),
 ('busy', 2124),
 ('replaced', 2114),
 ('write', 2106),
 ('was-attended', 2105),
 ('dined', 2100),
 ('grabbed', 2095),
 ('putting', 2095),
 ('hungry', 2094),
 ('throat', 2086),
 ('knowledge', 2072),
 ('head', 2054),
 ('learned', 2049),
 ('horror', 2045),
 ('meeting', 2040),
 ('acquired', 2036),
 ('pleased', 2028),
 ('curiosity', 2022),
 ('gift', 2022),
 ('company', 2017),
 ('resolved', 2017),
 ('lodgings', 2015),
 ('offer', 2001),
 ('drank', 1999),
 ('spine', 1996),
 ('was-determined', 1995),
 ('bearing', 1994)]

In [21]:
diff = []
for i in range(2388):
    d = w[i] - m[i]
    diff.append((d, bothlist[i]))

diff.sort()

In [23]:
diff[-100:]


Out[23]:
[(1433, 'old'),
 (1439, 'established'),
 (1440, 'stammered'),
 (1447, 'becoming'),
 (1457, 'right'),
 (1460, 'broke'),
 (1461, 'bought'),
 (1461, 'was-drawing'),
 (1465, 'lodgings'),
 (1466, 'confident'),
 (1467, 'nose'),
 (1468, 'view'),
 (1469, 'triumph'),
 (1470, 'conceived'),
 (1471, 'bearing'),
 (1472, 'closed'),
 (1474, 'formed'),
 (1482, 'was-echoed'),
 (1490, 'set'),
 (1491, 'was-finding'),
 (1502, 'forced'),
 (1515, 'panted'),
 (1518, 'recover'),
 (1522, 'venture'),
 (1528, 'talks'),
 (1529, 'sensible'),
 (1532, 'hatred'),
 (1537, 'suffered'),
 (1543, 'knowing'),
 (1563, 'order'),
 (1565, 'palms'),
 (1567, 'slave'),
 (1588, 'supposed'),
 (1599, 'wisdom'),
 (1614, 'was-announced'),
 (1619, 'sent'),
 (1621, 'grumbled'),
 (1624, 'folly'),
 (1625, 'was-surprise'),
 (1630, 'assumed'),
 (1633, 'pleased'),
 (1638, 'examined'),
 (1643, 'informed'),
 (1643, 'searched'),
 (1646, 'admit'),
 (1647, 'pursued'),
 (1647, 'was-addressed'),
 (1652, 'lungs'),
 (1657, 'overcome'),
 (1667, 'advantage'),
 (1669, 'dismissed'),
 (1672, 'stayed'),
 (1675, 'offer'),
 (1676, 'was-treat'),
 (1680, 'accepted'),
 (1681, 'drove'),
 (1691, 'sleeves'),
 (1697, 'ate'),
 (1700, 'jaw'),
 (1702, 'night'),
 (1704, 'was-strike'),
 (1709, 'wont'),
 (1715, 'unhappy'),
 (1724, 'was-assure'),
 (1734, 'help'),
 (1745, 'believe'),
 (1749, 'imagination'),
 (1766, 'afraid'),
 (1791, 'satisfaction'),
 (1806, 'handsome'),
 (1809, 'reverie'),
 (1817, 'anticipated'),
 (1817, 'win'),
 (1833, 'spend'),
 (1838, 'killed'),
 (1845, 'was-saved'),
 (1851, 'throw'),
 (1867, 'lost'),
 (1867, 'was-mentioned'),
 (1872, 'trunk'),
 (1873, 'liking'),
 (1891, 'giving'),
 (1898, 'notes'),
 (1898, 'show'),
 (1901, 'was-determined'),
 (1914, 'attack'),
 (1928, 'wrote'),
 (1947, 'scrambled'),
 (1950, 'luck'),
 (1960, 'fly'),
 (1961, 'owned'),
 (1968, 'proved'),
 (1977, 'effort'),
 (1992, 'conscience'),
 (1996, 'spine'),
 (2046, 'maintained'),
 (2055, 'thought'),
 (2131, 'discover'),
 (2162, 'was-tired'),
 (2188, 'dined')]

In [35]:
import pandas as pd
one = pd.read_csv('../models/wholetimeline.csv', index_col = 'volid')
two = pd.read_csv('../models/wholenineteenth.csv', index_col = 'volid')
justpredictions = pd.concat([one['logistic'], two['logistic']], axis=1, keys=['one', 'two'])
justpredictions.dropna(inplace = True)
justpredictions.head()


Out[35]:
one two
10276|LordSurrey 0.557854 0.495263
10308|Kate 0.654317 0.683286
11140|GeneralHospital 0.260432 0.363858
1139|Mr.Austin 0.420320 0.326094
12412|Philip 0.078735 0.282212

In [ ]: