In [2]:
import pandas as pd
import os

In [3]:
dfs = []
for d in os.scandir('results/top-words'):
    df_read = pd.read_csv(d.path, names=['word', 'score'])
    df_read['date'] = d.name
    dfs.append(df_read)

In [4]:
df = pd.concat(dfs)

In [5]:
df.shape


Out[5]:
(17548650, 3)

In [6]:
df[df.date.str.startswith('2017-07-19')].head()


Out[6]:
word score date
0 moran 3963.89 2017-07-19.csv
1 shilts 3328.00 2017-07-19.csv
2 sens 2252.46 2017-07-19.csv
3 neillsville 1603.27 2017-07-19.csv
4 wickford 1560.00 2017-07-19.csv

In [12]:
date_sums = df.groupby('date').score.apply(lambda x: x/sum(x))

In [21]:
df['place'] = df.groupby('date').score.cumcount()

In [15]:
df['prop'] = date_sums

In [39]:
scores = df.groupby('word').place.mean()

In [41]:
scores.sort_values().head(20)


Out[41]:
word
friday       1551.314286
comey        1739.685714
april        2006.009524
e            2265.019048
thursday     2682.990476
july         2750.009524
tuesday      2840.409524
wednesday    3023.285714
easter       3234.504762
fbi          3249.942857
june         3379.866667
gorsuch      3695.276190
missile      3980.723810
senate       4226.542857
clearance    4276.095238
trump        4356.371429
o            4452.638095
sessions     4466.219048
t            4475.447619
chemical     4547.857143
Name: place, dtype: float64

In [12]:
df.groupby('date').head(3)


Out[12]:
word score date
0 comey 11241.15 2017-05-12.csv
1 fbi 8570.06 2017-05-12.csv
2 clinton 4769.02 2017-05-12.csv
0 nauseous 2830.91 2017-05-05.csv
1 upton 2146.20 2017-05-05.csv
2 amundson 2057.39 2017-05-05.csv
0 tuesday 2234.21 2017-04-20.csv
1 kenosha 2179.63 2017-04-20.csv
2 stephens 1908.09 2017-04-20.csv
0 watlington 2563.95 2017-06-27.csv
1 laclare 1872.00 2017-06-27.csv
2 overs 1784.19 2017-06-27.csv
0 zytiga 3328.00 2017-06-08.csv
1 normandy 1959.26 2017-06-08.csv
2 fiamma 1768.00 2017-06-08.csv
0 pyongyang 2955.73 2017-04-16.csv
1 ryol 2912.00 2017-04-16.csv
2 korea 2626.74 2017-04-16.csv
0 egypt 19291.30 2017-04-11.csv
1 tanta 10400.08 2017-04-11.csv
2 coptic 7538.72 2017-04-11.csv
0 grucci 2496.00 2017-06-30.csv
1 sheard 1979.05 2017-06-30.csv
2 enis 1771.58 2017-06-30.csv
0 yates 24454.02 2017-05-10.csv
1 naly 11440.00 2017-05-10.csv
2 flynn 8985.09 2017-05-10.csv
0 bomb 15103.60 2017-04-15.csv
1 ordnance 8678.79 2017-04-15.csv
2 afghanistan 7101.11 2017-04-15.csv
... ... ... ...
0 mccain 5443.88 2017-07-18.csv
1 federer 4081.27 2017-07-18.csv
2 clot 2896.46 2017-07-18.csv
0 biegel 2907.71 2017-05-03.csv
1 clout 2085.93 2017-05-03.csv
2 stalans 1976.00 2017-05-03.csv
0 abedi 5028.25 2017-05-26.csv
1 libyan 2378.09 2017-05-26.csv
2 unaffordable 2244.07 2017-05-26.csv
0 clearance 35519.44 2017-04-01.csv
1 sanction 2872.79 2017-04-01.csv
2 berkner 1979.05 2017-04-01.csv
0 lts 1725.12 2017-07-01.csv
1 deportee 1517.59 2017-07-01.csv
2 ripken 1455.76 2017-07-01.csv
0 abe 3477.43 2017-07-10.csv
1 menace 3116.64 2017-07-10.csv
2 luxuriously 2186.61 2017-07-10.csv
0 ailes 8062.86 2017-05-20.csv
1 witch 2520.12 2017-05-20.csv
2 lams 2080.00 2017-05-20.csv
0 prayer 6631.90 2017-05-07.csv
1 revile 6250.19 2017-05-07.csv
2 triumphant 3070.34 2017-05-07.csv
0 nra 8031.52 2017-04-30.csv
1 offshore 1333.53 2017-04-30.csv
2 stopgap 1087.51 2017-04-30.csv
0 akhmetshin 7821.18 2017-07-16.csv
1 bastille 5385.59 2017-07-16.csv
2 rinat 3958.10 2017-07-16.csv

315 rows × 3 columns