In [1]:
import sys
import pandas as pd
import numpy as np
import difflib
import gzip
import matplotlib.pyplot as plt
from scipy import stats 

# filename1 = sys.argv[1]

filename1 = "reddit-counts.json.gz"

In [2]:
def filter_data(df):
# data_file = gzip.open(filename1, 'rt', encoding='utf-8')
# data = pd.read_json(data_file, lines=True)
    data=df.copy()
    data['year']=data.apply(lambda x: x['date'].year,axis = 1)
    data['iso_year']=data.apply(lambda x: x['date'].isocalendar()[0],axis = 1)
    data['week_sq']=data.apply(lambda x: x['date'].isocalendar()[1],axis = 1)
    data['weekday']=data.apply(lambda x: x['date'].isocalendar()[2],axis = 1)
    data = data[((data['year']==2012)|(data['year']==2013))&(data['subreddit']=="canada")]
    data['isWeekend'] = data.apply(lambda x: False if x['weekday'] < 6 else True, axis = 1)
    return data

In [3]:
# weekends = data[data["isWeekend"]==True]
# weekdays = data[data["isWeekend"]==False]

Normality and Equal variance Test


In [4]:
# print(stats.normaltest(weekends["comment_count"]).pvalue)
# print(stats.normaltest(weekdays["comment_count"]).pvalue)
# print(stats.levene(weekdays["comment_count"],weekends["comment_count"]).pvalue)

In [5]:
def fix_1(func):
    print(func)
    print(stats.normaltest(func(weekends["comment_count"])).pvalue)
    print(stats.normaltest(func(weekdays["comment_count"])).pvalue)
    print(stats.levene(func(weekdays["comment_count"]),func(weekends["comment_count"])).pvalue)
# fix_1(np.log)
# fix_1(np.log)
# # not working for exp
# # fix_1(np.exp)
# fix_1(np.sqrt)
# fix_1(lambda x:x*x)

Fix 2


In [6]:
# weekends_by_week = weekends.groupby(["iso_year","week_sq"]).mean().reset_index()
# weekdays_by_week = weekdays.groupby(["iso_year","week_sq"]).mean().reset_index()

# print(stats.normaltest(weekends_by_week["comment_count"]).pvalue)
# print(stats.normaltest(weekdays_by_week["comment_count"]).pvalue)
# print(stats.levene(weekdays_by_week["comment_count"],weekends_by_week["comment_count"]).pvalue)
# print(stats.ttest_ind(weekdays_by_week["comment_count"],weekends_by_week["comment_count"]))

Fix 3


In [7]:
# print(stats.mannwhitneyu(weekdays["comment_count"],weekends["comment_count"]))

In [8]:
def main():
    OUTPUT_TEMPLATE = (
    "Initial (invalid) T-test p-value: {initial_ttest_p:.3g}\n"
    "Original data normality p-values: {initial_weekday_normality_p:.3g} {initial_weekend_normality_p:.3g}\n"
    "Original data equal-variance p-value: {initial_levene_p:.3g}\n"
    "Transformed data normality p-values: {transformed_weekday_normality_p:.3g} {transformed_weekend_normality_p:.3g}\n"
    "Transformed data equal-variance p-value: {transformed_levene_p:.3g}\n"
    "Weekly data normality p-values: {weekly_weekday_normality_p:.3g} {weekly_weekend_normality_p:.3g}\n"
    "Weekly data equal-variance p-value: {weekly_levene_p:.3g}\n"
    "Weekly T-test p-value: {weekly_ttest_p:.3g}\n"
    "Mann–Whitney U-test p-value: {utest_p:.3g}"
)
    
    reddit_counts = sys.argv[1]

    # ...
    data_file = gzip.open(filename1, 'rt', encoding='utf-8')
    df = pd.read_json(data_file, lines=True)
    data = filter_data(df)
    weekends = data[data["isWeekend"]==True]
    weekdays = data[data["isWeekend"]==False]
    
    weekends_by_week = weekends.groupby(["iso_year","week_sq"]).mean().reset_index()
    weekdays_by_week = weekdays.groupby(["iso_year","week_sq"]).mean().reset_index()
    
    print(OUTPUT_TEMPLATE.format(
        initial_ttest_p=stats.ttest_ind(weekdays["comment_count"],weekends["comment_count"]).pvalue,
        initial_weekday_normality_p=stats.normaltest(weekdays["comment_count"]).pvalue,
        initial_weekend_normality_p=stats.normaltest(weekends["comment_count"]).pvalue,
        initial_levene_p=stats.levene(weekdays["comment_count"],weekends["comment_count"]).pvalue,
        transformed_weekday_normality_p=stats.normaltest(np.sqrt(weekdays["comment_count"])).pvalue,
        transformed_weekend_normality_p=stats.normaltest(np.sqrt(weekends["comment_count"])).pvalue,
        transformed_levene_p=stats.levene(np.sqrt(weekdays["comment_count"]),np.sqrt(weekends["comment_count"])).pvalue,
        weekly_weekday_normality_p=stats.normaltest(weekdays_by_week["comment_count"]).pvalue,
        weekly_weekend_normality_p=stats.normaltest(weekends_by_week["comment_count"]).pvalue,
        weekly_levene_p=stats.levene(weekdays_by_week["comment_count"],weekends_by_week["comment_count"]).pvalue,
        weekly_ttest_p=stats.ttest_ind(weekdays_by_week["comment_count"],weekends_by_week["comment_count"]).pvalue,
        utest_p=stats.mannwhitneyu(weekdays["comment_count"],weekends["comment_count"]).pvalue,
    ))

if __name__ == '__main__':
    main()


Initial (invalid) T-test p-value: 1.3e-58
Original data normality p-values: 1.01e-07 0.00152
Original data equal-variance p-value: 0.0438
Transformed data normality p-values: 0.0369 0.108
Transformed data equal-variance p-value: 0.556
Weekly data normality p-values: 0.308 0.153
Weekly data equal-variance p-value: 0.204
Weekly T-test p-value: 1.34e-34
Mann–Whitney U-test p-value: 4.31e-53

In [10]:
data_file = gzip.open(filename1, 'rt', encoding='utf-8')
df = pd.read_json(data_file, lines=True)
data = filter_data(df)
weekends = data[data["isWeekend"]==True]
weekdays = data[data["isWeekend"]==False]

weekends_by_week = weekends.groupby(["iso_year","week_sq"]).mean().reset_index()
weekdays_by_week = weekdays.groupby(["iso_year","week_sq"]).mean().reset_index()


Out[10]:
comment_count date subreddit year iso_year week_sq weekday isWeekend
66 1657 2013-03-14 canada 2013 2013 11 4 False
69 1369 2013-07-08 canada 2013 2013 28 1 False
97 1343 2012-07-04 canada 2012 2012 27 3 False
115 1619 2013-03-11 canada 2013 2013 11 1 False
165 1909 2013-09-11 canada 2013 2013 37 3 False
181 2112 2012-08-08 canada 2012 2012 32 3 False
182 1929 2012-08-09 canada 2012 2012 32 4 False
193 2086 2012-01-17 canada 2012 2012 3 2 False
263 2113 2013-08-20 canada 2013 2013 34 2 False
265 1500 2012-01-23 canada 2012 2012 4 1 False
294 1853 2012-02-27 canada 2012 2012 9 1 False
302 1826 2012-08-07 canada 2012 2012 32 2 False
314 1622 2013-03-25 canada 2013 2013 13 1 False
344 1822 2012-07-18 canada 2012 2012 29 3 False
360 1681 2013-05-03 canada 2013 2013 18 5 False
433 2598 2013-07-30 canada 2013 2013 31 2 False
455 1705 2012-08-24 canada 2012 2012 34 5 False
473 1848 2012-02-01 canada 2012 2012 5 3 False
571 1664 2012-10-31 canada 2012 2012 44 3 False
591 1572 2013-07-22 canada 2013 2013 30 1 False
607 2012 2012-06-22 canada 2012 2012 25 5 False
652 1427 2013-06-06 canada 2013 2013 23 4 False
654 1672 2013-01-02 canada 2013 2013 1 3 False
665 2262 2013-04-11 canada 2013 2013 15 4 False
692 1602 2012-11-01 canada 2012 2012 44 4 False
759 1656 2012-11-12 canada 2012 2012 46 1 False
766 2279 2013-04-25 canada 2013 2013 17 4 False
771 1775 2013-05-23 canada 2013 2013 21 4 False
803 1253 2013-08-06 canada 2013 2013 32 2 False
857 1702 2013-06-24 canada 2013 2013 26 1 False
... ... ... ... ... ... ... ... ...
14431 1773 2013-06-19 canada 2013 2013 25 3 False
14455 1714 2012-12-10 canada 2012 2012 50 1 False
14486 2353 2013-11-05 canada 2013 2013 45 2 False
14514 1456 2013-09-27 canada 2013 2013 39 5 False
14540 1633 2013-10-08 canada 2013 2013 41 2 False
14608 1783 2013-05-22 canada 2013 2013 21 3 False
14622 1222 2013-09-02 canada 2013 2013 36 1 False
14644 1357 2012-10-08 canada 2012 2012 41 1 False
14646 1822 2013-01-23 canada 2013 2013 4 3 False
14675 2702 2012-05-25 canada 2012 2012 21 5 False
14705 1791 2012-12-13 canada 2012 2012 50 4 False
14742 1542 2013-11-04 canada 2013 2013 45 1 False
14815 1046 2012-12-25 canada 2012 2012 52 2 False
14834 2154 2012-10-11 canada 2012 2012 41 4 False
14852 1876 2013-05-01 canada 2013 2013 18 3 False
14877 1804 2013-10-30 canada 2013 2013 44 3 False
14949 1904 2012-11-29 canada 2012 2012 48 4 False
15008 1296 2013-06-26 canada 2013 2013 26 3 False
15047 1932 2013-01-16 canada 2013 2013 3 3 False
15131 1454 2013-09-23 canada 2013 2013 39 1 False
15151 1517 2012-06-18 canada 2012 2012 25 1 False
15252 1818 2013-04-12 canada 2013 2013 15 5 False
15258 2588 2013-11-28 canada 2013 2013 48 4 False
15264 1533 2012-11-21 canada 2012 2012 47 3 False
15330 1916 2013-05-08 canada 2013 2013 19 3 False
15357 2021 2013-04-10 canada 2013 2013 15 3 False
15363 1630 2013-03-19 canada 2013 2013 12 2 False
15389 2113 2013-01-01 canada 2013 2013 1 2 False
15430 1486 2012-10-19 canada 2012 2012 42 5 False
15469 1127 2012-01-02 canada 2012 2012 1 1 False

522 rows × 8 columns