In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict

import matplotlib.pyplot as plt

import re

from collections import Counter
import statsmodels.api as sm

import seaborn as sns
from urlparse import urlsplit, parse_qs

from sklearn.feature_extraction import DictVectorizer

from IPython.display import display


/homed/content/anaconda3/envs/python2/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [2]:
sns.set_context("poster")
sns.set_style("ticks")

In [3]:
TOPIC_MAPPING={
    "GunControl": "Gun Control",
    "Privacy": "Privacy",
    "Vaccine": "Vaccine",
    "ChildEducation": "Child Education",
    "SkinDamage": "Skin Damage",
    "SeatBelt": "Seat Belt"
}
topic_order=["Gun Control", "Privacy", "Vaccine",
             "Child Education", "Skin Damage", "Seat Belt"]
df = pd.read_hdf("FINAL_ANALYSIS_DATA.h5", "final_data").rename(columns={
        #u'is_controvertial': u'is_controversial'
    }).assign(
    topic_name=lambda x: x.topic_name.apply(lambda k: TOPIC_MAPPING[k.split('/')[0]]),
)
df.shape


Out[3]:
(246869, 54)

In [4]:
print ("Total instances in data is %s,\n"
       " with URLs is %s (%.2f%%),\n"
       " with user locations is %s (%.2f%%),\n"
       " and with gender is %s (%.2f%%)") % (
    df.shape[0],
    df[df.t_n_urls > 0].shape[0], df[df.t_n_urls > 0].shape[0]*100./ df.shape[0],
    df[~df.u_state.isnull()].shape[0], df[~df.u_state.isnull()].shape[0]*100./ df.shape[0],
    df[~df.Gender.isnull()].shape[0], df[~df.Gender.isnull()].shape[0]*100./ df.shape[0],
)


pd.DataFrame({
        "Overall": df.topic_name.value_counts(), 
        "With URLs": df[df.t_n_urls > 0].topic_name.value_counts(), 
        "With user locations": df[~df.u_state.isnull()].topic_name.value_counts(), 
        "With user gender": df[~df.Gender.isnull()].topic_name.value_counts(), 
    })#.sort_values("Overall")


Total instances in data is 246869,
 with URLs is 101640 (41.17%),
 with user locations is 169038 (68.47%),
 and with gender is 132125 (53.52%)
Out[4]:
Overall With URLs With user gender With user locations
Child Education 10808 5601 5054 8142
Gun Control 34357 17139 18598 24009
Privacy 73593 33006 40583 53368
Seat Belt 73270 14842 38653 43219
Skin Damage 14128 7777 6915 11162
Vaccine 40713 23275 22322 29138

In [5]:
pd.concat([df.groupby(["topic_name"])["t_n_urls"].agg([np.sum, np.mean, len]),
           df[df.t_n_urls > 0].topic_name.value_counts().to_frame()
          ], axis=1).rename(columns={"topic_name": "No. of tweets with URLs",
                                     "sum": "No. of URLs",
                                     "mean": "Mean URLs",
                                     "len": "No. of tweets"
                                    })


Out[5]:
No. of URLs Mean URLs No. of tweets No. of tweets with URLs
Child Education 5718.0 0.553426 10808.0 5601
Gun Control 17616.0 0.551172 34357.0 17139
Privacy 33693.0 0.483338 73593.0 33006
Seat Belt 15175.0 0.225714 73270.0 14842
Skin Damage 7934.0 0.586748 14128.0 7777
Vaccine 24143.0 0.621377 40713.0 23275

In [6]:
df[~df.Gender.isnull()].Gender.value_counts()


Out[6]:
M    76665
F    55460
Name: Gender, dtype: int64

In [7]:
df.u_location.value_counts().head()


Out[7]:
United States      6906
USA                5284
Washington, DC     3431
New York, NY       2225
Los Angeles, CA    1979
Name: u_location, dtype: int64

In [8]:
df.u_location.value_counts().shape


Out[8]:
(45905,)

In [9]:
df.columns


Out[9]:
Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial',
                    u'TID',             u'CATS',          u'u_state'],
      dtype='object')

In [10]:
df[~df.CATS.isnull()].CATS.head()


Out[10]:
13      [news]
14      [news]
15       [UNK]
16    [videos]
19      [blog]
Name: CATS, dtype: object

In [11]:
CAT_MAPPINGS={
    "satire": "fakenews",
    "clickbait": "fakenews",
    "usgov": "news"
}
def get_category_counts(x):
    if x == 0:
        return Counter(["NONE"])
    c = Counter([CAT_MAPPINGS.get(k,k) for k in x])
    if "twitter" in c:
        c = Counter({"twitter": c["twitter"]})
    return c

In [12]:
df.CATS.fillna(0).apply(get_category_counts).head()


Out[12]:
0    {u'NONE': 1}
1    {u'NONE': 1}
2    {u'NONE': 1}
3    {u'NONE': 1}
4    {u'NONE': 1}
Name: CATS, dtype: object

In [13]:
df.CATS.fillna(0).apply(get_category_counts).apply(lambda x: len(x)).describe()


Out[13]:
count    246869.000000
mean          1.037931
std           0.196466
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           4.000000
Name: CATS, dtype: float64

In [14]:
df[~df.CATS.isnull()].CATS.head()


Out[14]:
13      [news]
14      [news]
15       [UNK]
16    [videos]
19      [blog]
Name: CATS, dtype: object

In [15]:
df["CATS_Counter"] = df.CATS.fillna(0).apply(get_category_counts)
df.ix[df.CATS_Counter.apply(lambda x: len(x)) == 2, "CATS_Counter"].head()


/home/content/anaconda3/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  from ipykernel import kernelapp as app
Out[15]:
23     {u'socialmedia': 1, u'videos': 1}
38     {u'socialmedia': 1, u'videos': 1}
53     {u'socialmedia': 1, u'videos': 1}
87        {u'news': 1, u'scientific': 1}
138    {u'socialmedia': 1, u'videos': 1}
Name: CATS_Counter, dtype: object

In [16]:
df.ix[df.CATS_Counter.apply(lambda x: len(x)) == 2, "CATS_Counter"].shape


Out[16]:
(8862,)

In [17]:
pd.Series(sum(df.CATS_Counter.values[1:], df.CATS_Counter.values[0])).to_frame().reset_index()


Out[17]:
index 0
0 NONE 145229
1 UNK 30663
2 blog 6349
3 commercial 3654
4 fakenews 8375
5 news 19491
6 scientific 674
7 socialmedia 13745
8 twitter 23118
9 videos 7230

In [18]:
%%time
df_X = df.CATS_Counter.apply(lambda x: pd.Series(x)).fillna(0.)
df_y = df.is_controversial * 1.


CPU times: user 1min 39s, sys: 1.52 s, total: 1min 41s
Wall time: 1min 40s

In [19]:
df_X.head()


Out[19]:
NONE UNK blog commercial fakenews news scientific socialmedia twitter videos
0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

In [20]:
model = sm.Logit(df_y, df_X)
res = model.fit()
res.summary2()


Optimization terminated successfully.
         Current function value: 0.628661
         Iterations 8
Out[20]:
Model: Logit Pseudo R-squared: 0.065
Dependent Variable: is_controversial AIC: 310413.9147
Date: 2018-01-19 16:08 BIC: 310518.0809
No. Observations: 246869 Log-Likelihood: -1.5520e+05
Df Model: 9 LL-Null: -1.6592e+05
Df Residuals: 246859 LLR p-value: 0.0000
Converged: 1.0000 Scale: 1.0000
No. Iterations: 8.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
NONE 0.0724 0.0053 13.7917 0.0000 0.0621 0.0827
UNK 0.7346 0.0122 60.2838 0.0000 0.7108 0.7585
blog 2.0246 0.0392 51.5891 0.0000 1.9476 2.1015
commercial 0.3305 0.0339 9.7370 0.0000 0.2640 0.3970
fakenews 3.0580 0.0660 46.2990 0.0000 2.9286 3.1875
news 2.0590 0.0233 88.4545 0.0000 2.0134 2.1046
scientific 1.0698 0.1041 10.2784 0.0000 0.8658 1.2738
socialmedia -0.2513 0.0249 -10.1077 0.0000 -0.3000 -0.2026
twitter 0.6729 0.0139 48.3966 0.0000 0.6457 0.7002
videos 0.3852 0.0340 11.3459 0.0000 0.3187 0.4518

In [21]:
res.get_margeff(at="zero").summary()


Out[21]:
Logit Marginal Effects
Dep. Variable: is_controversial
Method: dydx
At: zero
dy/dx std err z P>|z| [0.025 0.975]
NONE 0.0181 0.001 13.792 0.000 0.016 0.021
UNK 0.1837 0.003 60.284 0.000 0.178 0.190
blog 0.5061 0.010 51.589 0.000 0.487 0.525
commercial 0.0826 0.008 9.737 0.000 0.066 0.099
fakenews 0.7645 0.017 46.299 0.000 0.732 0.797
news 0.5148 0.006 88.454 0.000 0.503 0.526
scientific 0.2675 0.026 10.278 0.000 0.216 0.318
socialmedia -0.0628 0.006 -10.108 0.000 -0.075 -0.051
twitter 0.1682 0.003 48.397 0.000 0.161 0.175
videos 0.0963 0.008 11.346 0.000 0.080 0.113

In [22]:
model = sm.Logit(df_y[(df.t_n_urls > 0)], df_X[(df.t_n_urls > 0)].drop("NONE", axis=1))
res = model.fit()
res.summary2()


Optimization terminated successfully.
         Current function value: 0.537456
         Iterations 8
Out[22]:
Model: Logit Pseudo R-squared: 0.090
Dependent Variable: is_controversial AIC: 109272.1055
Date: 2018-01-19 16:08 BIC: 109357.8682
No. Observations: 101640 Log-Likelihood: -54627.
Df Model: 8 LL-Null: -60040.
Df Residuals: 101631 LLR p-value: 0.0000
Converged: 1.0000 Scale: 1.0000
No. Iterations: 8.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
UNK 0.7346 0.0122 60.2838 0.0000 0.7108 0.7585
blog 2.0246 0.0392 51.5891 0.0000 1.9476 2.1015
commercial 0.3305 0.0339 9.7370 0.0000 0.2640 0.3970
fakenews 3.0580 0.0660 46.2990 0.0000 2.9286 3.1875
news 2.0590 0.0233 88.4545 0.0000 2.0134 2.1046
scientific 1.0698 0.1041 10.2784 0.0000 0.8658 1.2738
socialmedia -0.2513 0.0249 -10.1077 0.0000 -0.3000 -0.2026
twitter 0.6729 0.0139 48.3966 0.0000 0.6457 0.7002
videos 0.3852 0.0340 11.3459 0.0000 0.3187 0.4518

In [23]:
df.topic_name.value_counts()


Out[23]:
Privacy            73593
Seat Belt          73270
Vaccine            40713
Gun Control        34357
Skin Damage        14128
Child Education    10808
Name: topic_name, dtype: int64

In [24]:
%%time
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 14,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.factorplot(y="URL_type", x="URL_counts", #hue="is_controversial",
                   col="topic_name", col_wrap=3, 
                       col_order=topic_order,
                       kind="bar",
                       color="0.5",
                       errwidth=2,
                  data=pd.melt(pd.concat([df_X > 0,
                                          df[["topic_name"]],
                                          (df_y == 1).to_frame()],
                      axis=1).drop(["NONE", "UNK"], axis=1),
                               id_vars=["is_controversial", "topic_name"],
            var_name="URL_type", value_name="URL_counts"
           ))
    #g.set_xticklabels(rotation=90)
    g.set_titles("{col_name}").set_axis_labels("URL type proportion", "")
    #sns.despine(offset=10)
    plt.savefig("URL_proportions.all.pdf", bbox_inches="tight")


CPU times: user 54.1 s, sys: 196 ms, total: 54.3 s
Wall time: 53.7 s

In [25]:
%%time
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 14,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.factorplot(y="URL_type", x="URL_counts", #hue="is_controversial",
                   col="topic_name", col_wrap=3, 
                       col_order=topic_order,
                       kind="bar",
                       color="0.5",
                       errwidth=2,
                  data=pd.melt(pd.concat([df_X[df.t_n_urls > 0] > 0,
                                          df[df.t_n_urls > 0][["topic_name"]],
                                          (df_y[df.t_n_urls > 0] == 1).to_frame()],
                      axis=1).drop(["NONE", "UNK"], axis=1),
                               id_vars=["is_controversial", "topic_name"],
            var_name="URL_type", value_name="URL_counts"
           ))
    #g.set_xticklabels(rotation=90)
    g.set_titles("{col_name}").set_axis_labels("URL type proportion", "")
    #sns.despine(offset=10)
    plt.savefig("URL_proportions.atleast1url.pdf", bbox_inches="tight")


CPU times: user 22.7 s, sys: 92 ms, total: 22.8 s
Wall time: 22.7 s

In [26]:
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 14,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    ax = sns.barplot(x="mean", y="index", hue="label",
                data=pd.concat([
            ((df_X[
                (df_y == 0)
                & (df.t_n_urls > 0)
            ].drop(["NONE", "UNK",],
                   axis=1) > 0)* 1.).describe().T.reset_index().assign(
                label="Non-Controversial"),
            ((df_X[
                (df_y == 1)
                & (df.t_n_urls > 0)
            ].drop(["NONE", "UNK",],
                   axis=1) > 0)* 1.).describe().T.reset_index().assign(
                label="Controversial"),
                ], axis=0),
                     palette=["k", "0.5"]
                    )
    ax.set_xlabel("Proportion of tweets with given URL type")
    ax.set_ylabel("URL types")
    sns.despine(offset=10)



In [27]:
df_t = (
    ((df_X[(df_y == 1) & (df.t_n_urls > 0)].drop(["NONE"], axis=1) > 0)* 1.).describe().T["mean"]/
    ((df_X[(df_y == 0) & (df.t_n_urls > 0)].drop(["NONE"], axis=1) > 0)* 1.).describe().T["mean"]
).to_frame()
df_t["mean"] = (df_t["mean"]/df_t.ix["twitter", "mean"])
df_t.reset_index().rename(columns={"index": "URL type", "mean": "Odds in controversial topic"})


/home/content/anaconda3/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:5: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
Out[27]:
URL type Odds in controversial topic
0 UNK 1.106572
1 blog 3.895895
2 commercial 0.780835
3 fakenews 17.177927
4 news 4.282770
5 scientific 2.329731
6 socialmedia 0.512857
7 twitter 1.000000
8 videos 0.597117

Aggregate counts


In [28]:
df.shape


Out[28]:
(246869, 55)

In [29]:
df[df.t_n_urls > 0].shape


Out[29]:
(101640, 55)

In [30]:
df.t_n_urls.sum()


Out[30]:
104279.0

In [31]:
df.topic_name.value_counts().to_frame()


Out[31]:
topic_name
Privacy 73593
Seat Belt 73270
Vaccine 40713
Gun Control 34357
Skin Damage 14128
Child Education 10808

URL Analysis


In [32]:
def sum_url_present(x):
    return (x>0).sum()
def mean_url_present(x):
    return (x>0).mean()


def get_url_domain(x):
    x = urlsplit(x.lower())
    if x.netloc in {"linkis.com", "www.linkis.com"}:
        if x.path[1:] != "":
            x = urlsplit("http:/%s" % x.path).netloc
        else:
            x = x.netloc
    elif x.netloc in {"google.com", "www.google.com"}:
        query = parse_qs(x.query)
        if "url" in query:
            return get_url_domain(query["url"][0])
        x = x.netloc
    else:
        x = x.netloc
    if x.startswith("www."):
        x = x[4:]
    if x.endswith(".wordpress.com") or x.endswith(".tumblr.com") or x.endswith(".blogspot.com"):
        x = x.split(".", 1)[-1]
    return x

print get_url_domain("https://www.google.com/url?rct=j&sa=t&url=http://www.perthnow.com.au/news/western-australia/social-services-minister-christian-porter-slaps-down-antivaccination-campaigners/news-story/0aa49052ec0598704b05333075581296&ct=ga&cd=CAIyGjE2ZDBhYmZjOTAzMjkyMTk6Y29tOmVuOlVT&usg=AFQjCNFAB3aZtdfdVpXOHWzyfqsu0ZSFAg")


perthnow.com.au

In [33]:
df.groupby("topic_name")["t_n_urls"].agg([len, np.sum, np.mean,
                                          sum_url_present, mean_url_present]).sort_values("mean", ascending=False)


Out[33]:
len sum mean sum_url_present mean_url_present
topic_name
Vaccine 40713.0 24143.0 0.621377 23275.0 0.571685
Skin Damage 14128.0 7934.0 0.586748 7777.0 0.550467
Child Education 10808.0 5718.0 0.553426 5601.0 0.518227
Gun Control 34357.0 17616.0 0.551172 17139.0 0.498850
Privacy 73593.0 33693.0 0.483338 33006.0 0.448494
Seat Belt 73270.0 15175.0 0.225714 14842.0 0.202566

In [34]:
df_urls = pd.read_csv("TID_URL_CATS.txt", sep="\t")
df_urls["ORIG_DOMAIN"] = df_urls.URL.apply(get_url_domain)
df_urls.head()


Out[34]:
TID URL CATS ORIG_DOMAIN
0 682904901916225536 https://twitter.com/photogchad_WTSP/status/682... socialmedia|twitter twitter.com
1 682915876316692480 http://www.investirdanslenfance.ca/ UNK investirdanslenfance.ca
2 682985833821941760 http://TinyURL.com/NewYearCure commercial tinyurl.com
3 682952771746664448 http://TinyURL.com/NewYearCure commercial tinyurl.com
4 682830450969059328 http://yournewswire.com/donald-trump-vaccines-... fakenews yournewswire.com

In [35]:
df_url_exp = pd.read_csv("URL_CAT_MAPPINGS.txt", sep="\t")
df_url_exp.head()


Out[35]:
URL EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS
0 http://www.investmentnews.com/article/20160801... http://www.investmentnews.com/article/20160801... 0 investmentnews.com UNK
1 http://ow.ly/3avNPe https://www.reddit.com/r/cahideas/comments/42i... 0 reddit.com socialmedia
2 http://stratcom.kma-assc.com/uncategorized/pre... http://stratcom.kma-assc.com/uncategorized/pre... 3 stratcom.kma-assc.com UNK
3 http://ln.is/mabelsaveforschool.com/gbEtv http://linkis.com/mabelsaveforschool.com/gbEtv 0 mabelsaveforschool.com commercial
4 http://kiw.im/16LfJirkfzE https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 0 kiwi.qa UNK

In [36]:
df_urls.shape, df_url_exp.shape


Out[36]:
((166670, 4), (97512, 5))

In [37]:
df_urls = df_urls.merge(df_url_exp, how="left", on="URL")
df_urls.head()


Out[37]:
TID URL CATS ORIG_DOMAIN EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS
0 682904901916225536 https://twitter.com/photogchad_WTSP/status/682... socialmedia|twitter twitter.com NaN NaN NaN NaN
1 682915876316692480 http://www.investirdanslenfance.ca/ UNK investirdanslenfance.ca http://www.investirdanslenfance.ca/ 0.0 investirdanslenfance.ca UNK
2 682985833821941760 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial
3 682952771746664448 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial
4 682830450969059328 http://yournewswire.com/donald-trump-vaccines-... fakenews yournewswire.com http://yournewswire.com/donald-trump-vaccines-... 0.0 yournewswire.com fakenews

In [38]:
## Update data for twitter urls
df_urls.ix[df_urls.EXPANDED.isnull(), "EXPANDED_STATUS"] = 0
df_urls.ix[df_urls.EXPANDED.isnull(), "URL_DOMAIN"] = df_urls.ix[df_urls.EXPANDED.isnull(), "ORIG_DOMAIN"]
df_urls.ix[df_urls.EXPANDED.isnull(), "URL_CATS"] = df_urls.ix[df_urls.EXPANDED.isnull(), "CATS"]
df_urls.ix[df_urls.EXPANDED.isnull(), "EXPANDED"] = df_urls.ix[df_urls.EXPANDED.isnull(), "URL"]
df_urls.head()


/home/content/anaconda3/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  from ipykernel import kernelapp as app
Out[38]:
TID URL CATS ORIG_DOMAIN EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS
0 682904901916225536 https://twitter.com/photogchad_WTSP/status/682... socialmedia|twitter twitter.com https://twitter.com/photogchad_WTSP/status/682... 0.0 twitter.com socialmedia|twitter
1 682915876316692480 http://www.investirdanslenfance.ca/ UNK investirdanslenfance.ca http://www.investirdanslenfance.ca/ 0.0 investirdanslenfance.ca UNK
2 682985833821941760 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial
3 682952771746664448 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial
4 682830450969059328 http://yournewswire.com/donald-trump-vaccines-... fakenews yournewswire.com http://yournewswire.com/donald-trump-vaccines-... 0.0 yournewswire.com fakenews

In [39]:
df_urls.URL_DOMAIN.value_counts().head(10).to_frame()


Out[39]:
URL_DOMAIN
twitter.com 25251
youtube.com 6502
facebook.com 5620
mabelsaveforschool.com 3863
instagram.com 2869
newsweek.com 2503
naturalnews.com 1863
nytimes.com 1833
washingtonpost.com 1412
ebay.com 1345

In [40]:
df_urls.ORIG_DOMAIN.value_counts().head(10).to_frame()


Out[40]:
ORIG_DOMAIN
twitter.com 24980
bit.ly 15150
fb.me 15069
ow.ly 6866
dlvr.it 5398
ift.tt 4693
goo.gl 4039
ln.is 3795
youtu.be 3784
gvwy.io 3120

In [41]:
df_urls.merge(df[["t_id", "topic_name"]], how="left", left_on="TID", right_on="t_id").head()


Out[41]:
TID URL CATS ORIG_DOMAIN EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS t_id topic_name
0 682904901916225536 https://twitter.com/photogchad_WTSP/status/682... socialmedia|twitter twitter.com https://twitter.com/photogchad_WTSP/status/682... 0.0 twitter.com socialmedia|twitter 6.829049e+17 Skin Damage
1 682915876316692480 http://www.investirdanslenfance.ca/ UNK investirdanslenfance.ca http://www.investirdanslenfance.ca/ 0.0 investirdanslenfance.ca UNK 6.829159e+17 Child Education
2 682985833821941760 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial 6.829858e+17 Vaccine
3 682952771746664448 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial 6.829528e+17 Vaccine
4 682830450969059328 http://yournewswire.com/donald-trump-vaccines-... fakenews yournewswire.com http://yournewswire.com/donald-trump-vaccines-... 0.0 yournewswire.com fakenews 6.828305e+17 Vaccine

In [42]:
get_url_domain("http://linkis.com/thinkingmomsrevolution.com/lXwma")


Out[42]:
'thinkingmomsrevolution.com'

In [43]:
df_url_tid = df_urls.merge(df[["t_id", "topic_name"]], how="inner", left_on="TID", right_on="t_id")
df_url_tid["CATS_Counter"] = df_url_tid.CATS.apply(lambda x: get_category_counts(x.split("|")))
df_url_tid.head()


Out[43]:
TID URL CATS ORIG_DOMAIN EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS t_id topic_name CATS_Counter
0 682904901916225536 https://twitter.com/photogchad_WTSP/status/682... socialmedia|twitter twitter.com https://twitter.com/photogchad_WTSP/status/682... 0.0 twitter.com socialmedia|twitter 682904901916225536 Skin Damage {u'twitter': 1}
1 682915876316692480 http://www.investirdanslenfance.ca/ UNK investirdanslenfance.ca http://www.investirdanslenfance.ca/ 0.0 investirdanslenfance.ca UNK 682915876316692480 Child Education {u'UNK': 1}
2 682985833821941760 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial 682985833821941760 Vaccine {u'commercial': 1}
3 682952771746664448 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial 682952771746664448 Vaccine {u'commercial': 1}
4 682830450969059328 http://yournewswire.com/donald-trump-vaccines-... fakenews yournewswire.com http://yournewswire.com/donald-trump-vaccines-... 0.0 yournewswire.com fakenews 682830450969059328 Vaccine {u'fakenews': 1}

In [44]:
df_url_tid.shape


Out[44]:
(104279, 11)

In [45]:
df_url_tid.ORIG_DOMAIN.unique().shape


Out[45]:
(7495,)

In [46]:
df_url_tid.URL.unique().shape


Out[46]:
(87106,)

In [47]:
df_url_tid.URL_DOMAIN.unique().shape, df_url_tid.EXPANDED.unique().shape


Out[47]:
((12086,), (77880,))

In [48]:
df_t = df_url_tid.pivot_table(
    index="URL_DOMAIN", columns="topic_name", values="TID", aggfunc=len
)
pd.concat({c: df_t[c].sort_values(ascending=False).head(20).reset_index().rename(columns={c: "Counts",
                                                                                         "URL_DOMAIN": c})
           for c in df_t.columns},
          axis=1, keys=topic_order)


Out[48]:
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control Counts Privacy Counts Vaccine Counts Child Education Counts Skin Damage Counts Seat Belt Counts
0 twitter.com 2274.0 twitter.com 9827.0 twitter.com 3208.0 twitter.com 1227.0 facebook.com 1588.0 twitter.com 5929.0
1 youtube.com 909.0 youtube.com 877.0 youtube.com 1666.0 facebook.com 474.0 instagram.com 1133.0 instagram.com 1111.0
2 breitbart.com 656.0 newsweek.com 814.0 facebook.com 1003.0 youtube.com 137.0 twitter.com 653.0 facebook.com 832.0
3 facebook.com 394.0 facebook.com 522.0 naturalnews.com 556.0 instagram.com 130.0 youtube.com 161.0 youtube.com 624.0
4 nytimes.com 243.0 washingtonpost.com 409.0 periscope.tv 395.0 reverbnation.com 49.0 pinterest.com 73.0 ebay.com 337.0
5 cnn.com 213.0 politico.com 375.0 truthinmedia.com 391.0 gofundme.com 43.0 amazon.com 66.0 amazon.com 173.0
6 washingtonpost.com 210.0 warontherocks.com 373.0 nytimes.com 391.0 huffingtonpost.com 39.0 rocskincare.com 54.0 reddit.com 118.0
7 huffingtonpost.com 180.0 wired.com 372.0 wordpress.com 283.0 owler.com 35.0 cdc.gov 54.0 nytimes.com 109.0
8 thegatewaypundit.com 177.0 cnn.com 334.0 instagram.com 238.0 pinterest.com 33.0 owler.com 49.0 melanienini.net 91.0
9 foxnews.com 169.0 nytimes.com 319.0 paraven.net 220.0 blogspot.com 33.0 skincancer.org 45.0 vine.co 87.0
10 thehill.com 134.0 thehill.com 260.0 truthkings.com 212.0 wholechildeducation.org 32.0 newbeauty.com 39.0 tumblr.com 86.0
11 conservativereport.org 133.0 apple.com 245.0 washingtonpost.com 178.0 change.org 32.0 foxnews.com 36.0 patch.com 76.0
12 latimes.com 125.0 lawfareblog.com 240.0 forbes.com 167.0 epi.org 30.0 medicaldaily.com 34.0 cnet.com 70.0
13 infowars.com 124.0 breitbart.com 239.0 foxnews.com 159.0 ebay.com 28.0 sherif.ws 33.0 toonradio.net 66.0
14 theguardian.com 121.0 yahoo.com 232.0 infowars.com 150.0 s.einnews.com 28.0 yahoo.com 27.0 autoblog.com 64.0
15 rss.cnn.com 120.0 foxnews.com 218.0 yournewswire.com 144.0 npr.org 26.0 smallbiztrends.com 27.0 etsy.com 62.0
16 npr.org 118.0 reuters.com 206.0 mashable.com 140.0 wordpress.com 24.0 wordpress.com 26.0 stores.ebay.com 51.0
17 townhall.com 117.0 thefederalist.com 192.0 npr.org 137.0 nytimes.com 24.0 reddit.com 26.0 soundcloud.com 43.0
18 dailycaller.com 112.0 obamawhitehouse.archives.gov 183.0 today.com 131.0 washingtonpost.com 23.0 medicalnewstoday.com 25.0 nydailynews.com 43.0
19 theblaze.com 107.0 washingtonexaminer.com 182.0 huffingtonpost.com 130.0 theatlantic.com 23.0 sun-sentinel.com 23.0 foxnews.com 39.0

In [49]:
df_t = df_url_tid.pivot_table(
    index="ORIG_DOMAIN", columns="topic_name", values="TID", aggfunc=len
)
pd.concat({c: df_t[c].sort_values(ascending=False).head(20).reset_index().rename(columns={c: "Counts",
                                                                                         "ORIG_DOMAIN": c})
           for c in df_t.columns},
          axis=1, keys=topic_order)


Out[49]:
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control Counts Privacy Counts Vaccine Counts Child Education Counts Skin Damage Counts Seat Belt Counts
0 twitter.com 2244.0 twitter.com 9742.0 twitter.com 3147.0 twitter.com 1221.0 fb.me 2012.0 twitter.com 5903.0
1 bit.ly 1873.0 bit.ly 2910.0 fb.me 2296.0 fb.me 743.0 instagram.com 1093.0 fb.me 1200.0
2 fb.me 1249.0 ow.ly 1393.0 bit.ly 1888.0 bit.ly 594.0 bit.ly 867.0 instagram.com 1083.0
3 youtu.be 607.0 fb.me 1365.0 youtu.be 975.0 ow.ly 378.0 twitter.com 641.0 bit.ly 923.0
4 dlvr.it 550.0 ift.tt 868.0 ow.ly 951.0 buff.ly 143.0 ow.ly 554.0 dlvr.it 402.0
5 ow.ly 520.0 ln.is 675.0 ift.tt 457.0 instagram.com 123.0 goo.gl 163.0 ow.ly 395.0
6 ln.is 494.0 goo.gl 673.0 tinyurl.com 414.0 goo.gl 121.0 buff.ly 158.0 youtu.be 340.0
7 ift.tt 479.0 newsweek.com 670.0 periscope.tv 393.0 ift.tt 101.0 ift.tt 154.0 ift.tt 321.0
8 goo.gl 428.0 dlvr.it 668.0 goo.gl 384.0 dlvr.it 99.0 youtu.be 75.0 goo.gl 252.0
9 breitbart.com 242.0 youtu.be 492.0 youtube.com 372.0 youtu.be 76.0 dlvr.it 68.0 youtube.com 120.0
10 youtube.com 182.0 buff.ly 391.0 naturalnews.com 348.0 tinyurl.com 57.0 pinterest.com 65.0 ln.is 108.0
11 cnn.it 169.0 warontherocks.com 294.0 dlvr.it 339.0 s.einnews.com 35.0 ln.is 49.0 amzn.to 107.0
12 wp.me 168.0 tinyurl.com 267.0 buff.ly 325.0 owler.us 34.0 owler.us 49.0 melanienini.net 87.0
13 shar.es 160.0 youtube.com 228.0 nyti.ms 296.0 ln.is 34.0 rocskincare.com 42.0 vine.co 87.0
14 tinyurl.com 152.0 apple.com 212.0 ln.is 267.0 pinterest.com 29.0 amzn.to 37.0 ebay.com 83.0
15 m.tbnn.it 134.0 politi.co 149.0 shar.es 263.0 wholechildeducation.org 29.0 tinyurl.com 30.0 buff.ly 77.0
16 nyti.ms 132.0 politico.com 146.0 truthinmedia.com 258.0 facebook.com 27.0 1.usa.gov 26.0 tmblr.co 76.0
17 buff.ly 125.0 lawfareblog.com 139.0 instagram.com 231.0 youtube.com 22.0 atjo.es 26.0 toonradio.net 66.0
18 huffingtonpost.com 100.0 m.tbnn.it 136.0 facebook.com 230.0 gofundme.com 21.0 youtube.com 24.0 nyti.ms 58.0
19 infowars.com 78.0 thehill.com 136.0 paraven.net 210.0 lnkd.in 21.0 lnkd.in 23.0 tinyurl.com 57.0

TF-IDF URLs


In [50]:
df_t = df_url_tid.pivot_table(
    index="URL_DOMAIN", columns="topic_name", values="TID", aggfunc=len
)
IDF = np.log10(df.topic_name.unique().shape[0]  / ((df_t > 0) * 1.).sum(axis=1))
df_t = df_t.multiply(IDF, axis=0)
print "Using expanded URLs"
pd.concat({c: df_t[c].sort_values(ascending=False).head(20).reset_index().rename(columns={c: "TF-IDF",
                                                                                          "URL_DOMAIN": c})
           for c in df_t.columns},
          axis=1, keys=topic_order)


Using expanded URLs
Out[50]:
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control TF-IDF Privacy TF-IDF Vaccine TF-IDF Child Education TF-IDF Skin Damage TF-IDF Seat Belt TF-IDF
0 conservativereport.org 103.494116 warontherocks.com 290.250416 truthinmedia.com 186.554411 wholechildeducation.org 24.900840 rocskincare.com 42.020168 melanienini.net 70.811764
1 thegatewaypundit.com 84.450462 lawfareblog.com 186.756300 paraven.net 171.193275 epi.org 14.313638 skincancer.org 35.016806 toonradio.net 51.357983
2 bearingarms.com 42.463792 apple.com 73.752349 naturalnews.com 167.372678 edutopia.org 11.672269 newbeauty.com 30.347899 autoblog.com 30.535760
3 blog.tenthamendmentcenter.com 40.463865 newsweek.com 64.453534 truthkings.com 164.968065 reverbnation.com 8.628472 smallbiztrends.com 21.010084 stores.ebay.com 24.333184
4 guns.com 38.129411 freebeacon.com 61.071521 ageofautism.com 96.490755 thelegacyprep.org 8.559664 cdc.gov 16.255620 cnet.com 21.072100
5 freebeacon.com 37.692579 conferencecalltranscripts.org 60.695798 vaxxedthemovie.com 92.599999 youreducationguides.com 7.781513 medicaldaily.com 16.222123 thecarconnection.com 20.231933
6 thetruthaboutguns.com 34.238655 thefederalist.com 57.797759 healthimpactnews.com 85.596638 militarychild.org 7.781513 nerium.com 14.006723 vine.co 15.319940
7 thefederalistpapers.org 33.875609 threatpost.com 51.357983 vaccineimpact.com 83.262184 playertronics.com 7.781513 skinbarnyc.com 12.450420 blog.caranddriver.com 13.228571
8 guncrazy.org 32.682353 justsecurity.org 49.023529 skepticalraptor.com 59.917646 edweek.org 7.633940 aad.org 12.450420 nfl.com 13.228571
9 weaselzippers.us 31.490003 eff.org 45.132773 visionlaunch.com 57.583193 gofundme.com 7.571924 influenster.com 11.672269 auctioncars.online 13.228571
10 rightwingnews.com 31.012882 defenseone.com 44.354621 vaxxed.com 54.470588 channelstv.com 7.156819 theeditorialistla.com 11.672269 seekingalpha.com 12.882274
11 nraila.org 28.013445 zdnet.com 40.078185 greenmedinfo.com 49.801680 learningliftoff.com 7.003361 ashapdx.myrandf.com 10.894118 edmunds.com 12.450420
12 bizpacreview.com 23.856063 appleinsider.com 38.907563 thevaccinereaction.org 47.467226 star-brite.com 7.003361 realself.com 10.496668 amazon.co.uk 11.928031
13 newsbusters.org 23.480340 techdirt.com 37.351260 indiewire.com 46.689075 plansponsor.com 7.003361 fashiontimes.com 10.115966 deals.ebay.com 11.672269
14 twitchy.com 22.901820 motherboard.vice.com 34.829852 thinkingmomsrevolution.com 46.689075 allafrica.com 6.622660 sherif.ws 9.933990 etsy.com 10.917658
15 thetrace.org 22.566386 csis.org 32.682353 naturalhealth365.com 45.910924 cards.twitter.com 6.225210 seniorcarecorner.com 9.542425 stnonline.com 10.894118
16 infowars.com 21.835316 obamawhitehouse.archives.gov 32.224700 scienceblogs.com 45.910924 dianeravitch.net 6.225210 dermrf.com 9.337815 safercar.gov 10.894118
17 rss.cnn.com 21.130951 techcrunch.com 31.344244 articles.mercola.com 41.032428 donorschoose.org 6.225210 drlisaairan.com 8.559664 csoonline.com 10.496668
18 townhall.com 20.602677 politico.com 29.692967 video214.com 40.078185 causeartist.com 6.225210 grownandflown.com 7.781513 poshmark.com 10.115966
19 teaparty.org 20.516214 theregister.co.uk 29.569748 vactruth.com 37.351260 amomstake.com 6.225210 skintherapybuckhead.com 7.781513 industryweek.com 10.115966

In [51]:
df_t = df_url_tid.pivot_table(
    index="ORIG_DOMAIN", columns="topic_name", values="TID", aggfunc=len
)
IDF = np.log10(df.topic_name.unique().shape[0]  / ((df_t > 0) * 1.).sum(axis=1))
df_t = df_t.multiply(IDF, axis=0)
print "Using expanded URLs"
pd.concat({c: df_t[c].sort_values(ascending=False).head(20).reset_index().rename(columns={c: "TF-IDF",
                                                                                          "ORIG_DOMAIN": c})
           for c in df_t.columns},
          axis=1, keys=topic_order)


Using expanded URLs
Out[51]:
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control TF-IDF Privacy TF-IDF Vaccine TF-IDF Child Education TF-IDF Skin Damage TF-IDF Seat Belt TF-IDF
0 conservativereport.org 45.132773 warontherocks.com 228.776468 paraven.net 163.411763 wholechildeducation.org 22.566386 rocskincare.com 32.682353 melanienini.net 67.699159
1 breitbart.com 42.614085 lawfareblog.com 108.163024 truthinmedia.com 123.097284 epi.org 9.542425 atjo.es 20.231933 toonradio.net 51.357983
2 guncrazy.org 32.682353 apple.com 63.818359 naturalnews.com 104.758438 thelegacyprep.org 8.559664 ctap.it 17.119328 stores.ebay.com 38.907563
3 townhall.com 31.490003 conferencecalltranscripts.org 60.695798 truthkings.com 85.596638 youreducationguides.com 7.781513 skincancer.org 14.784874 autoblog.com 22.566386
4 bearingarms.com 27.673033 newsweek.com 53.051435 vaxxedthemovie.com 77.815125 playertronics.com 7.781513 influenster.com 10.894118 vine.co 15.319940
5 thegatewaypundit.com 21.947578 observer.com 45.910924 ageofautism.com 72.368066 star-brite.com 7.003361 newbeauty.com 10.115966 auctioncars.online 13.228571
6 weaselzippers.us 20.516214 politi.co 44.853469 vaxxed.com 53.692436 gofundme.com 6.321630 realself.com 9.065304 cnet.co 11.928031
7 thefederalistpapers.org 19.084850 tcrn.ch 39.685714 visionlaunch.com 53.692436 edut.to 6.225210 thephillygodfather.com 7.781513 etsy.com 10.235020
8 freebeacon.com 18.130608 thefederalist.com 38.531839 jezebel.com 41.242016 americannews.com 6.202576 plrb.org 7.781513 bnc.lt 10.115966
9 ontheissues.org 17.897479 dfi.io 27.235294 thevaccinereaction.org 39.685714 gvwy.io 5.447059 ipsy.com 7.781513 edmunds.com 8.559664
10 thetrace.org 14.784874 eff.org 27.235294 thinkingmomsrevolution.com 35.794958 thegood-life.net 5.447059 simpleweatheralert.com 7.003361 pdora.co 8.111061
11 louderwithcrowder.com 14.006723 whitehouse.gov 25.287427 sharylattkisson.com 34.238655 scl.bz 5.447059 walgreens.com 6.225210 operationsavelives.org 7.781513
12 teaparty.org 13.836516 freebeacon.com 25.287427 naturalhealth365.com 32.682353 donorschoose.org 5.447059 aad.org 6.225210 toys-deals-shop.xyz 7.781513
13 cnsnews.com 13.836516 patriotpost.us 24.810305 scribd.com 31.012882 mumblingmommy.com 5.447059 sanctuarycosmeticcenter.com 5.447059 bandageofhonor.org 7.781513
14 twitchy.com 13.836516 motherboard.vice.com 24.810305 fda.gov 30.058639 videotube.livehost.fr 4.668908 skinboutiquexperts.com 5.447059 dogloversupport.com 7.781513
15 infowars.com 13.735118 politicususa.com 24.333184 scienceblogs.com 28.013445 discoveryk12.com 4.668908 westskinlaser.com 5.447059 rosemonttextiles.com 7.781513
16 cnn.it 13.381631 defenseone.com 24.122689 vactruth.com 28.013445 dlayouts.com 4.668908 1.azdhs.gov 4.771213 st8.fm 7.003361
17 hotair.com 13.359395 wired.com 21.307042 howdovaccinescauseautism.com 25.678991 free24apps.com 3.890756 houstonfor.me 4.668908 safethammer.com 7.003361
18 blog.tenthamendmentcenter.com 13.228571 zdnet.com 20.231933 yournewswire.com 24.684460 glblctzn.me 3.890756 cdc.gov 4.515450 safetravelusa.com 6.225210
19 thetruthaboutguns.com 12.450420 washingtonexaminer.com 18.607729 healthimpactnews.com 24.122689 petition.parliament.uk 3.890756 im-fabulous.com 3.890756 rover.ebay.com 6.163194

Top urls of each type in each dataset


In [52]:
df.columns


Out[52]:
Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial',
                    u'TID',             u'CATS',          u'u_state',
           u'CATS_Counter'],
      dtype='object')

In [53]:
for url_type in ["fakenews", "news", "blog", "socialmedia", "scientific", "commercial", "videos", "UNK"]:
    N = 5
    print url_type
    df_t = df_url_tid[df_url_tid.CATS_Counter.apply(lambda x: url_type in x)].pivot_table(
        index="URL_DOMAIN", columns="topic_name", values="TID", aggfunc=len
    )
    display(pd.concat({c: df_t[c].sort_values(ascending=False).head(N).reset_index().rename(columns={c: "Counts", "URL_DOMAIN": c})
               for c in df_t.columns},
              axis=1, keys=topic_order))


fakenews
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control Counts Privacy Counts Vaccine Counts Child Education Counts Skin Damage Counts Seat Belt Counts
0 breitbart.com 656.0 breitbart.com 239.0 naturalnews.com 556.0 breitbart.com 18.0 articles.mercola.com 10.0 theonion.com 22.0
1 thegatewaypundit.com 177.0 washingtonexaminer.com 182.0 truthkings.com 212.0 americannews.com 13.0 lifezette.com 5.0 dailykos.com 18.0
2 infowars.com 124.0 thedailybeast.com 156.0 infowars.com 150.0 rickwells.us 4.0 beforeitsnews.com 2.0 motherjones.com 12.0
3 dailycaller.com 112.0 dailycaller.com 145.0 yournewswire.com 144.0 commondreams.org 3.0 worldtruth.tv 1.0 zerohedge.com 10.0
4 theblaze.com 107.0 freebeacon.com 128.0 ageofautism.com 124.0 usuncut.com 2.0 reductress.com 1.0 breitbart.com 7.0
news
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control Counts Privacy Counts Vaccine Counts Child Education Counts Skin Damage Counts Seat Belt Counts
0 nytimes.com 243.0 newsweek.com 814.0 nytimes.com 391.0 huffingtonpost.com 39.0 cdc.gov 54.0 nytimes.com 109.0
1 cnn.com 213.0 washingtonpost.com 409.0 washingtonpost.com 178.0 npr.org 26.0 skincancer.org 45.0 cnet.com 70.0
2 washingtonpost.com 210.0 politico.com 375.0 forbes.com 167.0 nytimes.com 24.0 foxnews.com 36.0 nydailynews.com 43.0
3 huffingtonpost.com 180.0 wired.com 372.0 foxnews.com 159.0 washingtonpost.com 23.0 yahoo.com 27.0 foxnews.com 39.0
4 foxnews.com 169.0 cnn.com 334.0 mashable.com 140.0 theatlantic.com 23.0 medicalnewstoday.com 25.0 cnn.com 37.0
blog
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control Counts Privacy Counts Vaccine Counts Child Education Counts Skin Damage Counts Seat Belt Counts
0 bearingarms.com 89.0 warontherocks.com 373.0 truthinmedia.com 391.0 blogspot.com 33.0 medicaldaily.com 34.0 tumblr.com 86.0
1 blogspot.com 86.0 lawfareblog.com 240.0 wordpress.com 283.0 s.einnews.com 28.0 smallbiztrends.com 27.0 patch.com 76.0
2 vox.com 85.0 blogspot.com 116.0 paraven.net 220.0 wordpress.com 24.0 wordpress.com 26.0 autoblog.com 64.0
3 wordpress.com 82.0 wordpress.com 89.0 today.com 131.0 medium.com 13.0 tumblr.com 21.0 wordpress.com 28.0
4 tumblr.com 69.0 world.einnews.com 86.0 vaccineimpact.com 107.0 militarytimes.com 9.0 webmd.com 19.0 blogspot.com 28.0
socialmedia
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control Counts Privacy Counts Vaccine Counts Child Education Counts Skin Damage Counts Seat Belt Counts
0 youtube.com 909.0 youtube.com 877.0 youtube.com 1666.0 facebook.com 474.0 facebook.com 1588.0 instagram.com 1111.0
1 facebook.com 394.0 facebook.com 522.0 facebook.com 1003.0 youtube.com 137.0 instagram.com 1133.0 facebook.com 832.0
2 tumblr.com 69.0 reddit.com 96.0 periscope.tv 395.0 instagram.com 130.0 youtube.com 161.0 youtube.com 624.0
3 instagram.com 51.0 linkedin.com 89.0 instagram.com 238.0 pinterest.com 33.0 pinterest.com 73.0 reddit.com 118.0
4 reddit.com 38.0 tumblr.com 70.0 reddit.com 122.0 linkedin.com 12.0 reddit.com 26.0 vine.co 87.0
scientific
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control Counts Privacy Counts Vaccine Counts Child Education Counts Skin Damage Counts Seat Belt Counts
0 complex.com 10.0 brookings.edu 9.0 ncbi.nlm.nih.gov 117.0 brookings.edu 7.0 cdc.gov 54.0 cdc.gov 20.0
1 cato.org 8.0 cato.org 9.0 cdc.gov 93.0 link.springer.com 3.0 ncbi.nlm.nih.gov 8.0 ncbi.nlm.nih.gov 3.0
2 brookings.edu 5.0 newscientist.com 5.0 healio.com 31.0 sites.google.com 2.0 onlinelibrary.wiley.com 4.0 sciencedirect.com 1.0
3 independent.org 2.0 cjr.org 3.0 sciencemag.org 26.0 thegospelcoalition.org 1.0 nature.com 2.0 pediatrics.aappublications.org 1.0
4 ncbi.nlm.nih.gov 1.0 tandfonline.com 3.0 who.int 23.0 tc.columbia.edu 1.0 healio.com 2.0 newscientist.com 1.0
commercial
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control Counts Privacy Counts Vaccine Counts Child Education Counts Skin Damage Counts Seat Belt Counts
0 fw.to 79.0 apple.com 245.0 vaxxedthemovie.com 119.0 gofundme.com 43.0 amazon.com 66.0 ebay.com 337.0
1 t.co 38.0 hillaryclinton.com 109.0 t.co 107.0 owler.com 35.0 rocskincare.com 54.0 amazon.com 173.0
2 ooyuz.com 37.0 t.co 107.0 amazon.com 89.0 change.org 32.0 owler.com 49.0 melanienini.net 91.0
3 readfulapp.com 33.0 conferencecalltranscripts.org 78.0 video214.com 84.0 ebay.com 28.0 etsy.com 21.0 etsy.com 62.0
4 axs.com 26.0 amazon.com 65.0 fw.to 64.0 fw.to 12.0 ow.ly 17.0 stores.ebay.com 51.0
videos
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control Counts Privacy Counts Vaccine Counts Child Education Counts Skin Damage Counts Seat Belt Counts
0 youtube.com 909.0 youtube.com 877.0 youtube.com 1666.0 youtube.com 137.0 instagram.com 1133.0 instagram.com 1111.0
1 instagram.com 51.0 instagram.com 34.0 instagram.com 238.0 instagram.com 130.0 youtube.com 161.0 youtube.com 624.0
2 pbs.org 19.0 pbs.org 18.0 vimeo.com 52.0 pbs.org 7.0 pbs.org 2.0 vimeo.com 5.0
3 liveleak.com 12.0 vimeo.com 10.0 pbs.org 15.0 vimeo.com 6.0 vimeo.com 1.0 flickr.com 3.0
4 vimeo.com 4.0 livestream.com 8.0 liveleak.com 13.0 livestream.com 1.0 dailymotion.com NaN pbs.org 1.0
UNK
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control Counts Privacy Counts Vaccine Counts Child Education Counts Skin Damage Counts Seat Belt Counts
0 conservativereport.org 133.0 danijobs.com 45.0 greenmedinfo.com 64.0 reverbnation.com 49.0 newbeauty.com 39.0 toonradio.net 66.0
1 hotair.com 58.0 csis.org 42.0 indiewire.com 60.0 wholechildeducation.org 32.0 sherif.ws 33.0 ppv.alipromo.com 39.0
2 thinkprogress.org 54.0 theregister.co.uk 38.0 jamanetwork.com 40.0 epi.org 30.0 sun-sentinel.com 23.0 seekingalpha.com 27.0
3 blog.tenthamendmentcenter.com 52.0 amp.twimg.com 37.0 outbreaknewstoday.com 39.0 allafrica.com 22.0 realself.com 22.0 thecarconnection.com 26.0
4 guns.com 49.0 schneier.com 36.0 snopes.com 39.0 leadership.ng 19.0 seniorcarecorner.com 20.0 amazon.co.uk 25.0

In [54]:
for url_type in ["fakenews", "news", "blog", "socialmedia", "scientific", "commercial", "videos", "UNK"]:
    N = 5
    print url_type
    df_t = df_url_tid[df_url_tid.CATS_Counter.apply(lambda x: url_type in x)].pivot_table(
        index="URL_DOMAIN", columns="topic_name", values="TID", aggfunc=len
    )
    IDF = np.log10(df.topic_name.unique().shape[0] / ((df_t > 0) * 1.).sum(axis=1))
    df_t = df_t.multiply(IDF, axis=0)
    display(pd.concat({c: df_t[c].sort_values(ascending=False).head(N).reset_index().rename(columns={c: "TF-IDF", "URL_DOMAIN": c})
               for c in df_t.columns},
              axis=1, keys=topic_order))


fakenews
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control TF-IDF Privacy TF-IDF Vaccine TF-IDF Child Education TF-IDF Skin Damage TF-IDF Seat Belt TF-IDF
0 thegatewaypundit.com 84.450462 freebeacon.com 61.071521 naturalnews.com 167.372678 americannews.com 3.913390 articles.mercola.com 4.771213 theonion.com 3.874008
1 freebeacon.com 37.692579 centerforsecuritypolicy.org 22.566386 truthkings.com 164.968065 usuncut.com 1.556303 lifezette.com 0.880456 zerohedge.com 3.010300
2 thefederalistpapers.org 33.875609 politicususa.com 22.276220 ageofautism.com 96.490755 rickwells.us 1.204120 reductress.com 0.778151 kingworldnews.com 2.334454
3 rightwingnews.com 31.012882 thegatewaypundit.com 19.084850 healthimpactnews.com 85.596638 projectveritas.com 0.778151 worldtruth.tv 0.477121 babylonbee.com 1.556303
4 bizpacreview.com 23.856063 counterpunch.org 18.607729 articles.mercola.com 41.032428 goneleft.com 0.778151 disclose.tv 0.301030 mirror.co.uk 1.505150
news
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control TF-IDF Privacy TF-IDF Vaccine TF-IDF Child Education TF-IDF Skin Damage TF-IDF Seat Belt TF-IDF
0 rss.cnn.com 21.130951 newsweek.com 64.453534 hollywoodreporter.com 31.006090 usnews.com 2.641369 skincancer.org 35.016806 cnet.com 21.072100
1 townhall.com 20.602677 thefederalist.com 57.797759 cdc.gov 27.995790 battlecreekenquirer.com 2.334454 cdc.gov 16.255620 cdc.gov 6.020600
2 foxnews.com 13.381631 threatpost.com 51.357983 fda.gov 27.393730 cefi.ca 2.334454 medicalnewstoday.com 7.525750 rss.cnn.com 4.754464
3 rollingstone.com 12.944290 eff.org 45.132773 sciencemag.org 20.231933 ed.gov 2.334454 cancer.org 7.003361 nhtsa.gov 3.890756
4 redstate.com 12.944290 motherboard.vice.com 34.829852 rss.cnn.com 18.841765 well.blogs.nytimes.com 1.806180 healthfinder.gov 5.248334 foxnews.com 3.088069
blog
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control TF-IDF Privacy TF-IDF Vaccine TF-IDF Child Education TF-IDF Skin Damage TF-IDF Seat Belt TF-IDF
0 bearingarms.com 42.463792 warontherocks.com 290.250416 truthinmedia.com 186.554411 militarytimes.com 2.709270 smallbiztrends.com 21.010084 autoblog.com 30.535760
1 weaselzippers.us 31.490003 lawfareblog.com 186.756300 paraven.net 171.193275 vitweet.com 1.908485 medicaldaily.com 16.222123 boingboing.net 1.056548
2 vox.com 14.967757 justsecurity.org 49.023529 vaccineimpact.com 83.262184 medium.com 1.029356 webmd.com 5.719570 forctr.com 0.704365
3 cnsnews.com 12.041200 defenseone.com 44.354621 skepticalraptor.com 59.917646 upworthy.com 0.704365 ezinearticles.com 2.334454 bigstory.ap.org 0.554269
4 americanthinker.com 10.019546 zdnet.com 40.078185 visionlaunch.com 57.583193 iflscience.com 0.602060 sciencealert.com 1.505150 en.wikipedia.org 0.528274
socialmedia
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control TF-IDF Privacy TF-IDF Vaccine TF-IDF Child Education TF-IDF Skin Damage TF-IDF Seat Belt TF-IDF
0 vine.co 1.056548 academia.edu 2.385606 web.archive.org 4.668908 yelp.com 0.602060 yelp.com 1.204120 vine.co 15.319940
1 storify.com 0.301030 storify.com 0.903090 researchgate.net 1.556303 ask.fm 0.352183 ask.fm 0.176091 ask.fm 1.760913
2 ask.fm 0.176091 vine.co 0.880456 academia.edu 0.477121 9gag.com 0.352183 9gag.com 0.176091 flickr.com 1.431364
3 9gag.com 0.176091 flickr.com 0.477121 vine.co 0.352183 youtube.com 0.000000 youtube.com 0.000000 last.fm 0.778151
4 youtube.com 0.000000 youtube.com 0.000000 storify.com 0.301030 vimeo.com 0.000000 vimeo.com 0.000000 foursquare.com 0.778151
scientific
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control TF-IDF Privacy TF-IDF Vaccine TF-IDF Child Education TF-IDF Skin Damage TF-IDF Seat Belt TF-IDF
0 complex.com 4.771213 brookings.edu 2.709270 cdc.gov 27.995790 brookings.edu 2.107210 cdc.gov 16.255620 cdc.gov 6.020600
1 cato.org 2.408240 cato.org 2.709270 sciencemag.org 20.231933 sites.google.com 0.954243 onlinelibrary.wiley.com 0.704365 pediatrics.aappublications.org 0.477121
2 independent.org 1.556303 eos.org 2.334454 who.int 17.897479 link.springer.com 0.903090 ncbi.nlm.nih.gov 0.633450 journals.lww.com 0.477121
3 brookings.edu 1.505150 cjr.org 2.334454 autismspeaks.org 17.897479 thegospelcoalition.org 0.778151 nature.com 0.602060 sciencedirect.com 0.301030
4 annualreviews.org 0.778151 fpri.org 1.556303 journals.plos.org 12.450420 tc.columbia.edu 0.778151 healio.com 0.602060 healio.com 0.301030
commercial
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control TF-IDF Privacy TF-IDF Vaccine TF-IDF Child Education TF-IDF Skin Damage TF-IDF Seat Belt TF-IDF
0 act.credoaction.com 3.311330 apple.com 73.752349 vaxxedthemovie.com 92.599999 gofundme.com 7.571924 rocskincare.com 42.020168 melanienini.net 70.811764
1 readfulapp.com 2.612981 conferencecalltranscripts.org 60.695798 video214.com 40.078185 mabelsaveforschool.com 6.225210 etsy.com 3.697916 stores.ebay.com 24.333184
2 tweetedtimes.com 1.187719 techdirt.com 37.351260 twi.gl 6.679698 change.org 2.533800 laskyaesthetics.com 2.334454 deals.ebay.com 11.672269
3 goo.gl 1.056548 hillaryclinton.com 8.630756 shareasale.com 4.671694 education.com 2.334454 youaresimplyradiant.com 1.556303 etsy.com 10.917658
4 vid.staged.com 1.056548 promotedstories.com 6.321630 seattleorganicrestaurants.com 4.294091 promotedstories.com 2.107210 stores.ebay.com 0.954243 redgage.com 3.112605
videos
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control TF-IDF Privacy TF-IDF Vaccine TF-IDF Child Education TF-IDF Skin Damage TF-IDF Seat Belt TF-IDF
0 liveleak.com 2.113095 livestream.com 2.408240 liveleak.com 2.289186 livestream.com 0.301030 youtube.com 0.0 flickr.com 1.431364
1 livestream.com 0.903090 liveleak.com 0.704365 dailymotion.com 0.528274 dailymotion.com 0.176091 vimeo.com 0.0 liveleak.com 0.176091
2 dailymotion.com 0.352183 flickr.com 0.477121 youtube.com 0.000000 youtube.com 0.000000 pbs.org 0.0 youtube.com 0.000000
3 youtube.com 0.000000 dailymotion.com 0.176091 vimeo.com 0.000000 vimeo.com 0.000000 instagram.com 0.0 vimeo.com 0.000000
4 vimeo.com 0.000000 youtube.com 0.000000 pbs.org 0.000000 pbs.org 0.000000 dailymotion.com NaN pbs.org 0.000000
UNK
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
Gun Control TF-IDF Privacy TF-IDF Vaccine TF-IDF Child Education TF-IDF Skin Damage TF-IDF Seat Belt TF-IDF
0 conservativereport.org 103.494116 csis.org 32.682353 greenmedinfo.com 49.801680 wholechildeducation.org 24.900840 newbeauty.com 30.347899 toonradio.net 51.357983
1 blog.tenthamendmentcenter.com 40.463865 theregister.co.uk 29.569748 indiewire.com 46.689075 epi.org 14.313638 nerium.com 14.006723 thecarconnection.com 20.231933
2 guns.com 38.129411 schneier.com 28.013445 outbreaknewstoday.com 30.347899 edutopia.org 11.672269 aad.org 12.450420 blog.caranddriver.com 13.228571
3 thetruthaboutguns.com 34.238655 tripwire.com 23.344538 vaxxed.us 23.344538 reverbnation.com 8.628472 skinbarnyc.com 12.450420 nfl.com 13.228571
4 guncrazy.org 32.682353 infoworld.com 20.231933 organiclifestylemagazine.com 20.231933 thelegacyprep.org 8.559664 influenster.com 11.672269 auctioncars.online 13.228571

In [55]:
parse_qs("")


Out[55]:
{}

In [56]:
df_url_tid.head()


Out[56]:
TID URL CATS ORIG_DOMAIN EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS t_id topic_name CATS_Counter
0 682904901916225536 https://twitter.com/photogchad_WTSP/status/682... socialmedia|twitter twitter.com https://twitter.com/photogchad_WTSP/status/682... 0.0 twitter.com socialmedia|twitter 682904901916225536 Skin Damage {u'twitter': 1}
1 682915876316692480 http://www.investirdanslenfance.ca/ UNK investirdanslenfance.ca http://www.investirdanslenfance.ca/ 0.0 investirdanslenfance.ca UNK 682915876316692480 Child Education {u'UNK': 1}
2 682985833821941760 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial 682985833821941760 Vaccine {u'commercial': 1}
3 682952771746664448 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial 682952771746664448 Vaccine {u'commercial': 1}
4 682830450969059328 http://yournewswire.com/donald-trump-vaccines-... fakenews yournewswire.com http://yournewswire.com/donald-trump-vaccines-... 0.0 yournewswire.com fakenews 682830450969059328 Vaccine {u'fakenews': 1}

In [57]:
N = 3
url_types = ["fakenews", "news", "blog", "socialmedia", "scientific", "commercial", "videos", "UNK"]
display(pd.concat({url_type: df_url_tid[
                df_url_tid.CATS_Counter.apply(
                    lambda x: url_type in x
                )].URL_DOMAIN.value_counts().head(N).reset_index().rename(columns={0: "Counts", "index": url_type})
           for url_type in url_types},
          axis=1, keys=url_types))


fakenews news blog socialmedia scientific commercial videos UNK
fakenews URL_DOMAIN news URL_DOMAIN blog URL_DOMAIN socialmedia URL_DOMAIN scientific URL_DOMAIN commercial URL_DOMAIN videos URL_DOMAIN UNK URL_DOMAIN
0 breitbart.com 925 nytimes.com 1096 wordpress.com 532 facebook.com 4813 cdc.gov 167 amazon.com 424 youtube.com 4374 conservativereport.org 133
1 naturalnews.com 561 newsweek.com 879 truthinmedia.com 394 youtube.com 4374 ncbi.nlm.nih.gov 130 ebay.com 385 instagram.com 2697 thinkprogress.org 79
2 infowars.com 318 washingtonpost.com 849 warontherocks.com 373 instagram.com 2697 healio.com 34 t.co 289 vimeo.com 78 mediaite.com 69

In [58]:
url_type = "blog"
df_url_tid[(df_url_tid.CATS_Counter.apply(
                    lambda x: url_type in x
                )) & (df_url_tid.URL_DOMAIN == "google.com")].head().EXPANDED.values


Out[58]:
array(['http://google.com/newsstand/s/CBIw38WH0Cc',
       'https://www.google.com/url?q=http://www.nbcnews.com/health/health-news/amp/de-niro-says-find-truth-vaccines-experts-already-did-n555416',
       'https://www.google.com/url?q=http://www.rte.ie/news/2016/0727/805275-clinton-trump-russia/',
       'https://www.google.com/search?ei=Z_rGV_qqCMnYeM6Lj9AC&q=success+prep+education+center+tutoring&oq=success+prep+education+center+tutoring&gs_l=mobile-gws-serp.3..33i160k1.4306.6098.0.6761.10.10.0.0.0.0.251.1328.0j8j1.9.0....0...1c.1.64.mobile-gws-serp..1.8.1224...33i21k1.nIaNbEtjppQ#fpstate=lie&lrd=0x89c25e4d46a74a13:0x2146c8c477d32c31,3,5',
       'https://www.google.com/search?q=Why+Donald+Trump+Is+Surging+in+the+Polls&safe=off&biw=360&bih=512&prmd=nvi&source=lnms&sa=X&ved=0ahUKEwiGssL7m4TPAhVTgiYKHQKKAJAQ_AUIBCgA&dpr=4'], dtype=object)

In [59]:
get_url_domain("https://www.google.com/url?rct=j&sa=t&url=http://www.perthnow.com.au/news/western-australia/social-services-minister-christian-porter-slaps-down-antivaccination-campaigners/news-story/0aa49052ec0598704b05333075581296&ct=ga&cd=CAIyGjE2ZDBhYmZjOTAzMjkyMTk6Y29tOmVuOlVT&usg=AFQjCNFAB3aZtdfdVpXOHWzyfqsu0ZSFAg")


Out[59]:
'perthnow.com.au'

Gender analysis


In [60]:
df_url_tid.head()


Out[60]:
TID URL CATS ORIG_DOMAIN EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS t_id topic_name CATS_Counter
0 682904901916225536 https://twitter.com/photogchad_WTSP/status/682... socialmedia|twitter twitter.com https://twitter.com/photogchad_WTSP/status/682... 0.0 twitter.com socialmedia|twitter 682904901916225536 Skin Damage {u'twitter': 1}
1 682915876316692480 http://www.investirdanslenfance.ca/ UNK investirdanslenfance.ca http://www.investirdanslenfance.ca/ 0.0 investirdanslenfance.ca UNK 682915876316692480 Child Education {u'UNK': 1}
2 682985833821941760 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial 682985833821941760 Vaccine {u'commercial': 1}
3 682952771746664448 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial 682952771746664448 Vaccine {u'commercial': 1}
4 682830450969059328 http://yournewswire.com/donald-trump-vaccines-... fakenews yournewswire.com http://yournewswire.com/donald-trump-vaccines-... 0.0 yournewswire.com fakenews 682830450969059328 Vaccine {u'fakenews': 1}

In [61]:
df_url_tid_gender = df_urls.merge(df[["t_id", "topic_name", u'Gender']], how="inner", left_on="TID", right_on="t_id")
df_url_tid_gender["CATS_Counter"] = df_url_tid_gender.CATS.apply(lambda x: get_category_counts(x.split("|")))
df_url_tid_gender.head()


Out[61]:
TID URL CATS ORIG_DOMAIN EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS t_id topic_name Gender CATS_Counter
0 682904901916225536 https://twitter.com/photogchad_WTSP/status/682... socialmedia|twitter twitter.com https://twitter.com/photogchad_WTSP/status/682... 0.0 twitter.com socialmedia|twitter 682904901916225536 Skin Damage F {u'twitter': 1}
1 682915876316692480 http://www.investirdanslenfance.ca/ UNK investirdanslenfance.ca http://www.investirdanslenfance.ca/ 0.0 investirdanslenfance.ca UNK 682915876316692480 Child Education F {u'UNK': 1}
2 682985833821941760 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial 682985833821941760 Vaccine M {u'commercial': 1}
3 682952771746664448 http://TinyURL.com/NewYearCure commercial tinyurl.com https://video214.com/play/8oLgDB1QXRWpXPysTSg1... 0.0 video214.com commercial 682952771746664448 Vaccine M {u'commercial': 1}
4 682830450969059328 http://yournewswire.com/donald-trump-vaccines-... fakenews yournewswire.com http://yournewswire.com/donald-trump-vaccines-... 0.0 yournewswire.com fakenews 682830450969059328 Vaccine F {u'fakenews': 1}

In [62]:
df_url_tid_gender.Gender.value_counts()


Out[62]:
M    29285
F    20682
Name: Gender, dtype: int64

In [63]:
df_url_tid_gender.shape


Out[63]:
(104279, 12)

In [64]:
df_url_tid_gender.Gender.value_counts()


Out[64]:
M    29285
F    20682
Name: Gender, dtype: int64

In [65]:
df_url_tid_gender.pivot_table(index="topic_name", columns="Gender", values="TID", aggfunc=len)


Out[65]:
Gender F M
topic_name
Child Education 1199 1180
Gun Control 2653 5739
Privacy 5204 11343
Seat Belt 2922 3744
Skin Damage 2713 1016
Vaccine 5991 6263

In [66]:
df.pivot_table(index="topic_name", columns="Gender", values="t_id", aggfunc=len)


Out[66]:
Gender F M
topic_name
Child Education 2486 2568
Gun Control 5978 12620
Privacy 12935 27648
Seat Belt 18555 20098
Skin Damage 4741 2174
Vaccine 10765 11557

In [67]:
df[["topic_name", "u_id", "Gender"]].groupby(["topic_name", "Gender"])["u_id"].nunique().to_frame().unstack()


Out[67]:
u_id
Gender F M
topic_name
Child Education 2028 2101
Gun Control 4532 8874
Privacy 8162 17633
Seat Belt 15191 16662
Skin Damage 3081 1705
Vaccine 4765 6593

In [68]:
N = 3
url_types = ["fakenews", "news", "blog", "socialmedia", "scientific", "commercial", "videos", "UNK"]
display(pd.concat({url_type: df_url_tid_gender[
                df_url_tid_gender.CATS_Counter.apply(
                    lambda x: url_type in x
                )].pivot_table(index="topic_name", columns="Gender", values="TID", aggfunc=len)#.reset_index()#.rename(columns={0: "Counts", "index": url_type})
           for url_type in url_types},
          axis=1, keys=url_types))


fakenews news blog socialmedia scientific commercial videos UNK
Gender F M F M F M F M F M F M F M F M
topic_name
Child Education 6 19 106 126 31 33 174 178 6 4 62 32 60 61 479 494
Gun Control 382 900 665 1417 124 295 200 646 6 6 57 125 117 440 790 1539
Privacy 317 626 1287 3078 270 767 294 657 3 22 128 314 178 341 880 2483
Seat Belt 27 51 147 313 47 79 639 793 5 5 210 94 433 496 513 847
Skin Damage 7 4 94 114 35 28 1525 296 17 11 98 42 808 109 745 415
Vaccine 584 644 985 1196 611 628 1055 1044 144 108 108 311 494 740 1618 1514

In [69]:
#gender_palette = ["orange", "darkmagenta"]
gender_palette = ["#D5F7FF", "#494B67"]

In [70]:
%%time
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 14,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.factorplot(y="URL_type", x="URL_counts", hue=u'Gender',
                   col="topic_name", col_wrap=3, 
                       col_order=topic_order,
                       kind="bar",
                       errwidth=2,
                       #capsize=0.5,
                       #color="0.2",
                       palette=gender_palette,
                  data=pd.melt(pd.concat([df_X[df.t_n_urls > 0] > 0,
                                          df[df.t_n_urls > 0][["topic_name", u'Gender']],
                                          (df_y[df.t_n_urls > 0] == 1).to_frame()],
                      axis=1).drop(["NONE", "UNK"], axis=1),
                               id_vars=["is_controversial", u'Gender', "topic_name"],
            var_name="URL_type", value_name="URL_counts"
           ))
    #g.set_xticklabels(rotation=90)
    g.set_titles("{col_name}").set_axis_labels("URL type proportion", "")
    #sns.despine(offset=10)


CPU times: user 14.1 s, sys: 120 ms, total: 14.2 s
Wall time: 14.2 s

In [71]:
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 14,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    ax = sns.barplot(x="mean", y="index", hue="Gender",
                data=pd.concat([
            ((df_X[(df.Gender == "F") & (df.t_n_urls > 0)].drop(["NONE", "UNK",], axis=1) > 0)* 1.).describe().T.reset_index().assign(Gender="Female"),
            ((df_X[(df.Gender == "M") & (df.t_n_urls > 0)].drop(["NONE", "UNK",], axis=1) > 0)* 1.).describe().T.reset_index().assign(Gender="Male"),
              ], axis=0),
                     palette=gender_palette
                    )
    ax.set_xlabel("Proportion of tweets with given URL type")
    ax.set_ylabel("URL types")
    sns.despine(offset=10)



In [72]:
df_t = (
    ((df_X[(df.Gender == "F") & (df.t_n_urls > 0)].drop(["NONE"], axis=1) > 0)* 1.).describe().T["mean"]/
    ((df_X[(df.Gender == "M") & (df.t_n_urls > 0)].drop(["NONE"], axis=1) > 0)* 1.).describe().T["mean"]
).to_frame()
df_t["mean"] = (df_t["mean"]/df_t.ix["twitter", "mean"])
df_t.reset_index().rename(columns={"index": "URL type", "mean": "Odds in Females versus Males"})


/home/content/anaconda3/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:5: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
Out[72]:
URL type Odds in Females versus Males
0 UNK 0.939866
1 blog 0.824964
2 commercial 1.002631
3 fakenews 0.793072
4 news 0.713371
5 scientific 1.541865
6 socialmedia 1.477333
7 twitter 1.000000
8 videos 1.314732

In [73]:
model = sm.Logit.from_formula(("I((Gender == 'F') * 1.)"
                               "~ UNK + blog + commercial +"
                               " fakenews + news+ scientific"
                               " + socialmedia + twitter + videos -1"),
                              data=pd.concat([df.Gender[(df.t_n_urls > 0) & (df.Gender.isin({"M", "F"}))],
                 df_X[(df.t_n_urls > 0) & (df.Gender.isin({"M", "F"}))].drop("NONE", axis=1)], axis=1))
res = model.fit()
res.summary2()


Optimization terminated successfully.
         Current function value: 0.672335
         Iterations 4
Out[73]:
Model: Logit Pseudo R-squared: 0.010
Dependent Variable: I((Gender == 'F') * 1.) AIC: 65780.4237
Date: 2018-01-19 16:10 BIC: 65859.6026
No. Observations: 48906 Log-Likelihood: -32881.
Df Model: 8 LL-Null: -33208.
Df Residuals: 48897 LLR p-value: 9.6414e-136
Converged: 1.0000 Scale: 1.0000
No. Iterations: 4.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
UNK -0.3578 0.0182 -19.6582 0.0000 -0.3935 -0.3222
blog -0.4988 0.0381 -13.0986 0.0000 -0.5734 -0.4242
commercial -0.2757 0.0503 -5.4781 0.0000 -0.3744 -0.1771
fakenews -0.3559 0.0279 -12.7557 0.0000 -0.4106 -0.3012
news -0.6215 0.0216 -28.7840 0.0000 -0.6638 -0.5792
scientific 0.3428 0.1124 3.0508 0.0023 0.1226 0.5630
socialmedia 0.2503 0.0349 7.1775 0.0000 0.1820 0.3187
twitter -0.3043 0.0179 -16.9567 0.0000 -0.3395 -0.2691
videos -0.2772 0.0459 -6.0335 0.0000 -0.3672 -0.1872

In [74]:
res.get_margeff().summary()


Out[74]:
Logit Marginal Effects
Dep. Variable: I((Gender == 'F') * 1.)
Method: dydx
At: overall
dy/dx std err z P>|z| [0.025 0.975]
UNK -0.0858 0.004 -19.972 0.000 -0.094 -0.077
blog -0.1196 0.009 -13.190 0.000 -0.137 -0.102
commercial -0.0661 0.012 -5.485 0.000 -0.090 -0.042
fakenews -0.0853 0.007 -12.838 0.000 -0.098 -0.072
news -0.1490 0.005 -29.774 0.000 -0.159 -0.139
scientific 0.0822 0.027 3.052 0.002 0.029 0.135
socialmedia 0.0600 0.008 7.193 0.000 0.044 0.076
twitter -0.0730 0.004 -17.158 0.000 -0.081 -0.065
videos -0.0665 0.011 -6.042 0.000 -0.088 -0.045

In [75]:
%%time
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 14,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.factorplot(y="URL_type", x="URL_counts", #hue="is_controversial",
                   col="topic_name", col_wrap=3, 
                       #col_order=topic_order,
                       kind="bar",
                       color="0.5",
                       errwidth=2,
                  data=pd.melt(pd.concat([df_X[df.t_n_urls > 0] > 0,
                                          df[(df.t_n_urls > 0) & (df.topic_name == "Vaccine")][["topic_name"]],
                                          (df_y[df.t_n_urls > 0] == 1).to_frame()],
                      axis=1).drop(["NONE", "UNK"], axis=1),
                               id_vars=["is_controversial", "topic_name"],
            var_name="URL_type", value_name="URL_counts"
           ))
    #g.set_xticklabels(rotation=90)
    g.set_titles("{col_name}").set_axis_labels("URL type proportion", "")
    #sns.despine(offset=10)
    plt.savefig("URL_proportions.atleast1url.vaccine.pdf", bbox_inches="tight")


CPU times: user 6.3 s, sys: 68 ms, total: 6.36 s
Wall time: 5.17 s

In [76]:
"""
These numbers will not add to 1 as the same URL can be counted in multiple categories. 
Hence, the proportions are more reflective of how different category of URLs are present in the dataset. 
"""

pd.melt(pd.concat([df_X[df.t_n_urls > 0] > 0,
                                          df[(df.t_n_urls > 0) & (df.topic_name == "Vaccine")][["topic_name"]],
                                          (df_y[df.t_n_urls > 0] == 1).to_frame()],
                      axis=1),#.drop(["NONE", "UNK"], axis=1),
                               id_vars=["is_controversial", "topic_name"],
            var_name="URL_type", value_name="URL_counts"
           ).groupby("URL_type")["URL_counts"].mean()


Out[76]:
URL_type
NONE           0.000000
UNK            0.296547
blog           0.062200
commercial     0.035429
fakenews       0.068566
news           0.190506
scientific     0.006572
socialmedia    0.134376
twitter        0.227401
videos         0.070533
Name: URL_counts, dtype: float64

In [ ]: