In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict
import matplotlib.pyplot as plt
import re
from collections import Counter
import statsmodels.api as sm
import seaborn as sns
from urlparse import urlsplit, parse_qs
from sklearn.feature_extraction import DictVectorizer
from IPython.display import display
In [2]:
sns.set_context("poster")
sns.set_style("ticks")
In [3]:
TOPIC_MAPPING={
"GunControl": "Gun Control",
"Privacy": "Privacy",
"Vaccine": "Vaccine",
"ChildEducation": "Child Education",
"SkinDamage": "Skin Damage",
"SeatBelt": "Seat Belt"
}
topic_order=["Gun Control", "Privacy", "Vaccine",
"Child Education", "Skin Damage", "Seat Belt"]
df = pd.read_hdf("FINAL_ANALYSIS_DATA.h5", "final_data").rename(columns={
#u'is_controvertial': u'is_controversial'
}).assign(
topic_name=lambda x: x.topic_name.apply(lambda k: TOPIC_MAPPING[k.split('/')[0]]),
)
df.shape
Out[3]:
In [4]:
print ("Total instances in data is %s,\n"
" with URLs is %s (%.2f%%),\n"
" with user locations is %s (%.2f%%),\n"
" and with gender is %s (%.2f%%)") % (
df.shape[0],
df[df.t_n_urls > 0].shape[0], df[df.t_n_urls > 0].shape[0]*100./ df.shape[0],
df[~df.u_state.isnull()].shape[0], df[~df.u_state.isnull()].shape[0]*100./ df.shape[0],
df[~df.Gender.isnull()].shape[0], df[~df.Gender.isnull()].shape[0]*100./ df.shape[0],
)
pd.DataFrame({
"Overall": df.topic_name.value_counts(),
"With URLs": df[df.t_n_urls > 0].topic_name.value_counts(),
"With user locations": df[~df.u_state.isnull()].topic_name.value_counts(),
"With user gender": df[~df.Gender.isnull()].topic_name.value_counts(),
})#.sort_values("Overall")
Out[4]:
In [5]:
pd.concat([df.groupby(["topic_name"])["t_n_urls"].agg([np.sum, np.mean, len]),
df[df.t_n_urls > 0].topic_name.value_counts().to_frame()
], axis=1).rename(columns={"topic_name": "No. of tweets with URLs",
"sum": "No. of URLs",
"mean": "Mean URLs",
"len": "No. of tweets"
})
Out[5]:
In [6]:
df[~df.Gender.isnull()].Gender.value_counts()
Out[6]:
In [7]:
df.u_location.value_counts().head()
Out[7]:
In [8]:
df.u_location.value_counts().shape
Out[8]:
In [9]:
df.columns
Out[9]:
In [10]:
df[~df.CATS.isnull()].CATS.head()
Out[10]:
In [11]:
CAT_MAPPINGS={
"satire": "fakenews",
"clickbait": "fakenews",
"usgov": "news"
}
def get_category_counts(x):
if x == 0:
return Counter(["NONE"])
c = Counter([CAT_MAPPINGS.get(k,k) for k in x])
if "twitter" in c:
c = Counter({"twitter": c["twitter"]})
return c
In [12]:
df.CATS.fillna(0).apply(get_category_counts).head()
Out[12]:
In [13]:
df.CATS.fillna(0).apply(get_category_counts).apply(lambda x: len(x)).describe()
Out[13]:
In [14]:
df[~df.CATS.isnull()].CATS.head()
Out[14]:
In [15]:
df["CATS_Counter"] = df.CATS.fillna(0).apply(get_category_counts)
df.ix[df.CATS_Counter.apply(lambda x: len(x)) == 2, "CATS_Counter"].head()
Out[15]:
In [16]:
df.ix[df.CATS_Counter.apply(lambda x: len(x)) == 2, "CATS_Counter"].shape
Out[16]:
In [17]:
pd.Series(sum(df.CATS_Counter.values[1:], df.CATS_Counter.values[0])).to_frame().reset_index()
Out[17]:
In [18]:
%%time
df_X = df.CATS_Counter.apply(lambda x: pd.Series(x)).fillna(0.)
df_y = df.is_controversial * 1.
In [19]:
df_X.head()
Out[19]:
In [20]:
model = sm.Logit(df_y, df_X)
res = model.fit()
res.summary2()
Out[20]:
In [21]:
res.get_margeff(at="zero").summary()
Out[21]:
In [22]:
model = sm.Logit(df_y[(df.t_n_urls > 0)], df_X[(df.t_n_urls > 0)].drop("NONE", axis=1))
res = model.fit()
res.summary2()
Out[22]:
In [23]:
df.topic_name.value_counts()
Out[23]:
In [24]:
%%time
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 14,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.factorplot(y="URL_type", x="URL_counts", #hue="is_controversial",
col="topic_name", col_wrap=3,
col_order=topic_order,
kind="bar",
color="0.5",
errwidth=2,
data=pd.melt(pd.concat([df_X > 0,
df[["topic_name"]],
(df_y == 1).to_frame()],
axis=1).drop(["NONE", "UNK"], axis=1),
id_vars=["is_controversial", "topic_name"],
var_name="URL_type", value_name="URL_counts"
))
#g.set_xticklabels(rotation=90)
g.set_titles("{col_name}").set_axis_labels("URL type proportion", "")
#sns.despine(offset=10)
plt.savefig("URL_proportions.all.pdf", bbox_inches="tight")
In [25]:
%%time
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 14,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.factorplot(y="URL_type", x="URL_counts", #hue="is_controversial",
col="topic_name", col_wrap=3,
col_order=topic_order,
kind="bar",
color="0.5",
errwidth=2,
data=pd.melt(pd.concat([df_X[df.t_n_urls > 0] > 0,
df[df.t_n_urls > 0][["topic_name"]],
(df_y[df.t_n_urls > 0] == 1).to_frame()],
axis=1).drop(["NONE", "UNK"], axis=1),
id_vars=["is_controversial", "topic_name"],
var_name="URL_type", value_name="URL_counts"
))
#g.set_xticklabels(rotation=90)
g.set_titles("{col_name}").set_axis_labels("URL type proportion", "")
#sns.despine(offset=10)
plt.savefig("URL_proportions.atleast1url.pdf", bbox_inches="tight")
In [26]:
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 14,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
ax = sns.barplot(x="mean", y="index", hue="label",
data=pd.concat([
((df_X[
(df_y == 0)
& (df.t_n_urls > 0)
].drop(["NONE", "UNK",],
axis=1) > 0)* 1.).describe().T.reset_index().assign(
label="Non-Controversial"),
((df_X[
(df_y == 1)
& (df.t_n_urls > 0)
].drop(["NONE", "UNK",],
axis=1) > 0)* 1.).describe().T.reset_index().assign(
label="Controversial"),
], axis=0),
palette=["k", "0.5"]
)
ax.set_xlabel("Proportion of tweets with given URL type")
ax.set_ylabel("URL types")
sns.despine(offset=10)
In [27]:
df_t = (
((df_X[(df_y == 1) & (df.t_n_urls > 0)].drop(["NONE"], axis=1) > 0)* 1.).describe().T["mean"]/
((df_X[(df_y == 0) & (df.t_n_urls > 0)].drop(["NONE"], axis=1) > 0)* 1.).describe().T["mean"]
).to_frame()
df_t["mean"] = (df_t["mean"]/df_t.ix["twitter", "mean"])
df_t.reset_index().rename(columns={"index": "URL type", "mean": "Odds in controversial topic"})
Out[27]:
In [28]:
df.shape
Out[28]:
In [29]:
df[df.t_n_urls > 0].shape
Out[29]:
In [30]:
df.t_n_urls.sum()
Out[30]:
In [31]:
df.topic_name.value_counts().to_frame()
Out[31]:
In [32]:
def sum_url_present(x):
return (x>0).sum()
def mean_url_present(x):
return (x>0).mean()
def get_url_domain(x):
x = urlsplit(x.lower())
if x.netloc in {"linkis.com", "www.linkis.com"}:
if x.path[1:] != "":
x = urlsplit("http:/%s" % x.path).netloc
else:
x = x.netloc
elif x.netloc in {"google.com", "www.google.com"}:
query = parse_qs(x.query)
if "url" in query:
return get_url_domain(query["url"][0])
x = x.netloc
else:
x = x.netloc
if x.startswith("www."):
x = x[4:]
if x.endswith(".wordpress.com") or x.endswith(".tumblr.com") or x.endswith(".blogspot.com"):
x = x.split(".", 1)[-1]
return x
print get_url_domain("https://www.google.com/url?rct=j&sa=t&url=http://www.perthnow.com.au/news/western-australia/social-services-minister-christian-porter-slaps-down-antivaccination-campaigners/news-story/0aa49052ec0598704b05333075581296&ct=ga&cd=CAIyGjE2ZDBhYmZjOTAzMjkyMTk6Y29tOmVuOlVT&usg=AFQjCNFAB3aZtdfdVpXOHWzyfqsu0ZSFAg")
In [33]:
df.groupby("topic_name")["t_n_urls"].agg([len, np.sum, np.mean,
sum_url_present, mean_url_present]).sort_values("mean", ascending=False)
Out[33]:
In [34]:
df_urls = pd.read_csv("TID_URL_CATS.txt", sep="\t")
df_urls["ORIG_DOMAIN"] = df_urls.URL.apply(get_url_domain)
df_urls.head()
Out[34]:
In [35]:
df_url_exp = pd.read_csv("URL_CAT_MAPPINGS.txt", sep="\t")
df_url_exp.head()
Out[35]:
In [36]:
df_urls.shape, df_url_exp.shape
Out[36]:
In [37]:
df_urls = df_urls.merge(df_url_exp, how="left", on="URL")
df_urls.head()
Out[37]:
In [38]:
## Update data for twitter urls
df_urls.ix[df_urls.EXPANDED.isnull(), "EXPANDED_STATUS"] = 0
df_urls.ix[df_urls.EXPANDED.isnull(), "URL_DOMAIN"] = df_urls.ix[df_urls.EXPANDED.isnull(), "ORIG_DOMAIN"]
df_urls.ix[df_urls.EXPANDED.isnull(), "URL_CATS"] = df_urls.ix[df_urls.EXPANDED.isnull(), "CATS"]
df_urls.ix[df_urls.EXPANDED.isnull(), "EXPANDED"] = df_urls.ix[df_urls.EXPANDED.isnull(), "URL"]
df_urls.head()
Out[38]:
In [39]:
df_urls.URL_DOMAIN.value_counts().head(10).to_frame()
Out[39]:
In [40]:
df_urls.ORIG_DOMAIN.value_counts().head(10).to_frame()
Out[40]:
In [41]:
df_urls.merge(df[["t_id", "topic_name"]], how="left", left_on="TID", right_on="t_id").head()
Out[41]:
In [42]:
get_url_domain("http://linkis.com/thinkingmomsrevolution.com/lXwma")
Out[42]:
In [43]:
df_url_tid = df_urls.merge(df[["t_id", "topic_name"]], how="inner", left_on="TID", right_on="t_id")
df_url_tid["CATS_Counter"] = df_url_tid.CATS.apply(lambda x: get_category_counts(x.split("|")))
df_url_tid.head()
Out[43]:
In [44]:
df_url_tid.shape
Out[44]:
In [45]:
df_url_tid.ORIG_DOMAIN.unique().shape
Out[45]:
In [46]:
df_url_tid.URL.unique().shape
Out[46]:
In [47]:
df_url_tid.URL_DOMAIN.unique().shape, df_url_tid.EXPANDED.unique().shape
Out[47]:
In [48]:
df_t = df_url_tid.pivot_table(
index="URL_DOMAIN", columns="topic_name", values="TID", aggfunc=len
)
pd.concat({c: df_t[c].sort_values(ascending=False).head(20).reset_index().rename(columns={c: "Counts",
"URL_DOMAIN": c})
for c in df_t.columns},
axis=1, keys=topic_order)
Out[48]:
In [49]:
df_t = df_url_tid.pivot_table(
index="ORIG_DOMAIN", columns="topic_name", values="TID", aggfunc=len
)
pd.concat({c: df_t[c].sort_values(ascending=False).head(20).reset_index().rename(columns={c: "Counts",
"ORIG_DOMAIN": c})
for c in df_t.columns},
axis=1, keys=topic_order)
Out[49]:
In [50]:
df_t = df_url_tid.pivot_table(
index="URL_DOMAIN", columns="topic_name", values="TID", aggfunc=len
)
IDF = np.log10(df.topic_name.unique().shape[0] / ((df_t > 0) * 1.).sum(axis=1))
df_t = df_t.multiply(IDF, axis=0)
print "Using expanded URLs"
pd.concat({c: df_t[c].sort_values(ascending=False).head(20).reset_index().rename(columns={c: "TF-IDF",
"URL_DOMAIN": c})
for c in df_t.columns},
axis=1, keys=topic_order)
Out[50]:
In [51]:
df_t = df_url_tid.pivot_table(
index="ORIG_DOMAIN", columns="topic_name", values="TID", aggfunc=len
)
IDF = np.log10(df.topic_name.unique().shape[0] / ((df_t > 0) * 1.).sum(axis=1))
df_t = df_t.multiply(IDF, axis=0)
print "Using expanded URLs"
pd.concat({c: df_t[c].sort_values(ascending=False).head(20).reset_index().rename(columns={c: "TF-IDF",
"ORIG_DOMAIN": c})
for c in df_t.columns},
axis=1, keys=topic_order)
Out[51]:
In [52]:
df.columns
Out[52]:
In [53]:
for url_type in ["fakenews", "news", "blog", "socialmedia", "scientific", "commercial", "videos", "UNK"]:
N = 5
print url_type
df_t = df_url_tid[df_url_tid.CATS_Counter.apply(lambda x: url_type in x)].pivot_table(
index="URL_DOMAIN", columns="topic_name", values="TID", aggfunc=len
)
display(pd.concat({c: df_t[c].sort_values(ascending=False).head(N).reset_index().rename(columns={c: "Counts", "URL_DOMAIN": c})
for c in df_t.columns},
axis=1, keys=topic_order))
In [54]:
for url_type in ["fakenews", "news", "blog", "socialmedia", "scientific", "commercial", "videos", "UNK"]:
N = 5
print url_type
df_t = df_url_tid[df_url_tid.CATS_Counter.apply(lambda x: url_type in x)].pivot_table(
index="URL_DOMAIN", columns="topic_name", values="TID", aggfunc=len
)
IDF = np.log10(df.topic_name.unique().shape[0] / ((df_t > 0) * 1.).sum(axis=1))
df_t = df_t.multiply(IDF, axis=0)
display(pd.concat({c: df_t[c].sort_values(ascending=False).head(N).reset_index().rename(columns={c: "TF-IDF", "URL_DOMAIN": c})
for c in df_t.columns},
axis=1, keys=topic_order))
In [55]:
parse_qs("")
Out[55]:
In [56]:
df_url_tid.head()
Out[56]:
In [57]:
N = 3
url_types = ["fakenews", "news", "blog", "socialmedia", "scientific", "commercial", "videos", "UNK"]
display(pd.concat({url_type: df_url_tid[
df_url_tid.CATS_Counter.apply(
lambda x: url_type in x
)].URL_DOMAIN.value_counts().head(N).reset_index().rename(columns={0: "Counts", "index": url_type})
for url_type in url_types},
axis=1, keys=url_types))
In [58]:
url_type = "blog"
df_url_tid[(df_url_tid.CATS_Counter.apply(
lambda x: url_type in x
)) & (df_url_tid.URL_DOMAIN == "google.com")].head().EXPANDED.values
Out[58]:
In [59]:
get_url_domain("https://www.google.com/url?rct=j&sa=t&url=http://www.perthnow.com.au/news/western-australia/social-services-minister-christian-porter-slaps-down-antivaccination-campaigners/news-story/0aa49052ec0598704b05333075581296&ct=ga&cd=CAIyGjE2ZDBhYmZjOTAzMjkyMTk6Y29tOmVuOlVT&usg=AFQjCNFAB3aZtdfdVpXOHWzyfqsu0ZSFAg")
Out[59]:
In [60]:
df_url_tid.head()
Out[60]:
In [61]:
df_url_tid_gender = df_urls.merge(df[["t_id", "topic_name", u'Gender']], how="inner", left_on="TID", right_on="t_id")
df_url_tid_gender["CATS_Counter"] = df_url_tid_gender.CATS.apply(lambda x: get_category_counts(x.split("|")))
df_url_tid_gender.head()
Out[61]:
In [62]:
df_url_tid_gender.Gender.value_counts()
Out[62]:
In [63]:
df_url_tid_gender.shape
Out[63]:
In [64]:
df_url_tid_gender.Gender.value_counts()
Out[64]:
In [65]:
df_url_tid_gender.pivot_table(index="topic_name", columns="Gender", values="TID", aggfunc=len)
Out[65]:
In [66]:
df.pivot_table(index="topic_name", columns="Gender", values="t_id", aggfunc=len)
Out[66]:
In [67]:
df[["topic_name", "u_id", "Gender"]].groupby(["topic_name", "Gender"])["u_id"].nunique().to_frame().unstack()
Out[67]:
In [68]:
N = 3
url_types = ["fakenews", "news", "blog", "socialmedia", "scientific", "commercial", "videos", "UNK"]
display(pd.concat({url_type: df_url_tid_gender[
df_url_tid_gender.CATS_Counter.apply(
lambda x: url_type in x
)].pivot_table(index="topic_name", columns="Gender", values="TID", aggfunc=len)#.reset_index()#.rename(columns={0: "Counts", "index": url_type})
for url_type in url_types},
axis=1, keys=url_types))
In [69]:
#gender_palette = ["orange", "darkmagenta"]
gender_palette = ["#D5F7FF", "#494B67"]
In [70]:
%%time
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 14,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.factorplot(y="URL_type", x="URL_counts", hue=u'Gender',
col="topic_name", col_wrap=3,
col_order=topic_order,
kind="bar",
errwidth=2,
#capsize=0.5,
#color="0.2",
palette=gender_palette,
data=pd.melt(pd.concat([df_X[df.t_n_urls > 0] > 0,
df[df.t_n_urls > 0][["topic_name", u'Gender']],
(df_y[df.t_n_urls > 0] == 1).to_frame()],
axis=1).drop(["NONE", "UNK"], axis=1),
id_vars=["is_controversial", u'Gender', "topic_name"],
var_name="URL_type", value_name="URL_counts"
))
#g.set_xticklabels(rotation=90)
g.set_titles("{col_name}").set_axis_labels("URL type proportion", "")
#sns.despine(offset=10)
In [71]:
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 14,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
ax = sns.barplot(x="mean", y="index", hue="Gender",
data=pd.concat([
((df_X[(df.Gender == "F") & (df.t_n_urls > 0)].drop(["NONE", "UNK",], axis=1) > 0)* 1.).describe().T.reset_index().assign(Gender="Female"),
((df_X[(df.Gender == "M") & (df.t_n_urls > 0)].drop(["NONE", "UNK",], axis=1) > 0)* 1.).describe().T.reset_index().assign(Gender="Male"),
], axis=0),
palette=gender_palette
)
ax.set_xlabel("Proportion of tweets with given URL type")
ax.set_ylabel("URL types")
sns.despine(offset=10)
In [72]:
df_t = (
((df_X[(df.Gender == "F") & (df.t_n_urls > 0)].drop(["NONE"], axis=1) > 0)* 1.).describe().T["mean"]/
((df_X[(df.Gender == "M") & (df.t_n_urls > 0)].drop(["NONE"], axis=1) > 0)* 1.).describe().T["mean"]
).to_frame()
df_t["mean"] = (df_t["mean"]/df_t.ix["twitter", "mean"])
df_t.reset_index().rename(columns={"index": "URL type", "mean": "Odds in Females versus Males"})
Out[72]:
In [73]:
model = sm.Logit.from_formula(("I((Gender == 'F') * 1.)"
"~ UNK + blog + commercial +"
" fakenews + news+ scientific"
" + socialmedia + twitter + videos -1"),
data=pd.concat([df.Gender[(df.t_n_urls > 0) & (df.Gender.isin({"M", "F"}))],
df_X[(df.t_n_urls > 0) & (df.Gender.isin({"M", "F"}))].drop("NONE", axis=1)], axis=1))
res = model.fit()
res.summary2()
Out[73]:
In [74]:
res.get_margeff().summary()
Out[74]:
In [75]:
%%time
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 14,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.factorplot(y="URL_type", x="URL_counts", #hue="is_controversial",
col="topic_name", col_wrap=3,
#col_order=topic_order,
kind="bar",
color="0.5",
errwidth=2,
data=pd.melt(pd.concat([df_X[df.t_n_urls > 0] > 0,
df[(df.t_n_urls > 0) & (df.topic_name == "Vaccine")][["topic_name"]],
(df_y[df.t_n_urls > 0] == 1).to_frame()],
axis=1).drop(["NONE", "UNK"], axis=1),
id_vars=["is_controversial", "topic_name"],
var_name="URL_type", value_name="URL_counts"
))
#g.set_xticklabels(rotation=90)
g.set_titles("{col_name}").set_axis_labels("URL type proportion", "")
#sns.despine(offset=10)
plt.savefig("URL_proportions.atleast1url.vaccine.pdf", bbox_inches="tight")
In [76]:
"""
These numbers will not add to 1 as the same URL can be counted in multiple categories.
Hence, the proportions are more reflective of how different category of URLs are present in the dataset.
"""
pd.melt(pd.concat([df_X[df.t_n_urls > 0] > 0,
df[(df.t_n_urls > 0) & (df.topic_name == "Vaccine")][["topic_name"]],
(df_y[df.t_n_urls > 0] == 1).to_frame()],
axis=1),#.drop(["NONE", "UNK"], axis=1),
id_vars=["is_controversial", "topic_name"],
var_name="URL_type", value_name="URL_counts"
).groupby("URL_type")["URL_counts"].mean()
Out[76]:
In [ ]: