In [1]:
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict
import re
In [2]:
DATA_DIRS = ["ChildEducation/Education", "Privacy/Privacy",
"Vaccine/vaccine",
"SeatBelt/Seatbelt",
"GunControl/gunControl",
"SkinDamage/SkinDamage"
]
CONTROVERSIAL_TOPICS=["Privacy/Privacy",
"Vaccine/vaccine",
"GunControl/gunControl",
]
USER_HANDLE_REGEX = re.compile(r'twitter\.com/(.+)/status/(.+)')
USER_HANDLE_REGEX.findall('http://twitter.com/malkanen/status/12233311')
Out[2]:
In [3]:
#%%time
datasets = {}
for dirname in DATA_DIRS:
print dirname
df = pd.read_csv("%s_processed.csv" % dirname)
df_orig = pd.read_csv("%s_noDublict.csv" % dirname)
print df_orig.shape, df.shape
assert df_orig.shape[0] == df.shape[0], "Shape mismatch between df_orig, and df"
df_merged = pd.concat([df, df_orig[["URL", "Contents"]]], axis=1)
print df_merged.shape
assert df_merged.shape[0] == df.shape[0], "Shape mismatch between df_merged, and df"
assert ((df_merged.Author != df_merged.URL.apply(
lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0][0])
).sum() == 0), "Authors are not the same in merged and other."
df_merged["t_id"] = df_merged["URL"].apply(lambda x: USER_HANDLE_REGEX.findall(x)[0][1]).astype(int)
df_merged = df_merged.drop_duplicates(subset=["t_id"])
print df_merged.shape
for label_col in ["sentiment", "sentiment_subj", "subjectvity_type", "negation"]:
print df_merged[label_col].value_counts()
datasets[dirname] = df_merged
In [4]:
def get_user_from_tweet_url(x):
return "@%s" % USER_HANDLE_REGEX.findall(x)[0][0]
In [5]:
df_meta = pd.read_csv("TID_META.txt", sep="\t", encoding='utf-8')
df_meta.shape
Out[5]:
In [6]:
df_meta = df_meta.drop_duplicates(subset=["t_id"])
df_meta.shape
Out[6]:
In [7]:
df_meta.t_id.value_counts().head()
Out[7]:
In [8]:
df_meta.dtypes
Out[8]:
In [9]:
df_meta.columns
Out[9]:
In [10]:
df_all = pd.concat([v.assign(topic_name=lambda x: k) for k,v in datasets.iteritems()], axis=0)
df_all.shape
Out[10]:
In [11]:
df_all.dtypes
Out[11]:
In [12]:
df_all.t_id.value_counts().head()
Out[12]:
In [13]:
df_all.topic_name.value_counts()
Out[13]:
In [14]:
df_merged_meta = df_all.merge(df_meta, how="left", on="t_id")
df_merged_meta.shape
Out[14]:
In [15]:
df_merged_meta.t_id.value_counts().head()
Out[15]:
In [16]:
df_merged_meta[df_merged_meta.t_id == 700042121877835776][["topic_name"]]
Out[16]:
In [17]:
df_merged_meta.t_id.value_counts()[df_merged_meta.t_id.value_counts() > 1]
Out[17]:
In [18]:
df_merged_meta[df_merged_meta.t_id == 792354716521009152].T
Out[18]:
In [19]:
df_merged_meta["is_controversial"] = df_merged_meta.topic_name.isin(CONTROVERSIAL_TOPICS)
df_merged_meta.is_controversial.value_counts()
Out[19]:
In [20]:
df_merged_meta.columns
Out[20]:
In [21]:
df_mapped_cats = pd.read_csv("TID_URL_CATS.txt", sep="\t").assign(
CATS=lambda x: x.CATS.apply(lambda k: k.split("|"))
)
df_mapped_cats.head()
Out[21]:
In [22]:
URL_DICT = dict(zip(df_mapped_cats.URL.values, df_mapped_cats.CATS.values))
URL_DICT["http://TinyURL.com/NewYearCure"]
Out[22]:
In [23]:
len(URL_DICT)
Out[23]:
In [24]:
df_mapped_cats.TID.value_counts().head()
Out[24]:
In [25]:
df_mapped_cats[df_mapped_cats.TID == 700152617033289728]
Out[25]:
In [26]:
df_tweet_cat_counts = df_mapped_cats.groupby("TID")["CATS"].apply(lambda x: sum(x, []))
df_tweet_cat_counts.head()
Out[26]:
In [27]:
df_tweet_cat_counts.reset_index().dtypes
Out[27]:
In [28]:
df_merged_meta.shape
Out[28]:
In [29]:
df_merged_meta.t_id.value_counts().head()
Out[29]:
In [30]:
df_merged_meta_cats = df_merged_meta.merge(
df_tweet_cat_counts.reset_index(), how="left", left_on="t_id", right_on="TID")
In [31]:
df_merged_meta_cats.columns
Out[31]:
df_merged_meta_cats.u_location.value_counts().to_csv("USER_LOCATIONS.txt", sep="\t", encoding='utf-8')
! head USER_LOCATIONS.txt
! python process_user_locations.py ## RUN using python3 from command line
In [32]:
df_places = pd.read_csv("PARSED_STATES.final.txt", sep="\t")
df_places = df_places.rename(columns={
"location": "u_location", "parse_manual": "u_state"
})[["u_location", "u_state"]]
df_places.head()
Out[32]:
In [33]:
df_merged_meta_cats = df_merged_meta_cats.merge(df_places, how="left", on="u_location")
In [34]:
df_merged_meta_cats.u_state.head()
Out[34]:
In [35]:
df_merged_meta_cats.t_id.value_counts().head()
Out[35]:
In [36]:
df_merged_meta_cats.to_hdf("FINAL_ANALYSIS_DATA.h5", "final_data")