In [1]:
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict

import re

In [2]:
DATA_DIRS = ["ChildEducation/Education", "Privacy/Privacy",
             "Vaccine/vaccine",
             "SeatBelt/Seatbelt",
             "GunControl/gunControl",
             "SkinDamage/SkinDamage"
            ]
CONTROVERSIAL_TOPICS=["Privacy/Privacy",
                      "Vaccine/vaccine",
                      "GunControl/gunControl",
                     ]
USER_HANDLE_REGEX = re.compile(r'twitter\.com/(.+)/status/(.+)')
USER_HANDLE_REGEX.findall('http://twitter.com/malkanen/status/12233311')


Out[2]:
[('malkanen', '12233311')]

In [3]:
#%%time
datasets = {}
for dirname in DATA_DIRS:
    print dirname
    df = pd.read_csv("%s_processed.csv" % dirname)
    df_orig = pd.read_csv("%s_noDublict.csv" % dirname)
    print df_orig.shape, df.shape
    assert df_orig.shape[0] == df.shape[0], "Shape mismatch between df_orig, and df"
    df_merged = pd.concat([df, df_orig[["URL", "Contents"]]], axis=1)
    print df_merged.shape
    assert df_merged.shape[0] == df.shape[0], "Shape mismatch between df_merged, and df"
    assert ((df_merged.Author != df_merged.URL.apply(
            lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0][0])
           ).sum() == 0), "Authors are not the same in merged and other."
    df_merged["t_id"] = df_merged["URL"].apply(lambda x: USER_HANDLE_REGEX.findall(x)[0][1]).astype(int)
    df_merged = df_merged.drop_duplicates(subset=["t_id"])
    print df_merged.shape
    for label_col in ["sentiment", "sentiment_subj", "subjectvity_type", "negation"]:
        print df_merged[label_col].value_counts()
    datasets[dirname] = df_merged


ChildEducation/Education
(10808, 17) (10808, 25)
(10808, 27)
(10808, 28)
Basic Neutral     6652
Basic Positive    2295
Basic Negative    1861
Name: sentiment, dtype: int64
neutral     9861
positive     742
negative     205
Name: sentiment_subj, dtype: int64
neutralsubj    10181
strongsubj       328
weaksubj         299
Name: subjectvity_type, dtype: int64
0    9814
1     994
Name: negation, dtype: int64
Privacy/Privacy
(73593, 17) (73593, 25)
(73593, 27)
(73593, 28)
Basic Neutral     37350
Basic Negative    29103
Basic Positive     7140
Name: sentiment, dtype: int64
neutral     67583
positive     3848
negative     2162
Name: sentiment_subj, dtype: int64
neutralsubj    70078
weaksubj        2105
strongsubj      1410
Name: subjectvity_type, dtype: int64
0    66227
1     7366
Name: negation, dtype: int64
Vaccine/vaccine
(40713, 16) (40713, 25)
(40713, 27)
(40713, 28)
Basic Neutral     26223
Basic Negative    10478
Basic Positive     4012
Name: sentiment, dtype: int64
neutral     37736
positive     1725
negative     1252
Name: sentiment_subj, dtype: int64
neutralsubj    39020
weaksubj        1000
strongsubj       693
Name: subjectvity_type, dtype: int64
0.0    36549
1.0     4164
Name: negation, dtype: int64
SeatBelt/Seatbelt
(73270, 17) (73270, 25)
(73270, 27)
(73270, 28)
Basic Neutral     35219
Basic Negative    22012
Basic Positive    16039
Name: sentiment, dtype: int64
neutral     67199
positive     3763
negative     2308
Name: sentiment_subj, dtype: int64
neutralsubj    69793
weaksubj        1910
strongsubj      1567
Name: subjectvity_type, dtype: int64
0    65329
1     7941
Name: negation, dtype: int64
GunControl/gunControl
(34357, 16) (34357, 25)
(34357, 27)
(34357, 28)
Basic Neutral     19988
Basic Negative    10399
Basic Positive     3970
Name: sentiment, dtype: int64
neutral     31887
positive     1466
negative     1004
Name: sentiment_subj, dtype: int64
neutralsubj    32928
weaksubj         771
strongsubj       658
Name: subjectvity_type, dtype: int64
0    31299
1     3058
Name: negation, dtype: int64
SkinDamage/SkinDamage
(14128, 16) (14128, 25)
(14128, 27)
(14128, 28)
Basic Neutral     8896
Basic Negative    3837
Basic Positive    1395
Name: sentiment, dtype: int64
neutral     12624
positive     1054
negative      450
Name: sentiment_subj, dtype: int64
neutralsubj    13133
weaksubj         685
strongsubj       310
Name: subjectvity_type, dtype: int64
0    13315
1      813
Name: negation, dtype: int64

In [4]:
def get_user_from_tweet_url(x):
    return "@%s" % USER_HANDLE_REGEX.findall(x)[0][0]

In [5]:
df_meta = pd.read_csv("TID_META.txt", sep="\t", encoding='utf-8')
df_meta.shape


Out[5]:
(328318, 21)

In [6]:
df_meta = df_meta.drop_duplicates(subset=["t_id"])
df_meta.shape


Out[6]:
(328310, 21)

In [7]:
df_meta.t_id.value_counts().head()


Out[7]:
771593256668848128    1
715317361373351937    1
742006158374772736    1
704957024149368832    1
720350207548923905    1
Name: t_id, dtype: int64

In [8]:
df_meta.dtypes


Out[8]:
t_id              int64
t_created        object
t_retweets        int64
t_favorites       int64
t_is_reply         bool
t_is_quote         bool
t_n_hashtags      int64
t_n_urls          int64
t_n_mentions      int64
t_n_media         int64
u_id              int64
u_created        object
u_n_listed        int64
u_n_favorites     int64
u_n_followers     int64
u_n_friends       int64
u_n_statuses      int64
u_is_verified      bool
u_location       object
u_name           object
u_url            object
dtype: object

In [9]:
df_meta.columns


Out[9]:
Index([u't_id', u't_created', u't_retweets', u't_favorites', u't_is_reply',
       u't_is_quote', u't_n_hashtags', u't_n_urls', u't_n_mentions',
       u't_n_media', u'u_id', u'u_created', u'u_n_listed', u'u_n_favorites',
       u'u_n_followers', u'u_n_friends', u'u_n_statuses', u'u_is_verified',
       u'u_location', u'u_name', u'u_url'],
      dtype='object')

In [10]:
df_all = pd.concat([v.assign(topic_name=lambda x: k) for k,v in datasets.iteritems()], axis=0)
df_all.shape


Out[10]:
(246869, 30)

In [11]:
df_all.dtypes


Out[11]:
Author               object
City/Urban           object
City/Urban Area      object
Contents             object
Date                 object
Followers           float64
Following           float64
GUID                float64
Gender               object
Name                 object
Posts               float64
RT?                  object
State/Region         object
URL                  object
URL?                 object
adjective           float64
adverb              float64
count_tweet           int64
hashtag?             object
mention?             object
negation            float64
noun                float64
preposition         float64
processedPost        object
sentiment            object
sentiment_subj       object
subjectvity_type     object
t_id                  int64
topic_name           object
verb                float64
dtype: object

In [12]:
df_all.t_id.value_counts().head()


Out[12]:
723689208196509696    2
699607372394819584    2
700042121877835776    2
792354716521009152    2
699996833892823040    2
Name: t_id, dtype: int64

In [13]:
df_all.topic_name.value_counts()


Out[13]:
Privacy/Privacy             73593
SeatBelt/Seatbelt           73270
Vaccine/vaccine             40713
GunControl/gunControl       34357
SkinDamage/SkinDamage       14128
ChildEducation/Education    10808
Name: topic_name, dtype: int64

In [14]:
df_merged_meta = df_all.merge(df_meta, how="left", on="t_id")
df_merged_meta.shape


Out[14]:
(246869, 50)

Same tweet belongs to multiple datasets


In [15]:
df_merged_meta.t_id.value_counts().head()


Out[15]:
723689208196509696    2
699607372394819584    2
700042121877835776    2
792354716521009152    2
699996833892823040    2
Name: t_id, dtype: int64

In [16]:
df_merged_meta[df_merged_meta.t_id == 700042121877835776][["topic_name"]]


Out[16]:
topic_name
147719 GunControl/gunControl
239776 Privacy/Privacy

In [17]:
df_merged_meta.t_id.value_counts()[df_merged_meta.t_id.value_counts() > 1]


Out[17]:
723689208196509696    2
699607372394819584    2
700042121877835776    2
792354716521009152    2
699996833892823040    2
724662525984387072    2
699778010921226240    2
700186170706300928    2
748305235907477504    2
697800944894025728    2
Name: t_id, dtype: int64

In [18]:
df_merged_meta[df_merged_meta.t_id == 792354716521009152].T


Out[18]:
49813 200954
Author @sylviadana @sylviadana
City/Urban NaN NaN
City/Urban Area Grand Rapids Grand Rapids
Contents Are u Pro-children? Education? Healthcare? Sma... Are u Pro-children? Education? Healthcare? Sma...
Date 10/29/2016 8:17 10/29/2016 8:17
Followers 144 144
Following 262 262
GUID 7.92355e+17 7.92355e+17
Gender F F
Name Sylvia Dana Sylvia Dana
Posts 935 935
RT? NO NO
State/Region Michigan Michigan
URL http://twitter.com/sylviadana/status/792354716... http://twitter.com/sylviadana/status/792354716...
URL? NO NO
adjective 6 6
adverb 0 0
count_tweet 1 1
hashtag? YES YES
mention? NO NO
negation 0 0
noun 6 6
preposition 1 1
processedPost are u pro children education healthcare small... are u pro children education healthcare small...
sentiment Basic Negative Basic Negative
sentiment_subj neutral neutral
subjectvity_type neutralsubj neutralsubj
t_id 792354716521009152 792354716521009152
topic_name ChildEducation/Education Privacy/Privacy
verb 1 1
t_created Sat Oct 29 13:17:40 +0000 2016 Sat Oct 29 13:17:40 +0000 2016
t_retweets 0 0
t_favorites 1 1
t_is_reply False False
t_is_quote False False
t_n_hashtags 1 1
t_n_urls 0 0
t_n_mentions 0 0
t_n_media 0 0
u_id 1.79638e+08 1.79638e+08
u_created Tue Aug 17 19:44:36 +0000 2010 Tue Aug 17 19:44:36 +0000 2010
u_n_listed 5 5
u_n_favorites 458 458
u_n_followers 142 142
u_n_friends 269 269
u_n_statuses 1009 1009
u_is_verified False False
u_location Grand Rapids, MI Grand Rapids, MI
u_name Sylvia Dana Sylvia Dana
u_url http://shesellscars.com http://shesellscars.com

In [19]:
df_merged_meta["is_controversial"] = df_merged_meta.topic_name.isin(CONTROVERSIAL_TOPICS)
df_merged_meta.is_controversial.value_counts()


Out[19]:
True     148663
False     98206
Name: is_controversial, dtype: int64

Merge URL types


In [20]:
df_merged_meta.columns


Out[20]:
Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial'],
      dtype='object')

In [21]:
df_mapped_cats = pd.read_csv("TID_URL_CATS.txt", sep="\t").assign(
    CATS=lambda x: x.CATS.apply(lambda k: k.split("|"))
)
df_mapped_cats.head()


Out[21]:
TID URL CATS
0 682904901916225536 https://twitter.com/photogchad_WTSP/status/682... [socialmedia, twitter]
1 682915876316692480 http://www.investirdanslenfance.ca/ [UNK]
2 682985833821941760 http://TinyURL.com/NewYearCure [commercial]
3 682952771746664448 http://TinyURL.com/NewYearCure [commercial]
4 682830450969059328 http://yournewswire.com/donald-trump-vaccines-... [fakenews]

In [22]:
URL_DICT = dict(zip(df_mapped_cats.URL.values, df_mapped_cats.CATS.values))
URL_DICT["http://TinyURL.com/NewYearCure"]


Out[22]:
['commercial']

In [23]:
len(URL_DICT)


Out[23]:
119558

In [24]:
df_mapped_cats.TID.value_counts().head()


Out[24]:
770862566662955008    3
700152617033289728    3
735430291821596673    3
738960203949965312    3
716003978807476224    3
Name: TID, dtype: int64

In [25]:
df_mapped_cats[df_mapped_cats.TID == 700152617033289728]


Out[25]:
TID URL CATS
17849 700152617033289728 http://www.hcplive.com/contributor/ryan-gray-m... [UNK]
17850 700152617033289728 http://www.foxla.com/news/local-news/89941411-... [UNK]
17851 700152617033289728 http://www.apple.com/customer-letter/ [commercial]

In [26]:
df_tweet_cat_counts = df_mapped_cats.groupby("TID")["CATS"].apply(lambda x: sum(x, []))
df_tweet_cat_counts.head()


Out[26]:
TID
682807892517478400                     [UNK]
682808408857272320    [socialmedia, twitter]
682809868449796099                     [UNK]
682815454159761408    [socialmedia, twitter]
682816642242490369          [news, fakenews]
Name: CATS, dtype: object

In [27]:
df_tweet_cat_counts.reset_index().dtypes


Out[27]:
TID      int64
CATS    object
dtype: object

In [28]:
df_merged_meta.shape


Out[28]:
(246869, 51)

In [29]:
df_merged_meta.t_id.value_counts().head()


Out[29]:
723689208196509696    2
699607372394819584    2
700042121877835776    2
792354716521009152    2
699996833892823040    2
Name: t_id, dtype: int64

In [30]:
df_merged_meta_cats = df_merged_meta.merge(
    df_tweet_cat_counts.reset_index(), how="left", left_on="t_id", right_on="TID")

In [31]:
df_merged_meta_cats.columns


Out[31]:
Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial',
                    u'TID',             u'CATS'],
      dtype='object')

Add location states

df_merged_meta_cats.u_location.value_counts().to_csv("USER_LOCATIONS.txt", sep="\t", encoding='utf-8')
! head USER_LOCATIONS.txt
! python process_user_locations.py ## RUN using python3 from command line

In [32]:
df_places = pd.read_csv("PARSED_STATES.final.txt", sep="\t")
df_places = df_places.rename(columns={
        "location": "u_location", "parse_manual": "u_state"
                                     })[["u_location", "u_state"]]
df_places.head()


Out[32]:
u_location u_state
0 United States USA
1 USA USA
2 Washington, DC DC
3 New York, NY NY
4 Los Angeles, CA CA

In [33]:
df_merged_meta_cats = df_merged_meta_cats.merge(df_places, how="left", on="u_location")


/home/entity/anaconda2/lib/python2.7/site-packages/pandas/tools/merge.py:1198: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  rlab = rizer.factorize(rk)

In [34]:
df_merged_meta_cats.u_state.head()


Out[34]:
0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: u_state, dtype: object

In [35]:
df_merged_meta_cats.t_id.value_counts().head()


Out[35]:
723689208196509696    2
699607372394819584    2
700042121877835776    2
792354716521009152    2
699996833892823040    2
Name: t_id, dtype: int64

In [36]:
df_merged_meta_cats.to_hdf("FINAL_ANALYSIS_DATA.h5", "final_data")


/home/entity/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:270: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->axis0] [items->None]

  f(store)
/home/entity/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:270: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_items] [items->None]

  f(store)
/home/entity/anaconda2/lib/python2.7/site-packages/pandas/core/generic.py:1138: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['Author', 'City/Urban', 'City/Urban Area', 'Contents', 'Date', 'Gender', 'Name', 'RT?', 'State/Region', 'URL', 'URL?', 'hashtag?', 'mention?', 'processedPost', 'sentiment', 'sentiment_subj', 'subjectvity_type', 'topic_name', u't_created', u't_is_reply', u't_is_quote', u'u_created', u'u_is_verified', u'u_location', u'u_name', u'u_url', 'CATS', 'u_state']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
/home/entity/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:270: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_items] [items->None]

  f(store)