notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict

import re



In [2]:

    
DATA_DIRS = ["ChildEducation/Education", "Privacy/Privacy",
             "Vaccine/vaccine",
             "SeatBelt/Seatbelt",
             "GunControl/gunControl",
             "SkinDamage/SkinDamage"
            ]
CONTROVERSIAL_TOPICS=["Privacy/Privacy",
                      "Vaccine/vaccine",
                      "GunControl/gunControl",
                     ]
USER_HANDLE_REGEX = re.compile(r'twitter\.com/(.+)/status/(.+)')
USER_HANDLE_REGEX.findall('http://twitter.com/malkanen/status/12233311')









    Out[2]:





[('malkanen', '12233311')]



In [3]:

    
#%%time
datasets = {}
for dirname in DATA_DIRS:
    print dirname
    df = pd.read_csv("%s_processed.csv" % dirname)
    df_orig = pd.read_csv("%s_noDublict.csv" % dirname)
    print df_orig.shape, df.shape
    assert df_orig.shape[0] == df.shape[0], "Shape mismatch between df_orig, and df"
    df_merged = pd.concat([df, df_orig[["URL", "Contents"]]], axis=1)
    print df_merged.shape
    assert df_merged.shape[0] == df.shape[0], "Shape mismatch between df_merged, and df"
    assert ((df_merged.Author != df_merged.URL.apply(
            lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0][0])
           ).sum() == 0), "Authors are not the same in merged and other."
    df_merged["t_id"] = df_merged["URL"].apply(lambda x: USER_HANDLE_REGEX.findall(x)[0][1]).astype(int)
    df_merged = df_merged.drop_duplicates(subset=["t_id"])
    print df_merged.shape
    for label_col in ["sentiment", "sentiment_subj", "subjectvity_type", "negation"]:
        print df_merged[label_col].value_counts()
    datasets[dirname] = df_merged









    



ChildEducation/Education
(10808, 17) (10808, 25)
(10808, 27)
(10808, 28)
Basic Neutral     6652
Basic Positive    2295
Basic Negative    1861
Name: sentiment, dtype: int64
neutral     9861
positive     742
negative     205
Name: sentiment_subj, dtype: int64
neutralsubj    10181
strongsubj       328
weaksubj         299
Name: subjectvity_type, dtype: int64
0    9814
1     994
Name: negation, dtype: int64
Privacy/Privacy
(73593, 17) (73593, 25)
(73593, 27)
(73593, 28)
Basic Neutral     37350
Basic Negative    29103
Basic Positive     7140
Name: sentiment, dtype: int64
neutral     67583
positive     3848
negative     2162
Name: sentiment_subj, dtype: int64
neutralsubj    70078
weaksubj        2105
strongsubj      1410
Name: subjectvity_type, dtype: int64
0    66227
1     7366
Name: negation, dtype: int64
Vaccine/vaccine
(40713, 16) (40713, 25)
(40713, 27)
(40713, 28)
Basic Neutral     26223
Basic Negative    10478
Basic Positive     4012
Name: sentiment, dtype: int64
neutral     37736
positive     1725
negative     1252
Name: sentiment_subj, dtype: int64
neutralsubj    39020
weaksubj        1000
strongsubj       693
Name: subjectvity_type, dtype: int64
0.0    36549
1.0     4164
Name: negation, dtype: int64
SeatBelt/Seatbelt
(73270, 17) (73270, 25)
(73270, 27)
(73270, 28)
Basic Neutral     35219
Basic Negative    22012
Basic Positive    16039
Name: sentiment, dtype: int64
neutral     67199
positive     3763
negative     2308
Name: sentiment_subj, dtype: int64
neutralsubj    69793
weaksubj        1910
strongsubj      1567
Name: subjectvity_type, dtype: int64
0    65329
1     7941
Name: negation, dtype: int64
GunControl/gunControl
(34357, 16) (34357, 25)
(34357, 27)
(34357, 28)
Basic Neutral     19988
Basic Negative    10399
Basic Positive     3970
Name: sentiment, dtype: int64
neutral     31887
positive     1466
negative     1004
Name: sentiment_subj, dtype: int64
neutralsubj    32928
weaksubj         771
strongsubj       658
Name: subjectvity_type, dtype: int64
0    31299
1     3058
Name: negation, dtype: int64
SkinDamage/SkinDamage
(14128, 16) (14128, 25)
(14128, 27)
(14128, 28)
Basic Neutral     8896
Basic Negative    3837
Basic Positive    1395
Name: sentiment, dtype: int64
neutral     12624
positive     1054
negative      450
Name: sentiment_subj, dtype: int64
neutralsubj    13133
weaksubj         685
strongsubj       310
Name: subjectvity_type, dtype: int64
0    13315
1      813
Name: negation, dtype: int64



In [4]:

    
def get_user_from_tweet_url(x):
    return "@%s" % USER_HANDLE_REGEX.findall(x)[0][0]



In [5]:

    
df_meta = pd.read_csv("TID_META.txt", sep="\t", encoding='utf-8')
df_meta.shape









    Out[5]:





(328318, 21)



In [6]:

    
df_meta = df_meta.drop_duplicates(subset=["t_id"])
df_meta.shape









    Out[6]:





(328310, 21)



In [7]:

    
df_meta.t_id.value_counts().head()









    Out[7]:





771593256668848128    1
715317361373351937    1
742006158374772736    1
704957024149368832    1
720350207548923905    1
Name: t_id, dtype: int64



In [8]:

    
df_meta.dtypes









    Out[8]:





t_id              int64
t_created        object
t_retweets        int64
t_favorites       int64
t_is_reply         bool
t_is_quote         bool
t_n_hashtags      int64
t_n_urls          int64
t_n_mentions      int64
t_n_media         int64
u_id              int64
u_created        object
u_n_listed        int64
u_n_favorites     int64
u_n_followers     int64
u_n_friends       int64
u_n_statuses      int64
u_is_verified      bool
u_location       object
u_name           object
u_url            object
dtype: object



In [9]:

    
df_meta.columns









    Out[9]:





Index([u't_id', u't_created', u't_retweets', u't_favorites', u't_is_reply',
       u't_is_quote', u't_n_hashtags', u't_n_urls', u't_n_mentions',
       u't_n_media', u'u_id', u'u_created', u'u_n_listed', u'u_n_favorites',
       u'u_n_followers', u'u_n_friends', u'u_n_statuses', u'u_is_verified',
       u'u_location', u'u_name', u'u_url'],
      dtype='object')



In [10]:

    
df_all = pd.concat([v.assign(topic_name=lambda x: k) for k,v in datasets.iteritems()], axis=0)
df_all.shape









    Out[10]:





(246869, 30)



In [11]:

    
df_all.dtypes









    Out[11]:





Author               object
City/Urban           object
City/Urban Area      object
Contents             object
Date                 object
Followers           float64
Following           float64
GUID                float64
Gender               object
Name                 object
Posts               float64
RT?                  object
State/Region         object
URL                  object
URL?                 object
adjective           float64
adverb              float64
count_tweet           int64
hashtag?             object
mention?             object
negation            float64
noun                float64
preposition         float64
processedPost        object
sentiment            object
sentiment_subj       object
subjectvity_type     object
t_id                  int64
topic_name           object
verb                float64
dtype: object



In [12]:

    
df_all.t_id.value_counts().head()









    Out[12]:





723689208196509696    2
699607372394819584    2
700042121877835776    2
792354716521009152    2
699996833892823040    2
Name: t_id, dtype: int64



In [13]:

    
df_all.topic_name.value_counts()









    Out[13]:





Privacy/Privacy             73593
SeatBelt/Seatbelt           73270
Vaccine/vaccine             40713
GunControl/gunControl       34357
SkinDamage/SkinDamage       14128
ChildEducation/Education    10808
Name: topic_name, dtype: int64



In [14]:

    
df_merged_meta = df_all.merge(df_meta, how="left", on="t_id")
df_merged_meta.shape









    Out[14]:





(246869, 50)

Same tweet belongs to multiple datasets



In [15]:

    
df_merged_meta.t_id.value_counts().head()









    Out[15]:





723689208196509696    2
699607372394819584    2
700042121877835776    2
792354716521009152    2
699996833892823040    2
Name: t_id, dtype: int64



In [16]:

    
df_merged_meta[df_merged_meta.t_id == 700042121877835776][["topic_name"]]









    Out[16]:






  
    
      
      topic_name
    
  
  
    
      147719
      GunControl/gunControl
    
    
      239776
      Privacy/Privacy



In [17]:

    
df_merged_meta.t_id.value_counts()[df_merged_meta.t_id.value_counts() > 1]









    Out[17]:





723689208196509696    2
699607372394819584    2
700042121877835776    2
792354716521009152    2
699996833892823040    2
724662525984387072    2
699778010921226240    2
700186170706300928    2
748305235907477504    2
697800944894025728    2
Name: t_id, dtype: int64



In [18]:

    
df_merged_meta[df_merged_meta.t_id == 792354716521009152].T









    Out[18]:






  
    
      
      49813
      200954
    
  
  
    
      Author
      @sylviadana
      @sylviadana
    
    
      City/Urban
      NaN
      NaN
    
    
      City/Urban Area
      Grand Rapids
      Grand Rapids
    
    
      Contents
      Are u Pro-children? Education? Healthcare? Sma...
      Are u Pro-children? Education? Healthcare? Sma...
    
    
      Date
      10/29/2016 8:17
      10/29/2016 8:17
    
    
      Followers
      144
      144
    
    
      Following
      262
      262
    
    
      GUID
      7.92355e+17
      7.92355e+17
    
    
      Gender
      F
      F
    
    
      Name
      Sylvia Dana
      Sylvia Dana
    
    
      Posts
      935
      935
    
    
      RT?
      NO
      NO
    
    
      State/Region
      Michigan
      Michigan
    
    
      URL
      http://twitter.com/sylviadana/status/792354716...
      http://twitter.com/sylviadana/status/792354716...
    
    
      URL?
      NO
      NO
    
    
      adjective
      6
      6
    
    
      adverb
      0
      0
    
    
      count_tweet
      1
      1
    
    
      hashtag?
      YES
      YES
    
    
      mention?
      NO
      NO
    
    
      negation
      0
      0
    
    
      noun
      6
      6
    
    
      preposition
      1
      1
    
    
      processedPost
      are u pro children education healthcare small...
      are u pro children education healthcare small...
    
    
      sentiment
      Basic Negative
      Basic Negative
    
    
      sentiment_subj
      neutral
      neutral
    
    
      subjectvity_type
      neutralsubj
      neutralsubj
    
    
      t_id
      792354716521009152
      792354716521009152
    
    
      topic_name
      ChildEducation/Education
      Privacy/Privacy
    
    
      verb
      1
      1
    
    
      t_created
      Sat Oct 29 13:17:40 +0000 2016
      Sat Oct 29 13:17:40 +0000 2016
    
    
      t_retweets
      0
      0
    
    
      t_favorites
      1
      1
    
    
      t_is_reply
      False
      False
    
    
      t_is_quote
      False
      False
    
    
      t_n_hashtags
      1
      1
    
    
      t_n_urls
      0
      0
    
    
      t_n_mentions
      0
      0
    
    
      t_n_media
      0
      0
    
    
      u_id
      1.79638e+08
      1.79638e+08
    
    
      u_created
      Tue Aug 17 19:44:36 +0000 2010
      Tue Aug 17 19:44:36 +0000 2010
    
    
      u_n_listed
      5
      5
    
    
      u_n_favorites
      458
      458
    
    
      u_n_followers
      142
      142
    
    
      u_n_friends
      269
      269
    
    
      u_n_statuses
      1009
      1009
    
    
      u_is_verified
      False
      False
    
    
      u_location
      Grand Rapids, MI
      Grand Rapids, MI
    
    
      u_name
      Sylvia Dana
      Sylvia Dana
    
    
      u_url
      http://shesellscars.com
      http://shesellscars.com



In [19]:

    
df_merged_meta["is_controversial"] = df_merged_meta.topic_name.isin(CONTROVERSIAL_TOPICS)
df_merged_meta.is_controversial.value_counts()









    Out[19]:





True     148663
False     98206
Name: is_controversial, dtype: int64

Merge URL types



In [20]:

    
df_merged_meta.columns









    Out[20]:





Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial'],
      dtype='object')



In [21]:

    
df_mapped_cats = pd.read_csv("TID_URL_CATS.txt", sep="\t").assign(
    CATS=lambda x: x.CATS.apply(lambda k: k.split("|"))
)
df_mapped_cats.head()









    Out[21]:






  
    
      
      TID
      URL
      CATS
    
  
  
    
      0
      682904901916225536
      https://twitter.com/photogchad_WTSP/status/682...
      [socialmedia, twitter]
    
    
      1
      682915876316692480
      http://www.investirdanslenfance.ca/
      [UNK]
    
    
      2
      682985833821941760
      http://TinyURL.com/NewYearCure
      [commercial]
    
    
      3
      682952771746664448
      http://TinyURL.com/NewYearCure
      [commercial]
    
    
      4
      682830450969059328
      http://yournewswire.com/donald-trump-vaccines-...
      [fakenews]



In [22]:

    
URL_DICT = dict(zip(df_mapped_cats.URL.values, df_mapped_cats.CATS.values))
URL_DICT["http://TinyURL.com/NewYearCure"]









    Out[22]:





['commercial']



In [23]:

    
len(URL_DICT)









    Out[23]:





119558



In [24]:

    
df_mapped_cats.TID.value_counts().head()









    Out[24]:





770862566662955008    3
700152617033289728    3
735430291821596673    3
738960203949965312    3
716003978807476224    3
Name: TID, dtype: int64



In [25]:

    
df_mapped_cats[df_mapped_cats.TID == 700152617033289728]









    Out[25]:






  
    
      
      TID
      URL
      CATS
    
  
  
    
      17849
      700152617033289728
      http://www.hcplive.com/contributor/ryan-gray-m...
      [UNK]
    
    
      17850
      700152617033289728
      http://www.foxla.com/news/local-news/89941411-...
      [UNK]
    
    
      17851
      700152617033289728
      http://www.apple.com/customer-letter/
      [commercial]



In [26]:

    
df_tweet_cat_counts = df_mapped_cats.groupby("TID")["CATS"].apply(lambda x: sum(x, []))
df_tweet_cat_counts.head()









    Out[26]:





TID
682807892517478400                     [UNK]
682808408857272320    [socialmedia, twitter]
682809868449796099                     [UNK]
682815454159761408    [socialmedia, twitter]
682816642242490369          [news, fakenews]
Name: CATS, dtype: object



In [27]:

    
df_tweet_cat_counts.reset_index().dtypes









    Out[27]:





TID      int64
CATS    object
dtype: object



In [28]:

    
df_merged_meta.shape









    Out[28]:





(246869, 51)



In [29]:

    
df_merged_meta.t_id.value_counts().head()









    Out[29]:





723689208196509696    2
699607372394819584    2
700042121877835776    2
792354716521009152    2
699996833892823040    2
Name: t_id, dtype: int64



In [30]:

    
df_merged_meta_cats = df_merged_meta.merge(
    df_tweet_cat_counts.reset_index(), how="left", left_on="t_id", right_on="TID")



In [31]:

    
df_merged_meta_cats.columns









    Out[31]:





Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial',
                    u'TID',             u'CATS'],
      dtype='object')

Add location states

df_merged_meta_cats.u_location.value_counts().to_csv("USER_LOCATIONS.txt", sep="\t", encoding='utf-8')
! head USER_LOCATIONS.txt
! python process_user_locations.py ## RUN using python3 from command line



In [32]:

    
df_places = pd.read_csv("PARSED_STATES.final.txt", sep="\t")
df_places = df_places.rename(columns={
        "location": "u_location", "parse_manual": "u_state"
                                     })[["u_location", "u_state"]]
df_places.head()









    Out[32]:






  
    
      
      u_location
      u_state
    
  
  
    
      0
      United States
      USA
    
    
      1
      USA
      USA
    
    
      2
      Washington, DC
      DC
    
    
      3
      New York, NY
      NY
    
    
      4
      Los Angeles, CA
      CA



In [33]:

    
df_merged_meta_cats = df_merged_meta_cats.merge(df_places, how="left", on="u_location")









    



/home/entity/anaconda2/lib/python2.7/site-packages/pandas/tools/merge.py:1198: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  rlab = rizer.factorize(rk)



In [34]:

    
df_merged_meta_cats.u_state.head()









    Out[34]:





0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: u_state, dtype: object



In [35]:

    
df_merged_meta_cats.t_id.value_counts().head()









    Out[35]:





723689208196509696    2
699607372394819584    2
700042121877835776    2
792354716521009152    2
699996833892823040    2
Name: t_id, dtype: int64



In [36]:

    
df_merged_meta_cats.to_hdf("FINAL_ANALYSIS_DATA.h5", "final_data")









    



/home/entity/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:270: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->axis0] [items->None]

  f(store)
/home/entity/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:270: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_items] [items->None]

  f(store)
/home/entity/anaconda2/lib/python2.7/site-packages/pandas/core/generic.py:1138: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['Author', 'City/Urban', 'City/Urban Area', 'Contents', 'Date', 'Gender', 'Name', 'RT?', 'State/Region', 'URL', 'URL?', 'hashtag?', 'mention?', 'processedPost', 'sentiment', 'sentiment_subj', 'subjectvity_type', 'topic_name', u't_created', u't_is_reply', u't_is_quote', u'u_created', u'u_is_verified', u'u_location', u'u_name', u'u_url', 'CATS', 'u_state']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
/home/entity/anaconda2/lib/python2.7/site-packages/pandas/io/pytables.py:270: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_items] [items->None]

  f(store)

	49813	200954
Author	@sylviadana	@sylviadana
City/Urban	NaN	NaN
City/Urban Area	Grand Rapids	Grand Rapids
Contents	Are u Pro-children? Education? Healthcare? Sma...	Are u Pro-children? Education? Healthcare? Sma...
Date	10/29/2016 8:17	10/29/2016 8:17
Followers	144	144
Following	262	262
GUID	7.92355e+17	7.92355e+17
Gender	F	F
Name	Sylvia Dana	Sylvia Dana
Posts	935	935
RT?	NO	NO
State/Region	Michigan	Michigan
URL	http://twitter.com/sylviadana/status/792354716...	http://twitter.com/sylviadana/status/792354716...
URL?	NO	NO
adjective	6	6
adverb	0	0
count_tweet	1	1
hashtag?	YES	YES
mention?	NO	NO
negation	0	0
noun	6	6
preposition	1	1
processedPost	are u pro children education healthcare small...	are u pro children education healthcare small...
sentiment	Basic Negative	Basic Negative
sentiment_subj	neutral	neutral
subjectvity_type	neutralsubj	neutralsubj
t_id	792354716521009152	792354716521009152
topic_name	ChildEducation/Education	Privacy/Privacy
verb	1	1
t_created	Sat Oct 29 13:17:40 +0000 2016	Sat Oct 29 13:17:40 +0000 2016
t_retweets	0	0
t_favorites	1	1
t_is_reply	False	False
t_is_quote	False	False
t_n_hashtags	1	1
t_n_urls	0	0
t_n_mentions	0	0
t_n_media	0	0
u_id	1.79638e+08	1.79638e+08
u_created	Tue Aug 17 19:44:36 +0000 2010	Tue Aug 17 19:44:36 +0000 2010
u_n_listed	5	5
u_n_favorites	458	458
u_n_followers	142	142
u_n_friends	269	269
u_n_statuses	1009	1009
u_is_verified	False	False
u_location	Grand Rapids, MI	Grand Rapids, MI
u_name	Sylvia Dana	Sylvia Dana
u_url	http://shesellscars.com	http://shesellscars.com

	TID	URL	CATS
0	682904901916225536	https://twitter.com/photogchad_WTSP/status/682...	[socialmedia, twitter]
1	682915876316692480	http://www.investirdanslenfance.ca/	[UNK]
2	682985833821941760	http://TinyURL.com/NewYearCure	[commercial]
3	682952771746664448	http://TinyURL.com/NewYearCure	[commercial]
4	682830450969059328	http://yournewswire.com/donald-trump-vaccines-...	[fakenews]

	TID	URL	CATS
17849	700152617033289728	http://www.hcplive.com/contributor/ryan-gray-m...	[UNK]
17850	700152617033289728	http://www.foxla.com/news/local-news/89941411-...	[UNK]
17851	700152617033289728	http://www.apple.com/customer-letter/	[commercial]

	u_location	u_state
0	United States	USA
1	USA	USA
2	Washington, DC	DC
3	New York, NY	NY
4	Los Angeles, CA	CA