In [1]:
from __future__ import division
from __future__ import print_function
import csv
import datetime as dt
import os
import re

import pandas
from sklearn.feature_extraction.text import CountVectorizer

Open csv


In [2]:
def csv_to_df(csv_file):
    """Open csv, return Pandas DataFrame."""
    dataframe = pandas.read_csv(csv_file, 
                             delimiter='|', 
                             error_bad_lines=False, 
                             warn_bad_lines=False,
                            )
    return dataframe

Data cleanup


In [3]:
def make_lowercase(input_str):
    """Lowercase input string, return."""

In [4]:
def clean_whitespaces(input_str):
    """Use re library to replace all 
    whitespaces (newlines, etc.) with a simple ' ' space.
    """

In [5]:
def remove_puncutation(input_str):
    """Remove certain punctuation."""

In [6]:
stopwords = ['himself', 'very', 'those', 'most', 'this', 'it', 'did', 'be', 'each', 'you', 'was', 'should', 'down', 'if', 'that', 'no', 'itself', 'does', 'under', 'a', 'over', 'about', 'both', 'their', 'who', 'her', 'now', 'which', 'as', 'other', 'too', 'yourselves', 'and', 'why', 'how', 'your', 'into', 'i', 'before', 'by', 'again', 'having', 'during', 'of', 'after', 'against', 'is', 'here', 't', 'above', 'so', 'doing', 'me', 'between', 'are', 'whom', 'ours', 'ourselves', 'he', 'him', 'where', 'because', 'up', 'yours', 'out', 'more', 's', 'nor', 'just', 'then', 'don', 'myself', 'my', 'while', 'these', 'some', 'yourself', 'such', 'on', 'few', 'them', 'until', 'from', 'when', 'our', 'have', 'or', 'theirs', 'off', 'through', 'the', 'same', 'any', 'its', 'not', 'below', 'has', 'had', 'am', 'been', 'will', 'at', 'being', 'there', 'than', 'to', 'she', 'but', 'what', 'for', 'can', 'own', 'an', 'they', 'his', 'with', 'we', 'only', 'in', 'were', 'hers', 'once', 'all', 'further', 'do', 'themselves', 'herself']

def remove_stopwords(input_tokens):
    """Remove common words."""

Feature extraction

Word tokenization

Show plain function, maybe NLTK too


In [7]:
# A basic tokenizer

def tokenize_words(input_string):
    """Take a string, return a list of 
    strings broken on whitespace, but do 
    not break @mentions and URLs.
    
    Alternative: Try using something like `[word for word in re.sub('\W', ' ', s).split()]`.
    then stripping punct that isn't @ or #.
    """
    punctuation = [',', '!', '"', '. ', ': ']
    for char in punctuation:
        input_string = input_string.replace(char, ' ')
    
    return [w for w in input_string.split(' ') if w]  # rm empty strings

In [8]:
# See @users and http: not split
a_tweet = """@CuteEmergency: "I'm okay!" https://t.co/TWMwjG03Fd"""
tokenize_words(a_tweet)


Out[8]:
['@CuteEmergency', "I'm", 'okay', 'https://t.co/TWMwjG03Fd']

Counting text

  • count chars
  • count words
  • links
  • count links
  • #hashtags
  • count #hashtags
  • @mentions
  • count @mentions

In [9]:
def get_urls(input_tokens):
    """Check incoming list of strings, check if token
    starts with `http(s)://`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('http')]`
    """
    urls = []
    for word in input_tokens:
        if word.startswith('http'):
            urls.append(word)
    return urls

In [10]:
def get_hashtags(input_tokens):
    """Check incoming list of strings, check if token
    starts with `#`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('#')]`
    """
    hashtags = []
    for word in input_tokens:
        if word.startswith('#'):
            hashtags.append(word)
    return hashtags

In [11]:
def get_mentions(input_tokens):
    """Check incoming list of strings, check if token
    starts with `@`.
    
    Could be done with list comprehension, too:
    `[w for w in input_tokens if word.startswith('@')]`
    """
    mentions = []
    for word in input_tokens:
        if word.startswith('@'):
            mentions.append(word)
    return mentions

In [12]:
def add_features_to_df(dataframe):
    """Take DataFrame of tweets, extract some specific 
    features and add to returned DataFrame.
    """
    #tokens = []  # list of strings
    char_count = []
    word_count = []
    urls = []
    url_counts = []
    hashtags = []
    hashtag_counts = []
    mentions = []
    mentions_counts = []

    for i, row in dataframe.iterrows():
        
        # Text and tokens
        tokens = tokenize_words(row['_text'])
        char_count.append(len(row['_text']))
        word_count.append(len(tokens))

        # URLs
        url_list = get_urls(tokens)
        urls.append(url_list)
        url_count = len(url_list)
        url_counts.append(url_count)

        # Hashtags
        hashtag_list = get_hashtags(tokens)
        hashtags.append(hashtag_list)
        hashtag_count = len(hashtag_list)
        hashtag_counts.append(hashtag_count)

        # Mentions
        mentions_list = get_mentions(tokens)
        mentions.append(mentions_list)
        mentions_count = len(mentions_list)
        mentions_counts.append(mentions_count)


    dataframe['_char_count'] = char_count
    dataframe['_word_count'] = word_count
    dataframe['_urls'] = urls
    dataframe['_url_count'] = url_counts
    dataframe['_hashtags'] = hashtags
    dataframe['_hashtag_count'] = hashtag_counts
    dataframe['_mentions'] = mentions
    dataframe['_mentions_count'] = mentions_counts
    
    return dataframe

TODO: Named entity recognition (NER)

Maybe show NLTK code, but don't do, too slow


In [14]:
def make_merge_bow_write(dataframe, save_path):
    """Take a dataframe, extract '_text' and make a Bag of Words.
    Write BoW features to their own file, then merge with input
    and return new dataframe.
    
    TODO: Revisit options for CountVectorizer() (lowercase, tokenizer, min freq)
    """
    # Get list of strings, for input into vectorizer
    text_list = dataframe['_text'].tolist()

    # Setup Vectorizer
    # Note that min_df is confusing; see http://stackoverflow.com/a/27697863
    # min_df + an integer: if word found in less than n docs, then ignore
    vectorizer = CountVectorizer(min_df=2)
    term_document_matrix = vectorizer.fit_transform(text_list)  # input is a list of strings, 1 per document

    # Put BoW vectors into a new df
    dataframe_bow = pandas.DataFrame(term_document_matrix.toarray(), columns=vectorizer.get_feature_names())
    
    # Write BoW to disk
    # Just the Bag of Words, in case we want to use it by itself later
    # TODO! Add '_popular' column to this, or ditch this csv altogether
    dataframe_bow.to_csv(save_path, sep='|', encoding='utf-8')
    
    # Merge BoW df with the original feature table df
    # Important: Make sure the concat() function uses the original id index of the first, text datafram
    dataframe = pandas.concat([dataframe, dataframe_bow], axis=1, join_axes=[dataframe.index])
    
    return dataframe

In [15]:
def make_merge_bow(dataframe):
    """Take a dataframe, extract '_text' and make a Bag of Words.
    Write BoW features to their own file, then merge with input
    and return new dataframe.
    
    TODO: Revisit options for CountVectorizer() (lowercase, tokenizer, min freq)
    """
    # Get list of strings, for input into vectorizer
    text_list = dataframe['_text'].tolist()

    # Setup Vectorizer
    # Note that min_df is confusing; see http://stackoverflow.com/a/27697863
    # min_df + an integer: if word found in less than n docs, then ignore
    vectorizer = CountVectorizer(min_df=2)  
    term_document_matrix = vectorizer.fit_transform(text_list)  # input is a list of strings, 1 per document

    # Put BoW vectors into a new df
    dataframe_bow = pandas.DataFrame(term_document_matrix.toarray(), columns=vectorizer.get_feature_names())
    
    return dataframe_bow

Topic modeling

Think about how to put into feature table

Write entire DataFrame to csv

The next notebook will pick up from here

Do everything again for the unpopular tweets


In [18]:
def make_all_features_for_tweets():
    """Do all the steps to create one feature 
    table of popular and unpopular tweets.
    """
    
    print('Startting feature extraction ...')
    t0 = dt.datetime.utcnow()
    # Make sure 'feature_tables' present
    features_dir = 'feature_tables'
    if not os.path.isdir(features_dir):
        os.mkdir(features_dir)

    # load csvs to dfs
    dataframe_popular = csv_to_df('tweets/tweets_popular.csv')
    dataframe_not_popular = csv_to_df('tweets/tweets_not_popular.csv')
    
    # Remove dupes
    dataframe_popular = dataframe_popular.drop_duplicates()
    dataframe_not_popular = dataframe_not_popular.drop_duplicates()
    
    # Add column '_popular' or '_unpopular' for each df
    dataframe_popular['_popular'] = True
    dataframe_not_popular['_popular'] = False
    
    # Append unpopular to popular df
    dataframe = pandas.concat([dataframe_popular, dataframe_not_popular])
    
    
    # Extract features from df, add back to df
    dataframe = add_features_to_df(dataframe)
    
    # Write df, now with basic extracted features, to .csv
    dataframe.to_csv('feature_tables/basics.csv', sep='|', encoding='utf-8')

    # Make BoW df, then write it to .csv
    #dataframe_bow = make_merge_bow(dataframe)
    
    # Just the Bag of Words, in case we want to use it by itself later
    #dataframe_bow.to_csv('feature_tables/bow.csv', sep='|', encoding='utf-8')
    
    # Merge BoW df with the original feature table df
    # Important: Make sure the concat() function uses the original id index of the first, text df
    #dataframe = pandas.concat([dataframe, dataframe_bow], axis=1, join_axes=[dataframe.index])
    #dataframe.to_csv('feature_tables/all.csv', sep='|', encoding='utf-8')
    
    print('... completed in {}.'.format(dt.datetime.utcnow() - t0))
    print('Total (rows, columns):', dataframe.shape)  # (rows, columns)
    
    return dataframe

In [19]:
df = make_all_features_for_tweets()


Startting feature extraction ...
... completed in 0:00:11.750256.
Total (rows, columns): (22706, 12)

In [26]:
list(df.columns.values)


Out[26]:
['_text',
 '_rt_count',
 '_tweet_datetime',
 '_popular',
 '_char_count',
 '_word_count',
 '_urls',
 '_url_count',
 '_hashtags',
 '_hashtag_count',
 '_mentions',
 '_mentions_count']

In [30]:
df


Out[30]:
_text _rt_count _tweet_datetime _popular _char_count _word_count _urls _url_count _hashtags _hashtag_count _mentions _mentions_count
0 @CringeLMAO: Easy there m8 https://t.co/dnF3Wq... 2084 Mon Feb 15 20:44:33 +0000 2016 True 50 5 [https://t.co/dnF3Wqdt1C] 1 [] 0 [@CringeLMAO] 1
1 @AustinMahone: Just posted a photo https://t.c... 1059 Mon Feb 15 20:44:33 +0000 2016 True 58 6 [https://t.co/hXFg6TyuzE] 1 [] 0 [@AustinMahone] 1
2 @Ashton5SOS: Some days I drink way to much cof... 24121 Mon Feb 15 20:44:33 +0000 2016 True 136 24 [] 0 [] 0 [@Ashton5SOS] 1
3 @lailamuhammad: When you nail that #Beyonc m... 801 Mon Feb 15 20:44:33 +0000 2016 True 140 21 [] 0 [#Beyonc, #slay] 2 [@lailamuhammad, @MarqCotton, @BuckeyeBond83, ... 4
4 @BDBANDS: MOOD https://t.co/NMlFBJZtic 1856 Mon Feb 15 20:44:33 +0000 2016 True 46 3 [https://t.co/NMlFBJZtic] 1 [] 0 [@BDBANDS] 1
5 @TheGRAMMYs: Congrats Best Pop Vocal Album @ta... 3747 Mon Feb 15 20:44:33 +0000 2016 True 99 11 [https://t.co/6gqbPR2JmW] 1 [#GRAMMYs] 1 [@TheGRAMMYs, @taylorswift13] 2
6 @taylorcaniff: Never mind I'm snowed in again ... 1961 Mon Feb 15 20:44:33 +0000 2016 True 67 11 [] 0 [] 0 [@taylorcaniff] 1
7 @Ashton5SOS: But this is the obvious reason I ... 21948 Mon Feb 15 20:44:33 +0000 2016 True 102 19 [] 0 [] 0 [@Ashton5SOS] 1
9 @FemaleTexts: February 15th?? You mean annoy s... 2426 Mon Feb 15 20:44:33 +0000 2016 True 83 9 [https://t.co/k06HAUNeoM] 1 [] 0 [@FemaleTexts] 1
10 @AustinMahone: Maybe I'll make a country song ... 1826 Mon Feb 15 20:44:33 +0000 2016 True 51 8 [] 0 [] 0 [@AustinMahone] 1
11 @DrakeBible_: this is the warriors reporter om... 2659 Mon Feb 15 20:44:33 +0000 2016 True 100 13 [https://t.co/sS09KSOXOB] 1 [] 0 [@DrakeBible_] 1
12 @CuteEmergency: "I'm okay!" https://t.co/TWMwj... 617 Mon Feb 15 20:44:33 +0000 2016 True 51 4 [https://t.co/TWMwjG03Fd] 1 [] 0 [@CuteEmergency] 1
13 @BabyAnimalPics: LOOK HOW HE JUST SHAKES OFF H... 12067 Mon Feb 15 20:44:33 +0000 2016 True 80 10 [https://t.co/i6dvIw2x7O] 1 [] 0 [@BabyAnimalPics] 1
14 @carterreynolds: There's nothing better than a... 1278 Mon Feb 15 20:44:33 +0000 2016 True 79 13 [] 0 [] 0 [@carterreynolds] 1
16 @StockpiIe: I can send you a text at 12:04 &am... 752 Mon Feb 15 20:44:33 +0000 2016 True 67 14 [] 0 [] 0 [@StockpiIe] 1
17 @LaurenJauregui: You are simply alive. https:/... 8916 Mon Feb 15 20:44:33 +0000 2016 True 62 6 [https://t.co/FSzuksBUpQ] 1 [] 0 [@LaurenJauregui] 1
18 @TheKitchensHeat: Eddie Griffin- on racism in ... 4361 Mon Feb 15 20:44:33 +0000 2016 True 79 8 [https://t.co/tFh5wMGJVb] 1 [] 0 [@TheKitchensHeat] 1
19 @TheFitGoals: Need https://t.co/NqqXrfPfQw 1462 Mon Feb 15 20:44:33 +0000 2016 True 42 3 [https://t.co/NqqXrfPfQw] 1 [] 0 [@TheFitGoals] 1
20 @Divergent: Crazy FOUR you, Initiates! Happy V... 1231 Mon Feb 15 20:44:33 +0000 2016 True 91 10 [https://t.co/T4NEOAVOqe] 1 [] 0 [@Divergent] 1
21 @kanyewest: This is a God dream 16044 Mon Feb 15 20:44:33 +0000 2016 True 31 6 [] 0 [] 0 [@kanyewest] 1
22 @irlsimi: this article is fucking disgusting i... 10799 Mon Feb 15 20:48:46 +0000 2016 True 130 21 [https://t.co/S9g9yDwAYb] 1 [] 0 [@irlsimi] 1
23 @kanyewest: I m this generation's Disney ... 22244 Mon Feb 15 20:48:46 +0000 2016 True 87 15 [] 0 [] 0 [@kanyewest] 1
24 @kanyewest: No matter what level you're at in ... 46505 Mon Feb 15 20:48:46 +0000 2016 True 76 14 [] 0 [] 0 [@kanyewest] 1
25 @daggertattooH: when we heard louis' laugh com... 549 Mon Feb 15 20:48:46 +0000 2016 True 114 16 [https://t.co/AEYqtaXyXp] 1 [] 0 [@daggertattooH] 1
26 @WORLDSTARVlNE: THIS STILL KILLS ME https... 631 Mon Feb 15 20:48:46 +0000 2016 True 64 6 [https://t.co/HfYQtEMY9C] 1 [] 0 [@WORLDSTARVlNE] 1
27 @broken: I like my music at a volume where I c... 8627 Mon Feb 15 20:48:46 +0000 2016 True 60 13 [] 0 [] 0 [@broken] 1
28 @TheGRAMMYs: Congrats Best Pop Vocal Album @ta... 7028 Mon Feb 15 20:48:46 +0000 2016 True 99 11 [https://t.co/6gqbPR2JmW] 1 [#GRAMMYs] 1 [@TheGRAMMYs, @taylorswift13] 2
29 @JusticeBlaine: Antonin #Scalia requested crem... 19949 Mon Feb 15 20:48:46 +0000 2016 True 139 21 [] 0 [#Scalia] 1 [@JusticeBlaine] 1
30 @ArianaGrande: 1 yr since 1 of my fav vids cam... 4076 Mon Feb 15 20:48:46 +0000 2016 True 143 25 [http] 1 [] 0 [@ArianaGrande] 1
31 @MallowandMarsh: It's #MallowMondays, and this... 1381 Mon Feb 15 20:48:46 +0000 2016 True 139 23 [] 0 [#MallowMondays, #win] 2 [@MallowandMarsh] 1
... ... ... ... ... ... ... ... ... ... ... ... ...
18588 Hello Amy Nadine Dix! Thank you for following ... 0 Wed Feb 17 02:26:37 +0000 2016 False 120 19 [] 0 [] 0 [] 0
18589 @Michaelaa155 you can have Braxton https://t.c... 0 Wed Feb 17 02:26:37 +0000 2016 False 58 6 [https://t.co/oaPvRkyjqe] 1 [] 0 [@Michaelaa155] 1
18590 Einstein says she is expecting actual results/... 0 Wed Feb 17 02:26:37 +0000 2016 False 133 21 [] 0 [#Babylon] 1 [] 0
18591 The highest investment fees are the ones you p... 0 Wed Feb 17 02:26:37 +0000 2016 False 101 14 [https://t.co/RLIVw5p2fp] 1 [#money, #lifehacks] 2 [] 0
18592 @marciaxthree: @EbonyStarr5 Yes! The Black Pan... 3 Wed Feb 17 02:26:37 +0000 2016 False 139 21 [] 0 [] 0 [@marciaxthree, @EbonyStarr5, @michaeljackson] 3
18593 @shelleyhennig follow me for a minute so I can... 0 Wed Feb 17 02:26:37 +0000 2016 False 98 18 [] 0 [#askmalia] 1 [@shelleyhennig] 1
18594 @GIRLSKISSGlRLS: ok I'm gonna do a question th... 4 Wed Feb 17 02:26:37 +0000 2016 False 102 16 [] 0 [] 0 [@GIRLSKISSGlRLS] 1
18595 @yaboijoseph: Got to love you 1 Wed Feb 17 02:26:37 +0000 2016 False 29 5 [] 0 [] 0 [@yaboijoseph] 1
18596 John Newton~ How sweet the name of Jesus sound... 0 Wed Feb 17 02:26:37 +0000 2016 False 132 25 [] 0 [] 0 [] 0
18597 @blakejamieson: Do what you can. https://t.co/... 4 Wed Feb 17 02:26:37 +0000 2016 False 56 6 [https://t.co/WZmJcwhoIU] 1 [] 0 [@blakejamieson] 1
18598 RT @NelsonsWay: Yaw are so stupid... 0 Wed Feb 17 02:26:37 +0000 2016 False 79 7 [https://t.co/5yBOW82KDe] 1 [] 0 [@NelsonsWay] 1
18599 @based_joseph do it now! ! Or I will 0 Wed Feb 17 02:26:37 +0000 2016 False 36 7 [] 0 [] 0 [@based_joseph] 1
18600 @thebestrips: Amsterdam in the Fall (2015) htt... 1 Wed Feb 17 02:26:37 +0000 2016 False 90 8 [https://t.co/EDfCZVRq4Y, https://t.co/z91yib7... 2 [] 0 [@thebestrips] 1
18601 @DaIndieSpot: New #Music #YouSay By @AllHa... 1 Wed Feb 17 02:26:37 +0000 2016 False 141 13 [https://t.co/DwyxR7n1au, https://t.co/IYsnb] 2 [#Music, #YouSay, #DISPromo, #PromoBlast] 4 [@DaIndieSpot, @AllHailKao] 2
18602 @thatnigga_twann stop playing with me nigga , ... 0 Wed Feb 17 02:26:37 +0000 2016 False 74 11 [] 0 [] 0 [@thatnigga_twann] 1
18603 @GregWQAD: Carter Milem drops in a short put b... 1 Wed Feb 17 02:26:37 +0000 2016 False 117 20 [] 0 [] 0 [@GregWQAD, @TheScoreWQAD] 2
18604 @Kolby_NflBound ehhh, i guess i can be a littl... 0 Wed Feb 17 02:26:37 +0000 2016 False 74 13 [] 0 [] 0 [@Kolby_NflBound] 1
18605 Haha Nadine Deese...my exact thoughts! https:/... 0 Wed Feb 17 02:26:37 +0000 2016 False 62 6 [https://t.co/WOh3vwNt6B] 1 [] 0 [] 0
18606 eventhough i jump from fandoms to fandoms but ... 0 Wed Feb 17 02:26:37 +0000 2016 False 130 25 [] 0 [] 0 [] 0
18607 @NepolitanSunday: #StyleSquadBET. Imma go with... 2 Wed Feb 17 02:26:37 +0000 2016 False 139 18 [] 0 [#StyleSquadBET] 1 [@NepolitanSunday, @johnlegend, @chrissyteigen] 3
18608 what a beautiful little bean louis is !! alway... 0 Wed Feb 17 02:26:37 +0000 2016 False 131 22 [https://t.co/2qI5Iuu7OQ] 1 [] 0 [] 0
18609 Not sure why, but I find this ridiculously fun... 0 Wed Feb 17 02:26:37 +0000 2016 False 85 10 [https://t.co/bcjrVNC2Ig] 1 [] 0 [] 0
18610 @_queenkennedy lol no I know your a humble person 0 Wed Feb 17 02:26:37 +0000 2016 False 49 9 [] 0 [] 0 [@_queenkennedy] 1
18611 @dwolverton13: Look at how long our hair was. ... 1 Wed Feb 17 02:26:37 +0000 2016 False 142 23 [https://t.co/] 1 [] 0 [@dwolverton13] 1
18612 @AmazonUK how early do orders have to be made ... 0 Wed Feb 17 02:26:37 +0000 2016 False 93 19 [] 0 [] 0 [@AmazonUK] 1
18613 @KaliMarcum: Multipurpose Portable Pet Soft Cr... 2 Wed Feb 17 02:26:37 +0000 2016 False 139 15 [https://t.co/yoqk1t6kGK] 1 [#Dogs, #Outdoor] 2 [@KaliMarcum] 1
18614 A Gn text would be nice or a Gm te... 0 Wed Feb 17 02:26:37 +0000 2016 False 61 14 [] 0 [] 0 [] 0
18615 @snbilaj you hate alternative. Remember? 0 Wed Feb 17 02:26:37 +0000 2016 False 40 5 [] 0 [] 0 [@snbilaj] 1
18616 @BellaTwins it was painful for me to sit there... 0 Wed Feb 17 02:26:37 +0000 2016 False 140 22 [] 0 [] 0 [@BellaTwins] 1
18617 The weight of the Administration s gargantua... 0 Wed Feb 17 02:26:37 +0000 2016 False 138 22 [] 0 [] 0 [] 0

22706 rows × 12 columns


In [ ]: