In [1]:
from __future__ import division
from __future__ import print_function
import csv
import datetime as dt
import os
import re
import pandas
from sklearn.feature_extraction.text import CountVectorizer
In [2]:
def csv_to_df(csv_file):
"""Open csv, return Pandas DataFrame."""
dataframe = pandas.read_csv(csv_file,
delimiter='|',
error_bad_lines=False,
warn_bad_lines=False,
)
return dataframe
In [3]:
def make_lowercase(input_str):
"""Lowercase input string, return."""
In [4]:
def clean_whitespaces(input_str):
"""Use re library to replace all
whitespaces (newlines, etc.) with a simple ' ' space.
"""
In [5]:
def remove_puncutation(input_str):
"""Remove certain punctuation."""
In [6]:
stopwords = ['himself', 'very', 'those', 'most', 'this', 'it', 'did', 'be', 'each', 'you', 'was', 'should', 'down', 'if', 'that', 'no', 'itself', 'does', 'under', 'a', 'over', 'about', 'both', 'their', 'who', 'her', 'now', 'which', 'as', 'other', 'too', 'yourselves', 'and', 'why', 'how', 'your', 'into', 'i', 'before', 'by', 'again', 'having', 'during', 'of', 'after', 'against', 'is', 'here', 't', 'above', 'so', 'doing', 'me', 'between', 'are', 'whom', 'ours', 'ourselves', 'he', 'him', 'where', 'because', 'up', 'yours', 'out', 'more', 's', 'nor', 'just', 'then', 'don', 'myself', 'my', 'while', 'these', 'some', 'yourself', 'such', 'on', 'few', 'them', 'until', 'from', 'when', 'our', 'have', 'or', 'theirs', 'off', 'through', 'the', 'same', 'any', 'its', 'not', 'below', 'has', 'had', 'am', 'been', 'will', 'at', 'being', 'there', 'than', 'to', 'she', 'but', 'what', 'for', 'can', 'own', 'an', 'they', 'his', 'with', 'we', 'only', 'in', 'were', 'hers', 'once', 'all', 'further', 'do', 'themselves', 'herself']
def remove_stopwords(input_tokens):
"""Remove common words."""
In [7]:
# A basic tokenizer
def tokenize_words(input_string):
"""Take a string, return a list of
strings broken on whitespace, but do
not break @mentions and URLs.
Alternative: Try using something like `[word for word in re.sub('\W', ' ', s).split()]`.
then stripping punct that isn't @ or #.
"""
punctuation = [',', '!', '"', '. ', ': ']
for char in punctuation:
input_string = input_string.replace(char, ' ')
return [w for w in input_string.split(' ') if w] # rm empty strings
In [8]:
# See @users and http: not split
a_tweet = """@CuteEmergency: "I'm okay!" https://t.co/TWMwjG03Fd"""
tokenize_words(a_tweet)
Out[8]:
In [9]:
def get_urls(input_tokens):
"""Check incoming list of strings, check if token
starts with `http(s)://`.
Could be done with list comprehension, too:
`[w for w in input_tokens if word.startswith('http')]`
"""
urls = []
for word in input_tokens:
if word.startswith('http'):
urls.append(word)
return urls
In [10]:
def get_hashtags(input_tokens):
"""Check incoming list of strings, check if token
starts with `#`.
Could be done with list comprehension, too:
`[w for w in input_tokens if word.startswith('#')]`
"""
hashtags = []
for word in input_tokens:
if word.startswith('#'):
hashtags.append(word)
return hashtags
In [11]:
def get_mentions(input_tokens):
"""Check incoming list of strings, check if token
starts with `@`.
Could be done with list comprehension, too:
`[w for w in input_tokens if word.startswith('@')]`
"""
mentions = []
for word in input_tokens:
if word.startswith('@'):
mentions.append(word)
return mentions
In [12]:
def add_features_to_df(dataframe):
"""Take DataFrame of tweets, extract some specific
features and add to returned DataFrame.
"""
#tokens = [] # list of strings
char_count = []
word_count = []
urls = []
url_counts = []
hashtags = []
hashtag_counts = []
mentions = []
mentions_counts = []
for i, row in dataframe.iterrows():
# Text and tokens
tokens = tokenize_words(row['_text'])
char_count.append(len(row['_text']))
word_count.append(len(tokens))
# URLs
url_list = get_urls(tokens)
urls.append(url_list)
url_count = len(url_list)
url_counts.append(url_count)
# Hashtags
hashtag_list = get_hashtags(tokens)
hashtags.append(hashtag_list)
hashtag_count = len(hashtag_list)
hashtag_counts.append(hashtag_count)
# Mentions
mentions_list = get_mentions(tokens)
mentions.append(mentions_list)
mentions_count = len(mentions_list)
mentions_counts.append(mentions_count)
dataframe['_char_count'] = char_count
dataframe['_word_count'] = word_count
dataframe['_urls'] = urls
dataframe['_url_count'] = url_counts
dataframe['_hashtags'] = hashtags
dataframe['_hashtag_count'] = hashtag_counts
dataframe['_mentions'] = mentions
dataframe['_mentions_count'] = mentions_counts
return dataframe
Helpful links:
Brief example: https://github.com/kylepjohnson/lecture_nyc_ascent/blob/master/code_snippets/Example%20-%20Bag%20of%20words%20and%20Pandas%20df%20concat().ipynb.
In [14]:
def make_merge_bow_write(dataframe, save_path):
"""Take a dataframe, extract '_text' and make a Bag of Words.
Write BoW features to their own file, then merge with input
and return new dataframe.
TODO: Revisit options for CountVectorizer() (lowercase, tokenizer, min freq)
"""
# Get list of strings, for input into vectorizer
text_list = dataframe['_text'].tolist()
# Setup Vectorizer
# Note that min_df is confusing; see http://stackoverflow.com/a/27697863
# min_df + an integer: if word found in less than n docs, then ignore
vectorizer = CountVectorizer(min_df=2)
term_document_matrix = vectorizer.fit_transform(text_list) # input is a list of strings, 1 per document
# Put BoW vectors into a new df
dataframe_bow = pandas.DataFrame(term_document_matrix.toarray(), columns=vectorizer.get_feature_names())
# Write BoW to disk
# Just the Bag of Words, in case we want to use it by itself later
# TODO! Add '_popular' column to this, or ditch this csv altogether
dataframe_bow.to_csv(save_path, sep='|', encoding='utf-8')
# Merge BoW df with the original feature table df
# Important: Make sure the concat() function uses the original id index of the first, text datafram
dataframe = pandas.concat([dataframe, dataframe_bow], axis=1, join_axes=[dataframe.index])
return dataframe
In [15]:
def make_merge_bow(dataframe):
"""Take a dataframe, extract '_text' and make a Bag of Words.
Write BoW features to their own file, then merge with input
and return new dataframe.
TODO: Revisit options for CountVectorizer() (lowercase, tokenizer, min freq)
"""
# Get list of strings, for input into vectorizer
text_list = dataframe['_text'].tolist()
# Setup Vectorizer
# Note that min_df is confusing; see http://stackoverflow.com/a/27697863
# min_df + an integer: if word found in less than n docs, then ignore
vectorizer = CountVectorizer(min_df=2)
term_document_matrix = vectorizer.fit_transform(text_list) # input is a list of strings, 1 per document
# Put BoW vectors into a new df
dataframe_bow = pandas.DataFrame(term_document_matrix.toarray(), columns=vectorizer.get_feature_names())
return dataframe_bow
In [18]:
def make_all_features_for_tweets():
"""Do all the steps to create one feature
table of popular and unpopular tweets.
"""
print('Startting feature extraction ...')
t0 = dt.datetime.utcnow()
# Make sure 'feature_tables' present
features_dir = 'feature_tables'
if not os.path.isdir(features_dir):
os.mkdir(features_dir)
# load csvs to dfs
dataframe_popular = csv_to_df('tweets/tweets_popular.csv')
dataframe_not_popular = csv_to_df('tweets/tweets_not_popular.csv')
# Remove dupes
dataframe_popular = dataframe_popular.drop_duplicates()
dataframe_not_popular = dataframe_not_popular.drop_duplicates()
# Add column '_popular' or '_unpopular' for each df
dataframe_popular['_popular'] = True
dataframe_not_popular['_popular'] = False
# Append unpopular to popular df
dataframe = pandas.concat([dataframe_popular, dataframe_not_popular])
# Extract features from df, add back to df
dataframe = add_features_to_df(dataframe)
# Write df, now with basic extracted features, to .csv
dataframe.to_csv('feature_tables/basics.csv', sep='|', encoding='utf-8')
# Make BoW df, then write it to .csv
#dataframe_bow = make_merge_bow(dataframe)
# Just the Bag of Words, in case we want to use it by itself later
#dataframe_bow.to_csv('feature_tables/bow.csv', sep='|', encoding='utf-8')
# Merge BoW df with the original feature table df
# Important: Make sure the concat() function uses the original id index of the first, text df
#dataframe = pandas.concat([dataframe, dataframe_bow], axis=1, join_axes=[dataframe.index])
#dataframe.to_csv('feature_tables/all.csv', sep='|', encoding='utf-8')
print('... completed in {}.'.format(dt.datetime.utcnow() - t0))
print('Total (rows, columns):', dataframe.shape) # (rows, columns)
return dataframe
In [19]:
df = make_all_features_for_tweets()
In [26]:
list(df.columns.values)
Out[26]:
In [30]:
df
Out[30]:
In [ ]: