June 21 - June 30, 2016
This data set is quite small at only ~10,000 tweets, so it is much harder to find semantically interesting results than with charisma dataset. Still may be useful info for people interested specifically in Brexit
In [1]:
import sys
sys.path.append('..')
from twords.twords import Twords
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
# this pandas line makes the dataframe display all text in a line; useful for seeing entire tweets
pd.set_option('display.max_colwidth', -1)
In [2]:
twit = Twords()
twit.data_path = "../data/java_collector/brexit"
twit.background_path = '../jar_files_and_background/freq_table_72319443_total_words_twitter_corpus.csv'
twit.create_Background_dict()
twit.set_Search_terms(["brexit"])
twit.create_Stop_words()
In [3]:
twit.get_java_tweets_from_csv_list()
In [4]:
# find how many tweets we have in original dataset
print "Total number of tweets:", len(twit.tweets_df)
In [5]:
twit.keep_column_of_original_tweets()
In [6]:
twit.lower_tweets()
In [7]:
twit.keep_only_unicode_tweet_text()
In [8]:
twit.remove_urls_from_tweets()
In [9]:
twit.remove_punctuation_from_tweets()
In [10]:
twit.drop_non_ascii_characters_from_tweets()
In [11]:
twit.drop_duplicate_tweets()
In [12]:
twit.drop_by_search_in_name()
In [13]:
twit.convert_tweet_dates_to_standard()
In [14]:
twit.sort_tweets_by_date()
In [15]:
len(twit.tweets_df)
Out[15]:
In [16]:
twit.keep_tweets_with_terms("brexit")
In [17]:
len(twit.tweets_df)
Out[17]:
In [18]:
twit.create_word_bag()
twit.make_nltk_object_from_word_bag()
twit.create_word_freq_df(1000)
In [19]:
twit.word_freq_df.sort_values("log relative frequency", ascending = False, inplace = True)
twit.word_freq_df.head(20)
Out[19]:
In [20]:
num_words_to_plot = 32
background_cutoff = 100
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=True).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
In [21]:
num_words_to_plot = 32
background_cutoff = 500
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=True).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
In [22]:
num_words_to_plot = 32
background_cutoff = 2000
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=True).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
In [23]:
num_words_to_plot = 32
background_cutoff = 2000
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=True).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
In [ ]: