See the README for an explanation of how this code runs and functions.
Contact michaeldezube at gmail dot com with questions.
In [ ]:
from __future__ import print_function
from __future__ import division
import copy
import json
import re
import string
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn # To improve the chart styling.
import wordtree
from IPython.display import display
from IPython.display import HTML
from IPython.display import Javascript
from wordcloud import STOPWORDS
import ipywidgets as widgets
from wordcloud import WordCloud
import iphone_connector
In [ ]:
%matplotlib inline
matplotlib.style.use('ggplot')
pd.set_option('display.max_colwidth', 1000)
iphone_connector.initialize()
fully_merged_messages_df, address_book_df = iphone_connector.get_cleaned_fully_merged_messages()
full_names = set(address_book_df.full_name) # Handy set to check for misspellings later on.
fully_merged_messages_df.full_name.replace('nan nan nan', 'Unknown', inplace=True)
WORDS_PER_PAGE = 450 # Based upon http://wordstopages.com/
print('\nTotal pages if all texts were printed: {0:,d} (Arial size 12, single spaced)\n'.format(
sum(fully_merged_messages_df.text.apply(lambda x: len(x.split())))//WORDS_PER_PAGE))
In [ ]:
fully_merged_messages_df = fully_merged_messages_df.reset_index(drop=True)
fully_merged_messages_df
In [ ]:
address_book_df
Use fully_merged_messages_df
and address_book_df
for analysis, they contain all messages with columns for the sender and all contacts, respectively
In [ ]:
def plot_year_month_heatmap(df, trim_incomplete=True, search_term=None, figsize=(18, 10)):
"""Plots a heatmap of the dataframe grouped by year and month.
Args:
df: The dataframe, must contain a column named `date`.
trim_incomplete: If true, don't plot rows that lack 12 full months of data. Default True.
search_term: A case insensitive term to require in all rows of the dataframe's `text`
column. Default None.
figsize: The size of the plot as a tuple. Default (18, 10);
"""
if search_term:
df = df[df['text'].str.contains(search_term, case=False)]
month_year_messages = pd.DataFrame(df['date'])
month_year_messages['year'] = month_year_messages.apply(lambda row: row.date.year, axis=1)
month_year_messages['month'] = month_year_messages.apply(lambda row: row.date.month, axis=1)
month_year_messages = month_year_messages.drop('date', axis=1)
month_year_messages_pivot = month_year_messages.pivot_table(index='year',
columns='month',
aggfunc=len, dropna=True)
if trim_incomplete:
month_year_messages_pivot = month_year_messages_pivot[month_year_messages_pivot.count(axis=1) == 12]
if month_year_messages_pivot.shape[0] == 0:
print('After trimming rows that didn\'t have 12 months, no rows remained, bailing out.')
return
f, ax = plt.subplots(figsize=figsize)
seaborn.heatmap(month_year_messages_pivot, annot=True, fmt=".0f", square=True, cmap="YlGnBu", ax=ax)
# Plot all text messages exchanges over the years.
plot_year_month_heatmap(fully_merged_messages_df, search_term='')
In [ ]:
# Helper method to better support py2 and py3.
def convert_unicode_to_str_if_needed(unicode_or_str):
if type(unicode_or_str).__name__ == 'unicode':
return unicode_or_str.encode('utf-8')
return unicode_or_str
In [ ]:
# Note "Unknown" means the number was not found in your address book.
def get_message_counts(dataframe):
return pd.Series({'Texts sent': dataframe[dataframe.is_from_me == 1].shape[0],
'Texts received': dataframe[dataframe.is_from_me == 0].shape[0],
'Texts exchanged': dataframe.shape[0]})
messages_grouped = fully_merged_messages_df.groupby('full_name').apply(get_message_counts)
messages_grouped = messages_grouped.sort_values(by='Texts exchanged', ascending=False)
widgets.interact(messages_grouped.head,
n=widgets.IntSlider(min=5, max=50, step=1, value=5, continuous_update=False,
description='Number of people to show:'))
In [ ]:
# Helper method so we can wrap it with interact().
def _plot_most_common_text(top_n=10):
messages_grouped.head(top_n).plot(figsize=(20,10), kind='bar')
widgets.interact(_plot_most_common_text,
top_n=widgets.IntSlider(min=5, max=100, step=1, value=5, continuous_update=False,
description='Number of people to show:'))
In [ ]:
# Restrict to the top N people you text the most so the steamgraph is legible.
TOP_N = 10 # Freely change this value.
sliced_df = fully_merged_messages_df[fully_merged_messages_df.full_name.isin(messages_grouped.head(TOP_N).index)]
grouped_by_month = sliced_df.groupby([
sliced_df.apply(lambda x: x.date.strftime('%Y/%m'), axis=1),
'full_name']
)['text'].count().to_frame()
grouped_by_month = grouped_by_month.sort_index()
# We create a dense dataframe for every year/month combination so even if a person didn't text in a specific
# year/month, we have a 0 so the steamgraph can propertly graph the value.
grouped_by_month_dense = grouped_by_month.unstack().fillna(0).stack()
# Dump the dataframe to a global JS variable so we can access it in our JS code.
# TODO(mdezube): Dump out as JSON instead.
formatted_for_steamgraph = grouped_by_month_dense.reset_index(level=1)
formatted_for_steamgraph.index.name = 'date'
formatted_for_steamgraph.columns = ['key', 'value']
Javascript("window.csvAsString='{}'".format(formatted_for_steamgraph.to_csv(index_label='date').replace('\n', '\\n')))
In [ ]:
%%javascript
// Draw the streamgraph using d3.
element.append('<div class="chart" style="height:600px; width:100%"></div>')
element.append('<style>.axis path, .axis line' +
'{fill: none; stroke: #000;stroke-width: 2px; shape-rendering: crispEdges;}' +
'</style>')
element.append("<script src='d3.min.js'></script>")
element.append("<script src='colorbrewer.min.js'></script>")
element.append("<script src='steamgraph.js'></script>")
// Choose your favorite from https://bl.ocks.org/mbostock/5577023
var colorBrewerPalette = "Spectral";
// Set a timeout to let the JS scripts actually load into memory, this is a bit of a hack but works reliably.
setTimeout(function(){createSteamgraph(csvAsString, colorBrewerPalette)}, 200);
In [ ]:
def generate_cloud(texts, max_words=30):
# Add more words here if you want to ignore them:
my_stopwords = STOPWORDS.copy()
my_stopwords.update(['go', 'ya', 'come', 'back', 'good', 'sound'])
words = ' '.join(texts).lower()
wordcloud = WordCloud(font_path='CabinSketch-Bold.ttf',
stopwords=my_stopwords,
background_color='black',
width=800,
height=600,
relative_scaling=1,
max_words=max_words
).generate_from_text(words)
print('Based on {0:,} texts'.format(len(texts)))
fig, ax = plt.subplots(figsize=(15,10))
ax.imshow(wordcloud)
ax.axis('off')
plt.show()
In [ ]:
# Word cloud of the top 25 words I use based on the most recent 30,000 messages.
texts_from_me = fully_merged_messages_df[fully_merged_messages_df.is_from_me == 1].text[-30000:]
widgets.interact(
generate_cloud,
texts=widgets.fixed(texts_from_me),
max_words=widgets.IntSlider(min=5,max=50,step=1,value=10, continuous_update=False,
description='Max words to show:'))
In [ ]:
def _word_cloud_specific_contact(max_words, from_me, contact):
contact = convert_unicode_to_str_if_needed(contact)
if contact not in full_names:
print('{} not found'.format(contact))
return
sliced_df = fully_merged_messages_df[(fully_merged_messages_df.full_name == contact) &
(fully_merged_messages_df.is_from_me == from_me)].text
generate_cloud(sliced_df, max_words)
widgets.interact(
_word_cloud_specific_contact,
max_words=widgets.IntSlider(min=5, max=50, step=1, value=10,
continuous_update=False, description='Max words to show:'),
from_me=widgets.RadioButtons(
options={'Show messages FROM me': True, 'Show messages TO me': False}, description=' '),
contact=widgets.Text(value='Mom', description='Contact name:')
)
In [ ]:
# Note this requires an internet connection to load Google's JS library.
def get_json_for_word_tree(contact):
df = fully_merged_messages_df[(fully_merged_messages_df.full_name == contact)]
print('Exchanged {0:,} texts with {1}'.format(df.shape[0], contact))
array_for_json = [[text[1]] for text in df.text.iteritems()]
array_for_json.insert(0, [['Phrases']])
return json.dumps(array_for_json)
CONTACT_NAME = 'Mom'
ROOT_WORD = 'feel'
HTML(wordtree.get_word_tree_html(get_json_for_word_tree('Mom'),
ROOT_WORD.lower(),
lowercase=True,
tree_type='double'))
In [ ]:
punctuation = copy.copy(string.punctuation)
punctuation += u'“”‘’\ufffc\uff0c' # Include some UTF-8 punctuation that occurred.
punct_regex = re.compile(u'[{0}]'.format(punctuation))
spaces_regex = re.compile(r'\s{2,}')
numbers_regex = re.compile(r'\d+')
def clean_text(input_str):
processed = input_str.lower()
processed = punct_regex.sub('', processed)
# Also try: processed = numbers_regex.sub('_NUMBER_', processed)
processed = numbers_regex.sub('', processed)
processed = spaces_regex.sub(' ', processed)
return processed
# The normal stopwords list contains words like "i'll" which is unprocessed.
processed_stopwords = [clean_text(word) for word in STOPWORDS]
In [ ]:
# Group the texts by person and collapse them into a single string per person.
grouped_by_name = fully_merged_messages_df[fully_merged_messages_df.is_from_me == 0].groupby(
'full_name')['text'].apply(lambda x: ' '.join(x)).to_frame()
grouped_by_name.info(memory_usage='deep')
grouped_by_name.head(1)
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
import numpy as np
vectorizer = TfidfVectorizer(preprocessor=clean_text,
tokenizer=tokenize.WordPunctTokenizer().tokenize,
stop_words=processed_stopwords,
ngram_range=(1, 2), max_df=.9, max_features=50000)
tfidf_transformed_dataset = vectorizer.fit_transform(grouped_by_name.text)
word_list = pd.Series(vectorizer.get_feature_names())
print('TFIDF sparse matrix is {0}MB'.format(tfidf_transformed_dataset.data.nbytes / 1024 / 1024))
print('TFIDF matrix has shape: {0}'.format(tfidf_transformed_dataset.shape))
In [ ]:
def get_word_summary_for_contact(contact, top_n=25):
contact = convert_unicode_to_str_if_needed(contact)
tfidf_record = _get_tfidf_record_for_contact(contact)
if tfidf_record is None:
print('"{0}" was not found.'.format(contact))
return
sorted_indices = tfidf_record.argsort()[::-1]
return pd.DataFrame({'Word': word_list.iloc[sorted_indices[:top_n]]}).reset_index(drop=True)
def get_word_summary_for_diffs(contact, other_contact, top_n=25):
contact = convert_unicode_to_str_if_needed(contact)
other_contact = convert_unicode_to_str_if_needed(other_contact)
tfidf_record_contact = _get_tfidf_record_for_contact(contact)
tfidf_record_other_contact = _get_tfidf_record_for_contact(other_contact)
if tfidf_record_contact is None or tfidf_record_other_contact is None:
# Print out the first contact not found.
contact_not_found = contact if tfidf_record_contact is None else other_contact
print('"{0}" was not found.'.format(contact_not_found))
return
sorted_indices = (tfidf_record_contact - tfidf_record_other_contact).argsort()[::-1]
return pd.DataFrame({'Word': word_list.iloc[sorted_indices[:top_n]]}).reset_index(drop=True)
# Returns the row in the TFIDF matrix for a given contact by name.
def _get_tfidf_record_for_contact(contact):
if contact not in grouped_by_name.index:
return None
row = np.argmax(grouped_by_name.index == contact)
return tfidf_transformed_dataset.getrow(row).toarray().squeeze()
In [ ]:
widgets.interact(
get_word_summary_for_contact,
contact=widgets.Text(value='Mom', description='Contact name:', placeholder='Enter name'),
top_n=widgets.IntSlider(min=10, max=100, step=1, value=5, description='Max words to show:')
)
In [ ]:
widgets.interact(
get_word_summary_for_diffs,
contact=widgets.Text(description='1st Contact:', placeholder='Enter 1st name'),
other_contact=widgets.Text(description='2nd Contact:', placeholder='Enter 2nd name'),
top_n=widgets.IntSlider(description='Max words to show:', min=10, max=100, step=1, value=5)
)
In [ ]:
def top_words_by_year_from_tfidf(tfidf_by_year, years_as_list, top_n=15):
"""
Returns a dataframe of the top words for each year by their TFIDF score.
To determine the "top", we look at one year's TFIDF - avg(other years' TFIDFs)
Args:
tfidf_by_year: TFIDF matrix with as many rows as entries in years_as_list
years_as_list: Years that are represented in the TFIDF matrix
top_n: Number of top words per year to include in the result
"""
# Densify the tfidf matrix so we can operate on it.
tfidf_by_year_dense = tfidf_by_year.toarray()
df_by_year = []
for i in range(tfidf_by_year_dense.shape[0]):
this_year = years_as_list[i]
tfidf_this_year = tfidf_by_year_dense[i]
tfidf_other_years = np.delete(tfidf_by_year_dense, i, axis=0).mean(axis=0)
sorted_indices = (tfidf_this_year - tfidf_other_years).argsort()[::-1]
df = pd.DataFrame({this_year: word_list.iloc[sorted_indices[:top_n]]})
df = df.reset_index(drop=True)
df_by_year.append(df)
return pd.concat(df_by_year, axis=1)
def top_words_by_year_from_df(slice_of_texts_df, top_n=15, min_texts_required=100):
"""Returns a dataframe of the top words for each year by their TFIDF score.
Top is determined by the `top_words_by_year_from_tfidf` method.
Args:
slice_of_texts_df: A dataframe with the text messages to process
top_n: Number of top words per year to include in the result
min_texts_required: Number of texts to require in each year to not drop the record
"""
grouped_by_year_tfidf, years = _tfidf_by_year(slice_of_texts_df, min_texts_required)
return top_words_by_year_from_tfidf(grouped_by_year_tfidf, years, top_n)
def _tfidf_by_year(slice_of_texts_df, min_texts_required=100):
"""Returns a TFIDF matrix of the texts grouped by year.
Years with less than `min_texts_required` texts will be dropped.
"""
grouper = slice_of_texts_df.date.apply(lambda x: x.year)
grouped_by_year = slice_of_texts_df.groupby(grouper).apply(
lambda row: pd.Series({'count': len(row.date), 'text': ' '.join(row.text)})
)
# Drops years with less than min_texts_required texts since they won't be very meaningful.
years_to_drop = grouped_by_year[grouped_by_year['count'] < min_texts_required].index
print('Dropping year(s): {0}, each had fewer than {1} texts.'.format(
', '.join(str(year) for year in years_to_drop), min_texts_required))
grouped_by_year = grouped_by_year[grouped_by_year['count'] >= min_texts_required]
grouped_by_year.index.name = 'year'
if grouped_by_year.shape[0] == 0:
print('Bailing out, no years found with at least {0} texts.'.format(min_texts_required))
return None
grouped_by_year_tfidf = vectorizer.transform(grouped_by_year['text'])
print('Found {0} years with more than {1} texts each.'.format(grouped_by_year_tfidf.shape[0],
min_texts_required))
return grouped_by_year_tfidf, grouped_by_year.index
In [ ]:
top_words_by_year_from_df(fully_merged_messages_df[fully_merged_messages_df.is_from_me == 1],
top_n=15)
In [ ]:
# Wrapper method so we can use interact().
def _top_words_by_year_for_contact(contact, from_me, top_n):
contact = convert_unicode_to_str_if_needed(contact)
if contact not in full_names:
print('"{0}" not found'.format(contact))
return
# Slice to texts from/to the contact.
df = fully_merged_messages_df[(fully_merged_messages_df.is_from_me == from_me) &
(fully_merged_messages_df.full_name == contact)]
return top_words_by_year_from_df(df, top_n)
widgets.interact(
_top_words_by_year_for_contact,
contact=widgets.Text(value='Mom', description='Contact name:', placeholder='Enter name'),
from_me=widgets.RadioButtons(
options={'Show messages FROM me': True, 'Show messages TO me': False}, description=' '),
top_n=widgets.IntSlider(min=15, max=100, step=1, value=5, description='Max words to show:')
)
In [ ]:
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
def _top_words_by_cluster_from_tfidf(
cluster_id,
tfidf_per_sender,
cluster_for_tfidf_index,
top_n=15,
):
"""
Returns a dataframe of the top words for each cluster by their TFIDF score.
To determine the "top", we look at one cluster's TFIDF - avg(other clusters' TFIDFs)
Args:
cluster_id: The cluster we want to find the top words for (referred to as "given cluster")
tfidf_per_sender: TFIDF matrix with as many rows as entries in cluster_for_tfidf_index
cluster_for_tfidf_index: Cluster assignment for each entry in tfidf_per_sender
top_n: Number of top words per cluster to include in the result
"""
# First, we separate the given cluster we want to consider from all other entries.
this_cluster_records = tfidf_per_sender[cluster_for_tfidf_index == cluster_id]
other_cluster_records = tfidf_per_sender[cluster_for_tfidf_index != cluster_id]
# Next, we calculate the mean for each: the given cluster and the rest of the corpus
mean_this_cluster = np.asarray(this_cluster_records.mean(axis=0)).squeeze()
mean_other_cluster = np.asarray(other_cluster_records.mean(axis=0)).squeeze()
# Finally, we identify the words for which the given cluster shows the biggest difference.
difference = mean_this_cluster - mean_other_cluster
most_different_indicies = difference.argsort()
# Only display top_n
return most_different_indicies[::-1][:top_n]
def _tfidf_by_sender(messages_df, min_texts_required=100):
"""Returns a TFIDF matrix of the texts grouped by sender.
Message exchanges with less than `min_texts_required` texts will be dropped.
"""
# First we group messages by name, then we merge each conversation into one string.
grouped_by_name = messages_df.groupby("full_name").apply(
lambda row: pd.Series({'count': len(row.full_name), 'text': ' '.join(row.text)})
)
# Drop all conversations that don't meet the requirements for minimum number of messages.
grouped_by_name = grouped_by_name[grouped_by_name['count'] >= min_texts_required]
grouped_by_name.index.name = 'full_name'
# Bail if we have no data
if grouped_by_name.shape[0] == 0:
print('Bailing out, no conversations found with at least {0} texts.'.format(min_texts_required))
return None
grouped_by_name_tfidf = vectorizer.transform(grouped_by_name['text'])
print('Found {0} conversations with at least than {1} texts each.'.format(grouped_by_name_tfidf.shape[0],
min_texts_required))
return grouped_by_name_tfidf, grouped_by_name.index
# Get the TFIDF vector for each data point and the list of receivers.
tfidf_per_sender, names_sender = _tfidf_by_sender(fully_merged_messages_df[fully_merged_messages_df.is_from_me == 0])
# First, we reduce the dimensionality of the dataset.
# This reduces the difference between the clusters found by KMeans and the 2D graphic of the clusters.
tfidf_sender_reduced_dim = TruncatedSVD(n_components=7).fit_transform(tfidf_per_sender)
# Let's run KMeans clustering on the data.
NUMBER_OF_CLUSTERS = 7
kmeans_tfidf_sender = KMeans(n_clusters=NUMBER_OF_CLUSTERS)
tfidf_per_sender_cluster_assignment = kmeans_tfidf_sender.fit_transform(tfidf_sender_reduced_dim).argmin(axis=1)
# We further reduce the dimensionality of the data, so that we can graph it.
tfidf_per_sender_2d = TruncatedSVD(n_components=2).fit_transform(tfidf_sender_reduced_dim)
In [ ]:
clustered_tfidf_by_sender_df = pd.DataFrame({
"x": tfidf_per_sender_2d[:,0],
"y": tfidf_per_sender_2d[:,1],
"name": names_sender,
"group": ["Cluster: " + str(e) for e in tfidf_per_sender_cluster_assignment],
})
clustered_tfidf_by_sender_df.head()
In [ ]:
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
clusters = clustered_tfidf_by_sender_df.group.unique()
def plot_data(cluster_selection):
traces = []
top_words = None
if cluster_selection == "All":
clusters_to_plot = clusters
else:
clusters_to_plot = [cluster_selection]
top_words_indexes = _top_words_by_cluster_from_tfidf(
int(cluster_selection[-1]),
tfidf_per_sender,
tfidf_per_sender_cluster_assignment
)[0:10]
top_words = word_list.iloc[top_words_indexes].to_frame()
top_words.columns = ['Top Words In Cluster']
top_words = top_words.reset_index(drop=True)
for cluster in clusters_to_plot:
cluster_data = clustered_tfidf_by_sender_df[clustered_tfidf_by_sender_df.group == cluster]
scatter = go.Scatter(
x=cluster_data["x"],
y=cluster_data["y"],
text=cluster_data["name"],
mode = 'markers',
name=cluster
)
traces.append(scatter)
py.iplot(traces)
return top_words
cluster_selection = widgets.Dropdown(
options=["All"] + list(clusters),
value="All",
description="Cluster: "
)
print('We\'ve clustered your contacts by their word usage, hover over the dots to see which '
'cluster each person is in. Adjust the dropdown to restrict to a cluster.\nDots closer '
'to each other indicate the people talk similarly.')
widgets.interact(
plot_data,
cluster_selection=cluster_selection,
)
display(cluster_selection)