See the README for an explanation of how this code runs and functions.

Contact michaeldezube at gmail dot com with questions.

Imports


In [ ]:
from __future__ import print_function
from __future__ import division

import copy
import json
import re
import string

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn  # To improve the chart styling.
import wordtree

from IPython.display import display
from IPython.display import HTML
from IPython.display import Javascript
from wordcloud import STOPWORDS
import ipywidgets as widgets
from wordcloud import WordCloud

import iphone_connector

Load the data from disk and set up the dataframes


In [ ]:
%matplotlib inline
matplotlib.style.use('ggplot')
pd.set_option('display.max_colwidth', 1000)
iphone_connector.initialize()

fully_merged_messages_df, address_book_df = iphone_connector.get_cleaned_fully_merged_messages()
full_names = set(address_book_df.full_name)  # Handy set to check for misspellings later on.
fully_merged_messages_df.full_name.replace('nan nan nan', 'Unknown', inplace=True)

WORDS_PER_PAGE = 450  # Based upon http://wordstopages.com/
print('\nTotal pages if all texts were printed: {0:,d} (Arial size 12, single spaced)\n'.format(
    sum(fully_merged_messages_df.text.apply(lambda x: len(x.split())))//WORDS_PER_PAGE))

In [ ]:
fully_merged_messages_df = fully_merged_messages_df.reset_index(drop=True)
fully_merged_messages_df

In [ ]:
address_book_df

Use fully_merged_messages_df and address_book_df for analysis, they contain all messages with columns for the sender and all contacts, respectively

Show a heatmap of how many texts you've exchanged


In [ ]:
def plot_year_month_heatmap(df, trim_incomplete=True, search_term=None, figsize=(18, 10)):
    """Plots a heatmap of the dataframe grouped by year and month.
    
    Args:
        df: The dataframe, must contain a column named `date`.
        trim_incomplete: If true, don't plot rows that lack 12 full months of data.  Default True.
        search_term: A case insensitive term to require in all rows of the dataframe's `text`
            column.  Default None.
        figsize: The size of the plot as a tuple.  Default (18, 10);
    
    """
    if search_term:
        df = df[df['text'].str.contains(search_term, case=False)]
    month_year_messages = pd.DataFrame(df['date'])
    month_year_messages['year'] = month_year_messages.apply(lambda row: row.date.year, axis=1)
    month_year_messages['month'] = month_year_messages.apply(lambda row: row.date.month, axis=1)
    month_year_messages = month_year_messages.drop('date', axis=1)

    month_year_messages_pivot = month_year_messages.pivot_table(index='year',
                                                                columns='month',
                                                                aggfunc=len, dropna=True)
    if trim_incomplete:
        month_year_messages_pivot = month_year_messages_pivot[month_year_messages_pivot.count(axis=1) == 12]
    if month_year_messages_pivot.shape[0] == 0:
        print('After trimming rows that didn\'t have 12 months, no rows remained, bailing out.')
        return

    f, ax = plt.subplots(figsize=figsize)
    seaborn.heatmap(month_year_messages_pivot, annot=True, fmt=".0f", square=True, cmap="YlGnBu", ax=ax)

# Plot all text messages exchanges over the years.
plot_year_month_heatmap(fully_merged_messages_df, search_term='')

Table and graph of who you text the most


In [ ]:
# Helper method to better support py2 and py3.

def convert_unicode_to_str_if_needed(unicode_or_str):
    if type(unicode_or_str).__name__ == 'unicode':
        return unicode_or_str.encode('utf-8')
    return unicode_or_str

In [ ]:
# Note "Unknown" means the number was not found in your address book.

def get_message_counts(dataframe):
    return pd.Series({'Texts sent': dataframe[dataframe.is_from_me == 1].shape[0],
                      'Texts received': dataframe[dataframe.is_from_me == 0].shape[0],
                      'Texts exchanged': dataframe.shape[0]})
messages_grouped = fully_merged_messages_df.groupby('full_name').apply(get_message_counts)
messages_grouped = messages_grouped.sort_values(by='Texts exchanged', ascending=False)

widgets.interact(messages_grouped.head,
                 n=widgets.IntSlider(min=5, max=50, step=1, value=5, continuous_update=False,
                                     description='Number of people to show:'))

In [ ]:
# Helper method so we can wrap it with interact().
def _plot_most_common_text(top_n=10):
    messages_grouped.head(top_n).plot(figsize=(20,10), kind='bar')
   
widgets.interact(_plot_most_common_text,
                 top_n=widgets.IntSlider(min=5, max=100, step=1, value=5, continuous_update=False,
                                         description='Number of people to show:'))

Steamgraph

Dump the necessary data to JS


In [ ]:
# Restrict to the top N people you text the most so the steamgraph is legible.
TOP_N = 10  # Freely change this value.

sliced_df = fully_merged_messages_df[fully_merged_messages_df.full_name.isin(messages_grouped.head(TOP_N).index)]
grouped_by_month = sliced_df.groupby([
    sliced_df.apply(lambda x: x.date.strftime('%Y/%m'), axis=1),
    'full_name']
)['text'].count().to_frame()

grouped_by_month = grouped_by_month.sort_index()
# We create a dense dataframe for every year/month combination so even if a person didn't text in a specific
# year/month, we have a 0 so the steamgraph can propertly graph the value.
grouped_by_month_dense = grouped_by_month.unstack().fillna(0).stack()

# Dump the dataframe to a global JS variable so we can access it in our JS code.
# TODO(mdezube): Dump out as JSON instead.
formatted_for_steamgraph = grouped_by_month_dense.reset_index(level=1)
formatted_for_steamgraph.index.name = 'date'
formatted_for_steamgraph.columns = ['key', 'value']
Javascript("window.csvAsString='{}'".format(formatted_for_steamgraph.to_csv(index_label='date').replace('\n', '\\n')))

Draw the graph!


In [ ]:
%%javascript
// Draw the streamgraph using d3.

element.append('<div class="chart" style="height:600px; width:100%"></div>')
element.append('<style>.axis path, .axis line' + 
               '{fill: none; stroke: #000;stroke-width: 2px; shape-rendering: crispEdges;}' + 
               '</style>')

element.append("<script src='d3.min.js'></script>")
element.append("<script src='colorbrewer.min.js'></script>")
element.append("<script src='steamgraph.js'></script>")

// Choose your favorite from https://bl.ocks.org/mbostock/5577023
var colorBrewerPalette = "Spectral";

// Set a timeout to let the JS scripts actually load into memory, this is a bit of a hack but works reliably.
setTimeout(function(){createSteamgraph(csvAsString, colorBrewerPalette)}, 200);

Wordcloud

Define the helper method


In [ ]:
def generate_cloud(texts, max_words=30):
    # Add more words here if you want to ignore them:
    my_stopwords = STOPWORDS.copy()
    my_stopwords.update(['go', 'ya', 'come', 'back', 'good', 'sound'])
    words = ' '.join(texts).lower()
    wordcloud = WordCloud(font_path='CabinSketch-Bold.ttf',
                          stopwords=my_stopwords,
                          background_color='black',
                          width=800,
                          height=600,
                          relative_scaling=1,
                          max_words=max_words
                         ).generate_from_text(words)
    print('Based on {0:,} texts'.format(len(texts)))
    
    fig, ax = plt.subplots(figsize=(15,10))
    ax.imshow(wordcloud)
    ax.axis('off')
    plt.show()

Texts you've sent


In [ ]:
# Word cloud of the top 25 words I use based on the most recent 30,000 messages.

texts_from_me = fully_merged_messages_df[fully_merged_messages_df.is_from_me == 1].text[-30000:]
widgets.interact(
    generate_cloud,
    texts=widgets.fixed(texts_from_me),
    max_words=widgets.IntSlider(min=5,max=50,step=1,value=10, continuous_update=False,
                                description='Max words to show:'))

Texts to/from a specific contact


In [ ]:
def _word_cloud_specific_contact(max_words, from_me, contact):
    contact = convert_unicode_to_str_if_needed(contact)
    if contact not in full_names:
        print('{} not found'.format(contact))
        return
    sliced_df = fully_merged_messages_df[(fully_merged_messages_df.full_name == contact) &
                                         (fully_merged_messages_df.is_from_me == from_me)].text
    generate_cloud(sliced_df, max_words)

widgets.interact(
    _word_cloud_specific_contact,
    max_words=widgets.IntSlider(min=5, max=50, step=1, value=10,
                                continuous_update=False, description='Max words to show:'),
    from_me=widgets.RadioButtons(
        options={'Show messages FROM me': True, 'Show messages TO me': False}, description=' '),
    contact=widgets.Text(value='Mom', description='Contact name:')
)

Diving deeper into the actual text

Visualize a word tree of texts exchanged with a specific contact


In [ ]:
# Note this requires an internet connection to load Google's JS library.
def get_json_for_word_tree(contact):
    df = fully_merged_messages_df[(fully_merged_messages_df.full_name == contact)]
    print('Exchanged {0:,} texts with {1}'.format(df.shape[0], contact))
    
    array_for_json = [[text[1]] for text in df.text.iteritems()]
    array_for_json.insert(0, [['Phrases']])
    return json.dumps(array_for_json)
    
CONTACT_NAME = 'Mom'
ROOT_WORD = 'feel'
HTML(wordtree.get_word_tree_html(get_json_for_word_tree('Mom'),
                                 ROOT_WORD.lower(),
                                 lowercase=True,
                                 tree_type='double'))

Preprocessing and data munging for TFIDF


In [ ]:
punctuation = copy.copy(string.punctuation)
punctuation += u'“”‘’\ufffc\uff0c'  # Include some UTF-8 punctuation that occurred.
punct_regex = re.compile(u'[{0}]'.format(punctuation))
spaces_regex = re.compile(r'\s{2,}')
numbers_regex = re.compile(r'\d+')

def clean_text(input_str):
    processed = input_str.lower()
    processed = punct_regex.sub('', processed)
    # Also try: processed = numbers_regex.sub('_NUMBER_', processed)
    processed = numbers_regex.sub('', processed)
    processed = spaces_regex.sub(' ', processed)
    
    return processed

# The normal stopwords list contains words like "i'll" which is unprocessed.
processed_stopwords = [clean_text(word) for word in STOPWORDS]

In [ ]:
# Group the texts by person and collapse them into a single string per person.

grouped_by_name = fully_merged_messages_df[fully_merged_messages_df.is_from_me == 0].groupby(
    'full_name')['text'].apply(lambda x: ' '.join(x)).to_frame()
grouped_by_name.info(memory_usage='deep')
grouped_by_name.head(1)

Create TFIDF matrix for all contacts

Note the methods below focus on texts received from these contacts, not texts you've sent to them.


In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
import numpy as np

vectorizer = TfidfVectorizer(preprocessor=clean_text,
                             tokenizer=tokenize.WordPunctTokenizer().tokenize,
                             stop_words=processed_stopwords,
                             ngram_range=(1, 2), max_df=.9, max_features=50000)
tfidf_transformed_dataset = vectorizer.fit_transform(grouped_by_name.text)
word_list = pd.Series(vectorizer.get_feature_names())

print('TFIDF sparse matrix is {0}MB'.format(tfidf_transformed_dataset.data.nbytes / 1024 / 1024))
print('TFIDF matrix has shape: {0}'.format(tfidf_transformed_dataset.shape))

Helper methods to leverage the TFIDF matrix


In [ ]:
def get_word_summary_for_contact(contact, top_n=25):
    contact = convert_unicode_to_str_if_needed(contact)
    tfidf_record = _get_tfidf_record_for_contact(contact)
    if tfidf_record is None:
        print('"{0}" was not found.'.format(contact))
        return
    sorted_indices = tfidf_record.argsort()[::-1]
    return pd.DataFrame({'Word': word_list.iloc[sorted_indices[:top_n]]}).reset_index(drop=True)

def get_word_summary_for_diffs(contact, other_contact, top_n=25):
    contact = convert_unicode_to_str_if_needed(contact)
    other_contact = convert_unicode_to_str_if_needed(other_contact)
    
    tfidf_record_contact = _get_tfidf_record_for_contact(contact)
    tfidf_record_other_contact = _get_tfidf_record_for_contact(other_contact)
    
    if tfidf_record_contact is None or tfidf_record_other_contact is None:
        # Print out the first contact not found.
        contact_not_found = contact if tfidf_record_contact is None else other_contact
        print('"{0}" was not found.'.format(contact_not_found))
        return
    sorted_indices = (tfidf_record_contact - tfidf_record_other_contact).argsort()[::-1]
    return pd.DataFrame({'Word': word_list.iloc[sorted_indices[:top_n]]}).reset_index(drop=True)

# Returns the row in the TFIDF matrix for a given contact by name.
def _get_tfidf_record_for_contact(contact):
    if contact not in grouped_by_name.index:
        return None
    row = np.argmax(grouped_by_name.index == contact)
    return tfidf_transformed_dataset.getrow(row).toarray().squeeze()

Words that identify a specific contact


In [ ]:
widgets.interact(
    get_word_summary_for_contact,
    contact=widgets.Text(value='Mom', description='Contact name:', placeholder='Enter name'),
    top_n=widgets.IntSlider(min=10, max=100, step=1, value=5, description='Max words to show:')
)

Words that identify the difference between two contacts


In [ ]:
widgets.interact(
    get_word_summary_for_diffs,
    contact=widgets.Text(description='1st Contact:', placeholder='Enter 1st name'),
    other_contact=widgets.Text(description='2nd Contact:', placeholder='Enter 2nd name'),
    top_n=widgets.IntSlider(description='Max words to show:', min=10, max=100, step=1, value=5)
)

Looking at language progression over the years

Helper methods for looking at TFIDF by year


In [ ]:
def top_words_by_year_from_tfidf(tfidf_by_year, years_as_list, top_n=15):
    """
    Returns a dataframe of the top words for each year by their TFIDF score.
    
    To determine the "top", we look at one year's TFIDF - avg(other years' TFIDFs)
    
    Args:
        tfidf_by_year: TFIDF matrix with as many rows as entries in years_as_list
        years_as_list: Years that are represented in the TFIDF matrix
        top_n: Number of top words per year to include in the result
    """
    # Densify the tfidf matrix so we can operate on it.
    tfidf_by_year_dense = tfidf_by_year.toarray()
    df_by_year = []
    for i in range(tfidf_by_year_dense.shape[0]):
        this_year = years_as_list[i]
        tfidf_this_year = tfidf_by_year_dense[i]
        tfidf_other_years = np.delete(tfidf_by_year_dense, i, axis=0).mean(axis=0)
        
        sorted_indices = (tfidf_this_year - tfidf_other_years).argsort()[::-1]
        df = pd.DataFrame({this_year: word_list.iloc[sorted_indices[:top_n]]})
        df = df.reset_index(drop=True)
        df_by_year.append(df)
    return pd.concat(df_by_year, axis=1)

def top_words_by_year_from_df(slice_of_texts_df, top_n=15, min_texts_required=100):
    """Returns a dataframe of the top words for each year by their TFIDF score.

    Top is determined by the `top_words_by_year_from_tfidf` method.

    Args:
        slice_of_texts_df: A dataframe with the text messages to process
        top_n: Number of top words per year to include in the result
        min_texts_required: Number of texts to require in each year to not drop the record 
    """
    grouped_by_year_tfidf, years = _tfidf_by_year(slice_of_texts_df, min_texts_required)
    return top_words_by_year_from_tfidf(grouped_by_year_tfidf, years, top_n)

def _tfidf_by_year(slice_of_texts_df, min_texts_required=100):
    """Returns a TFIDF matrix of the texts grouped by year.
    
    Years with less than `min_texts_required` texts will be dropped.
    """
    grouper = slice_of_texts_df.date.apply(lambda x: x.year)
    grouped_by_year = slice_of_texts_df.groupby(grouper).apply(
        lambda row: pd.Series({'count': len(row.date), 'text': ' '.join(row.text)})
    )

    # Drops years with less than min_texts_required texts since they won't be very meaningful.
    years_to_drop = grouped_by_year[grouped_by_year['count'] < min_texts_required].index
    print('Dropping year(s): {0}, each had fewer than {1} texts.'.format(
        ', '.join(str(year) for year in years_to_drop), min_texts_required))
    grouped_by_year = grouped_by_year[grouped_by_year['count'] >= min_texts_required]
    grouped_by_year.index.name = 'year'

    if grouped_by_year.shape[0] == 0:
        print('Bailing out, no years found with at least {0} texts.'.format(min_texts_required))
        return None

    grouped_by_year_tfidf = vectorizer.transform(grouped_by_year['text'])
    print('Found {0} years with more than {1} texts each.'.format(grouped_by_year_tfidf.shape[0],
                                                                  min_texts_required))
    return grouped_by_year_tfidf, grouped_by_year.index

My top words over the years

This offers an interesting insight into the main topics over the years.


In [ ]:
top_words_by_year_from_df(fully_merged_messages_df[fully_merged_messages_df.is_from_me == 1],
                          top_n=15)

Top words over the years from/to a specific contact

This offers an interesting insight into the main topics over the years.


In [ ]:
# Wrapper method so we can use interact().
def _top_words_by_year_for_contact(contact, from_me, top_n):
    contact = convert_unicode_to_str_if_needed(contact)
    if contact not in full_names:
        print('"{0}" not found'.format(contact))
        return
    # Slice to texts from/to the contact.
    df = fully_merged_messages_df[(fully_merged_messages_df.is_from_me == from_me) &
                                  (fully_merged_messages_df.full_name == contact)]
    return top_words_by_year_from_df(df, top_n)

widgets.interact(
    _top_words_by_year_for_contact,
    contact=widgets.Text(value='Mom', description='Contact name:', placeholder='Enter name'),
    from_me=widgets.RadioButtons(
        options={'Show messages FROM me': True, 'Show messages TO me': False}, description=' '),
    top_n=widgets.IntSlider(min=15, max=100, step=1, value=5, description='Max words to show:')
)

In [ ]:
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD

def _top_words_by_cluster_from_tfidf(
    cluster_id,
    tfidf_per_sender,
    cluster_for_tfidf_index,
    top_n=15,
):
    """
    Returns a dataframe of the top words for each cluster by their TFIDF score.
    
    To determine the "top", we look at one cluster's TFIDF - avg(other clusters' TFIDFs)
    
    Args:
        cluster_id: The cluster we want to find the top words for (referred to as "given cluster")
        tfidf_per_sender: TFIDF matrix with as many rows as entries in cluster_for_tfidf_index
        cluster_for_tfidf_index: Cluster assignment for each entry in tfidf_per_sender
        top_n: Number of top words per cluster to include in the result
    """
    # First, we separate the given cluster we want to consider from all other entries.
    this_cluster_records = tfidf_per_sender[cluster_for_tfidf_index == cluster_id]
    other_cluster_records = tfidf_per_sender[cluster_for_tfidf_index != cluster_id]
    
    # Next, we calculate the mean for each: the given cluster and the rest of the corpus
    mean_this_cluster = np.asarray(this_cluster_records.mean(axis=0)).squeeze()
    mean_other_cluster = np.asarray(other_cluster_records.mean(axis=0)).squeeze()
    
    # Finally, we identify the words for which the given cluster shows the biggest difference.
    difference = mean_this_cluster - mean_other_cluster
    most_different_indicies = difference.argsort()
    # Only display top_n
    return most_different_indicies[::-1][:top_n]

def _tfidf_by_sender(messages_df, min_texts_required=100):
    """Returns a TFIDF matrix of the texts grouped by sender.
    
    Message exchanges with less than `min_texts_required` texts will be dropped.
    """
    # First we group messages by name, then we merge each conversation into one string.
    grouped_by_name = messages_df.groupby("full_name").apply(
        lambda row: pd.Series({'count': len(row.full_name), 'text': ' '.join(row.text)})
    )

    # Drop all conversations that don't meet the requirements for minimum number of messages.
    grouped_by_name = grouped_by_name[grouped_by_name['count'] >= min_texts_required]
    grouped_by_name.index.name = 'full_name'

    # Bail if we have no data
    if grouped_by_name.shape[0] == 0:
        print('Bailing out, no conversations found with at least {0} texts.'.format(min_texts_required))
        return None

    grouped_by_name_tfidf = vectorizer.transform(grouped_by_name['text'])
    print('Found {0} conversations with at least than {1} texts each.'.format(grouped_by_name_tfidf.shape[0],
                                                                  min_texts_required))
    return grouped_by_name_tfidf, grouped_by_name.index

# Get the TFIDF vector for each data point and the list of receivers.
tfidf_per_sender, names_sender = _tfidf_by_sender(fully_merged_messages_df[fully_merged_messages_df.is_from_me == 0])

# First, we reduce the dimensionality of the dataset.
# This reduces the difference between the clusters found by KMeans and the 2D graphic of the clusters.
tfidf_sender_reduced_dim = TruncatedSVD(n_components=7).fit_transform(tfidf_per_sender)

# Let's run KMeans clustering on the data.
NUMBER_OF_CLUSTERS = 7
kmeans_tfidf_sender = KMeans(n_clusters=NUMBER_OF_CLUSTERS)
tfidf_per_sender_cluster_assignment = kmeans_tfidf_sender.fit_transform(tfidf_sender_reduced_dim).argmin(axis=1)

# We further reduce the dimensionality of the data, so that we can graph it.
tfidf_per_sender_2d = TruncatedSVD(n_components=2).fit_transform(tfidf_sender_reduced_dim)

In [ ]:
clustered_tfidf_by_sender_df = pd.DataFrame({
    "x": tfidf_per_sender_2d[:,0],
    "y": tfidf_per_sender_2d[:,1],
    "name": names_sender,
    "group": ["Cluster: " + str(e) for e in tfidf_per_sender_cluster_assignment],
})
clustered_tfidf_by_sender_df.head()

In [ ]:
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

clusters = clustered_tfidf_by_sender_df.group.unique()

def plot_data(cluster_selection):
    traces = []
    top_words = None
    if cluster_selection == "All":
        clusters_to_plot = clusters
    else:
        clusters_to_plot = [cluster_selection]
        top_words_indexes = _top_words_by_cluster_from_tfidf(
            int(cluster_selection[-1]),
            tfidf_per_sender,
            tfidf_per_sender_cluster_assignment
        )[0:10]
        top_words = word_list.iloc[top_words_indexes].to_frame()
        top_words.columns = ['Top Words In Cluster']
        top_words = top_words.reset_index(drop=True)
    for cluster in clusters_to_plot:
        cluster_data = clustered_tfidf_by_sender_df[clustered_tfidf_by_sender_df.group == cluster]
        scatter = go.Scatter(
            x=cluster_data["x"],
            y=cluster_data["y"],
            text=cluster_data["name"],
            mode = 'markers',
            name=cluster
        )
        traces.append(scatter)
    py.iplot(traces)
    return top_words

cluster_selection = widgets.Dropdown(
    options=["All"] + list(clusters),
    value="All",
    description="Cluster: "
)
print('We\'ve clustered your contacts by their word usage, hover over the dots to see which '
      'cluster each person is in. Adjust the dropdown to restrict to a cluster.\nDots closer '
      'to each other indicate the people talk similarly.')
widgets.interact(
    plot_data,
    cluster_selection=cluster_selection,
)
display(cluster_selection)