In [ ]:

    
from __future__ import print_function
from __future__ import division

import copy
import json
import re
import string

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn  # To improve the chart styling.
import wordtree

from IPython.display import display
from IPython.display import HTML
from IPython.display import Javascript
from wordcloud import STOPWORDS
import ipywidgets as widgets
from wordcloud import WordCloud

import iphone_connector

Load the data from disk and set up the dataframes



In [ ]:

    
%matplotlib inline
matplotlib.style.use('ggplot')
pd.set_option('display.max_colwidth', 1000)
iphone_connector.initialize()

fully_merged_messages_df, address_book_df = iphone_connector.get_cleaned_fully_merged_messages()
full_names = set(address_book_df.full_name)  # Handy set to check for misspellings later on.
fully_merged_messages_df.full_name.replace('nan nan nan', 'Unknown', inplace=True)

WORDS_PER_PAGE = 450  # Based upon http://wordstopages.com/
print('\nTotal pages if all texts were printed: {0:,d} (Arial size 12, single spaced)\n'.format(
    sum(fully_merged_messages_df.text.apply(lambda x: len(x.split())))//WORDS_PER_PAGE))



In [ ]:

    
fully_merged_messages_df = fully_merged_messages_df.reset_index(drop=True)
fully_merged_messages_df



In [ ]:

    
address_book_df

Use fully_merged_messages_df and address_book_df for analysis, they contain all messages with columns for the sender and all contacts, respectively

Show a heatmap of how many texts you've exchanged



In [ ]:

    
def plot_year_month_heatmap(df, trim_incomplete=True, search_term=None, figsize=(18, 10)):
    """Plots a heatmap of the dataframe grouped by year and month.
    
    Args:
        df: The dataframe, must contain a column named `date`.
        trim_incomplete: If true, don't plot rows that lack 12 full months of data.  Default True.
        search_term: A case insensitive term to require in all rows of the dataframe's `text`
            column.  Default None.
        figsize: The size of the plot as a tuple.  Default (18, 10);
    
    """
    if search_term:
        df = df[df['text'].str.contains(search_term, case=False)]
    month_year_messages = pd.DataFrame(df['date'])
    month_year_messages['year'] = month_year_messages.apply(lambda row: row.date.year, axis=1)
    month_year_messages['month'] = month_year_messages.apply(lambda row: row.date.month, axis=1)
    month_year_messages = month_year_messages.drop('date', axis=1)

    month_year_messages_pivot = month_year_messages.pivot_table(index='year',
                                                                columns='month',
                                                                aggfunc=len, dropna=True)
    if trim_incomplete:
        month_year_messages_pivot = month_year_messages_pivot[month_year_messages_pivot.count(axis=1) == 12]
    if month_year_messages_pivot.shape[0] == 0:
        print('After trimming rows that didn\'t have 12 months, no rows remained, bailing out.')
        return

    f, ax = plt.subplots(figsize=figsize)
    seaborn.heatmap(month_year_messages_pivot, annot=True, fmt=".0f", square=True, cmap="YlGnBu", ax=ax)

# Plot all text messages exchanges over the years.
plot_year_month_heatmap(fully_merged_messages_df, search_term='')

Table and graph of who you text the most



In [ ]:

    
# Helper method to better support py2 and py3.

def convert_unicode_to_str_if_needed(unicode_or_str):
    if type(unicode_or_str).__name__ == 'unicode':
        return unicode_or_str.encode('utf-8')
    return unicode_or_str



In [ ]:

    
# Note "Unknown" means the number was not found in your address book.

def get_message_counts(dataframe):
    return pd.Series({'Texts sent': dataframe[dataframe.is_from_me == 1].shape[0],
                      'Texts received': dataframe[dataframe.is_from_me == 0].shape[0],
                      'Texts exchanged': dataframe.shape[0]})
messages_grouped = fully_merged_messages_df.groupby('full_name').apply(get_message_counts)
messages_grouped = messages_grouped.sort_values(by='Texts exchanged', ascending=False)

widgets.interact(messages_grouped.head,
                 n=widgets.IntSlider(min=5, max=50, step=1, value=5, continuous_update=False,
                                     description='Number of people to show:'))



In [ ]:

    
# Helper method so we can wrap it with interact().
def _plot_most_common_text(top_n=10):
    messages_grouped.head(top_n).plot(figsize=(20,10), kind='bar')
   
widgets.interact(_plot_most_common_text,
                 top_n=widgets.IntSlider(min=5, max=100, step=1, value=5, continuous_update=False,
                                         description='Number of people to show:'))

Steamgraph

Dump the necessary data to JS



In [ ]:

    
# Restrict to the top N people you text the most so the steamgraph is legible.
TOP_N = 10  # Freely change this value.

sliced_df = fully_merged_messages_df[fully_merged_messages_df.full_name.isin(messages_grouped.head(TOP_N).index)]
grouped_by_month = sliced_df.groupby([
    sliced_df.apply(lambda x: x.date.strftime('%Y/%m'), axis=1),
    'full_name']
)['text'].count().to_frame()

grouped_by_month = grouped_by_month.sort_index()
# We create a dense dataframe for every year/month combination so even if a person didn't text in a specific
# year/month, we have a 0 so the steamgraph can propertly graph the value.
grouped_by_month_dense = grouped_by_month.unstack().fillna(0).stack()

# Dump the dataframe to a global JS variable so we can access it in our JS code.
# TODO(mdezube): Dump out as JSON instead.
formatted_for_steamgraph = grouped_by_month_dense.reset_index(level=1)
formatted_for_steamgraph.index.name = 'date'
formatted_for_steamgraph.columns = ['key', 'value']
Javascript("window.csvAsString='{}'".format(formatted_for_steamgraph.to_csv(index_label='date').replace('\n', '\\n')))

Draw the graph!



In [ ]:

    
%%javascript
// Draw the streamgraph using d3.

element.append('<div class="chart" style="height:600px; width:100%"></div>')
element.append('<style>.axis path, .axis line' + 
               '{fill: none; stroke: #000;stroke-width: 2px; shape-rendering: crispEdges;}' + 
               '</style>')

element.append("<script src='d3.min.js'></script>")
element.append("<script src='colorbrewer.min.js'></script>")
element.append("<script src='steamgraph.js'></script>")

// Choose your favorite from https://bl.ocks.org/mbostock/5577023
var colorBrewerPalette = "Spectral";

// Set a timeout to let the JS scripts actually load into memory, this is a bit of a hack but works reliably.
setTimeout(function(){createSteamgraph(csvAsString, colorBrewerPalette)}, 200);

Wordcloud

Define the helper method



In [ ]:

    
def generate_cloud(texts, max_words=30):
    # Add more words here if you want to ignore them:
    my_stopwords = STOPWORDS.copy()
    my_stopwords.update(['go', 'ya', 'come', 'back', 'good', 'sound'])
    words = ' '.join(texts).lower()
    wordcloud = WordCloud(font_path='CabinSketch-Bold.ttf',
                          stopwords=my_stopwords,
                          background_color='black',
                          width=800,
                          height=600,
                          relative_scaling=1,
                          max_words=max_words
                         ).generate_from_text(words)
    print('Based on {0:,} texts'.format(len(texts)))
    
    fig, ax = plt.subplots(figsize=(15,10))
    ax.imshow(wordcloud)
    ax.axis('off')
    plt.show()

Texts you've sent



In [ ]:

    
# Word cloud of the top 25 words I use based on the most recent 30,000 messages.

texts_from_me = fully_merged_messages_df[fully_merged_messages_df.is_from_me == 1].text[-30000:]
widgets.interact(
    generate_cloud,
    texts=widgets.fixed(texts_from_me),
    max_words=widgets.IntSlider(min=5,max=50,step=1,value=10, continuous_update=False,
                                description='Max words to show:'))

Texts to/from a specific contact



In [ ]:

    
def _word_cloud_specific_contact(max_words, from_me, contact):
    contact = convert_unicode_to_str_if_needed(contact)
    if contact not in full_names:
        print('{} not found'.format(contact))
        return
    sliced_df = fully_merged_messages_df[(fully_merged_messages_df.full_name == contact) &
                                         (fully_merged_messages_df.is_from_me == from_me)].text
    generate_cloud(sliced_df, max_words)

widgets.interact(
    _word_cloud_specific_contact,
    max_words=widgets.IntSlider(min=5, max=50, step=1, value=10,
                                continuous_update=False, description='Max words to show:'),
    from_me=widgets.RadioButtons(
        options={'Show messages FROM me': True, 'Show messages TO me': False}, description=' '),
    contact=widgets.Text(value='Mom', description='Contact name:')
)

Diving deeper into the actual text

Visualize a word tree of texts exchanged with a specific contact



In [ ]:

    
# Note this requires an internet connection to load Google's JS library.
def get_json_for_word_tree(contact):
    df = fully_merged_messages_df[(fully_merged_messages_df.full_name == contact)]
    print('Exchanged {0:,} texts with {1}'.format(df.shape[0], contact))
    
    array_for_json = [[text[1]] for text in df.text.iteritems()]
    array_for_json.insert(0, [['Phrases']])
    return json.dumps(array_for_json)
    
CONTACT_NAME = 'Mom'
ROOT_WORD = 'feel'
HTML(wordtree.get_word_tree_html(get_json_for_word_tree('Mom'),
                                 ROOT_WORD.lower(),
                                 lowercase=True,
                                 tree_type='double'))

Preprocessing and data munging for TFIDF



In [ ]:

    
punctuation = copy.copy(string.punctuation)
punctuation += u'“”‘’\ufffc\uff0c'  # Include some UTF-8 punctuation that occurred.
punct_regex = re.compile(u'[{0}]'.format(punctuation))
spaces_regex = re.compile(r'\s{2,}')
numbers_regex = re.compile(r'\d+')

def clean_text(input_str):
    processed = input_str.lower()
    processed = punct_regex.sub('', processed)
    # Also try: processed = numbers_regex.sub('_NUMBER_', processed)
    processed = numbers_regex.sub('', processed)
    processed = spaces_regex.sub(' ', processed)
    
    return processed

# The normal stopwords list contains words like "i'll" which is unprocessed.
processed_stopwords = [clean_text(word) for word in STOPWORDS]



In [ ]:

    
# Group the texts by person and collapse them into a single string per person.

grouped_by_name = fully_merged_messages_df[fully_merged_messages_df.is_from_me == 0].groupby(
    'full_name')['text'].apply(lambda x: ' '.join(x)).to_frame()
grouped_by_name.info(memory_usage='deep')
grouped_by_name.head(1)

Create TFIDF matrix for all contacts

Note the methods below focus on texts received from these contacts, not texts you've sent to them.



In [ ]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
import numpy as np

vectorizer = TfidfVectorizer(preprocessor=clean_text,
                             tokenizer=tokenize.WordPunctTokenizer().tokenize,
                             stop_words=processed_stopwords,
                             ngram_range=(1, 2), max_df=.9, max_features=50000)
tfidf_transformed_dataset = vectorizer.fit_transform(grouped_by_name.text)
word_list = pd.Series(vectorizer.get_feature_names())

print('TFIDF sparse matrix is {0}MB'.format(tfidf_transformed_dataset.data.nbytes / 1024 / 1024))
print('TFIDF matrix has shape: {0}'.format(tfidf_transformed_dataset.shape))

Helper methods to leverage the TFIDF matrix



In [ ]:

    
def get_word_summary_for_contact(contact, top_n=25):
    contact = convert_unicode_to_str_if_needed(contact)
    tfidf_record = _get_tfidf_record_for_contact(contact)
    if tfidf_record is None:
        print('"{0}" was not found.'.format(contact))
        return
    sorted_indices = tfidf_record.argsort()[::-1]
    return pd.DataFrame({'Word': word_list.iloc[sorted_indices[:top_n]]}).reset_index(drop=True)

def get_word_summary_for_diffs(contact, other_contact, top_n=25):
    contact = convert_unicode_to_str_if_needed(contact)
    other_contact = convert_unicode_to_str_if_needed(other_contact)
    
    tfidf_record_contact = _get_tfidf_record_for_contact(contact)
    tfidf_record_other_contact = _get_tfidf_record_for_contact(other_contact)
    
    if tfidf_record_contact is None or tfidf_record_other_contact is None:
        # Print out the first contact not found.
        contact_not_found = contact if tfidf_record_contact is None else other_contact
        print('"{0}" was not found.'.format(contact_not_found))
        return
    sorted_indices = (tfidf_record_contact - tfidf_record_other_contact).argsort()[::-1]
    return pd.DataFrame({'Word': word_list.iloc[sorted_indices[:top_n]]}).reset_index(drop=True)

# Returns the row in the TFIDF matrix for a given contact by name.
def _get_tfidf_record_for_contact(contact):
    if contact not in grouped_by_name.index:
        return None
    row = np.argmax(grouped_by_name.index == contact)
    return tfidf_transformed_dataset.getrow(row).toarray().squeeze()

Words that identify a specific contact



In [ ]:

    
widgets.interact(
    get_word_summary_for_contact,
    contact=widgets.Text(value='Mom', description='Contact name:', placeholder='Enter name'),
    top_n=widgets.IntSlider(min=10, max=100, step=1, value=5, description='Max words to show:')
)

Words that identify the difference between two contacts



In [ ]:

    
widgets.interact(
    get_word_summary_for_diffs,
    contact=widgets.Text(description='1st Contact:', placeholder='Enter 1st name'),
    other_contact=widgets.Text(description='2nd Contact:', placeholder='Enter 2nd name'),
    top_n=widgets.IntSlider(description='Max words to show:', min=10, max=100, step=1, value=5)
)

Looking at language progression over the years

Helper methods for looking at TFIDF by year



In [ ]:

    
def top_words_by_year_from_tfidf(tfidf_by_year, years_as_list, top_n=15):
    """
    Returns a dataframe of the top words for each year by their TFIDF score.
    
    To determine the "top", we look at one year's TFIDF - avg(other years' TFIDFs)
    
    Args:
        tfidf_by_year: TFIDF matrix with as many rows as entries in years_as_list
        years_as_list: Years that are represented in the TFIDF matrix
        top_n: Number of top words per year to include in the result
    """
    # Densify the tfidf matrix so we can operate on it.
    tfidf_by_year_dense = tfidf_by_year.toarray()
    df_by_year = []
    for i in range(tfidf_by_year_dense.shape[0]):
        this_year = years_as_list[i]
        tfidf_this_year = tfidf_by_year_dense[i]
        tfidf_other_years = np.delete(tfidf_by_year_dense, i, axis=0).mean(axis=0)
        
        sorted_indices = (tfidf_this_year - tfidf_other_years).argsort()[::-1]
        df = pd.DataFrame({this_year: word_list.iloc[sorted_indices[:top_n]]})
        df = df.reset_index(drop=True)
        df_by_year.append(df)
    return pd.concat(df_by_year, axis=1)

def top_words_by_year_from_df(slice_of_texts_df, top_n=15, min_texts_required=100):
    """Returns a dataframe of the top words for each year by their TFIDF score.

    Top is determined by the `top_words_by_year_from_tfidf` method.

    Args:
        slice_of_texts_df: A dataframe with the text messages to process
        top_n: Number of top words per year to include in the result
        min_texts_required: Number of texts to require in each year to not drop the record 
    """
    grouped_by_year_tfidf, years = _tfidf_by_year(slice_of_texts_df, min_texts_required)
    return top_words_by_year_from_tfidf(grouped_by_year_tfidf, years, top_n)

def _tfidf_by_year(slice_of_texts_df, min_texts_required=100):
    """Returns a TFIDF matrix of the texts grouped by year.
    
    Years with less than `min_texts_required` texts will be dropped.
    """
    grouper = slice_of_texts_df.date.apply(lambda x: x.year)
    grouped_by_year = slice_of_texts_df.groupby(grouper).apply(
        lambda row: pd.Series({'count': len(row.date), 'text': ' '.join(row.text)})
    )

    # Drops years with less than min_texts_required texts since they won't be very meaningful.
    years_to_drop = grouped_by_year[grouped_by_year['count'] < min_texts_required].index
    print('Dropping year(s): {0}, each had fewer than {1} texts.'.format(
        ', '.join(str(year) for year in years_to_drop), min_texts_required))
    grouped_by_year = grouped_by_year[grouped_by_year['count'] >= min_texts_required]
    grouped_by_year.index.name = 'year'

    if grouped_by_year.shape[0] == 0:
        print('Bailing out, no years found with at least {0} texts.'.format(min_texts_required))
        return None

    grouped_by_year_tfidf = vectorizer.transform(grouped_by_year['text'])
    print('Found {0} years with more than {1} texts each.'.format(grouped_by_year_tfidf.shape[0],
                                                                  min_texts_required))
    return grouped_by_year_tfidf, grouped_by_year.index

My top words over the years

This offers an interesting insight into the main topics over the years.



In [ ]:

    
top_words_by_year_from_df(fully_merged_messages_df[fully_merged_messages_df.is_from_me == 1],
                          top_n=15)

Top words over the years from/to a specific contact