In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import os
import pickle
import re
import sys
import matplotlib as pl
import matplotlib.pyplot as plt

numpy.random.seed(42)

In [3]:
dataPath = '/Users/omojumiller/mycode/MachineLearningNanoDegree/IntroToMachineLearning/'
sys.path.append(dataPath+'tools/')
sys.path.append(dataPath+'final_project/')

In [4]:
with open(dataPath+'final_project/final_project_dataset.pkl', "r") as data_file:
    data_dict = pickle.load(data_file)
  • get email of author
  • compare to list of known persons of interest
  • return boolean if author is person of interest
  • aggregate count over all emails to person

In [5]:
from __future__ import division

data_point = data_dict['METTS MARK']
frac = data_point["from_poi_to_this_person"] / data_point["to_messages"]
print frac


0.0470879801735

In [6]:
def computeFraction( poi_messages, all_messages ):
    """ given a number messages to/from POI (numerator) 
        and number of all messages to/from a person (denominator),
        return the fraction of messages to/from that person
        that are from/to a POI
        
   """
    ### you fill in this code, so that it returns either
    ###     the fraction of all messages to this person that come from POIs
    ###     or
    ###     the fraction of all messages from this person that are sent to POIs
    ### the same code can be used to compute either quantity

    ### beware of "NaN" when there is no known email address (and so
    ### no filled email features), and integer division!
    ### in case of poi_messages or all_messages having "NaN" value, return 0.
    
    fraction = 0
    
    if poi_messages != 'NaN':
        fraction = float(poi_messages) / float(all_messages)
    


    return fraction

In [7]:
submit_dict = {}
for name in data_dict:

    data_point = data_dict[name]

    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
    print'{:5}{:35}{:.2f}'.format('FROM ', name, fraction_from_poi)
    data_point["fraction_from_poi"] = fraction_from_poi


    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
    fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
    #print fraction_to_poi
    print'{:5}{:35}{:.2f}'.format('TO: ', name, fraction_to_poi)
    submit_dict[name]={"from_poi_to_this_person":fraction_from_poi,
                       "from_this_person_to_poi":fraction_to_poi}
    data_point["fraction_to_poi"] = fraction_to_poi
    
    
#####################

def submitDict():
    return submit_dict


FROM METTS MARK                         0.05
TO:  METTS MARK                         0.03
FROM BAXTER JOHN C                      0.00
TO:  BAXTER JOHN C                      0.00
FROM ELLIOTT STEVEN                     0.00
TO:  ELLIOTT STEVEN                     0.00
FROM CORDES WILLIAM R                   0.01
TO:  CORDES WILLIAM R                   0.00
FROM HANNON KEVIN P                     0.03
TO:  HANNON KEVIN P                     0.66
FROM MORDAUNT KRISTINA M                0.00
TO:  MORDAUNT KRISTINA M                0.00
FROM MEYER ROCKFORD G                   0.00
TO:  MEYER ROCKFORD G                   0.00
FROM MCMAHON JEFFREY                    0.02
TO:  MCMAHON JEFFREY                    0.54
FROM HORTON STANLEY C                   0.02
TO:  HORTON STANLEY C                   0.01
FROM PIPER GREGORY F                    0.05
TO:  PIPER GREGORY F                    0.22
FROM HUMPHREY GENE E                    0.08
TO:  HUMPHREY GENE E                    1.00
FROM UMANOFF ADAM S                     0.11
TO:  UMANOFF ADAM S                     0.00
FROM BLACHMAN JEREMY M                  0.01
TO:  BLACHMAN JEREMY M                  0.14
FROM SUNDE MARTIN                       0.01
TO:  SUNDE MARTIN                       0.34
FROM GIBBS DANA R                       0.00
TO:  GIBBS DANA R                       0.00
FROM LOWRY CHARLES P                    0.00
TO:  LOWRY CHARLES P                    0.00
FROM COLWELL WESLEY                     0.14
TO:  COLWELL WESLEY                     0.28
FROM MULLER MARK S                      0.09
TO:  MULLER MARK S                      0.00
FROM JACKSON CHARLENE R                 0.10
TO:  JACKSON CHARLENE R                 0.34
FROM WESTFAHL RICHARD K                 0.00
TO:  WESTFAHL RICHARD K                 0.00
FROM WALTERS GARETH W                   0.00
TO:  WALTERS GARETH W                   0.00
FROM WALLS JR ROBERT H                  0.03
TO:  WALLS JR ROBERT H                  0.00
FROM KITCHEN LOUISE                     0.03
TO:  KITCHEN LOUISE                     0.11
FROM CHAN RONNIE                        0.00
TO:  CHAN RONNIE                        0.00
FROM BELFER ROBERT                      0.00
TO:  BELFER ROBERT                      0.00
FROM SHANKMAN JEFFREY A                 0.03
TO:  SHANKMAN JEFFREY A                 0.03
FROM WODRASKA JOHN                      0.00
TO:  WODRASKA JOHN                      0.00
FROM BERGSIEKER RICHARD P               0.01
TO:  BERGSIEKER RICHARD P               0.00
FROM URQUHART JOHN A                    0.00
TO:  URQUHART JOHN A                    0.00
FROM BIBI PHILIPPE A                    0.01
TO:  BIBI PHILIPPE A                    0.20
FROM RIEKER PAULA H                     0.03
TO:  RIEKER PAULA H                     0.59
FROM WHALEY DAVID A                     0.00
TO:  WHALEY DAVID A                     0.00
FROM BECK SALLY W                       0.02
TO:  BECK SALLY W                       0.09
FROM HAUG DAVID L                       0.01
TO:  HAUG DAVID L                       0.37
FROM ECHOLS JOHN B                      0.00
TO:  ECHOLS JOHN B                      0.00
FROM MENDELSOHN JOHN                    0.00
TO:  MENDELSOHN JOHN                    0.00
FROM HICKERSON GARY J                   0.03
TO:  HICKERSON GARY J                   0.04
FROM CLINE KENNETH W                    0.00
TO:  CLINE KENNETH W                    0.00
FROM LEWIS RICHARD                      0.01
TO:  LEWIS RICHARD                      0.00
FROM HAYES ROBERT E                     0.03
TO:  HAYES ROBERT E                     0.00
FROM MCCARTY DANNY J                    0.02
TO:  MCCARTY DANNY J                    0.01
FROM KOPPER MICHAEL J                   0.00
TO:  KOPPER MICHAEL J                   0.00
FROM LEFF DANIEL P                      0.02
TO:  LEFF DANIEL P                      0.22
FROM LAVORATO JOHN J                    0.07
TO:  LAVORATO JOHN J                    0.16
FROM BERBERIAN DAVID                    0.00
TO:  BERBERIAN DAVID                    0.00
FROM DETMERING TIMOTHY J                0.00
TO:  DETMERING TIMOTHY J                0.00
FROM WAKEHAM JOHN                       0.00
TO:  WAKEHAM JOHN                       0.00
FROM POWERS WILLIAM                     0.00
TO:  POWERS WILLIAM                     0.00
FROM GOLD JOSEPH                        0.00
TO:  GOLD JOSEPH                        0.00
FROM BANNANTINE JAMES M                 0.07
TO:  BANNANTINE JAMES M                 0.00
FROM DUNCAN JOHN H                      0.00
TO:  DUNCAN JOHN H                      0.00
FROM SHAPIRO RICHARD S                  0.00
TO:  SHAPIRO RICHARD S                  0.05
FROM SHERRIFF JOHN R                    0.01
TO:  SHERRIFF JOHN R                    0.25
FROM SHELBY REX                         0.06
TO:  SHELBY REX                         0.36
FROM LEMAISTRE CHARLES                  0.00
TO:  LEMAISTRE CHARLES                  0.00
FROM DEFFNER JOSEPH M                   0.16
TO:  DEFFNER JOSEPH M                   0.05
FROM KISHKILL JOSEPH G                  0.00
TO:  KISHKILL JOSEPH G                  0.00
FROM WHALLEY LAWRENCE G                 0.03
TO:  WHALLEY LAWRENCE G                 0.04
FROM MCCONNELL MICHAEL S                0.03
TO:  MCCONNELL MICHAEL S                0.07
FROM PIRO JIM                           0.00
TO:  PIRO JIM                           0.06
FROM DELAINEY DAVID W                   0.02
TO:  DELAINEY DAVID W                   0.20
FROM SULLIVAN-SHAKLOVITZ COLLEEN        0.00
TO:  SULLIVAN-SHAKLOVITZ COLLEEN        0.00
FROM WROBEL BRUCE                       0.00
TO:  WROBEL BRUCE                       0.00
FROM LINDHOLM TOD A                     0.00
TO:  LINDHOLM TOD A                     0.00
FROM MEYER JEROME J                     0.00
TO:  MEYER JEROME J                     0.00
FROM LAY KENNETH L                      0.03
TO:  LAY KENNETH L                      0.44
FROM BUTTS ROBERT H                     0.00
TO:  BUTTS ROBERT H                     0.00
FROM OLSON CINDY K                      0.02
TO:  OLSON CINDY K                      0.29
FROM MCDONALD REBECCA                   0.06
TO:  MCDONALD REBECCA                   0.08
FROM CUMBERLAND MICHAEL S               0.00
TO:  CUMBERLAND MICHAEL S               0.00
FROM GAHN ROBERT S                      0.00
TO:  GAHN ROBERT S                      0.00
FROM MCCLELLAN GEORGE                   0.03
TO:  MCCLELLAN GEORGE                   0.00
FROM HERMANN ROBERT J                   0.00
TO:  HERMANN ROBERT J                   0.00
FROM SCRIMSHAW MATTHEW                  0.00
TO:  SCRIMSHAW MATTHEW                  0.00
FROM GATHMANN WILLIAM D                 0.00
TO:  GATHMANN WILLIAM D                 0.00
FROM HAEDICKE MARK E                    0.04
TO:  HAEDICKE MARK E                    0.03
FROM BOWEN JR RAYMOND M                 0.08
TO:  BOWEN JR RAYMOND M                 0.56
FROM GILLIS JOHN                        0.00
TO:  GILLIS JOHN                        0.00
FROM FITZGERALD JAY L                   0.00
TO:  FITZGERALD JAY L                   0.50
FROM MORAN MICHAEL P                    0.00
TO:  MORAN MICHAEL P                    0.00
FROM REDMOND BRIAN L                    0.12
TO:  REDMOND BRIAN L                    0.22
FROM BAZELIDES PHILIP J                 0.00
TO:  BAZELIDES PHILIP J                 0.00
FROM BELDEN TIMOTHY N                   0.03
TO:  BELDEN TIMOTHY N                   0.22
FROM DURAN WILLIAM D                    0.12
TO:  DURAN WILLIAM D                    0.25
FROM THORN TERENCE H                    0.00
TO:  THORN TERENCE H                    0.00
FROM FASTOW ANDREW S                    0.00
TO:  FASTOW ANDREW S                    0.00
FROM FOY JOE                            0.00
TO:  FOY JOE                            0.00
FROM CALGER CHRISTOPHER F               0.08
TO:  CALGER CHRISTOPHER F               0.17
FROM RICE KENNETH D                     0.05
TO:  RICE KENNETH D                     0.22
FROM KAMINSKI WINCENTY J                0.01
TO:  KAMINSKI WINCENTY J                0.01
FROM LOCKHART EUGENE E                  0.00
TO:  LOCKHART EUGENE E                  0.00
FROM COX DAVID                          0.00
TO:  COX DAVID                          0.12
FROM OVERDYKE JR JERE C                 0.00
TO:  OVERDYKE JR JERE C                 0.00
FROM PEREIRA PAULO V. FERRAZ            0.00
TO:  PEREIRA PAULO V. FERRAZ            0.00
FROM STABLER FRANK                      0.00
TO:  STABLER FRANK                      0.00
FROM SKILLING JEFFREY K                 0.02
TO:  SKILLING JEFFREY K                 0.28
FROM BLAKE JR. NORMAN P                 0.00
TO:  BLAKE JR. NORMAN P                 0.00
FROM SHERRICK JEFFREY B                 0.06
TO:  SHERRICK JEFFREY B                 0.72
FROM PRENTICE JAMES                     0.00
TO:  PRENTICE JAMES                     0.00
FROM GRAY RODNEY                        0.00
TO:  GRAY RODNEY                        0.00
FROM PICKERING MARK R                   0.01
TO:  PICKERING MARK R                   0.00
FROM THE TRAVEL AGENCY IN THE PARK      0.00
TO:  THE TRAVEL AGENCY IN THE PARK      0.00
FROM NOLES JAMES L                      0.00
TO:  NOLES JAMES L                      0.00
FROM KEAN STEVEN J                      0.01
TO:  KEAN STEVEN J                      0.06
FROM TOTAL                              0.00
TO:  TOTAL                              0.00
FROM FOWLER PEGGY                       0.00
TO:  FOWLER PEGGY                       0.00
FROM WASAFF GEORGE                      0.06
TO:  WASAFF GEORGE                      0.23
FROM WHITE JR THOMAS E                  0.00
TO:  WHITE JR THOMAS E                  0.00
FROM CHRISTODOULOU DIOMEDES             0.00
TO:  CHRISTODOULOU DIOMEDES             0.00
FROM ALLEN PHILLIP K                    0.02
TO:  ALLEN PHILLIP K                    0.03
FROM SHARP VICTORIA T                   0.01
TO:  SHARP VICTORIA T                   0.04
FROM JAEDICKE ROBERT                    0.00
TO:  JAEDICKE ROBERT                    0.00
FROM WINOKUR JR. HERBERT S              0.00
TO:  WINOKUR JR. HERBERT S              0.00
FROM BROWN MICHAEL                      0.01
TO:  BROWN MICHAEL                      0.02
FROM BADUM JAMES P                      0.00
TO:  BADUM JAMES P                      0.00
FROM HUGHES JAMES A                     0.05
TO:  HUGHES JAMES A                     0.15
FROM REYNOLDS LAWRENCE                  0.00
TO:  REYNOLDS LAWRENCE                  0.00
FROM DIMICHELE RICHARD G                0.00
TO:  DIMICHELE RICHARD G                0.00
FROM BHATNAGAR SANJAY                   0.00
TO:  BHATNAGAR SANJAY                   0.03
FROM CARTER REBECCA C                   0.09
TO:  CARTER REBECCA C                   0.47
FROM BUCHANAN HAROLD G                  0.00
TO:  BUCHANAN HAROLD G                  0.00
FROM YEAP SOON                          0.00
TO:  YEAP SOON                          0.00
FROM MURRAY JULIA H                     0.01
TO:  MURRAY JULIA H                     0.04
FROM GARLAND C KEVIN                    0.05
TO:  GARLAND C KEVIN                    0.61
FROM DODSON KEITH                       0.06
TO:  DODSON KEITH                       0.21
FROM YEAGER F SCOTT                     0.00
TO:  YEAGER F SCOTT                     0.00
FROM HIRKO JOSEPH                       0.00
TO:  HIRKO JOSEPH                       0.00
FROM DIETRICH JANET R                   0.12
TO:  DIETRICH JANET R                   0.22
FROM DERRICK JR. JAMES V                0.03
TO:  DERRICK JR. JAMES V                0.02
FROM FREVERT MARK A                     0.07
TO:  FREVERT MARK A                     0.29
FROM PAI LOU L                          0.00
TO:  PAI LOU L                          0.00
FROM BAY FRANKLIN R                     0.00
TO:  BAY FRANKLIN R                     0.00
FROM HAYSLETT RODERICK J                0.01
TO:  HAYSLETT RODERICK J                0.04
FROM FUGH JOHN L                        0.00
TO:  FUGH JOHN L                        0.00
FROM FALLON JAMES B                     0.02
TO:  FALLON JAMES B                     0.49
FROM KOENIG MARK E                      0.02
TO:  KOENIG MARK E                      0.25
FROM SAVAGE FRANK                       0.00
TO:  SAVAGE FRANK                       0.00
FROM IZZO LAWRENCE L                    0.06
TO:  IZZO LAWRENCE L                    0.26
FROM TILNEY ELIZABETH A                 0.02
TO:  TILNEY ELIZABETH A                 0.58
FROM MARTIN AMANDA K                    0.01
TO:  MARTIN AMANDA K                    0.00
FROM BUY RICHARD B                      0.04
TO:  BUY RICHARD B                      0.07
FROM GRAMM WENDY L                      0.00
TO:  GRAMM WENDY L                      0.00
FROM CAUSEY RICHARD A                   0.03
TO:  CAUSEY RICHARD A                   0.24
FROM TAYLOR MITCHELL S                  0.00
TO:  TAYLOR MITCHELL S                  0.00
FROM DONAHUE JR JEFFREY M               0.22
TO:  DONAHUE JR JEFFREY M               0.50
FROM GLISAN JR BEN F                    0.06
TO:  GLISAN JR BEN F                    0.38

Beware of BUGS!!!

When Katie was working on the Enron POI identifier, she engineered a feature that identified when a given person was on the same email as a POI. So for example, if Ken Lay and Katie Malone are both recipients of the same email message, then Katie Malone should have her "shared receipt" feature incremented. If she shares lots of emails with POIs, maybe she's a POI herself.

Here's the problem: there was a subtle bug, that Ken Lay's "shared receipt" counter would also be incremented when this happens. And of course, then Ken Lay always shares receipt with a POI, because he is a POI. So the "shared receipt" feature became extremely powerful in finding POIs, because it effectively was encoding the label for each person as a feature.

We found this first by being suspicious of a classifier that was always returning 100% accuracy. Then we removed features one at a time, and found that this feature was driving all the performance. Then, digging back through the feature code, we found the bug outlined above. We changed the code so that a person's "shared receipt" feature was only incremented if there was a different POI who received the email, reran the code, and tried again. The accuracy dropped to a more reasonable level.

We take a couple of lessons from this:

  • Anyone can make mistakes--be skeptical of your results!
  • 100% accuracy should generally make you suspicious. Extraordinary claims require extraordinary proof.
  • If there's a feature that tracks your labels a little too closely, it's very likely a bug!
  • If you're sure it's not a bug, you probably don't need machine learning--you can just use that feature alone to assign labels.

Feature Selection Mini Project


In [16]:
sys.path.append(dataPath+'text_learning/')

words_file = "your_word_data.pkl" 
authors_file = "your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )

In [17]:
### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier

from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, 
                                                                            authors, test_size=0.1, random_state=42)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()

In [19]:
### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime

features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]

This is an interative process

  • start off with a peered down version of the dataset
  • run a decision tree on it
  • get the accuracy, should be rather high
  • get the important features definesd by coefs over 0.2
  • remove those features
  • run again until very fews have 0.2 importance value

In [20]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)

print"{}{:.2f}".format("Classifier accurancy: ", clf.score(features_test, labels_test))


Classifier accurancy: 0.81

In [25]:
import operator

featuresImportance =  clf.feature_importances_
featuresSortedByScore = []

for feature in range(len(featuresImportance)):
    if featuresImportance[feature] > 0.2:
        featuresSortedByScore.append([feature, featuresImportance[feature]])
        
df = sorted(featuresSortedByScore, key=operator.itemgetter(1), reverse=True)

for i in range(len(df)):
    print "{:5d}: {:f}".format(df[i][0], df[i][1])


21323: 0.363636

In [26]:
for i in range(len(df)):
    print vectorizer.get_feature_names()[df[i][0]]


houectect

In [ ]: