Train Models

Train a logistic regression model with the engineered features

Including LDA-based topic similarity, sentence position, sentence length, and readability metrics, I trained a logistic regression model that can be applied to new sentences to predict whether they should be highlighted or not. I used logistic regression because this is a binary classification problem and because features weights can later be inspected to get an idea of their importance.

I also tested a couple of methods (synthetic oversampling [SMOTE] and undersampling) from the imblearn library to account for the imbalanced dataset (~2% highlighted vs ~98% non-highlighted sentences). These gave similar highlight sensitivity results and slightly more balanced precision and f1-scores than the logistic regression with sklearn's automatic class weight rebalancing.

Finally, I tested a random forest model for classification. This approach led to the non-highlighted samples dominating the prediction (despite balancing class weights in sklearn), such that highlight recall was very low.

Thus, the final model that is applied in the backend of the skimr web app is a logistic regression model.


In [2]:
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob, Word
import pandas as pd
import sklearn
import pickle
import numpy as np
import scipy
from scipy import spatial
import nltk.data
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import learning_curve, GridSearchCV, StratifiedKFold, cross_val_score, train_test_split 
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.tokenize import RegexpTokenizer
word_tokenizer = RegexpTokenizer('\s+', gaps=True)
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics.pairwise import cosine_similarity

import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

from stop_words import get_stop_words
stop_en = get_stop_words('en')

from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()

en_words = set(nltk.corpus.words.words())

from gensim import corpora, models
import gensim

import timeit
import re
import string
from string import whitespace, punctuation

from nltk.corpus import stopwords
stopw_en = stopwords.words('english')
all_stopw = set(stopw_en) | set(stop_en)
print(len(all_stopw))


207

In [2]:
import math

from utils import get_char_count
from utils import get_words
from utils import get_sentences
from utils import count_syllables
from utils import count_complex_words


class Readability:
    analyzedVars = {}

    def __init__(self, text):
        self.analyze_text(text)

    def analyze_text(self, text):
        words = get_words(text)
        char_count = get_char_count(words)
        word_count = len(words)
        sentence_count = len(get_sentences(text))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text)
        avg_words_p_sentence = word_count/sentence_count
        
        self.analyzedVars = {
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
        }

    def ARI(self):
        score = 0.0 
        if self.analyzedVars['word_cnt'] > 0.0:
            score = 4.71 * (self.analyzedVars['char_cnt'] / self.analyzedVars['word_cnt']) + 0.5 * (self.analyzedVars['word_cnt'] / self.analyzedVars['sentence_cnt']) - 21.43
        return score
        
    def FleschReadingEase(self):
        score = 0.0 
        if self.analyzedVars['word_cnt'] > 0.0:
            score = 206.835 - (1.015 * (self.analyzedVars['avg_words_p_sentence'])) - (84.6 * (self.analyzedVars['syllable_cnt']/ self.analyzedVars['word_cnt']))
        return round(score, 4)
        
    def FleschKincaidGradeLevel(self):
        score = 0.0 
        if self.analyzedVars['word_cnt'] > 0.0:
            score = 0.39 * (self.analyzedVars['avg_words_p_sentence']) + 11.8 * (self.analyzedVars['syllable_cnt']/ self.analyzedVars['word_cnt']) - 15.59
        return round(score, 4)
        
    def GunningFogIndex(self):
        score = 0.0 
        if self.analyzedVars['word_cnt'] > 0.0:
            score = 0.4 * ((self.analyzedVars['avg_words_p_sentence']) + (100 * (self.analyzedVars['complex_word_cnt']/self.analyzedVars['word_cnt'])))
        return round(score, 4)

    def SMOGIndex(self):
        score = 0.0 
        if self.analyzedVars['word_cnt'] > 0.0:
            score = (math.sqrt(self.analyzedVars['complex_word_cnt']*(30/self.analyzedVars['sentence_cnt'])) + 3)
        return score

    def ColemanLiauIndex(self):
        score = 0.0 
        if self.analyzedVars['word_cnt'] > 0.0:
            score = (5.89*(self.analyzedVars['char_cnt']/self.analyzedVars['word_cnt']))-(30*(self.analyzedVars['sentence_cnt']/self.analyzedVars['word_cnt']))-15.8
        return round(score, 4)

    def LIX(self):
        longwords = 0.0
        score = 0.0 
        if self.analyzedVars['word_cnt'] > 0.0:
            for word in self.analyzedVars['words']:
                if len(word) >= 7:
                    longwords += 1.0
            score = self.analyzedVars['word_cnt'] / self.analyzedVars['sentence_cnt'] + float(100 * longwords) / self.analyzedVars['word_cnt']
        return score

    def RIX(self):
        longwords = 0.0
        score = 0.0 
        if self.analyzedVars['word_cnt'] > 0.0:
            for word in self.analyzedVars['words']:
                if len(word) >= 7:
                    longwords += 1.0
            score = longwords / self.analyzedVars['sentence_cnt']
        return score

if __name__ == "__main__":
    text = """We are close to wrapping up our 10 week Rails Course. This week we will cover a handful of topics commonly encountered in Rails projects. We then wrap up with part 2 of our Reddit on Rails exercise!  By now you should be hard at work on your personal projects. The students in the course just presented in front of the class with some live demos and a brief intro to to the problems their app were solving. Maybe set aside some time this week to show someone your progress, block off 5 minutes and describe what goal you are working towards, the current state of the project (is it almost done, just getting started, needs UI, etc.), and then show them a quick demo of the app. Explain what type of feedback you are looking for (conceptual, design, usability, etc.) and see what they have to say.  As we are wrapping up the course you need to be focused on learning as much as you can, but also making sure you have the tools to succeed after the class is over."""

    rd = Readability(text)

In [3]:
# testing readability
rd = Readability('We are close to wrapping up our 10 week Rails Course. This week we will cover a handful of topics commonly encountered in Rails projects. We then wrap up with part 2 of our Reddit on Rails exercise!  By now you should be hard at work on your personal projects. The students in the course just presented in front of the class with some live demos and a brief intro to to the problems their app were solving. Maybe set aside some time this week to show someone your progress, block off 5 minutes and describe what goal you are working towards, the current state of the project (is it almost done, just getting started, needs UI, etc.), and then show them a quick demo of the app. Explain what type of feedback you are looking for (conceptual, design, usability, etc.) and see what they have to say.  As we are wrapping up the course you need to be focused on learning as much as you can, but also making sure you have the tools to succeed after the class is over.')
print(rd.ARI())


7.216451612903228

Load dictionary and data


In [11]:
# dict_all = pickle.load(open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/dict_all_new','rb'))
# data = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/data_pd_new')
# data.head()

CONVERT list of paragraphs in 'text' column into string containing all text


In [ ]:
set_tr = data

n = 0
sent_join = []
for i in set_tr['text']:
    sent = str(' '.join(i))
    sent_join.append(sent)

DELETE HIGHLIGHTS FROM FULLTEXT SENTENCES


In [ ]:
n = 0
fulls_noh = []
for i in set_tr['highlights']:
    full = sent_join[n]
#     print(full)
#     print(i)
    full_noh = full.replace(i,'.')
#     print(full_noh)
    fulls_noh.append(full_noh)
    n+=1
set_tmp = set_tr
set_tr = pd.DataFrame({'ids':set_tmp['ids'], 'highlights':set_tmp['highlights'], 'text':set_tmp['text'], 'textwohighlight':fulls_noh})

print(len(set_tr['textwohighlight']))

Save set_tr


In [ ]:
# fset_tr = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/set_tr','wb')
# pickle.dump(set_tr, fset_tr)

LOAD set_tr


In [7]:
set_tr = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/set_tr')
print(set_tr)


                                             highlights   ids  \
0     Until you appreciate what you currently have, ...     2   
1     And let’s not feel terrified, but full of rene...     4   
2     Hard work is doing the work other people don’t...     5   
3     He’d made up his mind to skip the dog thing al...    10   
4     Realizing that our actions, feelings and behav...    12   
5     So, the advice I have is to find what your nor...    13   
6     Cryptoeconomic approaches combine cryptography...    14   
7     You can feel however you want about Durant’s d...    15   
8      A good principle should be a tradeoff, a choice.    16   
9     But do I think there are going to be 12 people...    17   
10    Angry, unhappy people demand bad leaders, who ...    18   
11    For clarity we remove technical terms and we p...    19   
12    The fact that something takes longer and requi...    22   
13    Prioritize placing buttons at the bottom of th...    23   
14    Token networks remove this friction by alignin...    25   
15    (We also didn’t see that he double-dribbled li...    26   
16    One is so fundamental as to be trite. Executio...    27   
17    I just look at Apple these days and see a comp...    28   
18    Women are conditioned to be very aware of the ...    29   
19               Because I enjoy the sound of laughter.    30   
20                  Tie your story into a larger point.    31   
21    “Once you make a decision, the universe conspi...    32   
22    Creativity allows us to take the data we have,...    33   
23    “When I had nothing to lose, I had everything....    34   
24    We just need to pause a moment and recognize t...    35   
25    We are less bored than our ancestors were, but...    36   
26    Sometimes the questions are complicated and th...    38   
27    So how do you actually win? By deploying const...    39   
28    We are all apprentices in a craft where no one...    40   
29    Interfaces are stories, and every designer is ...    41   
...                                                 ...   ...   
3171  A vast majority of startups have 5 to 10 gener...  4496   
3172  It’s also a post, so it can be recommended, hi...  4498   
3173  Personally and professionally this is an excit...  4499   
3174                     The best ships are friendships  4502   
3175                                             really  4503   
3176  The platform’s growth has spurned a culture of...  4507   
3177  Societies that hold girls to rigid expectation...  4509   
3178  Twitter enables connections between accounts. ...  4512   
3179  If the minimum wage had kept pace with product...  4514   
3180  Podcasts are terrific. But for many people (my...  4515   
3181                           Trendy names never work.  4517   
3182           And oddly absent in all of this: Dr. Dre  4520   
3183                      Why are you crying at 7:30am?  4522   
3184  Snapchat captures such screenshot usage, but m...  4526   
3185  If there’s anything drug addicts are good at, ...  4530   
3186  important to remember that product design is a...  4531   
3187                                          Chameleon  4532   
3188  Don’t confuse shame and guilt. Guilt is health...  4533   
3189  How can you experience the worst of humanity b...  4536   
3190                                        credibility  4537   
3191                                            cashier  4538   
3192  Em 2015 a mulher tem que brigar para ser o que...  4539   
3193  In the Top 500 most-saved articles from the fi...  4540   
3194  While social norms and even state laws say a m...  4541   
3195                                             circus  4543   
3196  A desk is a dangerous place from which to view...  4546   
3197      A crowd is now better at journalism than CNN.  4548   
3198                 the opposite of science isn’t girl  4554   
3199  Medium will be the first and best network for ...  4555   
3200                Her struggles deserve your respect.  4558   

                                                   text  \
0     [https://unsplash.com/?photo=pFqrYbhIAXs, 35 T...   
1     [Not giving in to terrorism, , The purpose of ...   
2     [Actual hard work, If you’re reading this, you...   
3     [Border Collie Who Almost Became Trump’s White...   
4     [, How to use Cognitive Behavioural Therapy to...   
5     [I Don’t Care What You Want or Need, , I’m des...   
6     [Cryptoeconomics 101, Much has been discussed ...   
7     [So Kevin Durant Can Play Center Now?, Golden ...   
8     [In the future, design principles won’t be abo...   
9     [The Rise of Audio & Voice, , This is a hu...   
10    [Why Trump Was Inevitable — But Recovery Isn’t...   
11    [UX Writing: How to do it like Google with thi...   
12    [, Why I illustrate all our blog posts, as a C...   
13    [All Thumbs, Why Reach Navigation Should Repla...   
14    [Crypto Tokens: A Breakthrough in Open Network...   
15    [NBA Shootaround, It Takes Two, Steph Curry an...   
16    [What Really Happened with Vista, I generally ...   
17    [, The Falling Apple, Revisited, Some thoughts...   
18    [Whose Pain Counts?, Many years ago, I had sex...   
19    [Upgrading from Node 6 to Node 8: a real-world...   
20    [, Storytelling Should Be the Number One Skill...   
21    [https://unsplash.com/?photo=3Z70SDuYs5g, Will...   
22    [If You Want to Be Creative, Don’t Be Data Dri...   
23    [https://unsplash.com/?photo=wxxAx26SXys, Live...   
24    [Melania Trump isn’t a black and white issue, ...   
25    [Illustration by Ana Vasquez, A Day at Google,...   
26    [, 10 incredible Quotes to guide your life., Y...   
27    [Advice For Millennials Entering Their First J...   
28    [How to Go From Hobbyist to Professional Devel...   
29    [Storyframes before wireframes: starting desig...   
...                                                 ...   
3171  [, Startup Advising Is Broken — Here’s What We...   
3172  [Introducing Letters, At Medium, we are buildi...   
3173  [, HODINKEE + Watchville join forces, North Te...   
3174  [The problem: it’s really tedious and stressfu...   
3175  [, VSCO Suite for Mac OS X, Concept to bring s...   
3176  [Re-imagining Twitter, How Chris Sacca’s 8,500...   
3177  [What’s killing our girls, My eldest daughter ...   
3178  [The Internet of Tweets, Will.I.Am visiting Tw...   
3179  [Youth Unemployment and Dr. King’s Dream, (U.S...   
3180  [Introducing Spoken.co, (You can also listen t...   
3181  [The 9 Habits of Highly Effective Restaurants,...   
3182  [, Apple Eating Beats Revisited, Remember when...   
3183  [, How to make peanut butter toast, There are ...   
3184  [, The Power of the Screenshot, If you analyze...   
3185  [No this is not my mug shot, this is not me. P...   
3186  [The Time When Lady Gaga Told YouTube to Keep ...   
3187  [27 iOS open source libraries to skyrocket you...   
3188  [via Educate Empower Kids, The Naked People In...   
3189  [, My Mom Has a Question, My parents and I jus...   
3190  [Walk then talk, My guiding philosophy on deve...   
3191  [Traveling in Japan, A quick collection of not...   
3192  [“O Nerd Padrão é Imbecil e Preconceituoso”, E...   
3193  [, Surprise! Our Attention Spans Aren’t Dead!,...   
3194  [A Room With a View — and Baby Too, Making Spa...   
3195  [How I May Have Saved the Life of the Person i...   
3196  [Culture Is More Than a Ping Pong Table, Organ...   
3197  [Bernie Sanders interviewed by Brianna Keilar....   
3198  [Papas, please let your babies grow up to be p...   
3199  [This is not a new strategy., In April 2012, I...   
3200  [What If Don Draper Was a Woman?, Jessica Knol...   

                                        textwohighlight  
0     https://unsplash.com/?photo=pFqrYbhIAXs 35 Thi...  
1     Not giving in to terrorism  The purpose of ter...  
2     Actual hard work If you’re reading this, you p...  
3     Border Collie Who Almost Became Trump’s White ...  
4      How to use Cognitive Behavioural Therapy to l...  
5     I Don’t Care What You Want or Need  I’m desper...  
6     Cryptoeconomics 101 Much has been discussed ab...  
7     So Kevin Durant Can Play Center Now? Golden St...  
8     In the future, design principles won’t be abou...  
9     The Rise of Audio & Voice  This is a huge ...  
10    Why Trump Was Inevitable — But Recovery Isn’t ...  
11    UX Writing: How to do it like Google with this...  
12     Why I illustrate all our blog posts, as a CEO...  
13    All Thumbs, Why Reach Navigation Should Replac...  
14    Crypto Tokens: A Breakthrough in Open Network ...  
15    NBA Shootaround It Takes Two Steph Curry and K...  
16    What Really Happened with Vista I generally ha...  
17     The Falling Apple, Revisited Some thoughts on...  
18    Whose Pain Counts? Many years ago, I had sex w...  
19    Upgrading from Node 6 to Node 8: a real-world ...  
20     Storytelling Should Be the Number One Skill Y...  
21    https://unsplash.com/?photo=3Z70SDuYs5g Willpo...  
22    If You Want to Be Creative, Don’t Be Data Driv...  
23    https://unsplash.com/?photo=wxxAx26SXys Live L...  
24    Melania Trump isn’t a black and white issue  W...  
25    Illustration by Ana Vasquez A Day at Google, L...  
26     10 incredible Quotes to guide your life. You ...  
27    Advice For Millennials Entering Their First Jo...  
28    How to Go From Hobbyist to Professional Develo...  
29    Storyframes before wireframes: starting design...  
...                                                 ...  
3171   Startup Advising Is Broken — Here’s What We’r...  
3172  Introducing Letters At Medium, we are building...  
3173   HODINKEE + Watchville join forces North Techn...  
3174  The problem: it’s really tedious and stressful...  
3175   VSCO Suite for Mac OS X Concept to bring simp...  
3176  Re-imagining Twitter How Chris Sacca’s 8,500 w...  
3177  What’s killing our girls My eldest daughter is...  
3178  The Internet of Tweets Will.I.Am visiting Twit...  
3179  Youth Unemployment and Dr. King’s Dream (U.S. ...  
3180  Introducing Spoken.co (You can also listen to ...  
3181  The 9 Habits of Highly Effective Restaurants W...  
3182   Apple Eating Beats Revisited Remember when ev...  
3183   How to make peanut butter toast There are ste...  
3184   The Power of the Screenshot If you analyzed m...  
3185  No this is not my mug shot, this is not me. Ph...  
3186  The Time When Lady Gaga Told YouTube to Keep i...  
3187  27 iOS open source libraries to skyrocket your...  
3188  via Educate Empower Kids The Naked People In Y...  
3189   My Mom Has a Question My parents and I just g...  
3190  Walk then talk My guiding philosophy on develo...  
3191  Traveling in Japan A quick collection of notes...  
3192  “O Nerd Padrão é Imbecil e Preconceituoso” Ess...  
3193   Surprise! Our Attention Spans Aren’t Dead! Ho...  
3194  A Room With a View — and Baby Too Making Space...  
3195  How I May Have Saved the Life of the Person in...  
3196  Culture Is More Than a Ping Pong Table Organiz...  
3197  Bernie Sanders interviewed by Brianna Keilar. ...  
3198  Papas, please let your babies grow up to be pr...  
3199  This is not a new strategy. In April 2012, I w...  
3200  What If Don Draper Was a Woman? Jessica Knoll,...  

[3201 rows x 4 columns]

Create 'dataset' for further analysis (and pickle)


In [38]:
dataset = pd.DataFrame({'highlights':set_tr['highlights'], 'ids':set_tr['ids'], 'text':set_tr['text'], \
                        'textwohighlight':fulls_noh, 'textjoined':sent_join})
print(dataset)

dataset.to_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/dataset_hl_ids_txt_wohl_joined')


                                             highlights   ids  \
0     Until you appreciate what you currently have, ...     2   
1     And let’s not feel terrified, but full of rene...     4   
2     Hard work is doing the work other people don’t...     5   
3     He’d made up his mind to skip the dog thing al...    10   
4     Realizing that our actions, feelings and behav...    12   
5     So, the advice I have is to find what your nor...    13   
6     Cryptoeconomic approaches combine cryptography...    14   
7     You can feel however you want about Durant’s d...    15   
8      A good principle should be a tradeoff, a choice.    16   
9     But do I think there are going to be 12 people...    17   
10    Angry, unhappy people demand bad leaders, who ...    18   
11    For clarity we remove technical terms and we p...    19   
12    The fact that something takes longer and requi...    22   
13    Prioritize placing buttons at the bottom of th...    23   
14    Token networks remove this friction by alignin...    25   
15    (We also didn’t see that he double-dribbled li...    26   
16    One is so fundamental as to be trite. Executio...    27   
17    I just look at Apple these days and see a comp...    28   
18    Women are conditioned to be very aware of the ...    29   
19               Because I enjoy the sound of laughter.    30   
20                  Tie your story into a larger point.    31   
21    “Once you make a decision, the universe conspi...    32   
22    Creativity allows us to take the data we have,...    33   
23    “When I had nothing to lose, I had everything....    34   
24    We just need to pause a moment and recognize t...    35   
25    We are less bored than our ancestors were, but...    36   
26    Sometimes the questions are complicated and th...    38   
27    So how do you actually win? By deploying const...    39   
28    We are all apprentices in a craft where no one...    40   
29    Interfaces are stories, and every designer is ...    41   
...                                                 ...   ...   
3171  A vast majority of startups have 5 to 10 gener...  4496   
3172  It’s also a post, so it can be recommended, hi...  4498   
3173  Personally and professionally this is an excit...  4499   
3174                     The best ships are friendships  4502   
3175                                             really  4503   
3176  The platform’s growth has spurned a culture of...  4507   
3177  Societies that hold girls to rigid expectation...  4509   
3178  Twitter enables connections between accounts. ...  4512   
3179  If the minimum wage had kept pace with product...  4514   
3180  Podcasts are terrific. But for many people (my...  4515   
3181                           Trendy names never work.  4517   
3182           And oddly absent in all of this: Dr. Dre  4520   
3183                      Why are you crying at 7:30am?  4522   
3184  Snapchat captures such screenshot usage, but m...  4526   
3185  If there’s anything drug addicts are good at, ...  4530   
3186  important to remember that product design is a...  4531   
3187                                          Chameleon  4532   
3188  Don’t confuse shame and guilt. Guilt is health...  4533   
3189  How can you experience the worst of humanity b...  4536   
3190                                        credibility  4537   
3191                                            cashier  4538   
3192  Em 2015 a mulher tem que brigar para ser o que...  4539   
3193  In the Top 500 most-saved articles from the fi...  4540   
3194  While social norms and even state laws say a m...  4541   
3195                                             circus  4543   
3196  A desk is a dangerous place from which to view...  4546   
3197      A crowd is now better at journalism than CNN.  4548   
3198                 the opposite of science isn’t girl  4554   
3199  Medium will be the first and best network for ...  4555   
3200                Her struggles deserve your respect.  4558   

                                                   text  \
0     [https://unsplash.com/?photo=pFqrYbhIAXs, 35 T...   
1     [Not giving in to terrorism, , The purpose of ...   
2     [Actual hard work, If you’re reading this, you...   
3     [Border Collie Who Almost Became Trump’s White...   
4     [, How to use Cognitive Behavioural Therapy to...   
5     [I Don’t Care What You Want or Need, , I’m des...   
6     [Cryptoeconomics 101, Much has been discussed ...   
7     [So Kevin Durant Can Play Center Now?, Golden ...   
8     [In the future, design principles won’t be abo...   
9     [The Rise of Audio & Voice, , This is a hu...   
10    [Why Trump Was Inevitable — But Recovery Isn’t...   
11    [UX Writing: How to do it like Google with thi...   
12    [, Why I illustrate all our blog posts, as a C...   
13    [All Thumbs, Why Reach Navigation Should Repla...   
14    [Crypto Tokens: A Breakthrough in Open Network...   
15    [NBA Shootaround, It Takes Two, Steph Curry an...   
16    [What Really Happened with Vista, I generally ...   
17    [, The Falling Apple, Revisited, Some thoughts...   
18    [Whose Pain Counts?, Many years ago, I had sex...   
19    [Upgrading from Node 6 to Node 8: a real-world...   
20    [, Storytelling Should Be the Number One Skill...   
21    [https://unsplash.com/?photo=3Z70SDuYs5g, Will...   
22    [If You Want to Be Creative, Don’t Be Data Dri...   
23    [https://unsplash.com/?photo=wxxAx26SXys, Live...   
24    [Melania Trump isn’t a black and white issue, ...   
25    [Illustration by Ana Vasquez, A Day at Google,...   
26    [, 10 incredible Quotes to guide your life., Y...   
27    [Advice For Millennials Entering Their First J...   
28    [How to Go From Hobbyist to Professional Devel...   
29    [Storyframes before wireframes: starting desig...   
...                                                 ...   
3171  [, Startup Advising Is Broken — Here’s What We...   
3172  [Introducing Letters, At Medium, we are buildi...   
3173  [, HODINKEE + Watchville join forces, North Te...   
3174  [The problem: it’s really tedious and stressfu...   
3175  [, VSCO Suite for Mac OS X, Concept to bring s...   
3176  [Re-imagining Twitter, How Chris Sacca’s 8,500...   
3177  [What’s killing our girls, My eldest daughter ...   
3178  [The Internet of Tweets, Will.I.Am visiting Tw...   
3179  [Youth Unemployment and Dr. King’s Dream, (U.S...   
3180  [Introducing Spoken.co, (You can also listen t...   
3181  [The 9 Habits of Highly Effective Restaurants,...   
3182  [, Apple Eating Beats Revisited, Remember when...   
3183  [, How to make peanut butter toast, There are ...   
3184  [, The Power of the Screenshot, If you analyze...   
3185  [No this is not my mug shot, this is not me. P...   
3186  [The Time When Lady Gaga Told YouTube to Keep ...   
3187  [27 iOS open source libraries to skyrocket you...   
3188  [via Educate Empower Kids, The Naked People In...   
3189  [, My Mom Has a Question, My parents and I jus...   
3190  [Walk then talk, My guiding philosophy on deve...   
3191  [Traveling in Japan, A quick collection of not...   
3192  [“O Nerd Padrão é Imbecil e Preconceituoso”, E...   
3193  [, Surprise! Our Attention Spans Aren’t Dead!,...   
3194  [A Room With a View — and Baby Too, Making Spa...   
3195  [How I May Have Saved the Life of the Person i...   
3196  [Culture Is More Than a Ping Pong Table, Organ...   
3197  [Bernie Sanders interviewed by Brianna Keilar....   
3198  [Papas, please let your babies grow up to be p...   
3199  [This is not a new strategy., In April 2012, I...   
3200  [What If Don Draper Was a Woman?, Jessica Knol...   

                                             textjoined  \
0     https://unsplash.com/?photo=pFqrYbhIAXs 35 Thi...   
1     Not giving in to terrorism  The purpose of ter...   
2     Actual hard work If you’re reading this, you p...   
3     Border Collie Who Almost Became Trump’s White ...   
4      How to use Cognitive Behavioural Therapy to l...   
5     I Don’t Care What You Want or Need  I’m desper...   
6     Cryptoeconomics 101 Much has been discussed ab...   
7     So Kevin Durant Can Play Center Now? Golden St...   
8     In the future, design principles won’t be abou...   
9     The Rise of Audio & Voice  This is a huge ...   
10    Why Trump Was Inevitable — But Recovery Isn’t ...   
11    UX Writing: How to do it like Google with this...   
12     Why I illustrate all our blog posts, as a CEO...   
13    All Thumbs, Why Reach Navigation Should Replac...   
14    Crypto Tokens: A Breakthrough in Open Network ...   
15    NBA Shootaround It Takes Two Steph Curry and K...   
16    What Really Happened with Vista I generally ha...   
17     The Falling Apple, Revisited Some thoughts on...   
18    Whose Pain Counts? Many years ago, I had sex w...   
19    Upgrading from Node 6 to Node 8: a real-world ...   
20     Storytelling Should Be the Number One Skill Y...   
21    https://unsplash.com/?photo=3Z70SDuYs5g Willpo...   
22    If You Want to Be Creative, Don’t Be Data Driv...   
23    https://unsplash.com/?photo=wxxAx26SXys Live L...   
24    Melania Trump isn’t a black and white issue  W...   
25    Illustration by Ana Vasquez A Day at Google, L...   
26     10 incredible Quotes to guide your life. You ...   
27    Advice For Millennials Entering Their First Jo...   
28    How to Go From Hobbyist to Professional Develo...   
29    Storyframes before wireframes: starting design...   
...                                                 ...   
3171   Startup Advising Is Broken — Here’s What We’r...   
3172  Introducing Letters At Medium, we are building...   
3173   HODINKEE + Watchville join forces North Techn...   
3174  The problem: it’s really tedious and stressful...   
3175   VSCO Suite for Mac OS X Concept to bring simp...   
3176  Re-imagining Twitter How Chris Sacca’s 8,500 w...   
3177  What’s killing our girls My eldest daughter is...   
3178  The Internet of Tweets Will.I.Am visiting Twit...   
3179  Youth Unemployment and Dr. King’s Dream (U.S. ...   
3180  Introducing Spoken.co (You can also listen to ...   
3181  The 9 Habits of Highly Effective Restaurants W...   
3182   Apple Eating Beats Revisited Remember when ev...   
3183   How to make peanut butter toast There are ste...   
3184   The Power of the Screenshot If you analyzed m...   
3185  No this is not my mug shot, this is not me. Ph...   
3186  The Time When Lady Gaga Told YouTube to Keep i...   
3187  27 iOS open source libraries to skyrocket your...   
3188  via Educate Empower Kids The Naked People In Y...   
3189   My Mom Has a Question My parents and I just g...   
3190  Walk then talk My guiding philosophy on develo...   
3191  Traveling in Japan A quick collection of notes...   
3192  “O Nerd Padrão é Imbecil e Preconceituoso” Ess...   
3193   Surprise! Our Attention Spans Aren’t Dead! Ho...   
3194  A Room With a View — and Baby Too Making Space...   
3195  How I May Have Saved the Life of the Person in...   
3196  Culture Is More Than a Ping Pong Table Organiz...   
3197  Bernie Sanders interviewed by Brianna Keilar. ...   
3198  Papas, please let your babies grow up to be pr...   
3199  This is not a new strategy. In April 2012, I w...   
3200  What If Don Draper Was a Woman? Jessica Knoll,...   

                                        textwohighlight  
0     https://unsplash.com/?photo=pFqrYbhIAXs 35 Thi...  
1     Not giving in to terrorism  The purpose of ter...  
2     Actual hard work If you’re reading this, you p...  
3     Border Collie Who Almost Became Trump’s White ...  
4      How to use Cognitive Behavioural Therapy to l...  
5     I Don’t Care What You Want or Need  I’m desper...  
6     Cryptoeconomics 101 Much has been discussed ab...  
7     So Kevin Durant Can Play Center Now? Golden St...  
8     In the future, design principles won’t be abou...  
9     The Rise of Audio & Voice  This is a huge ...  
10    Why Trump Was Inevitable — But Recovery Isn’t ...  
11    UX Writing: How to do it like Google with this...  
12     Why I illustrate all our blog posts, as a CEO...  
13    All Thumbs, Why Reach Navigation Should Replac...  
14    Crypto Tokens: A Breakthrough in Open Network ...  
15    NBA Shootaround It Takes Two Steph Curry and K...  
16    What Really Happened with Vista I generally ha...  
17     The Falling Apple, Revisited Some thoughts on...  
18    Whose Pain Counts? Many years ago, I had sex w...  
19    Upgrading from Node 6 to Node 8: a real-world ...  
20     Storytelling Should Be the Number One Skill Y...  
21    https://unsplash.com/?photo=3Z70SDuYs5g Willpo...  
22    If You Want to Be Creative, Don’t Be Data Driv...  
23    https://unsplash.com/?photo=wxxAx26SXys Live L...  
24    Melania Trump isn’t a black and white issue  W...  
25    Illustration by Ana Vasquez A Day at Google, L...  
26     10 incredible Quotes to guide your life. You ...  
27    Advice For Millennials Entering Their First Jo...  
28    How to Go From Hobbyist to Professional Develo...  
29    Storyframes before wireframes: starting design...  
...                                                 ...  
3171   Startup Advising Is Broken — Here’s What We’r...  
3172  Introducing Letters At Medium, we are building...  
3173   HODINKEE + Watchville join forces North Techn...  
3174  The problem: it’s really tedious and stressful...  
3175   VSCO Suite for Mac OS X Concept to bring simp...  
3176  Re-imagining Twitter How Chris Sacca’s 8,500 w...  
3177  What’s killing our girls My eldest daughter is...  
3178  The Internet of Tweets Will.I.Am visiting Twit...  
3179  Youth Unemployment and Dr. King’s Dream (U.S. ...  
3180  Introducing Spoken.co (You can also listen to ...  
3181  The 9 Habits of Highly Effective Restaurants W...  
3182   Apple Eating Beats Revisited Remember when ev...  
3183   How to make peanut butter toast There are ste...  
3184   The Power of the Screenshot If you analyzed m...  
3185  No this is not my mug shot, this is not me. Ph...  
3186  The Time When Lady Gaga Told YouTube to Keep i...  
3187  27 iOS open source libraries to skyrocket your...  
3188  via Educate Empower Kids The Naked People In Y...  
3189   My Mom Has a Question My parents and I just g...  
3190  Walk then talk My guiding philosophy on develo...  
3191  Traveling in Japan A quick collection of notes...  
3192  “O Nerd Padrão é Imbecil e Preconceituoso” Ess...  
3193   Surprise! Our Attention Spans Aren’t Dead! Ho...  
3194  A Room With a View — and Baby Too Making Space...  
3195  How I May Have Saved the Life of the Person in...  
3196  Culture Is More Than a Ping Pong Table Organiz...  
3197  Bernie Sanders interviewed by Brianna Keilar. ...  
3198  Papas, please let your babies grow up to be pr...  
3199  This is not a new strategy. In April 2012, I w...  
3200  What If Don Draper Was a Woman? Jessica Knoll,...  

[3201 rows x 5 columns]

LOAD LDA vectors for articles


In [10]:
ldamodel = pickle.load(open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/lda_10topic20pass','rb'))
print(ldamodel.print_topics( num_topics=10, num_words=5))

all_lda_vecs = pickle.load(open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/all_lda_vecs','rb'))
print(all_lda_vecs[0])

commonwords_2 = pickle.load(open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/commonwords2','rb'))
# wordlist = pickle.load(open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/wordlist','rb')

dictionary = pickle.load(open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/lda_dictionary_new2','rb'))
print(dictionary)


[(0, '0.011*"compani" + 0.010*"product" + 0.008*"busi" + 0.007*"build" + 0.007*"team"'), (1, '0.030*"design" + 0.010*"user" + 0.007*"code" + 0.006*"web" + 0.006*"color"'), (2, '0.013*"write" + 0.013*"read" + 0.008*"learn" + 0.008*"love" + 0.007*"life"'), (3, '0.034*"via" + 0.028*"music" + 0.025*"game" + 0.020*"univ" + 0.019*"data"'), (4, '0.007*"us" + 0.006*"learn" + 0.006*"world" + 0.005*"system" + 0.005*"human"'), (5, '0.153*"de" + 0.103*"e" + 0.044*"um" + 0.040*"da" + 0.039*"para"'), (6, '0.055*"white" + 0.033*"black" + 0.016*"photo" + 0.016*"hou" + 0.016*"presid"'), (7, '0.016*"life" + 0.008*"success" + 0.007*"feel" + 0.007*"becom" + 0.006*"learn"'), (8, '0.009*"food" + 0.007*"home" + 0.006*"eat" + 0.006*"hou" + 0.006*"live"'), (9, '0.007*"trump" + 0.007*"us" + 0.007*"said" + 0.005*"men" + 0.005*"never"')]
[0.10107509328751951, 0, 0.10852728610649451, 0, 0.075517989451679202, 0, 0, 0.67870291423836493, 0, 0.022212525354884893]
Dictionary(17433 unique tokens: ['give', 'success', 'wast', 'five', 'spend']...)

Convert lda topic tuple output to a vector


In [8]:
def lda_to_vec(lda_input):
    num_topics = 10
    vec = [0]*num_topics
    for i in lda_input:
        col = i[0]
        val = i[1]
        vec[col] = val
    return vec

Clean text for LDA


In [9]:
def clean_text(sent):
    # remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    txt2 = re.sub(u'\u2014','',sent) # remove em dashes
    txt3 = re.sub(r'\d+', '', txt2) # remove digits
    txt4 = txt3.translate(translator) # remove punctuation
    # split text into words
    tokens = word_tokenizer.tokenize(txt4.lower())
    # strip single and double quotes from ends of words
    tokens_strip = [i.strip('”“’‘') for i in tokens]
    # keep only english words
    tokens_en = [i for i in tokens_strip if i in en_words]
    # remove nltk/stop_word stop words
    nostop_tokens = [i for i in tokens_en if not i in all_stopw]
    # strip single and double quotes from ends of words
    nostop_strip = [i.strip('”“’‘') for i in nostop_tokens]
    # stem words
    stemmed = [p_stemmer.stem(i) for i in nostop_strip]
    # strip single and double quotes from ends of words
    stemmed_strip = [i.strip('”“’‘') for i in stemmed]
    # stem words
    stemmed2 = [p_stemmer.stem(i) for i in stemmed_strip]
    # strip single and double quotes from ends of words
    stemmed2_strip = [i.strip('”“’‘') for i in stemmed2]
    # remove common words post-stemming
    stemmed_nocommon = [i for i in stemmed2_strip if not i in commonwords_2]
    return stemmed_nocommon

test if all highlights are in main texts


In [ ]:
# for h in dataset['highlights']:
    

#     # sentence is tokenized from highlight or full text
    
#     # break text into sentences and get total sents in full text
#     full_sents = sent_tokenizer.tokenize(text)
#     num_sents = len(full_sents)

#     # break text into words and get total words in full text
#     full_words = word_tokenizer.tokenize(text)
#     num_words = len(full_words)

#     try:
#         pos = text.index(sentence)

#         # total words in full text before highlight position
#         b4_words = word_tokenizer.tokenize(text[:pos])
#         b4_wlen = len(b4_words)

#         # sentences in full text before highlight position
#         b4_sents = sent_tokenizer.tokenize(text[:pos])
#         b4_slen = len(b4_sents)

#         frc_w = b4_wlen / num_words
#         frc_s = b4_slen / num_sents
#     except ValueError:
#         print('\nsentence not in text!\n')
    
#     return frc_w, frc_s

Function to calculate position of sentence within article (frac of sentences into text)


In [29]:
def sent_pos(sentence, text, idval):

    # sentence is tokenized from highlight or full text
    
    # remove 1-word sentences?
    
    # break text into sentences and get total sents in full text
    full_sents = sent_tokenizer.tokenize(text)
    num_sents = len(full_sents)

    # break text into words and get total words in full text
    full_words = word_tokenizer.tokenize(text)
    num_words = len(full_words)

#     try:
    pos = text.find(sentence)

    if pos >= 0:

        # total words in full text before highlight position
        b4_words = word_tokenizer.tokenize(text[:pos])
        b4_wlen = len(b4_words)

        # sentences in full text before highlight position
        b4_sents = sent_tokenizer.tokenize(text[:pos])
        b4_slen = len(b4_sents)

        frc_w = b4_wlen / num_words
        frc_s = b4_slen / num_sents

    elif pos < 0:
#             print('\nsentence not in text!\n')
        print(str(idval) + ' ' + str(sentence))
        frc_w = -1
        frc_s = -1
            
#     except ValueError:
#         print('\nvalueerror: sentence not in text!\n')
    
    return frc_w, frc_s

Calculate values for logistic regression features

for each id (corresponds to a highlight and a full-text), tokenize highlight into sentences tokenize full-text w/o highlights into non-highlighted sentences

for each sentence in highlight and full-text, calculate: sentence length readability (various) LDA vector of sentence -> cos similarity to LDA vector of article put in array: id sentence length readability (various) LDA similarity

NOTE: Calculated some features at a time by commenting out the appropriate lines below; this is why there are separate pickle files for different sets of feature calculations. Not critical, because all features are combined into one dataframe downstream anyway.


In [72]:
tic = timeit.default_timer()

# print(set_tr.head(10))
n = 0

articleids = []
hlorno = []

all_ARI = []
all_FRE = []
all_FKG = []
all_GFI = []
all_SMG = []
all_CLI = []
all_LIX = []
all_RIX = []
alllens = []
all_ldadists = []
all_wposes = []
all_sposes = []

h_ARI = []
h_FRE = []
h_FKG = []
h_GFI = []
h_SMG = []
h_CLI = []
h_LIX = []
h_RIX = []
hllens = []
h_ldadists = []
h_wposes = []
h_sposes = []

f_ARI = []
f_FRE = []
f_FKG = []
f_GFI = []
f_SMG = []
f_CLI = []
f_LIX = []
f_RIX = []
ftlens = []
f_ldadists = []
f_wposes = []
f_sposes = []

for i, row in dataset.iterrows():

    idval = row.ids
    if i%10 == 0:
        print('analyzing row: '+str(i)+' ')
    # Get topic vector for the whole article
    lda_art = all_lda_vecs[i]
#     lda_art = np.asarray(all_lda_vecs[i])
#     print(lda_art)
    
    hlsents = sent_tokenizer.tokenize(row.highlights)
    for h in hlsents:
        # 
        # get LDA metric
        h_clean = clean_text(h)
        h_corpus = dictionary.doc2bow(h_clean)
        sent_lda = ldamodel[h_corpus]
        vec_lda = lda_to_vec(sent_lda)
        h_lda = 1-spatial.distance.cosine(vec_lda, lda_art)
        np_vec_lda = np.asarray(vec_lda)
        h_ldadists.append(h_lda)
        all_ldadists.append(h_lda)
        
        # get fraction position
        h_wpos, h_spos = sent_pos(h, row.textjoined, idval)
#         if n <= 5:
#             print('wordpos: '+str(h_wpos))
#             print('sentpos: '+str(h_spos))
        h_wposes.append( float(h_wpos) )
        h_sposes.append( float(h_spos) )
        all_wposes.append( float(h_wpos) )
        all_sposes.append( float(h_spos) )
        
        # get length
        hlwords = word_tokenizer.tokenize(h)
        hllen = len(hlwords)
        hllens.append(int(hllen))
        alllens.append(int(hllen))

        # get readability
        h_rd = Readability(h)
        h_ARI.append( float(h_rd.ARI()) )
        h_FRE.append( float(h_rd.FleschReadingEase()) )
        h_FKG.append( float(h_rd.FleschKincaidGradeLevel()) )
        h_GFI.append( float(h_rd.GunningFogIndex()) )
        h_SMG.append( float(h_rd.SMOGIndex()) )
        h_CLI.append( float(h_rd.ColemanLiauIndex()) )
        h_LIX.append( float(h_rd.LIX()) )
        h_RIX.append( float(h_rd.RIX()) )
        all_ARI.append( float(h_rd.ARI()) )
        all_FRE.append( float(h_rd.FleschReadingEase()) )
        all_FKG.append( float(h_rd.FleschKincaidGradeLevel()) )
        all_GFI.append( float(h_rd.GunningFogIndex()) )
        all_SMG.append( float(h_rd.SMOGIndex()) )
        all_CLI.append( float(h_rd.ColemanLiauIndex()) )
        all_LIX.append( float(h_rd.LIX()) )
        all_RIX.append( float(h_rd.RIX()) )

        # get label and id
        articleids.append(int(i))
        hlorno.append(1)

    # count lengths of non-highlighted sentences
    ftsents = sent_tokenizer.tokenize(row.textwohighlight)
    for f in ftsents:
        # get LDA metric
        f_clean = clean_text(f)
        f_corpus = dictionary.doc2bow(f_clean)
        sent_lda = ldamodel[f_corpus]
        vec_lda = lda_to_vec(sent_lda)
        f_lda = 1-spatial.distance.cosine(vec_lda, lda_art)
        np_vec_lda = np.asarray(vec_lda)
        f_ldadists.append(f_lda)
        all_ldadists.append(f_lda)

        # get fraction position
        f_wpos, f_spos = sent_pos(f[:-2], row.textjoined, idval)
#         if n <= 5:
#             print('wordpos: '+str(f_wpos))
#             print('sentpos: '+str(f_spos))
        f_wposes.append( float(f_wpos) )
        f_sposes.append( float(f_spos) )
        all_wposes.append( float(f_wpos) )
        all_sposes.append( float(f_spos) )
        
        # get length
        ftwords = word_tokenizer.tokenize(f)
        ftlen = len(ftwords)
        ftlens.append(int(ftlen))
        alllens.append(int(ftlen))

        # get readability
        f_rd = Readability(f)
        f_ARI.append( float(f_rd.ARI()) )
        f_FRE.append( float(f_rd.FleschReadingEase()) )
        f_FKG.append( float(f_rd.FleschKincaidGradeLevel()) )
        f_GFI.append( float(f_rd.GunningFogIndex()) )
        f_SMG.append( float(f_rd.SMOGIndex()) )
        f_CLI.append( float(f_rd.ColemanLiauIndex()) )
        f_LIX.append( float(f_rd.LIX()) )
        f_RIX.append( float(f_rd.RIX()) )
        all_ARI.append( float(f_rd.ARI()) )
        all_FRE.append( float(f_rd.FleschReadingEase()) )
        all_FKG.append( float(f_rd.FleschKincaidGradeLevel()) )
        all_GFI.append( float(f_rd.GunningFogIndex()) )
        all_SMG.append( float(f_rd.SMOGIndex()) )
        all_CLI.append( float(f_rd.ColemanLiauIndex()) )
        all_LIX.append( float(f_rd.LIX()) )
        all_RIX.append( float(f_rd.RIX()) )

        # get label and id
        articleids.append(int(i))
        hlorno.append(0)
    
    n += 1
#     if n == 5:
#         break

# print(len(articleids))
# print(len(hlorno))
# print(len(alllens))
# print(len(all_rds))
# print(len(all_ldadists))
# print(len(hllens))
# print(len(ftlens))
# print(len(h_rds))
# print(len(f_rds))
# print(len(h_ldadists))
# print(len(f_ldadists))

toc = timeit.default_timer()
print(str(toc - tic) + ' seconds elapsed')


analyzing row: 0 
analyzing row: 10 
analyzing row: 20 
36 Or, as Bertrand Russell — who lived through the invention of electric lights, radio, and television — put it: “.” The Google office, crammed as it was with games and toys and dessert bars for grown-ups, was an unlikely alternativ
analyzing row: 30 
analyzing row: 40 
analyzing row: 50 
analyzing row: 60 
analyzing row: 70 
107 I spent several long weekends that winter at my parents’ house on Long Island, dragging garbage cans to a dumpster parked in the yard — ., and my mother required help distinguishing the one from the othe
analyzing row: 80 
analyzing row: 90 
124 This is because .. You can read more on this in Harry Robert’s article, Single-direction margin declaration
analyzing row: 100 
144 Once you have those things ., designing the product becomes a lot easie
analyzing row: 110 
164 .. Bottom line: you want to keep goin
analyzing row: 120 
analyzing row: 130 
analyzing row: 140 
212 But enough preamble, here’s how this historic resignation is going to unfold: JANUARY 20th–MARCH 31st: .. His first 60 days in office have been a nightmar
analyzing row: 150 
analyzing row: 160 
analyzing row: 170 
analyzing row: 180 
analyzing row: 190 
analyzing row: 200 
analyzing row: 210 
analyzing row: 220 
315 We found that .. To gain access to Waymo’s design server, Mr. Levandowski searched for and installed specialized software onto his company-issued lapto
analyzing row: 230 
analyzing row: 240 
analyzing row: 250 
analyzing row: 260 
analyzing row: 270 
analyzing row: 280 
analyzing row: 290 
407  How I Gained 103,886 Email Subscribers With Just One Article “.” — H. Hackson Brown, J
analyzing row: 300 
analyzing row: 310 
analyzing row: 320 
analyzing row: 330 
analyzing row: 340 
analyzing row: 350 
analyzing row: 360 
analyzing row: 370 
analyzing row: 380 
analyzing row: 390 
535 After a quick introduction, he asked me, .How would you reverse a linked-list?” I paused in an attempt to contain my nervousness, before giving him my vague and clearly fabricated answe
535 .Amateurs sit and wait for inspiration, the rest of us just get up and go to work.” ― Stephen King On these days I would usually force myself to code for at least 30 minute
535 An entertaining .zombie shooter” game I built with a few friends A week later they sent me a coding challeng
535 I cold-emailed a few of my .dream” companies and let them know that I had just received an offer, but that I would rather work for the
analyzing row: 400 
analyzing row: 410 
analyzing row: 420 
analyzing row: 430 
analyzing row: 440 
601 .. We’re approaching a time where software will drive the vehicle that transports your family to soccer practic
analyzing row: 450 
analyzing row: 460 
analyzing row: 470 
analyzing row: 480 
644 Here are the two rules I use for every 1:1, many times a week:  ., among your team desks or in the caf
analyzing row: 490 
analyzing row: 500 
analyzing row: 510 
680 .. A network of small, electric aircraft that take off and land vertically (called VTOL aircraft for Vertical Take-off and Landing, and pronounced vee-tol), will enable rapid, reliable transportation between suburbs and cities and, ultimately, within citie
analyzing row: 520 
analyzing row: 530 
analyzing row: 540 
analyzing row: 550 
analyzing row: 560 
analyzing row: 570 
analyzing row: 580 
analyzing row: 590 
analyzing row: 600 
808 The 100% correct way .. (That’s a bold full-stop for effect
analyzing row: 610 
analyzing row: 620 
analyzing row: 630 
analyzing row: 640 
analyzing row: 650 
866 While it’s harder to get your startup off the ground without much of a budget, we’re big believers in constraints; .. Below, you’ll find 28 tools to help you bring your idea to life—everything from products that will help you build and design an app on the cheap, to plug-ins and services that will make it easier to market and promote your ide
analyzing row: 660 
analyzing row: 670 
analyzing row: 680 
909 Take ownership of your financial future — .. Don’t make the same mistake as countless others before yo
analyzing row: 690 
917 “.,” she say
analyzing row: 700 
931 ., but also because by that point in the movie he’d already killed dozens and dozens of peopl
analyzing row: 710 
948 .. Click one button and you’ll be given the option to take a screenshot of an entire webpage or a specific portion of i
analyzing row: 720 
analyzing row: 730 
971 .. It’s the difference between happiness and total pani
analyzing row: 740 
984 One of our favorite features of this tool is the branding ., allowing you to stick to specific styles for design cohesion no matter what you need to creat
analyzing row: 750 
analyzing row: 760 
analyzing row: 770 
analyzing row: 780 
analyzing row: 790 
analyzing row: 800 
analyzing row: 810 
1065 But here is the thing — .. Much wors
analyzing row: 820 
analyzing row: 830 
analyzing row: 840 
analyzing row: 850 
analyzing row: 860 
analyzing row: 870 
analyzing row: 880 
1154 .. That close bond with our most loyal readers is the heart of our journalistic and business strateg
analyzing row: 890 
1174 “You may have more experience or muscle in working together and supporting folks through a challenging time (sporting hobbies for example) and it can be about the team win vs. individual success.” “.” — Dawn Sharifan, Slack’s Director of People Operations But there can be drawbacks to putting too much stock into asking candidates about outside interests during the interview proces
analyzing row: 900 
analyzing row: 910 
analyzing row: 920 
analyzing row: 930 
analyzing row: 940 
1228 This is an early thesis: .. On other popular social platforms, like Snapchat and Facebook, conversations are usually one-on-one and very linea
analyzing row: 950 
analyzing row: 960 
1260 Now, .. A fresh breeze, some complimentary bottled water, an apple, pear, some grapes, a banana and a kiwi, and a little bag of extras on the way ou
analyzing row: 970 
analyzing row: 980 
analyzing row: 990 
1294 Remember the headlights at night analogy.. (Turns out E.L. Doctorow has some great quotes on writing
analyzing row: 1000 
1300 .. Because it doesn’t matter how good your site or app is if users can’t find their way around i
1306 .. Because something so constant, so slight, so under-the-radar is just … ther
analyzing row: 1010 
analyzing row: 1020 
analyzing row: 1030 
analyzing row: 1040 
analyzing row: 1050 
analyzing row: 1060 
analyzing row: 1070 
analyzing row: 1080 
analyzing row: 1090 
analyzing row: 1100 
1417 Instead, the family’s singular focus was illustrated by their motto det bedste er ikke for godt, “.
analyzing row: 1110 
1438 ..  1
analyzing row: 1120 
analyzing row: 1130 
analyzing row: 1140 
analyzing row: 1150 
1489 Like the joke before the .. That box became the most important thing in the world for m
analyzing row: 1160 
1508 .. With this short morning routine, your life will quickly chang
analyzing row: 1170 
analyzing row: 1180 
1526 Link: http://getguesstimate.com/models/193 At the end of this you don’t just understand the ‘best’ and ‘worst’ scenarios, but you also get everything in between and .. There’s the mean, the median, and several of the percentile
analyzing row: 1190 
analyzing row: 1200 
analyzing row: 1210 
analyzing row: 1220 
analyzing row: 1230 
analyzing row: 1240 
1609 Dear Mom and Dad, First, .. Second, congrats on out-humble-bragging the worl
analyzing row: 1250 
analyzing row: 1260 
analyzing row: 1270 
analyzing row: 1280 
analyzing row: 1290 
1668 For these reasons, .. As @artypapers says: tweetstorm https://twitter.com/artypapers/status/664846647797481472 
analyzing row: 1300 
analyzing row: 1310 
analyzing row: 1320 
1709 There’s no a.ing i
analyzing row: 1330 
analyzing row: 1340 
1732 ., from Adobe, offers a new way to design and prototype websites and mobile app
analyzing row: 1350 
analyzing row: 1360 
analyzing row: 1370 
analyzing row: 1380 
analyzing row: 1390 
1801 To submit yours, go to slackin.xyz Tweet about it now - let others discover these Slack Groups Startups GlobalFocus Asia#startup#LaunchMakerHuntFoundedX.Bootstrapped ChatProduct Hunt Global#FemaleFoundersEquity DirectoryYoung Startups#OPS Asia 🌏 Focus Asia US &amp; Canada 🌎Birmingham - Magic City TechBoston - Boston TechChicago - chicagotechDenton - TechMillGreenville (South Carolina) - Hack GreenvilleJacksonville - Email jaxtechslack@gmail.com for an inviteKnoxville, TN - Knox FoundersLos Angeles (LA) — LA TechMadison — Madison SlackNew York — NYCTech, The Hack UpstateOttawa - Ottawa Startup ChatPhiladelphia - Philly StartupsPortland — The Portland StartupsSeattle — techseattleToronto — startup6ix, techmasters.chatVancouver - startup604 Europe 🌍Amsterdam - AMSxTechBelgium - Belgium StartupsBerlin - Startup Berlin, Berlin TechsFinland - Email Finnish Startups for an inviteFrance - #startupersUK &amp; Ireland - Irish Tech CommunityLeicester - Leicester StartupsLondon - techlondon.ioNorthern Ireland - NiTechRussia - Creative RussiaSwitzerland - Switzerland Startups Australia 🌏Aussie Startups Africa 🌍ZA Tech Design Team SketchDesigner HangoutDesign TalksMobile CreativeUIUX DesignDribbblePeopleUX Guide Software/development By topic/technology stack Android - Android United, Android ChatDevOps - DevOps ChatElixir - ElixirFront-end - #FrontEndDeveloper, #FEDsGo - GophersiOS — iOS DevelopersJS - WeLearnJS, Ember Community (Ember), Angular Buddies (Angular), Reactiflux (React)Ghost - Ghost SlackNode.js - Socket.ioPHP - Larachat (Laravel)Python - Python Community SlackRuby - Ruby Developers, RoRSAP - #SAPSASS - SASSChatWatchKit - Five Min WatchKitWordpress - Wordpress By location GlobalHackerx#developers Atlanta - Tech404Australia/New Zealand — DEVANZBelgrade - JSBelgradeKnoxville - KnoxDevsSouth America - Node Enjoying the list so fa
analyzing row: 1400 
1813 Starbucks Android Material Design by Chase Bratton, Product Designer @ Starbucks The evolution of Starbucks digital products for Google’s new design .. I’m happy to announce that Starbucks has released an updated version of our Android app aligning to Material Desig
1813 An Exercise in Restraint Since Google I/O 2014, the Starbucks product design team has been thinking through how to implement the new .. Our goal was to ensure that the app avoided looking too “templated” when updating to Material Desig
analyzing row: 1410 
analyzing row: 1420 
1834 THE BOOKS: “.” by Robert Greene This book is like a curated version of 1000 biographies all under the guise, “how to become a master at what you love
analyzing row: 1430 
analyzing row: 1440 
analyzing row: 1450 
analyzing row: 1460 
analyzing row: 1470 
analyzing row: 1480 
analyzing row: 1490 
1915 .. We rarely choose things in absolute term
analyzing row: 1500 
1928 All of this leads to my seemingly counter-intuitive advice: .. Apple may hate me being honest in this regard, but they shouldn’t: it behooves neither the app makers nor Apple to have a bunch of apps featured that aren’t going to provide long-term value to user
1942 That isn’t surprising: .. It’s become clearer and clearer that the “consensus” that’s so often talked about in the Bitcoin Core community really means the views of a tiny handful of people, regardless of what anyone else in the wider community might think, how much work they have done, or how many users their products hav
analyzing row: 1510 
analyzing row: 1520 
analyzing row: 1530 
1977 Indian Moms are .reat UX desi.ners A week a.o, I was at my home; packi
1977 I just asked my mother to .ive an old b
1977 and .ave it to me after filli
1977 it with stuff I needed so that I could simply keep the stuff and .ive that b
1977 the ba., I found a tiny little piece of paper inside the b
1977 The piece of paper had my sister’s name, phone number, colle.e name and addres
1977 To .ive you some context, my sister had lost her previous b
1977 historical data, if you will) and was thou.htful enou.h to put an identification card in the ba., in case it’s lost a.ai
1977 to eat because it just adds a to the amount of wei.ht that I have to carr
1977 Plus, I can always buy stuff from pantry if I am hun.r
1977 And invariably, my stomach starts .rowli
1977 and no one is in si.ht to buy stuff fro
1977 I could just .o on and on but I .uess you .et what I am tryi
1977 The kind of experience you .et when you deal with your mother is probably the best experience you can .et anywhere in the worl
1977 Try to .uess as much information as you ca
1977 Make sure you predict what will .o wro
1977 Goo.le, with the knowled.e of your tickets in your inbox throu.h Gmail, reminds you that you already have these tickets when you search for some mor
1977 They did it once and found that people only booked the fli.hts with cheapest fare
1977 So they let .o of extra space to accommodate more people at reduced rates for everyon
1977 Just check Snapchat on your birthday, all messa.es have a .ift as icon in place of blue or red square
1977 You also .et some different photo filters exclusive for your location and tim
1977 How can you not feel special with thin.s like thes
1977 Thanks mom, for everythin.. Obli.atory selfie with my mom and siste
1977 this post, don’t for.et to shower some love by hitti
analyzing row: 1540 
1984  The 39 Best Products to Make Good Daily Habits .― Mahatma Gandhi These websites and apps will help you form good everyday habits and make your life bette
1989 “It lacks .,” she sai
analyzing row: 1550 
2002 What I do for money And how much I make as a small-scale farmer I’m a big fan of ., and when I was starting my small farm business was aided a lot by other people being really up front about how their set-up works for making mone
analyzing row: 1560 
2013 I’m a ., who not too long ago, was waiting tables 40+ hours per week in my hometow
2017 The success of Facebook’s and Instagram’s core products attest to our basic desire to .. Several tech companies have made more overt attempts to capitalize on this impulse—by reminding you of your activity a year ago today (TimeHop, Heyday, Facebook On This Day, Memoir) or summarizing an entire year of your life (Facebook Year in Review, Spotify Year in Music
analyzing row: 1570 
2020 (I had been doing this for 17 ..) I never wanted to get up and start the da
analyzing row: 1580 
analyzing row: 1590 
2049 ., que serve ao duplo propósito de atrair o espectador com uma vida que ele não pode ter, enquanto o faz se sentir como um derrotado por não conseguir alcançar esse patama
analyzing row: 1600 
2063 So I want to get this message out there to all the “techies:” .. I’m tired of all this you/we bullshit that I just did in the previous two paragraph
2068 ..  
analyzing row: 1610 
2082 But startups that are a little bigger will be using something like ., I also enjoy using Invision, it allows me to get instant feedback from my team, investors, and even early users on things that might need more work, or clarificatio
analyzing row: 1620 
2089 I did months of functional strength work, then was able to skip, in .d to running for three minutes, then walking a few, which felt joyful, yet painfu
analyzing row: 1630 
2107 preview of said script .. We can now test a mock with a large variety of datasets in a matter of minutes without changing our process muc
analyzing row: 1640 
2114 ..  
analyzing row: 1650 
analyzing row: 1660 
2145 Together we fixed up the room a bit better — I bought a small ., a large mirror and hung up the picture you see in the photo, among other small thing
analyzing row: 1670 
2164 There’s a great insight Matt Mullenweg once shared which really puts into perspective why as startup founders we should launch as early as possible: “.” Therefore, the ideal approach for creating a successful startup is to put it out there as soon as possible and then iterate from there based on the new information that comes from usage and from doing customer developmen
analyzing row: 1680 
analyzing row: 1690 
analyzing row: 1700 
analyzing row: 1710 
2215 .. That encouragement will help keep the Net as it was born: a platform that encourages sharing, now with the simple ability for authors to say, “Of course, share this too
analyzing row: 1720 
2221 .. Our efforts should have been focused on validating interest in our product and generating tractio
analyzing row: 1730 
analyzing row: 1740 
analyzing row: 1750 
2262 : pcap_loop: a packet arrived on interface 1, but there’s no Interface Description Block for that interface Well it was still ., so I wasn’t sure what I expecte
analyzing row: 1760 
analyzing row: 1770 
2300 What excites me most about the “Just Ask” model is that .. How can I create something that someone will watch and say, “I would feel better if I had paid for that.” That’s a different sort of content, and it especially encourages small-scale niche conten
analyzing row: 1780 
analyzing row: 1790 
2334 He also found that.. That is not relevant to the academic library, but it’s irresistible to kno
analyzing row: 1800 
analyzing row: 1810 
analyzing row: 1820 
analyzing row: 1830 
2410 1) “This has to be perfect.” American journalist, activist, author of six best-selling books Maria Shriver once said, “..” Often, we strive for perfection because we seek approval and praise from other
analyzing row: 1840 
analyzing row: 1850 
analyzing row: 1860 
2477 Um sistema que exige perfeição em troca de igualdade, é insustentável e injusto…muito injusto S. Espero que ainda estejam aí, porque com certeza o racismo ainda est
analyzing row: 1870 
analyzing row: 1880 
analyzing row: 1890 
analyzing row: 1900 
analyzing row: 1910 
analyzing row: 1920 
2560 As the protagonist raises a tart to their mouth, we approach a mirror and we see, the big reveal, that WE ARE A MAN .. 
analyzing row: 1930 
analyzing row: 1940 
analyzing row: 1950 
analyzing row: 1960 
analyzing row: 1970 
analyzing row: 1980 
analyzing row: 1990 
2659 .This one is practiced so often by college educated CSJWs it makes me wonder how tough their professors were on the
analyzing row: 2000 
2677 In this case you can either ., or create a boolean prop which might look a little something like this: &lt;Button theme="secondary" rounded&gt;Hello&lt;/Button&gt; Just like HTML’s binary attributes, you don’t need to do rounded={true
analyzing row: 2010 
analyzing row: 2020 
analyzing row: 2030 
analyzing row: 2040 
analyzing row: 2050 
analyzing row: 2060 
2786 .. Be creative every da
analyzing row: 2070 
2791 dzenbot/.. - A drop-in UITableView/UICollectionView superclass category for showing empty datasets whenever the…github.com 
analyzing row: 2080 
analyzing row: 2090 
analyzing row: 2100 
2847 As Antoine De Saint-Exupery said: “.” The message is clearly spelled out and the screen is void of unnecessary clutter Tips: Be severe when deciding what stays and what goe
analyzing row: 2110 
analyzing row: 2120 
analyzing row: 2130 
analyzing row: 2140 
analyzing row: 2150 
2933 Your pipeline will look as below (read more details here):  I recommend you start with: 1 hobby dyno at $7/month for staging 2 standard-1X dynos for production at $25/month each Cost: $57/month Database Hosting If you’re using PostgreSQL, just use .’s service for thi
2933 Cost: $7/month Alternatives: You can plug other tools in one click on .. See the full list her
analyzing row: 2160 
analyzing row: 2170 
analyzing row: 2180 
analyzing row: 2190 
analyzing row: 2200 
analyzing row: 2210 
analyzing row: 2220 
3047 ., with the possible exception of the Election of 1860, where everyone was pretty much focused on one thin
analyzing row: 2230 
analyzing row: 2240 
analyzing row: 2250 
analyzing row: 2260 
analyzing row: 2270 
analyzing row: 2280 
analyzing row: 2290 
analyzing row: 2300 
3168 About a year ago Joe and I left our jobs because we wanted to explore a question: .In other words, what would it look like if, instead of creating with print in mind and exporting to an ebook (which is what authors and publishers do today) you started from scratch, and created a fundamentally new book experience that is designed for the screen, and takes advantage of everything computers can d
analyzing row: 2310 
analyzing row: 2320 
analyzing row: 2330 
analyzing row: 2340 
analyzing row: 2350 
analyzing row: 2360 
analyzing row: 2370 
3289 .When talking to users, it’s important to keep this in mind: they always say one thing but do it differentl
analyzing row: 2380 
analyzing row: 2390 
analyzing row: 2400 
analyzing row: 2410 
3342 (Photo: Jeff Djevdet) So many questions, so few answers .. I’d treated it with the care and emotional investment of a fathe
analyzing row: 2420 
analyzing row: 2430 
analyzing row: 2440 
3387 .. It’s hard enough to find time to see him during the season even when I’m in Chicag
analyzing row: 2450 
3403 Many think he doesn’t stand a chance of winning, but today’s result only reinforces my belief that he’ll come close if not outright win... Anti Intellectualism is a Threat to Democracy A democracy cannot function when people fail to even acknowledge fact
analyzing row: 2460 
3415 However, .. After carefully considering their options, they choose a path, take action, and don’t look bac
3423 I’ve decided to make this a public .men
3423 Making a public .ment has proven to improve people’s ability to stick to changes and new habit
analyzing row: 2470 
analyzing row: 2480 
analyzing row: 2490 
3456 .. It’s hard to move forward with your plans when you’re constantly bombarded by the negativity of people around yo
analyzing row: 2500 
analyzing row: 2510 
analyzing row: 2520 
3507 For a consumer to get excited about something, to be compelled to click an ad or watch a video, .. And in order for you to win, they really need to consume i
analyzing row: 2530 
analyzing row: 2540 
analyzing row: 2550 
analyzing row: 2560 
analyzing row: 2570 
analyzing row: 2580 
analyzing row: 2590 
analyzing row: 2600 
3614 .. It’s definitely uncomfortable territory for m
analyzing row: 2610 
analyzing row: 2620 
analyzing row: 2630 
analyzing row: 2640 
analyzing row: 2650 
analyzing row: 2660 
analyzing row: 2670 
3705 If you’re Carol, you’re not a happy .. Bob, the introducer Bob, Mr. “I fire off intros without asking if people want them,” escapes mostly unscathed from this exchang
analyzing row: 2680 
analyzing row: 2690 
analyzing row: 2700 
3744 .. At this point your beloved Rick has lost interes
analyzing row: 2710 
analyzing row: 2720 
3782 T. And in such a world full of “centralized networks,” a writer is able to attract 31,204,577 views simply by answering other people’s questions on Quor
analyzing row: 2730 
analyzing row: 2740 
analyzing row: 2750 
3815 We also established a small set of design principles that supported reading as the key tenet: .. Let readers read at their own pac
analyzing row: 2760 
analyzing row: 2770 
analyzing row: 2780 
3869 .. Because each article is on its own page (versus on an aggregated timeline), and it’s all beautifully designed by the Medium team, there seems to be more inherent reverence for a user’s writing—there’s a gravity and weight to each word written that is oftentimes absent in most of the We
analyzing row: 2790 
analyzing row: 2800 
analyzing row: 2810 
3907 .” — Bull Snake  “I’m suspicious of government overreach myself — a lot of us are in these part
analyzing row: 2820 
3922 .. Our perception of time is really driven by our perception of the unfamiliar, vivid and ne
analyzing row: 2830 
analyzing row: 2840 
3961 .It’s obvious if you read their posts and tweet
analyzing row: 2850 
analyzing row: 2860 
4003 Alok once said to me that ., and I think that’s righ
analyzing row: 2870 
4025 ., and a good stat to look at to highlight this is the fact that @SlackHQ gained over 3,300 followers yesterday (more than 7x any other day in the previous month
analyzing row: 2880 
4041 Offline application shells are a major performance win and are also demonstrated well in Jake Archibald’s offline Wikipedia app, .’s Progressive web app or Voice Memos by Paul Lewi
analyzing row: 2890 
analyzing row: 2900 
analyzing row: 2910 
analyzing row: 2920 
analyzing row: 2930 
analyzing row: 2940 
4133 Today I’m proud to present ., an open source CSS plugin for Sketc
4133 By integrating it into ., it let me focus on coding the rest of the plugin (which was a lot more fun than coding a CSS parser would’ve been
4133 With ., a complete palette change can be as simple as changing one lin
analyzing row: 2950 
analyzing row: 2960 
analyzing row: 2970 
4171 Why I’m Leaving San Francisco A long read think piece meditation on the sharing economy, tech startup culture, and trying to make a life with someone you love .. Costs too fucking much to live here so I’m going somewhere els
4180 Have you BEEN to .
analyzing row: 2980 
analyzing row: 2990 
analyzing row: 3000 
analyzing row: 3010 
analyzing row: 3020 
analyzing row: 3030 
analyzing row: 3040 
analyzing row: 3050 
4291 It could be that I’m enjoying the work I’m doing so much and am accountable to only myself; however, I think it has more to do with the fact that I’ve learned so much about how creating good habits and a morning routine are critical for lifetime success.. One of my goals for the beginning of this year was to get into a good morning routine in order to set myself up for the da
analyzing row: 3060 
analyzing row: 3070 
analyzing row: 3080 
analyzing row: 3090 
analyzing row: 3100 
analyzing row: 3110 
4398 How they’re madePencil &amp; paper, Keynote/PowerPoint Source: cargocollective.com WIREFRAMES Wireframes are rough physical or .ital sketche
analyzing row: 3120 
4416 50 Shades of Red (., 2015) by Marius Vieth It reminded me how we often describe creativity with a sense of fracture; it’s ‘ground-breaking’, ‘mould-shattering
analyzing row: 3130 
4426 The other big thing that we have here is ., which we think is equally importan
4426 One is .. Why do we believe that that’s an important core componen
4428 Sejera isn’t your average friendly .. She’s a trained “facility dog” who works with Skrabucha in Family Support Services, The O’Farrell School’s one-stop shop for everything from counseling to applying for food stamps to buying towels donated from Bed Bath &amp; Beyon
4428 Skrabucha had to meet with Sejera’s original handler twice a week for four months to transition the ., and Sejera herself went through two years of training with Paws’itive Teams, an organization that trains dogs for a variety of services, from helping veterans suffering from PTSD to accompanying children who have to testify in court, against an abuser for exampl
analyzing row: 3140 
4438 So the phrase was passed down and became truth: “The home console killed the arcade.” Instead, this phrase more accurately should be: “The arcades were left to die.” ., left with little option but to reproduce gaming content already available on home consoles, no longer became the draw for new content to customers who could play the same content at hom
4450 I lived in a dorm, as a graduate assistant, and was responsible for helping sixty .ager GAs, drumming on residents’ doors early on a Saturday, wanting to go to the dining hall en mass
analyzing row: 3150 
analyzing row: 3160 
4491 Neither my life nor my summer was ruined (though I did break up with my newly unemployed boyfriend), but that incident drilled something into me that’s been impossible to shake until very recently: .. Now, I’m not talking about happiness over responsible decision-makin
analyzing row: 3170 
analyzing row: 3180 
4520 (Not stated: the role Trent Reznor has played in all this...) So what about the $3 billion price ta
4532 CocoaPods: pod '.Framework' ViccAlexander/.. - Flat Color Framework for iOS (Obj-C &amp; Swift)github.com 
analyzing row: 3190 
4537 Along the way, I actually learned a whole lot more about Android’s innards that helped build my .. To this day I’m still floored by the results — it’s been downloaded over a million times and more importantly, hundreds of developers have published extensions on Google Pla
4543 Of course I took the Batman stamps…I have very conflicted feelings about the ., but our whole household loves Batma
analyzing row: 3200 
2746.6176740369992 seconds elapsed

In [73]:
# # time required for position + readability analysis:
# # 2985.0007762390014 seconds elapsed
# # 2746.6176740369992 seconds elapsed for just position

# # count number of sentences excluded from position analysis
# print(h_wposes.count(-1))
# print(h_sposes.count(-1))
# print(f_wposes.count(-1))
# print(f_sposes.count(-1))
# print(all_wposes.count(-1))
# print(all_sposes.count(-1))


0
0
143
143
143
143

In [75]:
# time required for length + LDA analysis:
# 398.4295387339953 seconds elapsed


281331

In [54]:
plt.pie([276120,5211], explode=(0,0.2), labels=['non-highlighted (276120)','highlighted (5211)'], shadow=False, startangle=90)
plt.axis('equal')
plt.show()



In [76]:
analyzed_data_h = pd.DataFrame({ \
'h_wposes':h_wposes, \
'h_sposes':h_sposes, \
'h_ARI':h_ARI, \
'h_FRE':h_FRE, \
'h_FKG':h_FKG, \
'h_GFI':h_GFI, \
'h_SMG':h_SMG, \
'h_CLI':h_CLI, \
'h_LIX':h_LIX, \
'h_RIX':h_RIX, \
})

analyzed_data_all = pd.DataFrame({ \
'all_wposes':all_wposes, \
'all_sposes':all_sposes, \
'all_ARI':all_ARI, \
'all_FRE':all_FRE, \
'all_FKG':all_FKG, \
'all_GFI':all_GFI, \
'all_SMG':all_SMG, \
'all_CLI':all_CLI, \
'all_LIX':all_LIX, \
'all_RIX':all_RIX, \
})

analyzed_data_f = pd.DataFrame({ \
'f_wposes':f_wposes, \
'f_sposes':f_sposes, \
'f_ARI':f_ARI, \
'f_FRE':f_FRE, \
'f_FKG':f_FKG, \
'f_GFI':f_GFI, \
'f_SMG':f_SMG, \
'f_CLI':f_CLI, \
'f_LIX':f_LIX, \
'f_RIX':f_RIX, \
})

analyzed_data_h.to_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/analyzed_data_h')
analyzed_data_f.to_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/analyzed_data_f')
analyzed_data_all.to_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/analyzed_data_all')

Save article ids and highlight-or-not label after analyzing for FRE and position (without length and LDA)


In [52]:
# articleids_w_FRE_pos_list = articleids
# hlorno_w_FRE_pos_list = hlorno

# articleids_w_FRE_pos = pd.DataFrame({'articleids':articleids_w_FRE_pos_list})
# hlorno_w_FRE_pos = pd.DataFrame({'hlorno':hlorno_w_FRE_pos_list})
# articleids_w_FRE_pos.to_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/articleids_w_FRE_pos')
# hlorno_w_FRE_pos.to_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/hlorno_w_FRE_pos')

Save article ids and highlight-or-not label after analyzing for length and LDA


In [ ]:
# articleids_w_len_lda_list = articleids
# hlorno_w_len_lda_list = hlorno

# articleids_w_len_lda = pd.DataFrame({'articleids':articleids_w_len_lda_list})
# hlorno_w_len_lda = pd.DataFrame({'hlorno':hlorno_w_len_lda_list})
# articleids_w_len_lda.to_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/articleids_w_len_lda')
# hlorno_w_len_lda.to_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/hlorno_w_len_lda')

In [58]:
# print(articleids_w_len_lda_list == articleids_w_FRE_pos_list)
# print(hlorno_w_FRE_pos_list == hlorno_w_len_lda_list)


True
True

In [80]:
from statistics import mode

# print(max(hllens))
# print(scipy.stats.mode(hllens))
# print(max(ftlens))
# print(scipy.stats.mode(ftlens))

# print(max(h_rds))
# print(scipy.stats.mode(h_rds))
# print(max(f_rds))
# print(scipy.stats.mode(f_rds))


plt.hist(hllens, bins=50, range=(-10,140))
plt.title("All unique highlighted sentences")
plt.xlabel("Number of words")
plt.ylabel("Frequency")
plt.show()

plt.hist(ftlens, bins=50, range=(-10,140))
plt.title("All unique non-highlighted sentences")
plt.xlabel("Number of words")
plt.ylabel("Frequency")
plt.show()

# print(len(hllens))
# print(len(ftlens))
# print(len(alllens))



# plt.hist(h_ldadists, bins=50)#, normed=1)
# plt.title("All unique highlighted sentences")
# plt.xlabel("LDA distance")
# plt.ylabel("Frequency")
# plt.show()

# plt.hist(f_ldadists, bins=50)#, normed=1)
# plt.title("All unique non-highlighted sentences")
# plt.xlabel("LDA distance")
# plt.ylabel("Frequency")
# plt.show()

# print(len(h_ldadists))
# print(len(f_ldadists))
# print(len(all_ldadists))


# plt.hist(h_wposes, bins=25, range=(0,1), normed=1)
# plt.title("All unique highlighted sentences")
# plt.xlabel("Fraction words into full text")
# plt.ylabel("Frequency")
# plt.show()

# plt.hist(h_sposes, bins=25, range=(0,1), normed=1)
# plt.title("All unique highlighted sentences")
# plt.xlabel("Fraction sentences into full text")
# plt.ylabel("Frequency")
# plt.show()

# plt.hist(f_wposes, bins=25, range=(0,1), normed=1)
# plt.title("All unique non-highlighted sentences")
# plt.xlabel("Fraction words into full text")
# plt.ylabel("Frequency")
# plt.show()

# plt.hist(f_sposes, bins=25, range=(0,1), normed=1)
# plt.title("All unique non-highlighted sentences")
# plt.xlabel("Fraction sentences into full text")
# plt.ylabel("Frequency")
# plt.show()

# print(len(h_wposes))
# print(len(h_sposes))
# print(len(f_wposes))
# print(len(f_sposes))
# print(len(all_wposes))

# plt.hist(h_sposes, bins=25, range=(0,1), normed=1)
# plt.title("All unique highlighted sentences")
# plt.xlabel("Fraction sentences into full text")
# plt.ylabel("Frequency")
# plt.show()
# plt.hist(f_sposes, bins=25, range=(0,1), normed=1)
# plt.title("All unique non-highlighted sentences")
# plt.xlabel("Fraction sentences into full text")
# plt.ylabel("Frequency")
# plt.show()
# plt.hist(all_sposes, bins=25, range=(0,1), normed=1)
# plt.title("All unique sentences")
# plt.xlabel("Fraction sentences into full text")
# plt.ylabel("Frequency")
# plt.show()


Put into pandas dataframe to submit to logistic regression


In [81]:
dataset_submit = pd.DataFrame({ \
'highlightornot':hlorno, \
'length':alllens, \
'LDAdist':all_ldadists, \
'wordPos':all_wposes, \
'sentPos':all_sposes, \
'ARI':all_ARI, \
'FRE':all_FRE, \
'FKG':all_FKG, \
'GFI':all_GFI, \
'SMG':all_SMG, \
'CLI':all_CLI, \
'LIX':all_LIX, \
'RIX':all_RIX, \
                             })
dataset_submit.to_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/dataset_submit_len_lda_wpos_wpos_readmets')

dataset_submit = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/dataset_submit_len_lda_wpos_wpos_readmets')

print(dataset_submit)


              ARI      CLI      FKG       FRE      GFI   LDAdist        LIX  \
0        8.620000  11.3423   7.6338   63.4862  11.3538  0.978843  28.384615   
1       14.007500  18.5125   7.7733   60.7050   8.1333  0.981704  28.666667   
2        7.295294   7.7276   6.3106   80.0976   9.1529  0.185355  40.529412   
3       -0.087273   1.2845   1.5727  103.3791   4.4000  0.980338  11.000000   
4        4.410000   5.6171   4.1986   89.8964   8.4571  0.978843  21.142857   
5        0.340000   2.4633  -4.2133  141.3000   8.0444  0.088626  20.111111   
6        3.338182   5.5682   2.6455   95.6882   4.4000  0.978844  20.090909   
7       16.609767   4.1858  14.3521   68.7528  19.0605  0.155225  52.302326   
8       -2.660000  -4.2033  -6.5533  147.3900   1.2000  0.445979   3.000000   
9        4.678947   3.3911   7.3463   76.2342  11.8105  0.172752  24.263158   
10       8.397368   8.0411   9.8305   58.4237  11.8105  0.976764  34.789474   
11      -0.090000   1.7600   2.8800   83.3200  10.0000  0.212839   5.000000   
12       1.410000   4.0100   3.7550   82.3900   8.2000  0.981702  33.000000   
13     -16.220000 -39.9100 -15.2000  205.8200   0.4000  0.445979   1.000000   
14       9.234706  10.1529   9.0871   60.1918  13.8588  0.202789  46.411765   
15      11.859231   8.4185  11.3423   60.0527  16.5538  0.164257  52.923077   
16       0.270000   1.3333  -1.0767  124.1550   4.8000  0.194811  12.000000   
17       5.625000   5.1117   5.8522   85.1650   9.4222  0.174978  29.111111   
18      10.475200   7.2668   7.8480   83.3240  14.8000  0.179529  37.000000   
19       7.849412   8.4206   7.6988   70.1447  11.5059  0.169103  28.764706   
20       5.998750   6.6212   6.1375   79.5575  11.4000  0.895860  28.500000   
21      10.082500  13.6042   8.7567   53.6550  11.4667  0.169100  45.333333   
22       6.414211   5.5611   5.4832   89.5921   7.6000  0.977669  29.526316   
23      18.068125  21.7144  12.7750   31.9700  18.9000  0.139409  59.750000   
24      11.591875  13.6156  13.5125   26.6825  16.4000  0.174981  34.750000   
25      12.859474  13.6211  11.6937   45.0658  16.0211  0.785448  45.315789   
26     -16.220000 -39.9100 -15.2000  205.8200   0.4000  0.445979   1.000000   
27      20.512500  13.3075  18.1167   29.2950  23.2889  0.970737  72.111111   
28      -0.087273   1.2845   1.5727  103.3791   4.4000  0.978846  20.090909   
29      -3.858000  -2.9520   0.5200  100.2400   2.0000  0.445979   5.000000   
...           ...      ...      ...       ...      ...       ...        ...   
281301   6.649000  10.0610   6.0100   69.7850   8.0000  0.124723  30.000000   
281302  29.463158  11.6770  24.4435   21.3379  25.6070  0.982657  81.561404   
281303  25.331887   8.9721  22.0008   31.7268  24.2189  0.876314  75.641509   
281304  21.059412  15.1929  20.2288   10.5897  24.1882  0.885722  69.294118   
281305  20.819565   7.6200  17.4848   51.6363  21.0087  0.962248  59.043478   
281306  16.880800  15.2772  14.9280   32.5640  19.6000  0.977757  61.000000   
281307  14.833448  10.3814  12.4028   57.7931  15.7379  0.665232  49.689655   
281308  20.291538  11.2195  17.7738   37.0962  20.7282  0.169233  64.641026   
281309   9.900000  11.5033   9.1800   34.5900  14.5333  0.732380  36.333333   
281310  18.576400  17.3976  15.4000   29.1800  14.8000  0.976069  57.000000   
281311  33.621364  11.3214  27.3136   16.7905  30.0364  0.689548  85.696970   
281312   3.910000   5.4523   7.6338   63.4862  11.3538  0.195252  28.384615   
281313   7.652500  10.5675   6.6200   54.7250  11.6000  0.111985  54.000000   
281314   3.400714   4.3550   2.5129  101.9821   5.6000  0.208495  14.000000   
281315  26.580192  11.1475  23.0708   22.2742  25.4154  0.967257  75.076923   
281316   6.763636   9.8518   9.0818   49.5427  11.6727  0.302241  38.272727   
281317  14.195455   7.2079  14.4436   50.2855  19.2606  0.044852  45.121212   
281318   9.504231   5.4735   9.0731   76.3219  13.4769  0.188102  33.692308   
281319   1.765000   3.2050   6.6200   54.7250  11.6000  0.973335  29.000000   
281320  10.335000  11.0017  10.4411   52.2650  13.8667  0.550717  34.666667   
281321  18.721538   9.2562  16.8662   43.6038  20.7282  0.093440  56.948718   
281322  13.875556  10.3574  10.2363   69.7633  13.7630  0.034595  41.814815   
281323   8.397368   8.0411   6.7253   80.6868  11.8105  0.964383  34.789474   
281324  10.195000   7.4917  11.9617   52.0500  16.2667  0.947481  36.500000   
281325  10.417000  14.7730  11.9100   27.4850  16.0000  0.986691  40.000000   
281326  -2.122500  -0.4075  -0.6700  114.1150   3.2000  0.272988   8.000000   
281327  20.322439  10.0451  15.9415   53.7956  20.3024  0.885810  60.512195   
281328  21.912128   8.3749  19.5613   38.5300  23.9064  0.915502  64.021277   
281329   0.000000   0.0000   0.0000    0.0000   0.0000  0.449594   0.000000   
281330   0.340909   1.8200   1.5727  103.3791   4.4000  0.284824  11.000000   

         RIX        SMG  highlightornot  length   sentPos   wordPos  
0        2.0  10.745967               1      13  0.069811  0.073383  
1        2.0   8.477226               0      12  0.000000  0.000000  
2        4.0   8.477226               0      16  0.001887  0.001610  
3        0.0   3.000000               0      11  0.003774  0.003756  
4        1.0   8.477226               0      14  0.005660  0.005232  
5        1.0   8.477226               0       8  0.007547  0.007110  
6        1.0   3.000000               0      11  0.009434  0.008184  
7        4.0  10.745967               0      43  0.011321  0.009659  
8        0.0   3.000000               0       3  0.013208  0.015428  
9        1.0  10.745967               0      19  0.015094  0.015830  
10       3.0  10.745967               0      19  0.016981  0.018379  
11       0.0   8.477226               0       5  0.018868  0.020928  
12       2.0   8.477226               0       8  0.020755  0.021599  
13       0.0   3.000000               0       1  0.000000  0.000000  
14       5.0  12.486833               0      16  0.024528  0.022807  
15       7.0  13.954451               0      25  0.026415  0.024953  
16       0.0   3.000000               0      11  0.028302  0.028307  
17       2.0   8.477226               0      18  0.030189  0.029783  
18       3.0  12.486833               0      24  0.032075  0.032197  
19       2.0  10.745967               0      17  0.033962  0.035417  
20       2.0  10.745967               0      16  0.035849  0.037698  
21       4.0  10.745967               0      12  0.037736  0.039844  
22       2.0   3.000000               0      19  0.039623  0.041454  
23       7.0  15.247449               0      15  0.041509  0.044003  
24       3.0  13.954451               0      16  0.043396  0.046016  
25       5.0  13.954451               0      19  0.045283  0.048162  
26       0.0   3.000000               0       1  0.000000  0.000000  
27      13.0  18.491933               0      34  0.049057  0.050845  
28       1.0   3.000000               0      10  0.050943  0.055406  
29       0.0   3.000000               0       5  0.052830  0.056748  
...      ...        ...             ...     ...       ...       ...  
281301   2.0   8.477226               0      10  0.523810  0.505956  
281302  14.0  13.954451               0      57  0.539683  0.512226  
281303  12.0  13.954451               0      52  0.555556  0.547962  
281304  12.0  19.431677               0      34  0.571429  0.580564  
281305   6.0  12.486833               0      45  0.587302  0.601881  
281306   9.0  16.416408               0      25  0.603175  0.630094  
281307   6.0  12.486833               0      29  0.619048  0.645768  
281308  10.0  15.247449               0      39  0.634921  0.663950  
281309   1.0   8.477226               0       3  0.650794  0.688401  
281310   8.0  12.486833               0      25  0.666667  0.690282  
281311  13.0  16.416408               0      64  0.682540  0.705956  
281312   2.0  10.745967               0      13  0.698413  0.746082  
281313   2.0   8.477226               0       4  0.714286  0.754232  
281314   0.0   3.000000               0      13  0.730159  0.756740  
281315  12.0  16.416408               0      52  0.746032  0.764890  
281316   3.0  10.745967               0      11  0.761905  0.797492  
281317   4.0  15.247449               0      33  0.777778  0.804389  
281318   2.0  10.745967               0      26  0.793651  0.825078  
281319   1.0   8.477226               0       4  0.809524  0.841379  
281320   3.0  12.486833               0      18  0.825397  0.843887  
281321   7.0  15.247449               0      39  0.841270  0.855172  
281322   4.0  10.745967               0      27  0.857143  0.879624  
281323   3.0  10.745967               0      19  0.873016  0.896552  
281324   3.0  13.954451               0      24  0.888889  0.908464  
281325   3.0  12.486833               0      10  0.904762  0.923511  
281326   0.0   3.000000               0       8  0.920635  0.929781  
281327   8.0  13.954451               0      41  0.936508  0.934796  
281328   8.0  16.416408               0      47  0.952381  0.960502  
281329   0.0   0.000000               0       1  0.000000  0.000000  
281330   0.0   3.000000               0      11  0.984127  0.993103  

[281331 rows x 13 columns]

Describe statistical model with patsy


In [3]:
dataset_submit = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/dataset_submit_len_lda_wpos_wpos_readmets')

# create dataframe
y, X = dmatrices('highlightornot ~ length + LDAdist + wordPos + sentPos + ARI + FRE + FKG + GFI + SMG + CLI + LIX + RIX', \
                 dataset_submit, return_type="dataframe")
# ytest, Xtest = dmatrices('length ~ length + FRE', set_tr_submit, return_type="dataframe")
# print( Xtest.columns)
# print(ytest)

# flatten y into a 1-D array
y = np.ravel(y)

Split into train and test sets


In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [180]:
file = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/Xtrain_len_lda_wpos_wpos_readmets','wb')
pickle.dump(X_train,file)
file = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/Xtest_len_lda_wpos_wpos_readmets','wb')
pickle.dump(X_test,file)
file = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/ytrain_len_lda_wpos_wpos_readmets','wb')
pickle.dump(y_train,file)
file = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/ytest_len_lda_wpos_wpos_readmets','wb')
pickle.dump(y_test,file)

Include data preprocessing to scale all features! (i.e. calculate z-scores)


In [153]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(class_weight='balanced', penalty='l2'))

# ## fit
# pipe.fit(X, y)
# ## predict
# pipe.predict_proba(X)
# ## to get back mean/std
# scaler = pipe.steps[0][1]
# scaler.mean_
# # Out[12]: array([ 0.0313, -0.0334,  0.0145, ..., -0.0247,  0.0191,  0.0439])
# scaler.std_
# # Out[13]: array([ 1.    ,  1.0553,  0.9805, ...,  1.0033,  1.0097,  0.9884])
    
    
# pipe = LogisticRegression(class_weight='balanced', penalty='l2')
pipe.fit(X_train, y_train)
# print(X_train)
print(len(y_train))
print(len(y_test))
print(len(X_train))
print(len(X_test))


225064
56267
225064
56267

save/load model


In [3]:
# fpipe = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/model_len_lda_wpos_wpos_readmets','wb')
# pickle.dump(pipe, fpipe)

# load model
pipe = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/model_len_lda_wpos_wpos_readmets')
X_train = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/Xtrain_len_lda_wpos_wpos_readmets')
X_test = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/Xtest_len_lda_wpos_wpos_readmets')
y_train = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/ytrain_len_lda_wpos_wpos_readmets')
y_test = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/ytest_len_lda_wpos_wpos_readmets')

Evaluate logistic regression model


In [4]:
# check the accuracy on the training set
pipe.score(X_train, y_train)


Out[4]:
0.45945597696655172

In [5]:
# check the accuracy on the test set
pipe.score(X_test, y_test)


Out[5]:
0.45785273783923081

In [6]:
# predict class labels for the test set
predicted = pipe.predict(X_test)
print(predicted)
test = np.nonzero(predicted)
print(len(test[0]))
print(len(predicted))


[ 1.  0.  1. ...,  1.  0.  1.]
30734
56267

Examine feature weights (feature coefficients)


In [13]:
print(np.transpose(pipe.steps[1][1].coef_))
print(X_test.columns)
tmp = X_test.columns.name[2:]
print(tmp)
# [[ 0.        ]
#  [-0.75450588]
#  [ 0.00378554]
#  [-0.14955534]
#  [ 0.17832368]
#  [-0.06008622]
#  [ 0.69844673]
#  [ 1.1656712 ]
#  [ 0.01445301]
#  [ 0.10138341]
#  [ 0.12576411]
#  [ 0.16656827]
#  [-0.75734814]]
# Index(['Intercept', 'length', 'LDAdist', 'wordPos', 'sentPos', 'ARI', 'FRE',
#        'FKG', 'GFI', 'SMG', 'CLI', 'LIX', 'RIX'],
#       dtype='object')


# WHEN ONLY USE LOGISTIC REGRESSION (NO Z-SCORE PREPROCESSING)
# print(np.transpose(pipe.coef_))
# [[-0.83983031]
#  [-0.0366619 ]
#  [ 0.01134218]
#  [-0.49050534]
#  [ 0.59117833]
#  [-0.00489886]
#  [ 0.01644136]
#  [ 0.11783003]
#  [ 0.00142764]
#  [ 0.0216104 ]
#  [ 0.0138162 ]
#  [ 0.00623342]
#  [-0.09777468]]
# Index(['Intercept', 'length', 'LDAdist', 'wordPos', 'sentPos', 'ARI', 'FRE',
#        'FKG', 'GFI', 'SMG', 'CLI', 'LIX', 'RIX'],
#       dtype='object')


[[ 0.        ]
 [-0.75450588]
 [ 0.00378554]
 [-0.14955534]
 [ 0.17832368]
 [-0.06008622]
 [ 0.69844673]
 [ 1.1656712 ]
 [ 0.01445301]
 [ 0.10138341]
 [ 0.12576411]
 [ 0.16656827]
 [-0.75734814]]
Index(['Intercept', 'length', 'LDAdist', 'wordPos', 'sentPos', 'ARI', 'FRE',
       'FKG', 'GFI', 'SMG', 'CLI', 'LIX', 'RIX'],
      dtype='object')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-13-d81dd4bcf4d7> in <module>()
      2 print(np.transpose(pipe.steps[1][1].coef_))
      3 print(X_test.columns)
----> 4 tmp = X_test.columns.name[2:]
      5 print(tmp)
      6 # [[ 0.        ]

TypeError: 'NoneType' object is not subscriptable

ROC curve and evaluation metrics


In [7]:
# generate class probabilities
probs = pipe.predict_proba(X_test)
print (probs)


[[ 0.48381601  0.51618399]
 [ 0.65055944  0.34944056]
 [ 0.48048208  0.51951792]
 ..., 
 [ 0.4651526   0.5348474 ]
 [ 0.6783202   0.3216798 ]
 [ 0.48697564  0.51302436]]

In [8]:
# generate evaluation metrics
print( metrics.accuracy_score(y_test, predicted))
print( metrics.roc_auc_score(y_test, probs[:, 1]))


0.457852737839
0.55262283331

In [17]:
# plot ROC curve for test set

from sklearn.metrics import roc_curve, auc

auc_score = metrics.roc_auc_score(y_test, probs[:, 1])
fpr, tpr, thres = metrics.roc_curve(y_test, probs[:,1])

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='navy',
         lw=lw, label='AUC = %0.2f' % auc_score)
plt.plot([0, 1], [0, 1], color='darkorange', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve, test set')
plt.legend(loc="lower right")
plt.show()



In [16]:
# Plot ROC curve for training set
probs_tr = pipe.predict_proba(X_train)
print (probs_tr)

auc_score_tr = metrics.roc_auc_score(y_train, probs_tr[:, 1])
fpr_tr, tpr_tr, thres_tr = metrics.roc_curve(y_train, probs_tr[:,1])

#Plot of a ROC curve for a specific class
plt.figure()
lw = 2
plt.plot(fpr_tr, tpr_tr, color='navy',
         lw=lw, label='AUC = %0.2f' % auc_score_tr)
plt.plot([0, 1], [0, 1], color='darkorange', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve, training set')
plt.legend(loc="lower right")
plt.show()


[[ 0.51904363  0.48095637]
 [ 0.44721335  0.55278665]
 [ 0.5820432   0.4179568 ]
 ..., 
 [ 0.46717142  0.53282858]
 [ 0.50611333  0.49388667]
 [ 0.50718894  0.49281106]]

In [7]:
print( metrics.confusion_matrix(y_test, predicted))
print( metrics.classification_report(y_test, predicted))
predicted_train = pipe.predict(X_train)
print( metrics.classification_report(y_train, predicted_train))


[[25117 30089]
 [  416   645]]
             precision    recall  f1-score   support

        0.0       0.98      0.45      0.62     55206
        1.0       0.02      0.61      0.04      1061

avg / total       0.97      0.46      0.61     56267

             precision    recall  f1-score   support

        0.0       0.98      0.46      0.62    220914
        1.0       0.02      0.63      0.04      4150

avg / total       0.97      0.46      0.61    225064

Evaluate the model using 10-fold cross-validation


In [161]:
# scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=10)
recall = cross_val_score(pipe, X, y, cv=10, scoring='recall')

print (recall)
print (recall.mean())
print (recall.std())

# [ 0.59003831  0.61612284  0.61804223  0.68714012  0.62955854  0.60076775
#   0.61420345  0.57005758  0.59884837  0.61036468]
# 0.61351438804
# 0.0292113885642

# WITH ONLY LOGISTIC REGRESSION (NO Z-SCORE PREPROCESSING)
# [ 0.59195402  0.61612284  0.61804223  0.68714012  0.62955854  0.60076775
#   0.61420345  0.57005758  0.59884837  0.61036468]
# 0.613705958921
# 0.0290627055203


[ 0.59003831  0.61612284  0.61804223  0.68714012  0.62955854  0.60076775
  0.61420345  0.57005758  0.59884837  0.61036468]
0.61351438804
0.0292113885642

10-fold cross-validation gives consistent results (0.61 ± 0.03 highlight recall)


In [162]:
plt.matshow(confusion_matrix(y_test, predicted), cmap=plt.cm.binary, interpolation='nearest')
plt.title('confusion matrix')
plt.colorbar()
plt.ylabel('expected label')
plt.xlabel('predicted label')
plt.show()



In [169]:
# normalized confusion matrix values
cm = metrics.confusion_matrix(y_test, predicted)
print(cm)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print(cm)


[[25117 30089]
 [  416   645]]
[[ 0.45496866  0.54503134]
 [ 0.39208294  0.60791706]]

In [103]:
print(pipe)


Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [104]:
# predict class labels for the test set
predicted = pipe.predict(X_test)
print(predicted)

# plt.matshow(confusion_matrix(X_test, predicted), cmap=plt.cm.binary, interpolation='nearest')
# plt.title('confusion matrix')
# plt.colorbar()
# plt.ylabel('expected label')
# plt.xlabel('predicted label')
# plt.show()


[ 1.  0.  1. ...,  1.  0.  1.]

Inspect distribution of class probabilities


In [6]:
decfxn_tr = pipe.decision_function(X_train)
decfxn_ts = pipe.decision_function(X_test)
# print(decfxn_tr)
# print(decfxn_ts)
# print(min(decfxn_tr))
# print(min(decfxn_ts))
# print(max(decfxn_tr))
# print(max(decfxn_ts))

print(decfxn_tr.argmax())
print(decfxn_ts.argmax())


plt.hist(decfxn_tr, bins=50, range=(-1,1))#, normed=1)
plt.title("Training set")
plt.xlabel("Decision Fxn (confidence value)")
plt.ylabel("Frequency")
plt.show()

plt.hist(decfxn_ts, bins=50, range=(-1,1))#, normed=1)
plt.title("Test set")
plt.xlabel("Decision Fxn (confidence value)")
plt.ylabel("Frequency")
plt.show()


152825
37248

Construct validation plots -- roc type curve for decision function value (confidence score)

Before computing the ROC curves shown above, I created a separate cumulative density function (CDF) plot by sorting the confidence scores of every sentence in the dataset by rank, then plotting the cumulative highlight recovery going down the ranks. This gave a similar result to the ROC curves, since moving along the x-axis of this CDF plot is similar to adjusting the confidence score threshold.


In [221]:
# sort common dataset by cosine similarity score
# plot cumulative density of highlights relative to cos similarity score
# if curve above 1:1 diagonal, means highlight prediction better than chance

# print(dataset_submit.head(10))
# print(dataset_submit.sort_values('LDAdist').head(10))
# print(dataset_submit.sort_values('LDAdist').tail(10))
# print(dataset_submit.sort_values('LDAdist').head(2000))



from scipy.interpolate import interp1d
# 
decfxn_tr
print(len(np.nonzero(y_test)[0]))
print(np.nonzero(y_test)[0])

print(len(np.nonzero(y_train)[0]))
print(np.nonzero(y_train)[0])

cumul = pd.DataFrame({'confidencescore':decfxn_tr, 'highlightornot':y_train})
cumul_sort = cumul.sort_values('confidencescore', ascending=False)
# print(cumul_sort)
total = 0
array = []
for i in cumul_sort['highlightornot']:
    total = total + i
    array.append(total)
# print(min(decfxn_tr))
print(len(array))
print(total)
print(array[-1])

# rangex = np.arange(1,4150)
line = np.linspace(1, 4150, 225064)
print(line)

# plot CDF curve and chance line
plt.plot(range(1,225065), array, range(1,225065), line)
plt.show()

# plot CDF curve and chance line, normalized
plt.plot(range(1,225065), array/max(array), range(1,225065), line/max(line))
plt.show()


1061
[   71   102   110 ..., 56118 56203 56247]
4150
[    64    100    137 ..., 224889 224943 224996]
225064
4150.0
4150.0
[  1.00000000e+00   1.01843484e+00   1.03686968e+00 ...,   4.14996313e+03
   4.14998157e+03   4.15000000e+03]

In [ ]:

Define a function to draw CDF plots


In [14]:
def plot_cdf(decfxn, ytrain):
    cumul = pd.DataFrame({'confidencescore':decfxn, 'highlightornot':ytrain})
    cumul_sort = cumul.sort_values('confidencescore', ascending=False)
    # print(cumul_sort)
    total = 0
    array = []
    for i in cumul_sort['highlightornot']:
        array.append(total)
        total = total + i
    # print(min(decfxn_tr))
    print(len(array))
    print(total)
    print(array[-1])

    line = np.linspace(1, max(array), len(array))
#     print(line)

    # plot CDF curve and chance line
    plt.plot(range(1,len(array)+1), array, range(1,len(array)+1), line)
    plt.show()

    # plot CDF curve and chance line, normalized
    plt.plot(range(1,len(array)+1), array/max(array), range(1,len(array)+1), line/max(line))
    plt.show()
    
    normarray = array/max(array)
    
    print(sum(normarray)/(1*len(array)))

In [9]:
plot_cdf(decfxn_ts, y_test)


56267
1061.0
1061.0
0.551621663421

In [ ]:

Test undersampling from imblearn for balancing class weights


In [113]:
# print original shape of y from above:
print('Original dataset shape {}'.format(Counter(y)))
undersample = RandomUnderSampler()
X_undersmp, y_undersmp = undersample.fit_sample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_undersmp)))


Original dataset shape Counter({0.0: 276120, 1.0: 5211})
Resampled dataset shape Counter({1.0: 5211, 0.0: 5211})

In [117]:
# Split into train and test sets
X_undersmp_train, X_undersmp_test, y_undersmp_train, y_undersmp_test = \
train_test_split(X_undersmp, y_undersmp, test_size=0.2, random_state=0)

print('Resampled dataset shape {}'.format(Counter(y_undersmp_train)))
print('Resampled dataset shape {}'.format(Counter(y_undersmp_test)))


Resampled dataset shape Counter({1.0: 4203, 0.0: 4134})
Resampled dataset shape Counter({0.0: 1077, 1.0: 1008})

In [119]:
# Include data preprocessing to scale all features! (i.e. calculate z-scores)
pipe_undersmp = make_pipeline(StandardScaler(), LogisticRegression(class_weight='balanced', penalty='l2'))

# ## fit
# pipe.fit(X, y)
# ## predict
# pipe.predict_proba(X)
# ## to get back mean/std
# scaler = pipe.steps[0][1]
# scaler.mean_
# # Out[12]: array([ 0.0313, -0.0334,  0.0145, ..., -0.0247,  0.0191,  0.0439])
# scaler.std_
# # Out[13]: array([ 1.    ,  1.0553,  0.9805, ...,  1.0033,  1.0097,  0.9884])
    
    
# model = LogisticRegression(class_weight='balanced', penalty='l2')
pipe_undersmp.fit(X_undersmp_train, y_undersmp_train)
# print(X_train)
print(len(y_undersmp_train))
print(len(y_undersmp_test))
print(len(X_undersmp_train))
print(len(X_undersmp_test))


8337
2085
8337
2085

In [121]:
# # save model
# fpipe_undersmp = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/model_undersmp_len_lda_wpos_wpos_readmets','wb')
# pickle.dump(pipe_undersmp, fpipe_undersmp)

# load model
pipe_undersmp = pd.read_pickle('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/model_undersmp_len_lda_wpos_wpos_readmets')

# check the accuracy on the training set
print(pipe_undersmp.score(X_undersmp_train, y_undersmp_train))

# check the accuracy on the test set
print(pipe_undersmp.score(X_undersmp_test, y_undersmp_test))


0.548158810124
0.520383693046

In [124]:
# predict class labels for the test set
predicted_undersmp = pipe_undersmp.predict(X_undersmp_test)
print(predicted_undersmp)
test = np.nonzero(predicted_undersmp)
print(len(test[0]))
print(len(predicted_undersmp))

# examine the coefficients
print(np.transpose(pipe_undersmp.steps[1][1].coef_))
print(X_test.columns)

# generate class probabilities
probs_undersmp = pipe_undersmp.predict_proba(X_undersmp_test)
print (probs_undersmp)

# generate evaluation metrics
print( metrics.accuracy_score(y_undersmp_test, predicted_undersmp))
print( metrics.roc_auc_score(y_undersmp_test, probs_undersmp[:, 1]))

print( metrics.confusion_matrix(y_undersmp_test, predicted_undersmp))
print( metrics.classification_report(y_undersmp_test, predicted_undersmp))

# evaluate the model using 10-fold cross-validation
# scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=10)
recall_undersmp = cross_val_score(pipe_undersmp, X, y, cv=10, scoring='recall')

print (recall_undersmp)
print (recall_undersmp.mean())
print (recall_undersmp.std())


[ 1.  1.  1. ...,  0.  1.  0.]
1230
2085
[[ 0.        ]
 [-0.47851308]
 [-0.03087635]
 [ 0.0814741 ]
 [-0.03316393]
 [-0.01878615]
 [ 0.77610961]
 [ 1.01257391]
 [ 0.00578266]
 [ 0.12901553]
 [ 0.18632183]
 [ 0.07639077]
 [-0.37497635]]
Index(['Intercept', 'length', 'LDAdist', 'wordPos', 'sentPos', 'ARI', 'FRE',
       'FKG', 'GFI', 'SMG', 'CLI', 'LIX', 'RIX'],
      dtype='object')
[[ 0.4614495   0.5385505 ]
 [ 0.47152145  0.52847855]
 [ 0.4895365   0.5104635 ]
 ..., 
 [ 0.50380637  0.49619363]
 [ 0.47050047  0.52949953]
 [ 0.51401674  0.48598326]]
0.520383693046
0.540210350621
[[466 611]
 [389 619]]
             precision    recall  f1-score   support

        0.0       0.55      0.43      0.48      1077
        1.0       0.50      0.61      0.55      1008

avg / total       0.52      0.52      0.52      2085

[ 0.59003831  0.61612284  0.61804223  0.68714012  0.62955854  0.60076775
  0.61420345  0.57005758  0.59884837  0.61036468]
0.61351438804
0.0292113885642

In [127]:
# convert coefficients to probabilities
coeffs = np.transpose(pipe_undersmp.steps[1][1].coef_)
coeff_odds = []
coeff_probs = []
for i in coeffs:
    coeff_odds.append(math.exp(i))
    coeff_probs.append(math.exp(i)/(1+math.exp(i)))
    
print(coeff_odds)
print(coeff_probs)
print(X_test.columns)


[1.0, 0.6197041566997715, 0.9695954588907783, 1.0848851166868112, 0.9673799663533975, 0.981389206243711, 2.1730019841225814, 2.7526770590391414, 1.005799416639851, 1.137707793240735, 1.204809937231406, 1.0793842843468693, 0.687305531057032]
[0.5, 0.3826033008166441, 0.49228152639873957, 0.5203572647738275, 0.4917097779268677, 0.49530359969216464, 0.6848410416999703, 0.7335235661722391, 0.5014456621613655, 0.5322092181345262, 0.5464461661236404, 0.5190884111572007, 0.4073391086595096]
Index(['Intercept', 'length', 'LDAdist', 'wordPos', 'sentPos', 'ARI', 'FRE',
       'FKG', 'GFI', 'SMG', 'CLI', 'LIX', 'RIX'],
      dtype='object')

In [228]:
# normalized confusion matrix values
cm_undersmp = metrics.confusion_matrix(y_undersmp_test, predicted_undersmp)
print(cm_undersmp)
cm_undersmp = cm_undersmp.astype('float') / cm_undersmp.sum(axis=1)[:, np.newaxis]

print(cm_undersmp)


[[466 611]
 [389 619]]
[[ 0.43268338  0.56731662]
 [ 0.3859127   0.6140873 ]]

Generate CDF plots (see "Construct validation plots" section above)


In [226]:
decfxn_tr_undersmp = pipe_undersmp.decision_function(X_undersmp_train)
decfxn_ts_undersmp = pipe_undersmp.decision_function(X_undersmp_test)

plot_cdf(decfxn_tr_undersmp, y_undersmp_train)
plot_cdf(decfxn_ts_undersmp, y_undersmp_test)


8337
4203.0
4203.0
2085
1008.0
1008.0

In [ ]:

Test SMOTE (synthetic oversampling) from imblearn for balancing class weights


In [4]:
# print original shape of y from above:
print('Original dataset shape {}'.format(Counter(y)))
smote = SMOTE()
X_smote, y_smote = smote.fit_sample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_smote)))


Original dataset shape Counter({0.0: 276120, 1.0: 5211})
Resampled dataset shape Counter({1.0: 276120, 0.0: 276120})

In [5]:
# Split into train and test sets
X_smote_train, X_smote_test, y_smote_train, y_smote_test = \
train_test_split(X_smote, y_smote, test_size=0.2, random_state=0)

print('Resampled dataset shape {}'.format(Counter(y_smote_train)))
print('Resampled dataset shape {}'.format(Counter(y_smote_test)))


Resampled dataset shape Counter({0.0: 220917, 1.0: 220875})
Resampled dataset shape Counter({1.0: 55245, 0.0: 55203})

In [6]:
# Include data preprocessing to scale all features! (i.e. calculate z-scores)
pipe_smote = make_pipeline(StandardScaler(), LogisticRegression(class_weight='balanced', penalty='l2'))
    
pipe_smote.fit(X_smote_train, y_smote_train)
# print(X_train)
print(len(y_smote_train))
print(len(y_smote_test))
print(len(X_smote_train))
print(len(X_smote_test))


441792
110448
441792
110448

In [8]:
# save model
fpipe_smote = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/model_smote_len_lda_wpos_wpos_readmets','wb')
pickle.dump(pipe_smote, fpipe_smote)

# check the accuracy on the training set
print(pipe_smote.score(X_smote_train, y_smote_train))

# check the accuracy on the test set
print(pipe_smote.score(X_smote_test, y_smote_test))


0.539926028538
0.538561132841

In [11]:
# predict class labels for the test set
predicted_smote = pipe_smote.predict(X_smote_test)
print(predicted_smote)
test = np.nonzero(predicted_smote)
print(len(test[0]))
print(len(predicted_smote))

# examine the coefficients
print(np.transpose(pipe_smote.steps[1][1].coef_))
# print(X_smote_test.columns)

# generate class probabilities
probs_smote = pipe_smote.predict_proba(X_smote_test)
print (probs_smote)

# generate evaluation metrics
print( metrics.accuracy_score(y_smote_test, predicted_smote))
print( metrics.roc_auc_score(y_smote_test, probs_smote[:, 1]))

print( metrics.confusion_matrix(y_smote_test, predicted_smote))
print( metrics.classification_report(y_smote_test, predicted_smote))

# evaluate the model using 10-fold cross-validation
# scores = cross_val_score(pipe, X, y, scoring='accuracy', cv=10)
recall_smote = cross_val_score(pipe_smote, X, y, cv=10, scoring='recall')

print (recall_smote)
print (recall_smote.mean())
print (recall_smote.std())


[ 0.  1.  1. ...,  1.  1.  1.]
63802
110448
[[ 0.        ]
 [-0.54720871]
 [ 0.00265248]
 [-0.07867989]
 [ 0.1102044 ]
 [-0.0438532 ]
 [ 0.59799881]
 [ 0.92154695]
 [-0.06256826]
 [ 0.1332407 ]
 [ 0.09715447]
 [ 0.17088002]
 [-0.58975695]]
[[ 0.52961794  0.47038206]
 [ 0.4179719   0.5820281 ]
 [ 0.49659791  0.50340209]
 ..., 
 [ 0.47521229  0.52478771]
 [ 0.4465655   0.5534345 ]
 [ 0.44269922  0.55730078]]
0.538561132841
0.556674374615
[[25442 29761]
 [21204 34041]]
             precision    recall  f1-score   support

        0.0       0.55      0.46      0.50     55203
        1.0       0.53      0.62      0.57     55245

avg / total       0.54      0.54      0.54    110448

[ 0.59003831  0.61612284  0.61804223  0.68714012  0.62955854  0.60076775
  0.61420345  0.57005758  0.59884837  0.61036468]
0.61351438804
0.0292113885642

In [12]:
# normalized confusion matrix values
cm_smote = metrics.confusion_matrix(y_smote_test, predicted_smote)
print(cm_smote)
cm_smote = cm_smote.astype('float') / cm_smote.sum(axis=1)[:, np.newaxis]

print(cm_smote)


[[25442 29761]
 [21204 34041]]
[[ 0.46088075  0.53911925]
 [ 0.38381754  0.61618246]]

Generate CDF plots (see "Construct validation plots" section above)


In [15]:
decfxn_tr_smote = pipe_smote.decision_function(X_smote_train)
decfxn_ts_smote = pipe_smote.decision_function(X_smote_test)

plot_cdf(decfxn_tr_smote, y_smote_train)
plot_cdf(decfxn_ts_smote, y_smote_test)


441792
220875.0
220875.0
0.528512983291
110448
55245.0
55245.0
0.528321884524

Conclusion from imblearn tests

I tested applying undersampling and synthetic oversampling (through the imblearn library) to the logistic regression and found that these methods achieved similar highlight recall (sensitivity, TP/(TP+FN)) but with more balanced precision and f1-scores between the highlighted and non-highlighted classes.


In [ ]:

TRY RANDOM FOREST with current features

I wanted to see whether a random forest model would be better than the logistic regression at classifying highlights. This was not my preferred method, since random forest models are generally less interpretable than logistic regression.

Here, I found that a random forest model attains a deceptively high overall accuracy, because it predicts almost all sentences to be non-highlighted, allowing the much bigger non-highlighted portion of the samples to overwhelm the prediction. Thus, I decided to stick with the logistic regression model.


In [170]:
from sklearn.ensemble import RandomForestClassifier

# # evaluate the model by splitting into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
modelRF = RandomForestClassifier(class_weight='balanced')
modelRF.fit(X_train, y_train)
# print(X_train)
print(len(y_train))
print(len(y_test))
print(len(X_train))
print(len(X_test))


225064
56267
225064
56267

In [171]:
print(modelRF)


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [172]:
# save model
fmodelRF = open('/Users/clarencecheng/Dropbox/~Insight/skimr/datasets/modelRF_len_lda_wpos_wpos_readmets','wb')
pickle.dump(modelRF, fmodelRF)

In [173]:
# check the accuracy on the training set
print(modelRF.score(X_train, y_train))


0.994885899122

In [174]:
# check the accuracy on the test set
print(modelRF.score(X_test, y_test))


0.981143476638

In [175]:
# predict class labels for the test set
predictedRF = modelRF.predict(X_test)
print(predictedRF)

test = np.nonzero(predictedRF)
print(len(test[0]))
print(len(predictedRF))


[ 0.  0.  0. ...,  0.  0.  0.]
16
56267

In [176]:
# generate class probabilities
probsRF = modelRF.predict_proba(X_test)
print (probsRF)


[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 ..., 
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]

In [177]:
# generate evaluation metrics
print( metrics.accuracy_score(y_test, predictedRF))
print( metrics.roc_auc_score(y_test, probsRF[:, 1]))

print( metrics.confusion_matrix(y_test, predictedRF))
print( metrics.classification_report(y_test, predictedRF))


0.981143476638
0.519190098482
[[55198     8]
 [ 1053     8]]
             precision    recall  f1-score   support

        0.0       0.98      1.00      0.99     55206
        1.0       0.50      0.01      0.01      1061

avg / total       0.97      0.98      0.97     56267


In [178]:
# evaluate the model using 10-fold cross-validation
recallRF = cross_val_score(modelRF, X, y, scoring='recall', cv=10)
print (recallRF)
print (recallRF.mean())
print (recallRF.std())

# Terrible highlight recall!
# [ 0.00383142  0.00191939  0.00767754  0.00191939  0.00575816  0.00959693
#   0.00575816  0.00575816  0.00383877  0.00575816]
# 0.00518160625381
# 0.00227957890199


[ 0.00383142  0.00191939  0.00767754  0.00191939  0.00575816  0.00959693
  0.00575816  0.00575816  0.00383877  0.00575816]
0.00518160625381
0.00227957890199

In [110]:
plt.matshow(confusion_matrix(y_test, predictedRF), cmap=plt.cm.binary, interpolation='nearest')
plt.title('confusion matrix')
plt.colorbar()
plt.ylabel('expected label')
plt.xlabel('predicted label')
plt.show()