In [1]:
import pandas as pd
import numpy as np
from spacy import English
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
nlp = English()

In [3]:
df = pd.read_csv('~/Downloads/Atlas-page-titles.csv',encoding='utf-8')

In [4]:
df.columns


Out[4]:
Index([u'Page Title', u'Pageviews', u'Unique Pageviews', u'Avg. Time on Page',
       u'Entrances', u'Bounce Rate', u'% Exit', u'Page Value'],
      dtype='object')

In [5]:
df['Page Title'] = df['Page Title'].apply(lambda x: x.replace(' | Atlas Obscura',''))

In [6]:
df.iloc[3,0]


Out[6]:
u'The Subtle Design Features That Make Cities Feel More Hostile'

In [7]:
title = df.iloc[3,0].lower()
title = unicode(title)

In [8]:
parsed = nlp(title)

In [9]:
for (i,word) in enumerate(parsed):
    print "Word: {}".format(word)
    print "\t Phrase type: {}".format(word.dep_)
    print "\t Is the word a known entity type? {}".format(word.ent_type_ if word.ent_type_ else "No")
    print "\t Lemma: {}".format(word.lemma_)
    print "\t Parent of this word: {}".format(word.head.lemma_)


Word: the 
	 Phrase type: det
	 Is the word a known entity type? No
	 Lemma: the
	 Parent of this word: feature
Word: subtle 
	 Phrase type: amod
	 Is the word a known entity type? No
	 Lemma: subtle
	 Parent of this word: feature
Word: design 
	 Phrase type: compound
	 Is the word a known entity type? No
	 Lemma: design
	 Parent of this word: feature
Word: features 
	 Phrase type: ROOT
	 Is the word a known entity type? No
	 Lemma: feature
	 Parent of this word: feature
Word: that 
	 Phrase type: nsubj
	 Is the word a known entity type? No
	 Lemma: that
	 Parent of this word: make
Word: make 
	 Phrase type: relcl
	 Is the word a known entity type? No
	 Lemma: make
	 Parent of this word: feature
Word: cities 
	 Phrase type: nsubj
	 Is the word a known entity type? No
	 Lemma: city
	 Parent of this word: feel
Word: feel 
	 Phrase type: ccomp
	 Is the word a known entity type? No
	 Lemma: feel
	 Parent of this word: make
Word: more 
	 Phrase type: advmod
	 Is the word a known entity type? No
	 Lemma: more
	 Parent of this word: hostile
Word: hostile
	 Phrase type: acomp
	 Is the word a known entity type? No
	 Lemma: hostile
	 Parent of this word: feel

In [10]:
for i, token in enumerate(parsed):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 10:
        break


('original:', 466, u'the')
('lowercased:', 466, u'the')
('lemma:', 466, u'the')
('shape:', 28983, u'xxx')
('prefix:', 3598, u't')
('suffix:', 466, u'the')
('log probability:', -3.528766632080078)
('Brown cluster id:', 11)
----------------------------------------
('original:', 4563, u'subtle')
('lowercased:', 4563, u'subtle')
('lemma:', 4563, u'subtle')
('shape:', 53740, u'xxxx')
('prefix:', 1012, u's')
('suffix:', 318622, u'tle')
('log probability:', -11.123329162597656)
('Brown cluster id:', 807)
----------------------------------------
('original:', 1641, u'design')
('lowercased:', 1641, u'design')
('lemma:', 1641, u'design')
('shape:', 53740, u'xxxx')
('prefix:', 5675, u'd')
('suffix:', 15903, u'ign')
('log probability:', -9.593635559082031)
('Brown cluster id:', 181)
----------------------------------------
('original:', 2456, u'features')
('lowercased:', 2456, u'features')
('lemma:', 2557, u'feature')
('shape:', 53740, u'xxxx')
('prefix:', 7040, u'f')
('suffix:', 7846, u'res')
('log probability:', -10.199167251586914)
('Brown cluster id:', 77)
----------------------------------------
('original:', 475, u'that')
('lowercased:', 475, u'that')
('lemma:', 475, u'that')
('shape:', 53740, u'xxxx')
('prefix:', 3598, u't')
('suffix:', 2768, u'hat')
('log probability:', -4.464504718780518)
('Brown cluster id:', 84)
----------------------------------------
('original:', 565, u'make')
('lowercased:', 565, u'make')
('lemma:', 565, u'make')
('shape:', 53740, u'xxxx')
('prefix:', 977, u'm')
('suffix:', 152089, u'ake')
('log probability:', -6.66980504989624)
('Brown cluster id:', 4618)
----------------------------------------
('original:', 2755, u'cities')
('lowercased:', 2755, u'cities')
('lemma:', 1210, u'city')
('shape:', 53740, u'xxxx')
('prefix:', 4206, u'c')
('suffix:', 135838, u'ies')
('log probability:', -10.388223648071289)
('Brown cluster id:', 845)
----------------------------------------
('original:', 638, u'feel')
('lowercased:', 638, u'feel')
('lemma:', 638, u'feel')
('shape:', 53740, u'xxxx')
('prefix:', 7040, u'f')
('suffix:', 38818, u'eel')
('log probability:', -7.342533588409424)
('Brown cluster id:', 1674)
----------------------------------------
('original:', 529, u'more')
('lowercased:', 529, u'more')
('lemma:', 529, u'more')
('shape:', 53740, u'xxxx')
('prefix:', 977, u'm')
('suffix:', 13678, u'ore')
('log probability:', -6.081598281860352)
('Brown cluster id:', 1514)
----------------------------------------
('original:', 6626, u'hostile')
('lowercased:', 6626, u'hostile')
('lemma:', 6626, u'hostile')
('shape:', 53740, u'xxxx')
('prefix:', 10828, u'h')
('suffix:', 99715, u'ile')
('log probability:', -11.662409782409668)
('Brown cluster id:', 697)
----------------------------------------

In [11]:
for i, word in enumerate(parsed):
    print "/t Part of speech: {}".format(word.pos_), word


/t Part of speech: DET the 
/t Part of speech: ADJ subtle 
/t Part of speech: NOUN design 
/t Part of speech: NOUN features 
/t Part of speech: ADJ that 
/t Part of speech: VERB make 
/t Part of speech: NOUN cities 
/t Part of speech: VERB feel 
/t Part of speech: ADJ more 
/t Part of speech: ADJ hostile

In [12]:
#How do we figure out what the best pattern for headlines is?

In [13]:
df.head(1)


Out[13]:
Page Title Pageviews Unique Pageviews Avg. Time on Page Entrances Bounce Rate % Exit Page Value
0 Atlas Obscura | Curious and Wondrous Travel De... 284,868 212,278 0:01:18 160,486 36.31% 30.74% $0.00

In [14]:
df.iloc[49,0]


Out[14]:
u"The Origins of 'Horn OK Please,' India's Most Ubiquitous Phrase"

In [15]:
df['Page Title'] = df['Page Title'].apply(lambda x: unicode(x))

In [16]:
vectorizer = CountVectorizer(lowercase=True, stop_words='english')

In [18]:
matrix = vectorizer.fit_transform(df['Page Title'])

In [22]:
print matrix.todense()


[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

In [ ]: