notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
from spacy import English
from sklearn.feature_extraction.text import CountVectorizer



In [2]:

    
nlp = English()



In [3]:

    
df = pd.read_csv('~/Downloads/Atlas-page-titles.csv',encoding='utf-8')



In [4]:

    
df.columns









    Out[4]:





Index([u'Page Title', u'Pageviews', u'Unique Pageviews', u'Avg. Time on Page',
       u'Entrances', u'Bounce Rate', u'% Exit', u'Page Value'],
      dtype='object')



In [5]:

    
df['Page Title'] = df['Page Title'].apply(lambda x: x.replace(' | Atlas Obscura',''))



In [6]:

    
df.iloc[3,0]









    Out[6]:





u'The Subtle Design Features That Make Cities Feel More Hostile'



In [7]:

    
title = df.iloc[3,0].lower()
title = unicode(title)



In [8]:

    
parsed = nlp(title)



In [9]:

    
for (i,word) in enumerate(parsed):
    print "Word: {}".format(word)
    print "\t Phrase type: {}".format(word.dep_)
    print "\t Is the word a known entity type? {}".format(word.ent_type_ if word.ent_type_ else "No")
    print "\t Lemma: {}".format(word.lemma_)
    print "\t Parent of this word: {}".format(word.head.lemma_)









    



Word: the 
	 Phrase type: det
	 Is the word a known entity type? No
	 Lemma: the
	 Parent of this word: feature
Word: subtle 
	 Phrase type: amod
	 Is the word a known entity type? No
	 Lemma: subtle
	 Parent of this word: feature
Word: design 
	 Phrase type: compound
	 Is the word a known entity type? No
	 Lemma: design
	 Parent of this word: feature
Word: features 
	 Phrase type: ROOT
	 Is the word a known entity type? No
	 Lemma: feature
	 Parent of this word: feature
Word: that 
	 Phrase type: nsubj
	 Is the word a known entity type? No
	 Lemma: that
	 Parent of this word: make
Word: make 
	 Phrase type: relcl
	 Is the word a known entity type? No
	 Lemma: make
	 Parent of this word: feature
Word: cities 
	 Phrase type: nsubj
	 Is the word a known entity type? No
	 Lemma: city
	 Parent of this word: feel
Word: feel 
	 Phrase type: ccomp
	 Is the word a known entity type? No
	 Lemma: feel
	 Parent of this word: make
Word: more 
	 Phrase type: advmod
	 Is the word a known entity type? No
	 Lemma: more
	 Parent of this word: hostile
Word: hostile
	 Phrase type: acomp
	 Is the word a known entity type? No
	 Lemma: hostile
	 Parent of this word: feel



In [10]:

    
for i, token in enumerate(parsed):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 10:
        break









    



('original:', 466, u'the')
('lowercased:', 466, u'the')
('lemma:', 466, u'the')
('shape:', 28983, u'xxx')
('prefix:', 3598, u't')
('suffix:', 466, u'the')
('log probability:', -3.528766632080078)
('Brown cluster id:', 11)
----------------------------------------
('original:', 4563, u'subtle')
('lowercased:', 4563, u'subtle')
('lemma:', 4563, u'subtle')
('shape:', 53740, u'xxxx')
('prefix:', 1012, u's')
('suffix:', 318622, u'tle')
('log probability:', -11.123329162597656)
('Brown cluster id:', 807)
----------------------------------------
('original:', 1641, u'design')
('lowercased:', 1641, u'design')
('lemma:', 1641, u'design')
('shape:', 53740, u'xxxx')
('prefix:', 5675, u'd')
('suffix:', 15903, u'ign')
('log probability:', -9.593635559082031)
('Brown cluster id:', 181)
----------------------------------------
('original:', 2456, u'features')
('lowercased:', 2456, u'features')
('lemma:', 2557, u'feature')
('shape:', 53740, u'xxxx')
('prefix:', 7040, u'f')
('suffix:', 7846, u'res')
('log probability:', -10.199167251586914)
('Brown cluster id:', 77)
----------------------------------------
('original:', 475, u'that')
('lowercased:', 475, u'that')
('lemma:', 475, u'that')
('shape:', 53740, u'xxxx')
('prefix:', 3598, u't')
('suffix:', 2768, u'hat')
('log probability:', -4.464504718780518)
('Brown cluster id:', 84)
----------------------------------------
('original:', 565, u'make')
('lowercased:', 565, u'make')
('lemma:', 565, u'make')
('shape:', 53740, u'xxxx')
('prefix:', 977, u'm')
('suffix:', 152089, u'ake')
('log probability:', -6.66980504989624)
('Brown cluster id:', 4618)
----------------------------------------
('original:', 2755, u'cities')
('lowercased:', 2755, u'cities')
('lemma:', 1210, u'city')
('shape:', 53740, u'xxxx')
('prefix:', 4206, u'c')
('suffix:', 135838, u'ies')
('log probability:', -10.388223648071289)
('Brown cluster id:', 845)
----------------------------------------
('original:', 638, u'feel')
('lowercased:', 638, u'feel')
('lemma:', 638, u'feel')
('shape:', 53740, u'xxxx')
('prefix:', 7040, u'f')
('suffix:', 38818, u'eel')
('log probability:', -7.342533588409424)
('Brown cluster id:', 1674)
----------------------------------------
('original:', 529, u'more')
('lowercased:', 529, u'more')
('lemma:', 529, u'more')
('shape:', 53740, u'xxxx')
('prefix:', 977, u'm')
('suffix:', 13678, u'ore')
('log probability:', -6.081598281860352)
('Brown cluster id:', 1514)
----------------------------------------
('original:', 6626, u'hostile')
('lowercased:', 6626, u'hostile')
('lemma:', 6626, u'hostile')
('shape:', 53740, u'xxxx')
('prefix:', 10828, u'h')
('suffix:', 99715, u'ile')
('log probability:', -11.662409782409668)
('Brown cluster id:', 697)
----------------------------------------



In [11]:

    
for i, word in enumerate(parsed):
    print "/t Part of speech: {}".format(word.pos_), word









    



/t Part of speech: DET the 
/t Part of speech: ADJ subtle 
/t Part of speech: NOUN design 
/t Part of speech: NOUN features 
/t Part of speech: ADJ that 
/t Part of speech: VERB make 
/t Part of speech: NOUN cities 
/t Part of speech: VERB feel 
/t Part of speech: ADJ more 
/t Part of speech: ADJ hostile



In [12]:

    
#How do we figure out what the best pattern for headlines is?



In [13]:

    
df.head(1)









    Out[13]:






  
    
      
      Page Title
      Pageviews
      Unique Pageviews
      Avg. Time on Page
      Entrances
      Bounce Rate
      % Exit
      Page Value
    
  
  
    
      0
      Atlas Obscura | Curious and Wondrous Travel De...
      284,868
      212,278
      0:01:18
      160,486
      36.31%
      30.74%
      $0.00



In [14]:

    
df.iloc[49,0]









    Out[14]:





u"The Origins of 'Horn OK Please,' India's Most Ubiquitous Phrase"



In [15]:

    
df['Page Title'] = df['Page Title'].apply(lambda x: unicode(x))



In [16]:

    
vectorizer = CountVectorizer(lowercase=True, stop_words='english')



In [18]:

    
matrix = vectorizer.fit_transform(df['Page Title'])



In [22]:

    
print matrix.todense()









    



[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]



In [ ]: