notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
from collections import Counter

from collections import OrderedDict
import re
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from HTMLParser import HTMLParser
from bs4 import BeautifulSoup



In [2]:

    
porter = PorterStemmer()
wnl = WordNetLemmatizer() 
stop = stopwords.words('english')
stop.append("new")
stop.append("like")
stop.append("u")
stop.append("it'")
stop.append("'s")
stop.append("n't")
stop.append('mr.')
stop = set(stop)



In [3]:

    
# taken from http://ahmedbesbes.com/how-to-mine-newsfeed-data-and-extract-interactive-insights-in-python.html

def tokenizer(text):

    tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]

    tokens = []
    for token_by_sent in tokens_:
        tokens += token_by_sent

    tokens = list(filter(lambda t: t.lower() not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
     
    filtered_tokens = []
    for token in tokens:
        token = wnl.lemmatize(token)
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

    return filtered_tokens


class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def get_keywords(tokens, num):
    return Counter(tokens).most_common(num)


def build_article_df(urls):
    articles = []
    for index, row in urls.iterrows():
        try:
            data=row['text'].strip().replace("'", "")
            data = strip_tags(data)
            soup = BeautifulSoup(data)
            data = soup.get_text()
            data = data.encode('ascii', 'ignore').decode('ascii')
            document = tokenizer(data)
            top_5 = get_keywords(document, 3)
          
            unzipped = zip(*top_5)
            kw= list(unzipped[0])
            kw=",".join(str(x) for x in kw)
            articles.append((kw, row['title'], row['pubdate']))
        except Exception as e:
            print e
            #print data
            #break
            pass
        #break
    article_df = pd.DataFrame(articles, columns=['keywords', 'title', 'pubdate'])
    return article_df



In [4]:

    
df = pd.read_csv('../examples/tocsv.csv')



In [5]:

    
df.head()









    Out[5]:







  
    
      
      id
      Title
      Content
      Date
      Permalink
      Categories
      Tags
    
  
  
    
      0
      8938
      Building a Data Culture
      <a href="http://ericbrown.com/wp-content/uploa...
      20141118
      http://ericbrown.com/building-data-culture.htm
      Big Data|People
      Big Data|culture
    
    
      1
      8943
      Note to Self - Don't say "Data Driven" Anymore
      <a href="http://ericbrown.com/wp-content/uploa...
      20141120
      http://ericbrown.com/dont-say-data-driven-anym...
      Big Data|Leadership
      Big Data|data-driven|Knowledge Management|Orga...
    
    
      2
      8948
      Foto Friday - Titmouse on the Feeder
      I captured this Titmouse on the backporch feed...
      20141121
      http://ericbrown.com/foto-friday-titmouse-feed...
      Foto Friday
      Photography
    
    
      3
      8952
      The Cloud - Gateway to Enterprise Mobility
      <em>This post is brought to you by the</em> <a...
      20141121
      http://ericbrown.com/cloud-gateway-enterprise-...
      Information Technology|Strategy|Technology|The...
      cloud|mobility
    
    
      4
      8957
      The Agile Data Center
      <a href="http://ericbrown.com/wp-content/uploa...
      20141124
      http://ericbrown.com/agile-data-center.htm
      Data Center|Information Technology|The New CIO
      Agility|Data center|flexibility



In [6]:

    
df = pd.read_csv('../examples/tocsv.csv')
data = []
for index, row in df.iterrows():
    data.append((row['Title'], row['Permalink'], row['Date'], row['Content']))
data_df = pd.DataFrame(data, columns=['title' ,'url', 'pubdate', 'text' ])



In [7]:

    
data_df.tail()









    Out[7]:







  
    
      
      title
      url
      pubdate
      text
    
  
  
    
      143
      Driving Digital by Isaac Sacolick - a book review
      http://ericbrown.com/driving-digital-isaac-sac...
      20170906
      <img class="alignleft size-medium wp-image-975...
    
    
      144
      Data and Culture go hand in hand
      http://ericbrown.com/?p=9757
      -11130
      Last week, I spent an afternoon talking to the...
    
    
      145
      Data Quality - The most important data dimension?
      http://ericbrown.com/data-quality-most-importa...
      20170918
      <img class="size-medium wp-image-9764 alignrig...
    
    
      146
      Be pragmatic, not dogmatic
      http://ericbrown.com/be-pragmatic-not-dogmatic...
      20170928
      <img class="alignright size-medium wp-image-97...
    
    
      147
      The Data Way
      http://ericbrown.com/the-data-way.htm
      20171003
      <img class="alignleft size-medium wp-image-977...



In [8]:

    
article_df = build_article_df(data_df)









    



/vagrant/Python/pythondata/env/local/lib/python2.7/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html5lib"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 174 of the file /usr/lib/python2.7/runpy.py. To get rid of this warning, change code that looks like this:

 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))



In [9]:

    
article_df.head()









    Out[9]:







  
    
      
      keywords
      title
      pubdate
    
  
  
    
      0
      data,big,culture
      Building a Data Culture
      20141118
    
    
      1
      data,data-driven,company
      Note to Self - Don't say "Data Driven" Anymore
      20141120
    
    
      2
      canon,captured,titmouse
      Foto Friday - Titmouse on the Feeder
      20141121
    
    
      3
      mobility,organization,device
      The Cloud - Gateway to Enterprise Mobility
      20141121
    
    
      4
      center,data,agile
      The Agile Data Center
      20141124



In [10]:

    
keywords_array=[]
for index, row in article_df.iterrows():
    keywords=row['keywords'].split(',')
    for kw in keywords:
        keywords_array.append((kw.strip(' '), row['keywords']))

kw_df = pd.DataFrame(keywords_array).rename(columns={0:'keyword', 1:'keywords'})



In [11]:

    
kw_df.head()









    Out[11]:







  
    
      
      keyword
      keywords
    
  
  
    
      0
      data
      data,big,culture
    
    
      1
      big
      data,big,culture
    
    
      2
      culture
      data,big,culture
    
    
      3
      data
      data,data-driven,company
    
    
      4
      data-driven
      data,data-driven,company



In [12]:

    
document = kw_df.keywords.tolist()
names = kw_df.keyword.tolist()



In [13]:

    
document_array = []
for item in document:
    items = item.split(',')
    document_array.append((items))

occurrences = OrderedDict((name, OrderedDict((name, 0) for name in names)) for name in names)

# Find the co-occurrences:
for l in document_array:
    for i in range(len(l)):
        for item in l[:i] + l[i + 1:]:
            occurrences[l[i]][item] += 1



In [14]:

    
co_occur = pd.DataFrame.from_dict(occurrences )



In [15]:

    
co_occur.to_csv('out/ericbrown_co-occurancy_matrix.csv')



In [16]:

    
co_occur.head()









    Out[16]:







  
    
      
      data
      big
      culture
      data-driven
      company
      canon
      captured
      titmouse
      mobility
      organization
      ...
      multi-bracket
      slow
      science
      knowledge
      management
      love
      song
      quality
      dogmatic
      pragmatic
    
  
  
    
      300mm
      0
      0
      0
      0
      0
      3
      3
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      7d
      0
      0
      0
      0
      0
      18
      6
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      8-16mm
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      agile
      9
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      agility
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 201 columns



In [ ]:

	id	Title	Content	Date	Permalink	Categories	Tags
0	8938	Building a Data Culture	<a href="http://ericbrown.com/wp-content/uploa...	20141118	http://ericbrown.com/building-data-culture.htm	Big Data\|People	Big Data\|culture
1	8943	Note to Self - Don't say "Data Driven" Anymore	<a href="http://ericbrown.com/wp-content/uploa...	20141120	http://ericbrown.com/dont-say-data-driven-anym...	Big Data\|Leadership	Big Data\|data-driven\|Knowledge Management\|Orga...
2	8948	Foto Friday - Titmouse on the Feeder	I captured this Titmouse on the backporch feed...	20141121	http://ericbrown.com/foto-friday-titmouse-feed...	Foto Friday	Photography
3	8952	The Cloud - Gateway to Enterprise Mobility	<em>This post is brought to you by the</em> <a...	20141121	http://ericbrown.com/cloud-gateway-enterprise-...	Information Technology\|Strategy\|Technology\|The...	cloud\|mobility
4	8957	The Agile Data Center	<a href="http://ericbrown.com/wp-content/uploa...	20141124	http://ericbrown.com/agile-data-center.htm	Data Center\|Information Technology\|The New CIO	Agility\|Data center\|flexibility

	title	url	pubdate	text
143	Driving Digital by Isaac Sacolick - a book review	http://ericbrown.com/driving-digital-isaac-sac...	20170906	<img class="alignleft size-medium wp-image-975...
144	Data and Culture go hand in hand	http://ericbrown.com/?p=9757	-11130	Last week, I spent an afternoon talking to the...
145	Data Quality - The most important data dimension?	http://ericbrown.com/data-quality-most-importa...	20170918	<img class="size-medium wp-image-9764 alignrig...
146	Be pragmatic, not dogmatic	http://ericbrown.com/be-pragmatic-not-dogmatic...	20170928	<img class="alignright size-medium wp-image-97...
147	The Data Way	http://ericbrown.com/the-data-way.htm	20171003	<img class="alignleft size-medium wp-image-977...

	keywords	title	pubdate
0	data,big,culture	Building a Data Culture	20141118
1	data,data-driven,company	Note to Self - Don't say "Data Driven" Anymore	20141120
2	canon,captured,titmouse	Foto Friday - Titmouse on the Feeder	20141121
3	mobility,organization,device	The Cloud - Gateway to Enterprise Mobility	20141121
4	center,data,agile	The Agile Data Center	20141124

	keyword	keywords
0	data	data,big,culture
1	big	data,big,culture
2	culture	data,big,culture
3	data	data,data-driven,company
4	data-driven	data,data-driven,company

	data	canon	captured	...
300mm	0	3	3	...
7d	0	18	6	...
8-16mm	0	0	0	...
agile	9	0	0	...
agility	0	0	0	...