In [1]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
from collections import Counter

from collections import OrderedDict
import re
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from HTMLParser import HTMLParser
from bs4 import BeautifulSoup

In [2]:
porter = PorterStemmer()
wnl = WordNetLemmatizer() 
stop = stopwords.words('english')
stop.append("new")
stop.append("like")
stop.append("u")
stop.append("it'")
stop.append("'s")
stop.append("n't")
stop.append('mr.')
stop = set(stop)

In [3]:
# taken from http://ahmedbesbes.com/how-to-mine-newsfeed-data-and-extract-interactive-insights-in-python.html

def tokenizer(text):

    tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]

    tokens = []
    for token_by_sent in tokens_:
        tokens += token_by_sent

    tokens = list(filter(lambda t: t.lower() not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
     
    filtered_tokens = []
    for token in tokens:
        token = wnl.lemmatize(token)
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

    return filtered_tokens


class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def get_keywords(tokens, num):
    return Counter(tokens).most_common(num)


def build_article_df(urls):
    articles = []
    for index, row in urls.iterrows():
        try:
            data=row['text'].strip().replace("'", "")
            data = strip_tags(data)
            soup = BeautifulSoup(data)
            data = soup.get_text()
            data = data.encode('ascii', 'ignore').decode('ascii')
            document = tokenizer(data)
            top_5 = get_keywords(document, 3)
          
            unzipped = zip(*top_5)
            kw= list(unzipped[0])
            kw=",".join(str(x) for x in kw)
            articles.append((kw, row['title'], row['pubdate']))
        except Exception as e:
            print e
            #print data
            #break
            pass
        #break
    article_df = pd.DataFrame(articles, columns=['keywords', 'title', 'pubdate'])
    return article_df

In [4]:
df = pd.read_csv('../examples/tocsv.csv')

In [5]:
df.head()


Out[5]:
id Title Content Date Permalink Categories Tags
0 8938 Building a Data Culture <a href="http://ericbrown.com/wp-content/uploa... 20141118 http://ericbrown.com/building-data-culture.htm Big Data|People Big Data|culture
1 8943 Note to Self - Don't say "Data Driven" Anymore <a href="http://ericbrown.com/wp-content/uploa... 20141120 http://ericbrown.com/dont-say-data-driven-anym... Big Data|Leadership Big Data|data-driven|Knowledge Management|Orga...
2 8948 Foto Friday - Titmouse on the Feeder I captured this Titmouse on the backporch feed... 20141121 http://ericbrown.com/foto-friday-titmouse-feed... Foto Friday Photography
3 8952 The Cloud - Gateway to Enterprise Mobility <em>This post is brought to you by the</em> <a... 20141121 http://ericbrown.com/cloud-gateway-enterprise-... Information Technology|Strategy|Technology|The... cloud|mobility
4 8957 The Agile Data Center <a href="http://ericbrown.com/wp-content/uploa... 20141124 http://ericbrown.com/agile-data-center.htm Data Center|Information Technology|The New CIO Agility|Data center|flexibility

In [6]:
df = pd.read_csv('../examples/tocsv.csv')
data = []
for index, row in df.iterrows():
    data.append((row['Title'], row['Permalink'], row['Date'], row['Content']))
data_df = pd.DataFrame(data, columns=['title' ,'url', 'pubdate', 'text' ])

In [7]:
data_df.tail()


Out[7]:
title url pubdate text
143 Driving Digital by Isaac Sacolick - a book review http://ericbrown.com/driving-digital-isaac-sac... 20170906 <img class="alignleft size-medium wp-image-975...
144 Data and Culture go hand in hand http://ericbrown.com/?p=9757 -11130 Last week, I spent an afternoon talking to the...
145 Data Quality - The most important data dimension? http://ericbrown.com/data-quality-most-importa... 20170918 <img class="size-medium wp-image-9764 alignrig...
146 Be pragmatic, not dogmatic http://ericbrown.com/be-pragmatic-not-dogmatic... 20170928 <img class="alignright size-medium wp-image-97...
147 The Data Way http://ericbrown.com/the-data-way.htm 20171003 <img class="alignleft size-medium wp-image-977...

In [8]:
article_df = build_article_df(data_df)


/vagrant/Python/pythondata/env/local/lib/python2.7/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html5lib"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 174 of the file /usr/lib/python2.7/runpy.py. To get rid of this warning, change code that looks like this:

 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))

In [9]:
article_df.head()


Out[9]:
keywords title pubdate
0 data,big,culture Building a Data Culture 20141118
1 data,data-driven,company Note to Self - Don't say "Data Driven" Anymore 20141120
2 canon,captured,titmouse Foto Friday - Titmouse on the Feeder 20141121
3 mobility,organization,device The Cloud - Gateway to Enterprise Mobility 20141121
4 center,data,agile The Agile Data Center 20141124

In [10]:
keywords_array=[]
for index, row in article_df.iterrows():
    keywords=row['keywords'].split(',')
    for kw in keywords:
        keywords_array.append((kw.strip(' '), row['keywords']))

kw_df = pd.DataFrame(keywords_array).rename(columns={0:'keyword', 1:'keywords'})

In [11]:
kw_df.head()


Out[11]:
keyword keywords
0 data data,big,culture
1 big data,big,culture
2 culture data,big,culture
3 data data,data-driven,company
4 data-driven data,data-driven,company

In [12]:
document = kw_df.keywords.tolist()
names = kw_df.keyword.tolist()

In [13]:
document_array = []
for item in document:
    items = item.split(',')
    document_array.append((items))

occurrences = OrderedDict((name, OrderedDict((name, 0) for name in names)) for name in names)

# Find the co-occurrences:
for l in document_array:
    for i in range(len(l)):
        for item in l[:i] + l[i + 1:]:
            occurrences[l[i]][item] += 1

In [14]:
co_occur = pd.DataFrame.from_dict(occurrences )

In [15]:
co_occur.to_csv('out/ericbrown_co-occurancy_matrix.csv')

In [16]:
co_occur.head()


Out[16]:
data big culture data-driven company canon captured titmouse mobility organization ... multi-bracket slow science knowledge management love song quality dogmatic pragmatic
300mm 0 0 0 0 0 3 3 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7d 0 0 0 0 0 18 6 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8-16mm 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
agile 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
agility 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 201 columns


In [ ]: