In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
from collections import Counter
from collections import OrderedDict
import re
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from HTMLParser import HTMLParser
from bs4 import BeautifulSoup
In [2]:
porter = PorterStemmer()
wnl = WordNetLemmatizer()
stop = stopwords.words('english')
stop.append("new")
stop.append("like")
stop.append("u")
stop.append("it'")
stop.append("'s")
stop.append("n't")
stop.append('mr.')
stop = set(stop)
In [3]:
# taken from http://ahmedbesbes.com/how-to-mine-newsfeed-data-and-extract-interactive-insights-in-python.html
def tokenizer(text):
tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]
tokens = []
for token_by_sent in tokens_:
tokens += token_by_sent
tokens = list(filter(lambda t: t.lower() not in stop, tokens))
tokens = list(filter(lambda t: t not in punctuation, tokens))
tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
filtered_tokens = []
for token in tokens:
token = wnl.lemmatize(token)
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))
return filtered_tokens
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def get_keywords(tokens, num):
return Counter(tokens).most_common(num)
def build_article_df(urls):
articles = []
for index, row in urls.iterrows():
try:
data=row['text'].strip().replace("'", "")
data = strip_tags(data)
soup = BeautifulSoup(data)
data = soup.get_text()
data = data.encode('ascii', 'ignore').decode('ascii')
document = tokenizer(data)
top_5 = get_keywords(document, 3)
unzipped = zip(*top_5)
kw= list(unzipped[0])
kw=",".join(str(x) for x in kw)
articles.append((kw, row['title'], row['pubdate']))
except Exception as e:
print e
#print data
#break
pass
#break
article_df = pd.DataFrame(articles, columns=['keywords', 'title', 'pubdate'])
return article_df
In [4]:
df = pd.read_csv('../examples/tocsv.csv')
In [5]:
df.head()
Out[5]:
In [6]:
df = pd.read_csv('../examples/tocsv.csv')
data = []
for index, row in df.iterrows():
data.append((row['Title'], row['Permalink'], row['Date'], row['Content']))
data_df = pd.DataFrame(data, columns=['title' ,'url', 'pubdate', 'text' ])
In [7]:
data_df.tail()
Out[7]:
In [8]:
article_df = build_article_df(data_df)
In [9]:
article_df.head()
Out[9]:
In [10]:
keywords_array=[]
for index, row in article_df.iterrows():
keywords=row['keywords'].split(',')
for kw in keywords:
keywords_array.append((kw.strip(' '), row['keywords']))
kw_df = pd.DataFrame(keywords_array).rename(columns={0:'keyword', 1:'keywords'})
In [11]:
kw_df.head()
Out[11]:
In [12]:
document = kw_df.keywords.tolist()
names = kw_df.keyword.tolist()
In [13]:
document_array = []
for item in document:
items = item.split(',')
document_array.append((items))
occurrences = OrderedDict((name, OrderedDict((name, 0) for name in names)) for name in names)
# Find the co-occurrences:
for l in document_array:
for i in range(len(l)):
for item in l[:i] + l[i + 1:]:
occurrences[l[i]][item] += 1
In [14]:
co_occur = pd.DataFrame.from_dict(occurrences )
In [15]:
co_occur.to_csv('out/ericbrown_co-occurancy_matrix.csv')
In [16]:
co_occur.head()
Out[16]:
In [ ]: