In [1]:
import json, re
from collections import Counter
from operator import itemgetter
from sklearn.feature_extraction import DictVectorizer
In [5]:
TAGS = ['title', 'h1', 'h2', 'h3', 'meta-description', 'meta-keywords',
'img', 'a', 'other']
In [3]:
def load_processed_data(path):
with open(path, 'r') as infile:
data = list(map(json.loads, infile))
infile.close()
return data
path = '../data/processed/extracted_text'
data = load_processed_data(path)
In [26]:
def tokenize(string):
string = re.sub(r'[0-9]', '0', string)
words = re.split(r'\W+', string)
return words
def count(texts):
counter = Counter()
for text in texts:
words = tokenize(text)
counter.update(words)
return counter
def get(tag, items):
for item in items:
yield item[tag] if (tag in item) else []
def sum_up(counts, n):
tags = list(counts)
total = Counter()
for i in range(n):
words = set()
for tag in tags:
words.update(set(counts[tag][i]))
total.update(words)
return total
def vectorize(total, tags, counts_per_tag):
v = DictVectorizer()
v.fit([total]) # because DictVectorizer requires array of dict like objects
features = {}
for tag in tags:
features[tag] = v.transform(counts_per_tag[tag])
print('Features ', features['title'])
def count_words(data):
tags = TAGS + ['boilerplate']
counts_per_tag = {}
for tag in tags:
counts = list(map(count, get(tag, data)))
counts_per_tag[tag] = counts
total = sum_up(counts_per_tag, len(data))
vectorize(total, tags, counts_per_tag)
return total
x = count_words(data[0:1])
In [ ]: