In [1]:
import json, re

from collections import Counter
from operator import itemgetter
from sklearn.feature_extraction import DictVectorizer

In [5]:
TAGS = ['title', 'h1', 'h2', 'h3', 'meta-description', 'meta-keywords',
        'img', 'a', 'other']

In [3]:
def load_processed_data(path):
    with open(path, 'r') as infile:
        data = list(map(json.loads, infile))
        infile.close()
        return data

path = '../data/processed/extracted_text'
data = load_processed_data(path)

In [26]:
def tokenize(string):
    string = re.sub(r'[0-9]', '0', string)
    words = re.split(r'\W+', string)
    
    return words
    
def count(texts):
    counter = Counter()
    
    for text in texts:
        words = tokenize(text)
        counter.update(words)
    
    return counter

def get(tag, items):
    for item in items:
        yield item[tag] if (tag in item) else []
        
def sum_up(counts, n):
    tags = list(counts)
    total = Counter()
    
    for i in range(n):
        words = set()
        
        for tag in tags:
            words.update(set(counts[tag][i]))
        
        total.update(words)
    
    return total

def vectorize(total, tags, counts_per_tag):
    v = DictVectorizer()
    v.fit([total]) # because DictVectorizer requires array of dict like objects
    
    features = {}
    
    for tag in tags:
        features[tag] = v.transform(counts_per_tag[tag])
    
    print('Features ', features['title'])
     
def count_words(data):
    tags = TAGS + ['boilerplate']
    counts_per_tag = {}
    
    for tag in tags:
        counts = list(map(count, get(tag, data)))
        counts_per_tag[tag] = counts
        
    total = sum_up(counts_per_tag, len(data))
    vectorize(total, tags, counts_per_tag)
    
    return total
    
x = count_words(data[0:1])


Features    (0, 13)	1.0
  (0, 49)	1.0
  (0, 64)	1.0
  (0, 68)	1.0
  (0, 83)	1.0
  (0, 251)	1.0
  (0, 258)	1.0
  (0, 479)	1.0

In [ ]: