In [1]:
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict, Counter

from urlparse import urlsplit, parse_qs

import re

In [2]:
hand_annotated_lexicons_files = glob("DomainDataset/*+suffix.txt")
print hand_annotated_lexicons_files


['DomainDataset/fakenews_domain+suffix.txt', 'DomainDataset/commercial_domain+suffix.txt', 'DomainDataset/fakenewschecker_domain+suffix.txt', 'DomainDataset/satire_domain+suffix.txt', 'DomainDataset/Videos_domain+suffix.txt', 'DomainDataset/socialMedia_domain+suffix.txt', 'DomainDataset/scientific_domain+suffix.txt', 'DomainDataset/clickbait_domain+suffix.txt', 'DomainDataset/Blog_domain+suffix.txt', 'DomainDataset/USGov_domain+suffix.txt', 'DomainDataset/News_Domain+suffix.txt']

In [3]:
class URLCategory(object):
    def __init__(self):
        self.categories = set()
        self.sources = set()
    
    def add(self, category, source):
        self.categories.add(category)
        self.sources.add(source)
        
    def __repr__(self):
        return "URLCategory(caegories=%r, sources=%r)" % (
            self.categories, self.sources
        )

In [4]:
CAT_REGEX = re.compile(r'.*/([a-zA-Z]+)_.*')
source = "handcoded"
url_categories = defaultdict(URLCategory)
for filename in hand_annotated_lexicons_files:
    catname = CAT_REGEX.match(filename).groups()[0].lower()
    if catname in set(["fakenewschecker", "usgov"]):
        source = catname
    else:
        source = "handcoded"
    if catname == "fakenewschecker":
        catname = "fakenews"
    print "%s\t%s" % (filename, catname)
    with open(filename) as fp:
        for line in fp:
            line = line.strip().lower()
            if line.startswith("www."):
                line = line[4:]
            url_categories[line].add(catname, source)
            
url_categories["twitter.com"].add("twitter", source) # Manually add twitter in seperate category


DomainDataset/fakenews_domain+suffix.txt	fakenews
DomainDataset/commercial_domain+suffix.txt	commercial
DomainDataset/fakenewschecker_domain+suffix.txt	fakenews
DomainDataset/satire_domain+suffix.txt	satire
DomainDataset/Videos_domain+suffix.txt	videos
DomainDataset/socialMedia_domain+suffix.txt	socialmedia
DomainDataset/scientific_domain+suffix.txt	scientific
DomainDataset/clickbait_domain+suffix.txt	clickbait
DomainDataset/Blog_domain+suffix.txt	blog
DomainDataset/USGov_domain+suffix.txt	usgov
DomainDataset/News_Domain+suffix.txt	news

In [5]:
wikidata_files = glob("DomainDataset/Wikidata_*.tsv")
print wikidata_files
source = "wikidata"
WIKIDATA_CAT_REGEX = re.compile(r'.*/.*_([a-zA-Z\ ]+).*')

for filename in wikidata_files:
    catname = WIKIDATA_CAT_REGEX.match(filename).groups()[0].lower()
    print "%s\t%s" % (filename, catname)
    with open(filename) as fp:
        header = fp.readline()
        for line in fp:
            line = line[:-1].lower().split("\t")[-1]
            if line.strip() == "":
                continue
            try:
                line = line.split("/", 3)[2]
            except:
                print line
                raise
            if line.startswith("www."):
                line = line[4:]
            url_categories[line].add(catname, source)


['DomainDataset/Wikidata_scientific.tsv', 'DomainDataset/Wikidata_videos.tsv', 'DomainDataset/Wikidata_socialmedia.tsv', 'DomainDataset/Wikidata_blog.tsv', 'DomainDataset/Wikidata_news.tsv']
DomainDataset/Wikidata_scientific.tsv	scientific
DomainDataset/Wikidata_videos.tsv	videos
DomainDataset/Wikidata_socialmedia.tsv	socialmedia
DomainDataset/Wikidata_blog.tsv	blog
DomainDataset/Wikidata_news.tsv	news

In [6]:
CAT_MAPPINGS={
    "satire": "fakenews",
    "clickbait": "fakenews",
    "usgov": "news"
}
df_t=pd.Series(
    Counter(
        sum((list(CAT_MAPPINGS.get(x, x) for x in k.categories)
             for k in url_categories.itervalues()),
            []))).to_frame()
df_t.reset_index().rename(
    columns={0: "Counts",
            "index": "URL category"})


Out[6]:
URL category Counts
0 blog 194
1 commercial 55
2 fakenews 518
3 news 1988
4 scientific 2962
5 socialmedia 87
6 twitter 1
7 videos 13

In [7]:
df_t=pd.Series(
    Counter(
        sum((list(x for x in k.sources)
             for k in url_categories.itervalues()),
            []))).to_frame()
df_t.reset_index().rename(
    columns={0: "Counts",
            "index": "URL sources"})


Out[7]:
URL sources Counts
0 fakenewschecker 367
1 handcoded 709
2 usgov 888
3 wikidata 3943

In [8]:
len(url_categories), url_categories["facebook.com"]


Out[8]:
(5723,
 URLCategory(caegories=set(['socialmedia']), sources=set(['wikidata', 'handcoded'])))

In [9]:
with open("Final Lexicons/URL_CATEGORIES.tsv", "wb+") as fp:
    print >> fp, "URL\tCategories\tSources"
    for url, details in sorted(url_categories.iteritems(), key=lambda x: x[0]):
        if url.strip():
            print >> fp, "%s\t%s\t%s" % (
                url, "|".join(details.categories),
                "|".join(details.sources))
! head Final\ Lexicons/URL_CATEGORIES.tsv


URL	Categories	Sources
100percentfedup.com	fakenews	fakenewschecker|handcoded
199.44.84.82	usgov	usgov
1stcenturywire.com	fakenews	handcoded
209.235.208.145	scientific	wikidata
20min.ch	news	wikidata
20minutes.fr	news	wikidata
21stcenturywire.com	clickbait|fakenews	fakenewschecker|handcoded
24heures.ch	news	wikidata
24hrs.ca	news	wikidata

In [ ]: