In [1]:
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict, Counter
from urlparse import urlsplit, parse_qs
import re
In [2]:
hand_annotated_lexicons_files = glob("DomainDataset/*+suffix.txt")
print hand_annotated_lexicons_files
In [3]:
class URLCategory(object):
def __init__(self):
self.categories = set()
self.sources = set()
def add(self, category, source):
self.categories.add(category)
self.sources.add(source)
def __repr__(self):
return "URLCategory(caegories=%r, sources=%r)" % (
self.categories, self.sources
)
In [4]:
CAT_REGEX = re.compile(r'.*/([a-zA-Z]+)_.*')
source = "handcoded"
url_categories = defaultdict(URLCategory)
for filename in hand_annotated_lexicons_files:
catname = CAT_REGEX.match(filename).groups()[0].lower()
if catname in set(["fakenewschecker", "usgov"]):
source = catname
else:
source = "handcoded"
if catname == "fakenewschecker":
catname = "fakenews"
print "%s\t%s" % (filename, catname)
with open(filename) as fp:
for line in fp:
line = line.strip().lower()
if line.startswith("www."):
line = line[4:]
url_categories[line].add(catname, source)
url_categories["twitter.com"].add("twitter", source) # Manually add twitter in seperate category
In [5]:
wikidata_files = glob("DomainDataset/Wikidata_*.tsv")
print wikidata_files
source = "wikidata"
WIKIDATA_CAT_REGEX = re.compile(r'.*/.*_([a-zA-Z\ ]+).*')
for filename in wikidata_files:
catname = WIKIDATA_CAT_REGEX.match(filename).groups()[0].lower()
print "%s\t%s" % (filename, catname)
with open(filename) as fp:
header = fp.readline()
for line in fp:
line = line[:-1].lower().split("\t")[-1]
if line.strip() == "":
continue
try:
line = line.split("/", 3)[2]
except:
print line
raise
if line.startswith("www."):
line = line[4:]
url_categories[line].add(catname, source)
In [6]:
CAT_MAPPINGS={
"satire": "fakenews",
"clickbait": "fakenews",
"usgov": "news"
}
df_t=pd.Series(
Counter(
sum((list(CAT_MAPPINGS.get(x, x) for x in k.categories)
for k in url_categories.itervalues()),
[]))).to_frame()
df_t.reset_index().rename(
columns={0: "Counts",
"index": "URL category"})
Out[6]:
In [7]:
df_t=pd.Series(
Counter(
sum((list(x for x in k.sources)
for k in url_categories.itervalues()),
[]))).to_frame()
df_t.reset_index().rename(
columns={0: "Counts",
"index": "URL sources"})
Out[7]:
In [8]:
len(url_categories), url_categories["facebook.com"]
Out[8]:
In [9]:
with open("Final Lexicons/URL_CATEGORIES.tsv", "wb+") as fp:
print >> fp, "URL\tCategories\tSources"
for url, details in sorted(url_categories.iteritems(), key=lambda x: x[0]):
if url.strip():
print >> fp, "%s\t%s\t%s" % (
url, "|".join(details.categories),
"|".join(details.sources))
! head Final\ Lexicons/URL_CATEGORIES.tsv
In [ ]: