In [1]:
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict, Counter

from urlparse import urlsplit, parse_qs

import re

In [2]:
classification_files = glob("DomainDataset/*+suffix.txt")
print classification_files


['DomainDataset/fakenews_domain+suffix.txt', 'DomainDataset/commercial_domain+suffix.txt', 'DomainDataset/fakenewschecker_domain+suffix.txt', 'DomainDataset/satire_domain+suffix.txt', 'DomainDataset/Videos_domain+suffix.txt', 'DomainDataset/socialMedia_domain+suffix.txt', 'DomainDataset/scientific_domain+suffix.txt', 'DomainDataset/clickbait_domain+suffix.txt', 'DomainDataset/Blog_domain+suffix.txt', 'DomainDataset/USGov_domain+suffix.txt', 'DomainDataset/News_Domain+suffix.txt']

In [3]:
CAT_REGEX = re.compile(r'.*/([a-zA-Z]+)_.*')

url_categories = defaultdict(set)
for filename in classification_files:
    catname = CAT_REGEX.match(filename).groups()[0].lower()
    if catname == "fakenewschecker":
        catname = "fakenews"
    print "%s\t%s" % (filename, catname)
    with open(filename) as fp:
        for line in fp:
            line = line.strip().lower()
            if line.startswith("www."):
                line = line[4:]
            url_categories[line].add(catname)
            
len(url_categories), url_categories["facebook.com"]
url_categories["twitter.com"].add("twitter") # Manually add twitter in seperate category


DomainDataset/fakenews_domain+suffix.txt	fakenews
DomainDataset/commercial_domain+suffix.txt	commercial
DomainDataset/fakenewschecker_domain+suffix.txt	fakenews
DomainDataset/satire_domain+suffix.txt	satire
DomainDataset/Videos_domain+suffix.txt	videos
DomainDataset/socialMedia_domain+suffix.txt	socialmedia
DomainDataset/scientific_domain+suffix.txt	scientific
DomainDataset/clickbait_domain+suffix.txt	clickbait
DomainDataset/Blog_domain+suffix.txt	blog
DomainDataset/USGov_domain+suffix.txt	usgov
DomainDataset/News_Domain+suffix.txt	news

In [4]:
wikidata_files = glob("DomainDataset/Wikidata_*.tsv")
print wikidata_files

WIKIDATA_CAT_REGEX = re.compile(r'.*/.*_([a-zA-Z\ ]+).*')

for filename in wikidata_files:
    catname = WIKIDATA_CAT_REGEX.match(filename).groups()[0].lower()
    print "%s\t%s" % (filename, catname)
    with open(filename) as fp:
        header = fp.readline()
        for line in fp:
            line = line[:-1].lower().split("\t")[-1]
            if line.strip() == "":
                continue
            try:
                line = line.split("/", 3)[2]
            except:
                print line
                raise
            if line.startswith("www."):
                line = line[4:]
            url_categories[line].add(catname)


['DomainDataset/Wikidata_scientific.tsv', 'DomainDataset/Wikidata_videos.tsv', 'DomainDataset/Wikidata_socialmedia.tsv', 'DomainDataset/Wikidata_blog.tsv', 'DomainDataset/Wikidata_news.tsv']
DomainDataset/Wikidata_scientific.tsv	scientific
DomainDataset/Wikidata_videos.tsv	videos
DomainDataset/Wikidata_socialmedia.tsv	socialmedia
DomainDataset/Wikidata_blog.tsv	blog
DomainDataset/Wikidata_news.tsv	news

In [5]:
CAT_MAPPINGS={
    "satire": "fakenews",
    "clickbait": "fakenews",
    "usgov": "news"
}
pd.Series(
    Counter(
        sum((list(CAT_MAPPINGS.get(x, x) for x in k)
             for k in url_categories.itervalues()),
            []))).to_frame().reset_index().rename(
    columns={0: "Counts",
            "index": "URL category"})


Out[5]:
URL category Counts
0 blog 194
1 commercial 55
2 fakenews 519
3 news 1988
4 scientific 2962
5 socialmedia 87
6 twitter 1
7 videos 13

In [6]:
df_t = pd.Series(url_categories)
df_t[(df_t.apply(lambda k: len(set(CAT_MAPPINGS.get(x, x) for x in k))) > 1)]


Out[6]:
ameblo.jp                             {blog, socialmedia}
ap.org                                   {news, fakenews}
beme.com                            {socialmedia, videos}
blogger.com                           {blog, socialmedia}
bls.gov                               {usgov, scientific}
cdc.gov                               {usgov, scientific}
cia.gov                               {usgov, scientific}
dailycaller.com                          {news, fakenews}
dailykos.com                             {news, fakenews}
flickr.com                          {socialmedia, videos}
friendster.com                        {blog, socialmedia}
funk.net                            {socialmedia, videos}
gamepolitics.com                             {blog, news}
instagram.com                       {socialmedia, videos}
livejournal.com                       {blog, socialmedia}
mirror.co.uk                             {news, fakenews}
nature.com                             {news, scientific}
nbn-resolving.de                       {news, scientific}
newscientist.com                       {news, scientific}
nih.gov                               {usgov, scientific}
ning.com                              {blog, socialmedia}
pnas.org                               {news, scientific}
researchgate.net                {socialmedia, scientific}
sagepub.com                            {news, scientific}
sciencemag.org                         {news, scientific}
sites.google.com                      {usgov, scientific}
tumblr.com                            {blog, socialmedia}
twitter.com                        {twitter, socialmedia}
usda.gov                              {usgov, scientific}
vimeo.com                           {socialmedia, videos}
washingtonexaminer.com                   {news, fakenews}
web.archive.org           {news, socialmedia, scientific}
youtube.com                         {socialmedia, videos}
dtype: object

In [7]:
with open("DomainDataset/URL_CATS.txt", "wb+") as fp:
    for url, cats in url_categories.iteritems():
        print >> fp, "%s\t%s" % (url, ",".join(cats))
        
! head DomainDataset/URL_CATS.txt


	videos
tap.sagepub.com	scientific
qualitative-research.net	scientific
pe.com	news
iijournals.com	scientific
present.fr	news
rpd.unibo.it	scientific
libertyunyielding.com	fakenews
usma.edu	usgov
cbssports.com	news

Merge URL with categories


In [8]:
df_url_counts = pd.read_csv("all_urls.txt", sep="\t", header=None)
df_url_counts.columns = ["URL", "DOMAIN", "Counts"]
df_url_counts.head()


Out[8]:
URL DOMAIN Counts
0 http://bit.ly/1VzAMWD bit.ly 15148
1 http://bit.ly/2f8U9pg bit.ly 15148
2 http://bit.ly/1Q89AHn bit.ly 15148
3 http://bit.ly/2g0SbXa bit.ly 15148
4 http://bit.ly/29Udgo1 bit.ly 15148

In [9]:
df = pd.read_csv("url_expanded.merged.txt", sep="\t")
df.head()


Out[9]:
URL EXPANDED EXPANDED_STATUS
0 http://www.investmentnews.com/article/20160801... http://www.investmentnews.com/article/20160801... 0
1 http://ow.ly/3avNPe https://www.reddit.com/r/cahideas/comments/42i... 0
2 http://stratcom.kma-assc.com/uncategorized/pre... http://stratcom.kma-assc.com/uncategorized/pre... 3
3 http://ln.is/mabelsaveforschool.com/gbEtv http://linkis.com/mabelsaveforschool.com/gbEtv 0
4 http://kiw.im/16LfJirkfzE https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 0

In [10]:
"http://linkis.com/freebeacon.com/polit/3Fjdv".split("/", 1)


Out[10]:
['http:', '/linkis.com/freebeacon.com/polit/3Fjdv']

In [11]:
parse_qs(urlsplit("https://www.google.com/url?rct=j&sa=t&url=http://www.phoenixnewtimes.com/news/harkins-theaters-cancel-arizona-showing-of-anti-vaccine-film-8255215&ct=ga&cd=CAIyGjE2ZDBhYmZjOTAzMjkyMTk6Y29tOmVuOlVT&usg=AFQjCNHJWqaVm8jBMMQhMe39xm5Wtiy-3A").query)


Out[11]:
{'cd': ['CAIyGjE2ZDBhYmZjOTAzMjkyMTk6Y29tOmVuOlVT'],
 'ct': ['ga'],
 'rct': ['j'],
 'sa': ['t'],
 'url': ['http://www.phoenixnewtimes.com/news/harkins-theaters-cancel-arizona-showing-of-anti-vaccine-film-8255215'],
 'usg': ['AFQjCNHJWqaVm8jBMMQhMe39xm5Wtiy-3A']}

In [12]:
def get_url_domain(x):
    x = urlsplit(x.lower())
    if x.netloc in {"linkis.com", "www.linkis.com"}:
        if x.path[1:] != "":
            x = urlsplit("http:/%s" % x.path).netloc
        else:
            x = x.netloc
    elif x.netloc in {"google.com", "www.google.com"}:
        query = parse_qs(x.query)
        if "url" in query:
            return get_url_domain(query["url"][0])
        x = x.netloc
    else:
        x = x.netloc
    if x.startswith("www."):
        x = x[4:]
    if x.endswith(".wordpress.com") or x.endswith(".tumblr.com") or x.endswith(".blogspot.com"):
        x = x.split(".", 1)[-1]
    return x

In [13]:
get_url_domain("https://www.google.com/url?rct=j&sa=t&url=http://www.perthnow.com.au/news/western-australia/social-services-minister-christian-porter-slaps-down-antivaccination-campaigners/news-story/0aa49052ec0598704b05333075581296&ct=ga&cd=CAIyGjE2ZDBhYmZjOTAzMjkyMTk6Y29tOmVuOlVT&usg=AFQjCNFAB3aZtdfdVpXOHWzyfqsu0ZSFAg")


Out[13]:
'perthnow.com.au'

In [14]:
df["URL_DOMAIN"] = df.EXPANDED.apply(get_url_domain)
df.head()


Out[14]:
URL EXPANDED EXPANDED_STATUS URL_DOMAIN
0 http://www.investmentnews.com/article/20160801... http://www.investmentnews.com/article/20160801... 0 investmentnews.com
1 http://ow.ly/3avNPe https://www.reddit.com/r/cahideas/comments/42i... 0 reddit.com
2 http://stratcom.kma-assc.com/uncategorized/pre... http://stratcom.kma-assc.com/uncategorized/pre... 3 stratcom.kma-assc.com
3 http://ln.is/mabelsaveforschool.com/gbEtv http://linkis.com/mabelsaveforschool.com/gbEtv 0 mabelsaveforschool.com
4 http://kiw.im/16LfJirkfzE https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 0 kiwi.qa

In [15]:
df["URL_CATS"] = df.URL_DOMAIN.apply(lambda x: url_categories.get(x, "UNK"))
df.head()


Out[15]:
URL EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS
0 http://www.investmentnews.com/article/20160801... http://www.investmentnews.com/article/20160801... 0 investmentnews.com UNK
1 http://ow.ly/3avNPe https://www.reddit.com/r/cahideas/comments/42i... 0 reddit.com {socialmedia}
2 http://stratcom.kma-assc.com/uncategorized/pre... http://stratcom.kma-assc.com/uncategorized/pre... 3 stratcom.kma-assc.com UNK
3 http://ln.is/mabelsaveforschool.com/gbEtv http://linkis.com/mabelsaveforschool.com/gbEtv 0 mabelsaveforschool.com {commercial}
4 http://kiw.im/16LfJirkfzE https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 0 kiwi.qa UNK

In [16]:
df[df.URL_CATS != "UNK"].head()


Out[16]:
URL EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS
1 http://ow.ly/3avNPe https://www.reddit.com/r/cahideas/comments/42i... 0 reddit.com {socialmedia}
3 http://ln.is/mabelsaveforschool.com/gbEtv http://linkis.com/mabelsaveforschool.com/gbEtv 0 mabelsaveforschool.com {commercial}
5 http://fb.me/241s7UtEJ https://www.facebook.com/story.php?story_fbid=... 0 facebook.com {socialmedia}
6 http://owl.li/XkyUO https://www.youtube.com/watch?v=xtspq5T7B44&fe... 0 youtube.com {socialmedia, videos}
9 http://ln.is/www.rocskincare.com/AGBcS http://linkis.com/www.rocskincare.com/AGBcS 0 rocskincare.com {commercial}

In [17]:
df[df.URL_CATS != "UNK"].shape, df.shape


Out[17]:
((60586, 5), (97512, 5))

In [18]:
df[df.URL_CATS == "UNK"].head(10)


Out[18]:
URL EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS
0 http://www.investmentnews.com/article/20160801... http://www.investmentnews.com/article/20160801... 0 investmentnews.com UNK
2 http://stratcom.kma-assc.com/uncategorized/pre... http://stratcom.kma-assc.com/uncategorized/pre... 3 stratcom.kma-assc.com UNK
4 http://kiw.im/16LfJirkfzE https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 0 kiwi.qa UNK
7 http://goo.gl/RTQ29 http://localbuzznetwork.com/clarksburg-wv-job-... 0 localbuzznetwork.com UNK
8 http://buff.ly/1SNoZU6 http://weightlosslaw.com/01cdea672dbfe8?utm_co... 0 weightlosslaw.com UNK
10 http://dlvr.it/DD1NHF http://www.datacenterknowledge.com/archives/20... 0 datacenterknowledge.com UNK
11 http://wbur.fm/2fP8Rm7 http://www.wbur.org/npr/501600013/for-clues-to... 0 wbur.org UNK
14 http://dailydose.topratedviral.com/article/wom... http://dailydose.topratedviral.com/article/wom... 1 dailydose.topratedviral.com UNK
18 http://ecowatch.com/2015/11/30/another-earthqu... http://www.ecowatch.com/another-earthquake-hit... 0 ecowatch.com UNK
20 http://www.illinoishomepage.net/weather/weathe... http://www.illinoishomepage.net/weather/weathe... 0 illinoishomepage.net UNK

In [19]:
df[df.URL_DOMAIN == "com"].head()


Out[19]:
URL EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS
1312 http://ln.is/com/hSCIv http://linkis.com/com/hSCIv 0 com UNK
2299 http://ln.is/com/T5iQK http://linkis.com/com/T5iQK 0 com UNK
3379 http://ln.is/com/qM8CB http://linkis.com/com/qM8CB 0 com UNK
4803 http://ln.is/com/Z1VtJ http://linkis.com/com/Z1VtJ 0 com UNK
8959 http://ln.is/com/kLNtH http://linkis.com/com/kLNtH 0 com UNK

In [20]:
df[df.URL_CATS == "UNK"].URL_DOMAIN.value_counts()


Out[20]:
greenmedinfo.com                   90
webogi.com                         80
com                                78
a.bla.es                           67
mediaite.com                       67
ww1.news-freak.com                 66
newslocker.com                     64
teaparty.org                       63
soco.space                         63
infantway.com                      63
thinkprogress.org                  61
choiceandtruth.com                 59
indiewire.com                      57
hotair.com                         56
csoonline.com                      55
disq.us                            55
personalhealthdiary.co             53
twitlonger.com                     53
therealnews.com                    52
sun-sentinel.com                   52
natl.re                            52
reason.com                         52
empleoya.es                        51
esecpro.com                        51
danijobs.com                       51
amp.twimg.com                      50
guns.com                           48
theregister.co.uk                  48
guncrazy.org                       47
usa24.s6-news.com                  46
                                   ..
countynewscenter.com                1
features.wearemel.com               1
littlebitsofeverything.com          1
faasafety.gov                       1
extra-cash-from-home.com            1
equityinlearning.act.org            1
jrhighdropout.com                   1
mcsally.house.gov                   1
safety-blog.compliancesigns.com     1
nowtolove.com.au                    1
derbyinformer.com                   1
presidency.ucsb.edu                 1
lobmx                               1
vaccinenewsdaily.com                1
musictimes.com                      1
micron.com                          1
lps.leadpages.co                    1
baysport.com                        1
fia.com                             1
flemingislandplasticsurgery.com     1
onlinepatiala.com                   1
tipsndiy2017.com                    1
okoa.org                            1
eraofwisdom.org                     1
edp24.co.uk                         1
blog.aent.com                       1
hykfg                               1
moppenheim.tv                       1
tntp.org                            1
rodanandfields.com                  1
Name: URL_DOMAIN, dtype: int64

In [21]:
df_url_counts = df_url_counts.merge(df, how="inner", on="URL")
df_url_counts.shape


Out[21]:
(97512, 7)

In [22]:
df_url_counts.head()


Out[22]:
URL DOMAIN Counts EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS
0 http://bit.ly/1VzAMWD bit.ly 15148 http://www.autoblog.com/2016/03/22/hyundai-san... 0 autoblog.com {blog}
1 http://bit.ly/2f8U9pg bit.ly 15148 https://www.strongnation.org/articles/312-high... 0 strongnation.org UNK
2 http://bit.ly/1Q89AHn bit.ly 15148 http://www.today.com/video/robert-de-niro-on-a... 0 today.com {blog}
3 http://bit.ly/2g0SbXa bit.ly 15148 http://www.tucsonnewsnow.com/story/33740239/fl... 0 tucsonnewsnow.com UNK
4 http://bit.ly/29Udgo1 bit.ly 15148 http://www.medicaldaily.com/skin-cancer-freckl... 0 medicaldaily.com {blog}

In [23]:
df_url_counts[df_url_counts.URL_CATS == "UNK"].groupby("URL_DOMAIN")["Counts"].first().sort_values(ascending=False).head(10)


Out[23]:
URL_DOMAIN
kristv.com                     15148
okotoksonline.com              15148
ohsonline.com                  15148
oigel.com                      15148
okcfox.com                     15148
technmain.com                  15148
technewsworld.com              15148
oklahomacitynewschannel.com    15148
oklahomainjurylaw.com          15148
calledtomothering.com          15148
Name: Counts, dtype: int64

In [24]:
df.assign(
    URL_CATS = lambda x: x.URL_CATS.apply(lambda cats: "|".join(cats) if cats != "UNK" else cats)
).to_csv("URL_CAT_MAPPINGS.txt", sep="\t", index=False)
! head URL_CAT_MAPPINGS.txt


URL	EXPANDED	EXPANDED_STATUS	URL_DOMAIN	URL_CATS
http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike	http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike	0	investmentnews.com	UNK
http://ow.ly/3avNPe	https://www.reddit.com/r/cahideas/comments/42i3ew/w_farting_mid_rimjob/	0	reddit.com	socialmedia
http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/	http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/	3	stratcom.kma-assc.com	UNK
http://ln.is/mabelsaveforschool.com/gbEtv	http://linkis.com/mabelsaveforschool.com/gbEtv	0	mabelsaveforschool.com	commercial
http://kiw.im/16LfJirkfzE	https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927	0	kiwi.qa	UNK
http://fb.me/241s7UtEJ	https://www.facebook.com/story.php?story_fbid=1251035921618693&id=100001368900242	0	facebook.com	socialmedia
http://owl.li/XkyUO	https://www.youtube.com/watch?v=xtspq5T7B44&feature=em-uploademail	0	youtube.com	socialmedia|videos
http://goo.gl/RTQ29	http://localbuzznetwork.com/clarksburg-wv-job-search/	0	localbuzznetwork.com	UNK
http://buff.ly/1SNoZU6	http://weightlosslaw.com/01cdea672dbfe8?utm_content=bufferb9ed1&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer	0	weightlosslaw.com	UNK

In [25]:
reduce(lambda x, y: x.union(y), url_categories.values())


Out[25]:
{'blog',
 'clickbait',
 'commercial',
 'fakenews',
 'news',
 'satire',
 'scientific',
 'socialmedia',
 'twitter',
 'usgov',
 'videos'}

In [26]:
df.shape


Out[26]:
(97512, 5)

In [27]:
df[df.URL_DOMAIN == 'paper.li'].EXPANDED.head().values


Out[27]:
array([ 'http://paper.li/Dobroyeutro/1321885981?edition_id=eef235d0-9dd3-11e6-913d-0cc47a0d164b',
       'http://paper.li/Dobroyeutro/1321885981?edition_id=a46e1dd0-c043-11e5-a257-0cc47a0d164b',
       'http://paper.li/Dobroyeutro/1321885981?edition_id=4fe14f10-a9a6-11e6-a0e4-0cc47a0d164b',
       'http://paper.li/ag_companies/1312467449?edition_id=97532c20-5658-11e6-acd6-0cc47a0d1609',
       'http://paper.li/Dobroyeutro/1321885981?edition_id=fe29dc20-a813-11e6-a0e4-0cc47a0d164b'], dtype=object)

In [ ]: