notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict, Counter

from urlparse import urlsplit, parse_qs

import re



In [2]:

    
classification_files = glob("DomainDataset/*+suffix.txt")
print classification_files









    



['DomainDataset/fakenews_domain+suffix.txt', 'DomainDataset/commercial_domain+suffix.txt', 'DomainDataset/fakenewschecker_domain+suffix.txt', 'DomainDataset/satire_domain+suffix.txt', 'DomainDataset/Videos_domain+suffix.txt', 'DomainDataset/socialMedia_domain+suffix.txt', 'DomainDataset/scientific_domain+suffix.txt', 'DomainDataset/clickbait_domain+suffix.txt', 'DomainDataset/Blog_domain+suffix.txt', 'DomainDataset/USGov_domain+suffix.txt', 'DomainDataset/News_Domain+suffix.txt']



In [3]:

    
CAT_REGEX = re.compile(r'.*/([a-zA-Z]+)_.*')

url_categories = defaultdict(set)
for filename in classification_files:
    catname = CAT_REGEX.match(filename).groups()[0].lower()
    if catname == "fakenewschecker":
        catname = "fakenews"
    print "%s\t%s" % (filename, catname)
    with open(filename) as fp:
        for line in fp:
            line = line.strip().lower()
            if line.startswith("www."):
                line = line[4:]
            url_categories[line].add(catname)
            
len(url_categories), url_categories["facebook.com"]
url_categories["twitter.com"].add("twitter") # Manually add twitter in seperate category









    



DomainDataset/fakenews_domain+suffix.txt	fakenews
DomainDataset/commercial_domain+suffix.txt	commercial
DomainDataset/fakenewschecker_domain+suffix.txt	fakenews
DomainDataset/satire_domain+suffix.txt	satire
DomainDataset/Videos_domain+suffix.txt	videos
DomainDataset/socialMedia_domain+suffix.txt	socialmedia
DomainDataset/scientific_domain+suffix.txt	scientific
DomainDataset/clickbait_domain+suffix.txt	clickbait
DomainDataset/Blog_domain+suffix.txt	blog
DomainDataset/USGov_domain+suffix.txt	usgov
DomainDataset/News_Domain+suffix.txt	news



In [4]:

    
wikidata_files = glob("DomainDataset/Wikidata_*.tsv")
print wikidata_files

WIKIDATA_CAT_REGEX = re.compile(r'.*/.*_([a-zA-Z\ ]+).*')

for filename in wikidata_files:
    catname = WIKIDATA_CAT_REGEX.match(filename).groups()[0].lower()
    print "%s\t%s" % (filename, catname)
    with open(filename) as fp:
        header = fp.readline()
        for line in fp:
            line = line[:-1].lower().split("\t")[-1]
            if line.strip() == "":
                continue
            try:
                line = line.split("/", 3)[2]
            except:
                print line
                raise
            if line.startswith("www."):
                line = line[4:]
            url_categories[line].add(catname)









    



['DomainDataset/Wikidata_scientific.tsv', 'DomainDataset/Wikidata_videos.tsv', 'DomainDataset/Wikidata_socialmedia.tsv', 'DomainDataset/Wikidata_blog.tsv', 'DomainDataset/Wikidata_news.tsv']
DomainDataset/Wikidata_scientific.tsv	scientific
DomainDataset/Wikidata_videos.tsv	videos
DomainDataset/Wikidata_socialmedia.tsv	socialmedia
DomainDataset/Wikidata_blog.tsv	blog
DomainDataset/Wikidata_news.tsv	news



In [5]:

    
CAT_MAPPINGS={
    "satire": "fakenews",
    "clickbait": "fakenews",
    "usgov": "news"
}
pd.Series(
    Counter(
        sum((list(CAT_MAPPINGS.get(x, x) for x in k)
             for k in url_categories.itervalues()),
            []))).to_frame().reset_index().rename(
    columns={0: "Counts",
            "index": "URL category"})









    Out[5]:






  
    
      
      URL category
      Counts
    
  
  
    
      0
      blog
      194
    
    
      1
      commercial
      55
    
    
      2
      fakenews
      519
    
    
      3
      news
      1988
    
    
      4
      scientific
      2962
    
    
      5
      socialmedia
      87
    
    
      6
      twitter
      1
    
    
      7
      videos
      13



In [6]:

    
df_t = pd.Series(url_categories)
df_t[(df_t.apply(lambda k: len(set(CAT_MAPPINGS.get(x, x) for x in k))) > 1)]









    Out[6]:





ameblo.jp                             {blog, socialmedia}
ap.org                                   {news, fakenews}
beme.com                            {socialmedia, videos}
blogger.com                           {blog, socialmedia}
bls.gov                               {usgov, scientific}
cdc.gov                               {usgov, scientific}
cia.gov                               {usgov, scientific}
dailycaller.com                          {news, fakenews}
dailykos.com                             {news, fakenews}
flickr.com                          {socialmedia, videos}
friendster.com                        {blog, socialmedia}
funk.net                            {socialmedia, videos}
gamepolitics.com                             {blog, news}
instagram.com                       {socialmedia, videos}
livejournal.com                       {blog, socialmedia}
mirror.co.uk                             {news, fakenews}
nature.com                             {news, scientific}
nbn-resolving.de                       {news, scientific}
newscientist.com                       {news, scientific}
nih.gov                               {usgov, scientific}
ning.com                              {blog, socialmedia}
pnas.org                               {news, scientific}
researchgate.net                {socialmedia, scientific}
sagepub.com                            {news, scientific}
sciencemag.org                         {news, scientific}
sites.google.com                      {usgov, scientific}
tumblr.com                            {blog, socialmedia}
twitter.com                        {twitter, socialmedia}
usda.gov                              {usgov, scientific}
vimeo.com                           {socialmedia, videos}
washingtonexaminer.com                   {news, fakenews}
web.archive.org           {news, socialmedia, scientific}
youtube.com                         {socialmedia, videos}
dtype: object



In [7]:

    
with open("DomainDataset/URL_CATS.txt", "wb+") as fp:
    for url, cats in url_categories.iteritems():
        print >> fp, "%s\t%s" % (url, ",".join(cats))
        
! head DomainDataset/URL_CATS.txt









    



	videos
tap.sagepub.com	scientific
qualitative-research.net	scientific
pe.com	news
iijournals.com	scientific
present.fr	news
rpd.unibo.it	scientific
libertyunyielding.com	fakenews
usma.edu	usgov
cbssports.com	news

Merge URL with categories



In [8]:

    
df_url_counts = pd.read_csv("all_urls.txt", sep="\t", header=None)
df_url_counts.columns = ["URL", "DOMAIN", "Counts"]
df_url_counts.head()









    Out[8]:






  
    
      
      URL
      DOMAIN
      Counts
    
  
  
    
      0
      http://bit.ly/1VzAMWD
      bit.ly
      15148
    
    
      1
      http://bit.ly/2f8U9pg
      bit.ly
      15148
    
    
      2
      http://bit.ly/1Q89AHn
      bit.ly
      15148
    
    
      3
      http://bit.ly/2g0SbXa
      bit.ly
      15148
    
    
      4
      http://bit.ly/29Udgo1
      bit.ly
      15148



In [9]:

    
df = pd.read_csv("url_expanded.merged.txt", sep="\t")
df.head()









    Out[9]:






  
    
      
      URL
      EXPANDED
      EXPANDED_STATUS
    
  
  
    
      0
      http://www.investmentnews.com/article/20160801...
      http://www.investmentnews.com/article/20160801...
      0
    
    
      1
      http://ow.ly/3avNPe
      https://www.reddit.com/r/cahideas/comments/42i...
      0
    
    
      2
      http://stratcom.kma-assc.com/uncategorized/pre...
      http://stratcom.kma-assc.com/uncategorized/pre...
      3
    
    
      3
      http://ln.is/mabelsaveforschool.com/gbEtv
      http://linkis.com/mabelsaveforschool.com/gbEtv
      0
    
    
      4
      http://kiw.im/16LfJirkfzE
      https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927
      0



In [10]:

    
"http://linkis.com/freebeacon.com/polit/3Fjdv".split("/", 1)









    Out[10]:





['http:', '/linkis.com/freebeacon.com/polit/3Fjdv']



In [11]:

    
parse_qs(urlsplit("https://www.google.com/url?rct=j&sa=t&url=http://www.phoenixnewtimes.com/news/harkins-theaters-cancel-arizona-showing-of-anti-vaccine-film-8255215&ct=ga&cd=CAIyGjE2ZDBhYmZjOTAzMjkyMTk6Y29tOmVuOlVT&usg=AFQjCNHJWqaVm8jBMMQhMe39xm5Wtiy-3A").query)









    Out[11]:





{'cd': ['CAIyGjE2ZDBhYmZjOTAzMjkyMTk6Y29tOmVuOlVT'],
 'ct': ['ga'],
 'rct': ['j'],
 'sa': ['t'],
 'url': ['http://www.phoenixnewtimes.com/news/harkins-theaters-cancel-arizona-showing-of-anti-vaccine-film-8255215'],
 'usg': ['AFQjCNHJWqaVm8jBMMQhMe39xm5Wtiy-3A']}



In [12]:

    
def get_url_domain(x):
    x = urlsplit(x.lower())
    if x.netloc in {"linkis.com", "www.linkis.com"}:
        if x.path[1:] != "":
            x = urlsplit("http:/%s" % x.path).netloc
        else:
            x = x.netloc
    elif x.netloc in {"google.com", "www.google.com"}:
        query = parse_qs(x.query)
        if "url" in query:
            return get_url_domain(query["url"][0])
        x = x.netloc
    else:
        x = x.netloc
    if x.startswith("www."):
        x = x[4:]
    if x.endswith(".wordpress.com") or x.endswith(".tumblr.com") or x.endswith(".blogspot.com"):
        x = x.split(".", 1)[-1]
    return x



In [13]:

    
get_url_domain("https://www.google.com/url?rct=j&sa=t&url=http://www.perthnow.com.au/news/western-australia/social-services-minister-christian-porter-slaps-down-antivaccination-campaigners/news-story/0aa49052ec0598704b05333075581296&ct=ga&cd=CAIyGjE2ZDBhYmZjOTAzMjkyMTk6Y29tOmVuOlVT&usg=AFQjCNFAB3aZtdfdVpXOHWzyfqsu0ZSFAg")









    Out[13]:





'perthnow.com.au'



In [14]:

    
df["URL_DOMAIN"] = df.EXPANDED.apply(get_url_domain)
df.head()









    Out[14]:






  
    
      
      URL
      EXPANDED
      EXPANDED_STATUS
      URL_DOMAIN
    
  
  
    
      0
      http://www.investmentnews.com/article/20160801...
      http://www.investmentnews.com/article/20160801...
      0
      investmentnews.com
    
    
      1
      http://ow.ly/3avNPe
      https://www.reddit.com/r/cahideas/comments/42i...
      0
      reddit.com
    
    
      2
      http://stratcom.kma-assc.com/uncategorized/pre...
      http://stratcom.kma-assc.com/uncategorized/pre...
      3
      stratcom.kma-assc.com
    
    
      3
      http://ln.is/mabelsaveforschool.com/gbEtv
      http://linkis.com/mabelsaveforschool.com/gbEtv
      0
      mabelsaveforschool.com
    
    
      4
      http://kiw.im/16LfJirkfzE
      https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927
      0
      kiwi.qa



In [15]:

    
df["URL_CATS"] = df.URL_DOMAIN.apply(lambda x: url_categories.get(x, "UNK"))
df.head()









    Out[15]:






  
    
      
      URL
      EXPANDED
      EXPANDED_STATUS
      URL_DOMAIN
      URL_CATS
    
  
  
    
      0
      http://www.investmentnews.com/article/20160801...
      http://www.investmentnews.com/article/20160801...
      0
      investmentnews.com
      UNK
    
    
      1
      http://ow.ly/3avNPe
      https://www.reddit.com/r/cahideas/comments/42i...
      0
      reddit.com
      {socialmedia}
    
    
      2
      http://stratcom.kma-assc.com/uncategorized/pre...
      http://stratcom.kma-assc.com/uncategorized/pre...
      3
      stratcom.kma-assc.com
      UNK
    
    
      3
      http://ln.is/mabelsaveforschool.com/gbEtv
      http://linkis.com/mabelsaveforschool.com/gbEtv
      0
      mabelsaveforschool.com
      {commercial}
    
    
      4
      http://kiw.im/16LfJirkfzE
      https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927
      0
      kiwi.qa
      UNK



In [16]:

    
df[df.URL_CATS != "UNK"].head()









    Out[16]:






  
    
      
      URL
      EXPANDED
      EXPANDED_STATUS
      URL_DOMAIN
      URL_CATS
    
  
  
    
      1
      http://ow.ly/3avNPe
      https://www.reddit.com/r/cahideas/comments/42i...
      0
      reddit.com
      {socialmedia}
    
    
      3
      http://ln.is/mabelsaveforschool.com/gbEtv
      http://linkis.com/mabelsaveforschool.com/gbEtv
      0
      mabelsaveforschool.com
      {commercial}
    
    
      5
      http://fb.me/241s7UtEJ
      https://www.facebook.com/story.php?story_fbid=...
      0
      facebook.com
      {socialmedia}
    
    
      6
      http://owl.li/XkyUO
      https://www.youtube.com/watch?v=xtspq5T7B44&fe...
      0
      youtube.com
      {socialmedia, videos}
    
    
      9
      http://ln.is/www.rocskincare.com/AGBcS
      http://linkis.com/www.rocskincare.com/AGBcS
      0
      rocskincare.com
      {commercial}



In [17]:

    
df[df.URL_CATS != "UNK"].shape, df.shape









    Out[17]:





((60586, 5), (97512, 5))



In [18]:

    
df[df.URL_CATS == "UNK"].head(10)









    Out[18]:






  
    
      
      URL
      EXPANDED
      EXPANDED_STATUS
      URL_DOMAIN
      URL_CATS
    
  
  
    
      0
      http://www.investmentnews.com/article/20160801...
      http://www.investmentnews.com/article/20160801...
      0
      investmentnews.com
      UNK
    
    
      2
      http://stratcom.kma-assc.com/uncategorized/pre...
      http://stratcom.kma-assc.com/uncategorized/pre...
      3
      stratcom.kma-assc.com
      UNK
    
    
      4
      http://kiw.im/16LfJirkfzE
      https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927
      0
      kiwi.qa
      UNK
    
    
      7
      http://goo.gl/RTQ29
      http://localbuzznetwork.com/clarksburg-wv-job-...
      0
      localbuzznetwork.com
      UNK
    
    
      8
      http://buff.ly/1SNoZU6
      http://weightlosslaw.com/01cdea672dbfe8?utm_co...
      0
      weightlosslaw.com
      UNK
    
    
      10
      http://dlvr.it/DD1NHF
      http://www.datacenterknowledge.com/archives/20...
      0
      datacenterknowledge.com
      UNK
    
    
      11
      http://wbur.fm/2fP8Rm7
      http://www.wbur.org/npr/501600013/for-clues-to...
      0
      wbur.org
      UNK
    
    
      14
      http://dailydose.topratedviral.com/article/wom...
      http://dailydose.topratedviral.com/article/wom...
      1
      dailydose.topratedviral.com
      UNK
    
    
      18
      http://ecowatch.com/2015/11/30/another-earthqu...
      http://www.ecowatch.com/another-earthquake-hit...
      0
      ecowatch.com
      UNK
    
    
      20
      http://www.illinoishomepage.net/weather/weathe...
      http://www.illinoishomepage.net/weather/weathe...
      0
      illinoishomepage.net
      UNK



In [19]:

    
df[df.URL_DOMAIN == "com"].head()









    Out[19]:






  
    
      
      URL
      EXPANDED
      EXPANDED_STATUS
      URL_DOMAIN
      URL_CATS
    
  
  
    
      1312
      http://ln.is/com/hSCIv
      http://linkis.com/com/hSCIv
      0
      com
      UNK
    
    
      2299
      http://ln.is/com/T5iQK
      http://linkis.com/com/T5iQK
      0
      com
      UNK
    
    
      3379
      http://ln.is/com/qM8CB
      http://linkis.com/com/qM8CB
      0
      com
      UNK
    
    
      4803
      http://ln.is/com/Z1VtJ
      http://linkis.com/com/Z1VtJ
      0
      com
      UNK
    
    
      8959
      http://ln.is/com/kLNtH
      http://linkis.com/com/kLNtH
      0
      com
      UNK



In [20]:

    
df[df.URL_CATS == "UNK"].URL_DOMAIN.value_counts()









    Out[20]:





greenmedinfo.com                   90
webogi.com                         80
com                                78
a.bla.es                           67
mediaite.com                       67
ww1.news-freak.com                 66
newslocker.com                     64
teaparty.org                       63
soco.space                         63
infantway.com                      63
thinkprogress.org                  61
choiceandtruth.com                 59
indiewire.com                      57
hotair.com                         56
csoonline.com                      55
disq.us                            55
personalhealthdiary.co             53
twitlonger.com                     53
therealnews.com                    52
sun-sentinel.com                   52
natl.re                            52
reason.com                         52
empleoya.es                        51
esecpro.com                        51
danijobs.com                       51
amp.twimg.com                      50
guns.com                           48
theregister.co.uk                  48
guncrazy.org                       47
usa24.s6-news.com                  46
                                   ..
countynewscenter.com                1
features.wearemel.com               1
littlebitsofeverything.com          1
faasafety.gov                       1
extra-cash-from-home.com            1
equityinlearning.act.org            1
jrhighdropout.com                   1
mcsally.house.gov                   1
safety-blog.compliancesigns.com     1
nowtolove.com.au                    1
derbyinformer.com                   1
presidency.ucsb.edu                 1
lobmx                               1
vaccinenewsdaily.com                1
musictimes.com                      1
micron.com                          1
lps.leadpages.co                    1
baysport.com                        1
fia.com                             1
flemingislandplasticsurgery.com     1
onlinepatiala.com                   1
tipsndiy2017.com                    1
okoa.org                            1
eraofwisdom.org                     1
edp24.co.uk                         1
blog.aent.com                       1
hykfg                               1
moppenheim.tv                       1
tntp.org                            1
rodanandfields.com                  1
Name: URL_DOMAIN, dtype: int64



In [21]:

    
df_url_counts = df_url_counts.merge(df, how="inner", on="URL")
df_url_counts.shape









    Out[21]:





(97512, 7)



In [22]:

    
df_url_counts.head()









    Out[22]:






  
    
      
      URL
      DOMAIN
      Counts
      EXPANDED
      EXPANDED_STATUS
      URL_DOMAIN
      URL_CATS
    
  
  
    
      0
      http://bit.ly/1VzAMWD
      bit.ly
      15148
      http://www.autoblog.com/2016/03/22/hyundai-san...
      0
      autoblog.com
      {blog}
    
    
      1
      http://bit.ly/2f8U9pg
      bit.ly
      15148
      https://www.strongnation.org/articles/312-high...
      0
      strongnation.org
      UNK
    
    
      2
      http://bit.ly/1Q89AHn
      bit.ly
      15148
      http://www.today.com/video/robert-de-niro-on-a...
      0
      today.com
      {blog}
    
    
      3
      http://bit.ly/2g0SbXa
      bit.ly
      15148
      http://www.tucsonnewsnow.com/story/33740239/fl...
      0
      tucsonnewsnow.com
      UNK
    
    
      4
      http://bit.ly/29Udgo1
      bit.ly
      15148
      http://www.medicaldaily.com/skin-cancer-freckl...
      0
      medicaldaily.com
      {blog}



In [23]:

    
df_url_counts[df_url_counts.URL_CATS == "UNK"].groupby("URL_DOMAIN")["Counts"].first().sort_values(ascending=False).head(10)









    Out[23]:





URL_DOMAIN
kristv.com                     15148
okotoksonline.com              15148
ohsonline.com                  15148
oigel.com                      15148
okcfox.com                     15148
technmain.com                  15148
technewsworld.com              15148
oklahomacitynewschannel.com    15148
oklahomainjurylaw.com          15148
calledtomothering.com          15148
Name: Counts, dtype: int64



In [24]:

    
df.assign(
    URL_CATS = lambda x: x.URL_CATS.apply(lambda cats: "|".join(cats) if cats != "UNK" else cats)
).to_csv("URL_CAT_MAPPINGS.txt", sep="\t", index=False)
! head URL_CAT_MAPPINGS.txt









    



URL	EXPANDED	EXPANDED_STATUS	URL_DOMAIN	URL_CATS
http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike	http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike	0	investmentnews.com	UNK
http://ow.ly/3avNPe	https://www.reddit.com/r/cahideas/comments/42i3ew/w_farting_mid_rimjob/	0	reddit.com	socialmedia
http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/	http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/	3	stratcom.kma-assc.com	UNK
http://ln.is/mabelsaveforschool.com/gbEtv	http://linkis.com/mabelsaveforschool.com/gbEtv	0	mabelsaveforschool.com	commercial
http://kiw.im/16LfJirkfzE	https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927	0	kiwi.qa	UNK
http://fb.me/241s7UtEJ	https://www.facebook.com/story.php?story_fbid=1251035921618693&id=100001368900242	0	facebook.com	socialmedia
http://owl.li/XkyUO	https://www.youtube.com/watch?v=xtspq5T7B44&feature=em-uploademail	0	youtube.com	socialmedia|videos
http://goo.gl/RTQ29	http://localbuzznetwork.com/clarksburg-wv-job-search/	0	localbuzznetwork.com	UNK
http://buff.ly/1SNoZU6	http://weightlosslaw.com/01cdea672dbfe8?utm_content=bufferb9ed1&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer	0	weightlosslaw.com	UNK



In [25]:

    
reduce(lambda x, y: x.union(y), url_categories.values())









    Out[25]:





{'blog',
 'clickbait',
 'commercial',
 'fakenews',
 'news',
 'satire',
 'scientific',
 'socialmedia',
 'twitter',
 'usgov',
 'videos'}



In [26]:

    
df.shape









    Out[26]:





(97512, 5)



In [27]:

    
df[df.URL_DOMAIN == 'paper.li'].EXPANDED.head().values









    Out[27]:





array([ 'http://paper.li/Dobroyeutro/1321885981?edition_id=eef235d0-9dd3-11e6-913d-0cc47a0d164b',
       'http://paper.li/Dobroyeutro/1321885981?edition_id=a46e1dd0-c043-11e5-a257-0cc47a0d164b',
       'http://paper.li/Dobroyeutro/1321885981?edition_id=4fe14f10-a9a6-11e6-a0e4-0cc47a0d164b',
       'http://paper.li/ag_companies/1312467449?edition_id=97532c20-5658-11e6-acd6-0cc47a0d1609',
       'http://paper.li/Dobroyeutro/1321885981?edition_id=fe29dc20-a813-11e6-a0e4-0cc47a0d164b'], dtype=object)



In [ ]:

	URL category	Counts
0	blog	194
1	commercial	55
2	fakenews	519
3	news	1988
4	scientific	2962
5	socialmedia	87
6	twitter	1
7	videos	13

	URL	DOMAIN	Counts
0	http://bit.ly/1VzAMWD	bit.ly	15148
1	http://bit.ly/2f8U9pg	bit.ly	15148
2	http://bit.ly/1Q89AHn	bit.ly	15148
3	http://bit.ly/2g0SbXa	bit.ly	15148
4	http://bit.ly/29Udgo1	bit.ly	15148

	URL	EXPANDED	EXPANDED_STATUS
0	http://www.investmentnews.com/article/20160801...	http://www.investmentnews.com/article/20160801...	0
1	http://ow.ly/3avNPe	https://www.reddit.com/r/cahideas/comments/42i...	0
2	http://stratcom.kma-assc.com/uncategorized/pre...	http://stratcom.kma-assc.com/uncategorized/pre...	3
3	http://ln.is/mabelsaveforschool.com/gbEtv	http://linkis.com/mabelsaveforschool.com/gbEtv	0
4	http://kiw.im/16LfJirkfzE	https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927	0

	URL	EXPANDED	EXPANDED_STATUS	URL_DOMAIN
0	http://www.investmentnews.com/article/20160801...	http://www.investmentnews.com/article/20160801...	0	investmentnews.com
1	http://ow.ly/3avNPe	https://www.reddit.com/r/cahideas/comments/42i...	0	reddit.com
2	http://stratcom.kma-assc.com/uncategorized/pre...	http://stratcom.kma-assc.com/uncategorized/pre...	3	stratcom.kma-assc.com
3	http://ln.is/mabelsaveforschool.com/gbEtv	http://linkis.com/mabelsaveforschool.com/gbEtv	0	mabelsaveforschool.com
4	http://kiw.im/16LfJirkfzE	https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927	0	kiwi.qa

	URL	EXPANDED	URL_DOMAIN	URL_CATS
1312	http://ln.is/com/hSCIv	http://linkis.com/com/hSCIv	com	UNK
2299	http://ln.is/com/T5iQK	http://linkis.com/com/T5iQK	com	UNK
3379	http://ln.is/com/qM8CB	http://linkis.com/com/qM8CB	com	UNK
4803	http://ln.is/com/Z1VtJ	http://linkis.com/com/Z1VtJ	com	UNK
8959	http://ln.is/com/kLNtH	http://linkis.com/com/kLNtH	com	UNK