In [1]:
import requests
from bs4 import BeautifulSoup

from joblib import Parallel, delayed
from collections import defaultdict

In [2]:
BASE_URL = "http://www.fakenewschecker.com/"

In [3]:
response = requests.get(BASE_URL)
data = BeautifulSoup(response.text, "lxml")
data.select("#block-system-main > div > div > div.view-content > div > ul > li.views-fluid-grid-inline.views-fluid-grid-item.views-row.views-row-1.views-row-odd.views-row-first")[0]


Out[3]:
<li class="views-fluid-grid-inline views-fluid-grid-item views-row views-row-1 views-row-odd views-row-first"><div class="views-field views-field-field-conspiracy"><div class="field-content"><div class="conspiracy-bias"></div></div></div> <span class="views-field views-field-field-right-bias"> <span class="field-content"><div class="right-bias"></div></span> </span> <span class="views-field views-field-title"> <span class="field-content"><a href="/fake-news-source/100-percent-fed">100 Percent Fed Up <i aria-hidden="true" class="fa fa-chevron-circle-right"></i></a></span> </span><div class="views-field views-field-edit-node"> <span class="field-content"></span></div></li>

In [4]:
URL_CATS = defaultdict(set)

In [6]:
all_data = data.select("#block-system-main > div > div > div.view-content > div > ul > li")
len(all_data)


Out[6]:
376

In [19]:
def get_url_cats(list_data):
    url_div = list_data.select("a")[0]
    label_divs = list_data.select(".views-field > div > div, .views-field > span > div")
    return url_div.text, url_div["href"], sum([div["class"] for div in label_divs], [])

In [20]:
get_url_cats(all_data[0])


Out[20]:
(u'100 Percent Fed Up ',
 '/fake-news-source/100-percent-fed',
 ['conspiracy-bias', 'right-bias'])

In [26]:
def get_details(item):
    print item
    item_name, item_url, item_classes = item
    response = requests.get("%s%s" % (BASE_URL, item_url))
    data = BeautifulSoup(response.text, "lxml")
    return data

In [27]:
item_data = get_details(get_url_cats(all_data[0]))


(u'100 Percent Fed Up ', '/fake-news-source/100-percent-fed', ['conspiracy-bias', 'right-bias'])

In [29]:
item_data.select("#block-system-main > div > div > div.field.field-name-field-bias.field-type-taxonomy-term-reference.field-label-inline.clearfix > div.field-items a")


Out[29]:
[<a datatype="" href="/news-bias/conspiratorial" property="rdfs:label skos:prefLabel" typeof="skos:Concept">conspiratorial</a>,
 <a datatype="" href="/news-bias/political" property="rdfs:label skos:prefLabel" typeof="skos:Concept">political</a>,
 <a datatype="" href="/news-bias/psuedoscience" property="rdfs:label skos:prefLabel" typeof="skos:Concept">psuedoscience</a>,
 <a datatype="" href="/news-bias/right-bias" property="rdfs:label skos:prefLabel" typeof="skos:Concept">right bias</a>,
 <a datatype="" href="/news-bias/war" property="rdfs:label skos:prefLabel" typeof="skos:Concept">war</a>]

In [32]:
item_data.select("#block-system-main > div > div > div.field.field-name-field-website.field-type-text.field-label-inline.clearfix > div.field-items > div")[0].text


Out[32]:
u'100percentfedup.com'

In [47]:
def get_details(item):
    item_name, item_url, item_classes = item
    response = requests.get("%s%s" % (BASE_URL, item_url))
    data = BeautifulSoup(response.text, "lxml")
    labels = data.select("#block-system-main > div > div > div.field.field-name-field-bias.field-type-taxonomy-term-reference.field-label-inline.clearfix > div.field-items a")
    domain = data.select("#block-system-main > div > div > div.field.field-name-field-website.field-type-text.field-label-inline.clearfix > div.field-items > div")
    if len(domain) < 1:
        print item_name, item_url
        domain = ""
    else:
        domain = domain[0].text
    return item_name, item_url, item_classes, domain, tuple(label.text for label in labels)

In [53]:
all_url_cats = [get_url_cats(item_data) for item_data in all_data]

In [55]:
all_url_cats[:10]


Out[55]:
[(u'100 Percent Fed Up ',
  '/fake-news-source/100-percent-fed',
  ['conspiracy-bias', 'right-bias']),
 (u'21st Century Wire ',
  '/fake-news-source/21st-century-wire',
  ['conspiracy-bias', 'right-bias']),
 (u'365 USA News ', '/fake-news-source/365-usa-news', ['right-bias']),
 (u'70News ', '/fake-news-source/70news', ['right-bias']),
 (u'ABCNews.com.co ', '/fake-news-source/abcnewscomco', ['right-bias']),
 (u'Abel Danger ',
  '/fake-news-source/abel-danger',
  ['conspiracy-bias', 'right-bias']),
 (u'Above Top Secret ',
  '/fake-news-source/above-top-secret',
  ['conspiracy-bias', 'right-bias']),
 (u'Activist Post ', '/fake-news-source/activist-post', ['right-bias']),
 (u'Addicting Info ', '/fake-news-source/addicting-info', ['left-bias']),
 (u'Advocate ', '/fake-news-source/advocate', ['left-bias'])]

In [54]:
item_data = get_details(all_url_cats[0])
item_data


Out[54]:
(u'100 Percent Fed Up ',
 '/fake-news-source/100-percent-fed',
 ['conspiracy-bias', 'right-bias'],
 u'100percentfedup.com',
 (u'conspiratorial', u'political', u'psuedoscience', u'right bias', u'war'))

In [60]:
all_fake_news = Parallel(n_jobs=5, verbose=10)(delayed(get_details)(item_data)
                                                for item_data in all_url_cats
                                               )


[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:    2.1s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    3.9s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:    5.8s
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:    8.1s
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   10.9s
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   14.1s
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed:   17.4s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:   20.7s
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed:   24.7s
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed:   28.6s
DC Clothesline  /fake-news-source/dc-clothesline
[Parallel(n_jobs=5)]: Done 103 tasks      | elapsed:   32.7s
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:   38.7s
[Parallel(n_jobs=5)]: Done 135 tasks      | elapsed:   44.5s
Herman Cain  /fake-news-source/herman-cain
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:   50.6s
[Parallel(n_jobs=5)]: Done 171 tasks      | elapsed:   57.1s
Liberty Talk FM  /fake-news-source/liberty-talk-fm
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:  1.1min
[Parallel(n_jobs=5)]: Done 211 tasks      | elapsed:  1.2min
[Parallel(n_jobs=5)]: Done 232 tasks      | elapsed:  1.3min
[Parallel(n_jobs=5)]: Done 255 tasks      | elapsed:  1.4min
[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:  1.5min
[Parallel(n_jobs=5)]: Done 303 tasks      | elapsed:  1.7min
[Parallel(n_jobs=5)]: Done 328 tasks      | elapsed:  1.8min
Truth Broadcast Network  /fake-news-source/truth-broadcast-network
US Chronicle  /fake-news-source/us-chronicle
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:  2.0min
[Parallel(n_jobs=5)]: Done 376 out of 376 | elapsed:  2.1min finished

In [61]:
all_fake_news[1]


Out[61]:
(u'21st Century Wire ',
 '/fake-news-source/21st-century-wire',
 ['conspiracy-bias', 'right-bias'],
 u'21stcenturywire.com',
 (u'conspiratorial', u'political', u'psuedoscience', u'right bias', u'war'))

In [62]:
import pandas as pd

In [69]:
df = pd.DataFrame(all_fake_news, columns=["Name", "Source", "Types", "website", "labels"])

In [70]:
df.head()


Out[70]:
Name Source Types website labels
0 100 Percent Fed Up /fake-news-source/100-percent-fed [conspiracy-bias, right-bias] 100percentfedup.com (conspiratorial, political, psuedoscience, rig...
1 21st Century Wire /fake-news-source/21st-century-wire [conspiracy-bias, right-bias] 21stcenturywire.com (conspiratorial, political, psuedoscience, rig...
2 365 USA News /fake-news-source/365-usa-news [right-bias] 365usanews .com (conspiratorial, political, right bias)
3 70News /fake-news-source/70news [right-bias] 70news.wordpress.com (conspiratorial, political, psuedoscience, rig...
4 ABCNews.com.co /fake-news-source/abcnewscomco [right-bias] www.abcnews.com.co (conspiratorial, political, psuedoscience, rig...

In [71]:
df["labels"].head()


Out[71]:
0    (conspiratorial, political, psuedoscience, rig...
1    (conspiratorial, political, psuedoscience, rig...
2              (conspiratorial, political, right bias)
3    (conspiratorial, political, psuedoscience, rig...
4    (conspiratorial, political, psuedoscience, rig...
Name: labels, dtype: object

In [72]:
df["Types"] = df["Types"].apply(lambda x: "|".join(x))
df["labels"] = df["labels"].apply(lambda x: "|".join(x))
df.head()


Out[72]:
Name Source Types website labels
0 100 Percent Fed Up /fake-news-source/100-percent-fed conspiracy-bias|right-bias 100percentfedup.com conspiratorial|political|psuedoscience|right b...
1 21st Century Wire /fake-news-source/21st-century-wire conspiracy-bias|right-bias 21stcenturywire.com conspiratorial|political|psuedoscience|right b...
2 365 USA News /fake-news-source/365-usa-news right-bias 365usanews .com conspiratorial|political|right bias
3 70News /fake-news-source/70news right-bias 70news.wordpress.com conspiratorial|political|psuedoscience|right b...
4 ABCNews.com.co /fake-news-source/abcnewscomco right-bias www.abcnews.com.co conspiratorial|political|psuedoscience|right b...

In [74]:
df.to_csv("DomainDataset/FakeNewsChecker.txt", sep="\t", index=False, encoding='utf-8')
! head DomainDataset/FakeNewsChecker.txt


Name	Source	Types	website	labels
100 Percent Fed Up 	/fake-news-source/100-percent-fed	conspiracy-bias|right-bias	100percentfedup.com	conspiratorial|political|psuedoscience|right bias|war
21st Century Wire 	/fake-news-source/21st-century-wire	conspiracy-bias|right-bias	21stcenturywire.com	conspiratorial|political|psuedoscience|right bias|war
365 USA News 	/fake-news-source/365-usa-news	right-bias	365usanews .com	conspiratorial|political|right bias
70News 	/fake-news-source/70news	right-bias	70news.wordpress.com	conspiratorial|political|psuedoscience|right bias|war
ABCNews.com.co 	/fake-news-source/abcnewscomco	right-bias	www.abcnews.com.co	conspiratorial|political|psuedoscience|right bias|war
Abel Danger 	/fake-news-source/abel-danger	conspiracy-bias|right-bias	http://www.abeldanger.net	conspiratorial|political|psuedoscience|right bias|war
Above Top Secret 	/fake-news-source/above-top-secret	conspiracy-bias|right-bias	www.abovetopsecret.com	conspiratorial|political
Activist Post 	/fake-news-source/activist-post	right-bias	www.activistpost.com	conspiratorial|editorial|political|psuedoscience|right bias|war
Addicting Info 	/fake-news-source/addicting-info	left-bias	addictinginfo.org	left bias|political

In [75]:
import urlparse

In [76]:
urlparse.urlsplit("http://www.abeldanger.net")


Out[76]:
SplitResult(scheme='http', netloc='www.abeldanger.net', path='', query='', fragment='')

In [ ]:


In [79]:
df.website.apply(lambda x: urlparse.urlsplit(x).netloc
                 if urlparse.urlsplit(x).netloc != "" else x).to_csv(
    "DomainDataset/fakenewschecker_domain+suffix.txt", sep="\t", index=False, header=False)

In [80]:
! head DomainDataset/fakenewschecker_domain+suffix.txt


100percentfedup.com
21stcenturywire.com
365usanews .com
70news.wordpress.com
www.abcnews.com.co
www.abeldanger.net
www.abovetopsecret.com
www.activistpost.com
addictinginfo.org
www.advocate.com

In [ ]: