In [1]:
import requests
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
from collections import defaultdict
In [2]:
BASE_URL = "http://www.fakenewschecker.com/"
In [3]:
response = requests.get(BASE_URL)
data = BeautifulSoup(response.text, "lxml")
data.select("#block-system-main > div > div > div.view-content > div > ul > li.views-fluid-grid-inline.views-fluid-grid-item.views-row.views-row-1.views-row-odd.views-row-first")[0]
Out[3]:
In [4]:
URL_CATS = defaultdict(set)
In [6]:
all_data = data.select("#block-system-main > div > div > div.view-content > div > ul > li")
len(all_data)
Out[6]:
In [19]:
def get_url_cats(list_data):
url_div = list_data.select("a")[0]
label_divs = list_data.select(".views-field > div > div, .views-field > span > div")
return url_div.text, url_div["href"], sum([div["class"] for div in label_divs], [])
In [20]:
get_url_cats(all_data[0])
Out[20]:
In [26]:
def get_details(item):
print item
item_name, item_url, item_classes = item
response = requests.get("%s%s" % (BASE_URL, item_url))
data = BeautifulSoup(response.text, "lxml")
return data
In [27]:
item_data = get_details(get_url_cats(all_data[0]))
In [29]:
item_data.select("#block-system-main > div > div > div.field.field-name-field-bias.field-type-taxonomy-term-reference.field-label-inline.clearfix > div.field-items a")
Out[29]:
In [32]:
item_data.select("#block-system-main > div > div > div.field.field-name-field-website.field-type-text.field-label-inline.clearfix > div.field-items > div")[0].text
Out[32]:
In [47]:
def get_details(item):
item_name, item_url, item_classes = item
response = requests.get("%s%s" % (BASE_URL, item_url))
data = BeautifulSoup(response.text, "lxml")
labels = data.select("#block-system-main > div > div > div.field.field-name-field-bias.field-type-taxonomy-term-reference.field-label-inline.clearfix > div.field-items a")
domain = data.select("#block-system-main > div > div > div.field.field-name-field-website.field-type-text.field-label-inline.clearfix > div.field-items > div")
if len(domain) < 1:
print item_name, item_url
domain = ""
else:
domain = domain[0].text
return item_name, item_url, item_classes, domain, tuple(label.text for label in labels)
In [53]:
all_url_cats = [get_url_cats(item_data) for item_data in all_data]
In [55]:
all_url_cats[:10]
Out[55]:
In [54]:
item_data = get_details(all_url_cats[0])
item_data
Out[54]:
In [60]:
all_fake_news = Parallel(n_jobs=5, verbose=10)(delayed(get_details)(item_data)
for item_data in all_url_cats
)
In [61]:
all_fake_news[1]
Out[61]:
In [62]:
import pandas as pd
In [69]:
df = pd.DataFrame(all_fake_news, columns=["Name", "Source", "Types", "website", "labels"])
In [70]:
df.head()
Out[70]:
In [71]:
df["labels"].head()
Out[71]:
In [72]:
df["Types"] = df["Types"].apply(lambda x: "|".join(x))
df["labels"] = df["labels"].apply(lambda x: "|".join(x))
df.head()
Out[72]:
In [74]:
df.to_csv("DomainDataset/FakeNewsChecker.txt", sep="\t", index=False, encoding='utf-8')
! head DomainDataset/FakeNewsChecker.txt
In [75]:
import urlparse
In [76]:
urlparse.urlsplit("http://www.abeldanger.net")
Out[76]:
In [ ]:
In [79]:
df.website.apply(lambda x: urlparse.urlsplit(x).netloc
if urlparse.urlsplit(x).netloc != "" else x).to_csv(
"DomainDataset/fakenewschecker_domain+suffix.txt", sep="\t", index=False, header=False)
In [80]:
! head DomainDataset/fakenewschecker_domain+suffix.txt
In [ ]: