In [1]:
# imports libaries
import requests # HTTP connection
import random # random generator
import time # timer
import re # regular expression
import pickle # exports lists
import numpy as np # numerical computation
from bs4 import BeautifulSoup # web scraping
In [2]:
# generates random sample of users
n = 10000
max_id = 9000000
userids = random.sample(range(1, max_id), n)
baselink = 'https://www.fanfiction.net/u/'
urls = [baselink + str(userid) for userid in userids]
In [3]:
# initializes dataset
data_profile = []
In [4]:
t0 = time.time()
# collects data from user pages
for i in range(0, n, 1):
# retrieves user identifier
userid = userids[i]
# collects data from webpage
page = ''
while page == '':
try:
page = requests.get(urls[i])
except:
print("Connection refused, going to sleep...")
time.sleep(5)
continue
html = requests.get(urls[i]).text
soup = BeautifulSoup(html, 'html.parser')
# sets default for missing profile information
desc = 'NA'
country = 'NA'
join = 'NA'
profile = 'NA'
nlinks = 'NA'
# collects profile information
desctag = soup.find('meta', {'name': 'description'})
countrytag = soup.find('img', {'height': '11', 'width': '16'})
jointag = soup.find(lambda tag: tag.name == 'span' and 'data-xutime' in tag.attrs)
profiletag = soup.find_all('p')
if desctag is not None:
desc = desctag['content']
if countrytag is not None:
country = countrytag['title']
if jointag is not None:
join = jointag.text
if profiletag is not None:
profile = [tag.text for tag in soup.find_all('p')]
nlinks_keys = [key['href'] for key in soup.find_all('a', {'data-toggle': 'tab'})]
nlinks_values = [value.text for value in soup.find_all('span', {'class': 'badge'})]
nlinks = dict(zip(nlinks_keys, nlinks_values))
profile = [userid, desc, country, join, profile, nlinks]
data_profile.append(profile)
print(time.time() - t0)
In [6]:
# exports data
with open('data_profile', 'wb') as fp:
pickle.dump(data_profile, fp)