In [1]:
# imports libaries
import requests						# HTTP connection
import random						# random generator
import time							# timer
import re 							# regular expression
import pickle						# exports lists
import numpy as np					# numerical computation
from bs4 import BeautifulSoup		# web scraping

In [2]:
# generates random sample of users
n = 10000
max_id = 9000000
userids = random.sample(range(1, max_id), n)
baselink = 'https://www.fanfiction.net/u/'
urls = [baselink + str(userid) for userid in userids]

In [3]:
# initializes dataset
data_profile = []

In [4]:
t0 = time.time()

# collects data from user pages
for i in range(0, n, 1):
    
    # retrieves user identifier
    userid = userids[i]

    # collects data from webpage
    page = ''
    while page == '':
        try:
            page = requests.get(urls[i])
        except:
            print("Connection refused, going to sleep...")
            time.sleep(5)
            continue
    html = requests.get(urls[i]).text
    soup = BeautifulSoup(html, 'html.parser')

    # sets default for missing profile information
    desc = 'NA'
    country = 'NA'
    join = 'NA'
    profile = 'NA'
    nlinks = 'NA'
    
    # collects profile information
    desctag = soup.find('meta', {'name': 'description'})
    countrytag = soup.find('img', {'height': '11', 'width': '16'})
    jointag = soup.find(lambda tag: tag.name == 'span' and 'data-xutime' in tag.attrs)
    profiletag = soup.find_all('p')
    
    if desctag is not None:
        desc = desctag['content']
    if countrytag is not None:
        country = countrytag['title']
    if jointag is not None:
        join = jointag.text
    if profiletag is not None:
        profile = [tag.text for tag in soup.find_all('p')]
        
    nlinks_keys = [key['href'] for key in soup.find_all('a', {'data-toggle': 'tab'})]
    nlinks_values = [value.text for value in soup.find_all('span', {'class': 'badge'})]        
    nlinks = dict(zip(nlinks_keys, nlinks_values))
    
    profile = [userid, desc, country, join, profile, nlinks]
    data_profile.append(profile)

print(time.time() - t0)


11408.562286615372

In [6]:
# exports data
with open('data_profile', 'wb') as fp:
    pickle.dump(data_profile, fp)