In [ ]:
# imports libaries
import requests # HTTP connection
import random # random generator
import time # timer
import re # regular expression
import pickle # exports lists
import numpy as np # numerical computation
from bs4 import BeautifulSoup # web scraping
In [ ]:
# generates random sample of users
n = 10000
max_id = 9000000
userids = random.sample(range(1, max_id), n)
baselink = 'https://www.fanfiction.net/u/'
urls = [baselink + str(userid) for userid in userids]
In [ ]:
# initializes datasets
data_profile = []
data_userlinks = []
data_storylinks = []
In [ ]:
t0 = time.time()
# collects data from user pages
for i in range(0, n, 1):
# retrieves user identifier
userid = userids[i]
# collects data from webpage
page = ''
while page == '':
try:
page = requests.get(urls[i])
except:
print("Connection refused, going to sleep...")
time.sleep(5)
continue
html = requests.get(urls[i]).text
soup = BeautifulSoup(html, 'html.parser')
# sets default for missing profile information
desc = 'NA'
country = 'NA'
join = 'NA'
# collects profile information
desctag = soup.find('meta', {'name': 'description'})
countrytag = soup.find('img', {'height': '11', 'width': '16'})
jointag = soup.find(lambda tag: tag.name == 'span' and 'data-xutime' in tag.attrs)
if desctag is not None:
desc = desctag['content']
if countrytag is not None:
country = countrytag['title']
if jointag is not None:
join = jointag.text
nlinks_keys = [key['href'] for key in soup.find_all('a', {'data-toggle': 'tab'})]
nlinks_values = [value.text for value in soup.find_all('span', {'class': 'badge'})]
nlinks = dict(zip(nlinks_keys, nlinks_values))
profile = [userid, desc, country, join, nlinks]
data_profile.append(profile)
# finds all individual links
links = [tag['href'] for tag in
soup.find_all(lambda tag: tag.name == 'a' and 'href' in tag.attrs)]
# stores user links
user = re.compile('/u/(.+?)/')
userlinks = [re.search('/u/(.+?)/', link).group(1)
for link in links if user.search(link) is not None]
userlinks = [userid, userlinks]
data_userlinks.append(userlinks)
# stores story links
story = re.compile('/s/(.+?)/')
storylinks = [re.search('/s/(.+?)/', link).group(1)
for link in links if story.search(link) is not None]
storylinks = [userid, storylinks]
data_storylinks.append(storylinks)
print(time.time() - t0)
In [ ]:
# exports data
with open('data_userlinks', 'wb') as fp:
pickle.dump(data_userlinks, fp)
with open('data_storylinks', 'wb') as fp:
pickle.dump(data_storylinks, fp)
with open('data_profile', 'wb') as fp:
pickle.dump(data_profile, fp)