In [ ]:
# imports libaries
import requests # HTTP connection
import random # random generator
import time # timer
import re # regular expression
import pickle # exports lists
import numpy as np # numerical computation
from bs4 import BeautifulSoup # web scraping
In [ ]:
# generates random sample of users
n = 10000
max_id = 12600000
storyids = random.sample(range(1, max_id), n)
baselink = 'https://www.fanfiction.net/s/'
urls = [baselink + str(storyid) for storyid in storyids]
In [ ]:
# initializes datasets
data_stories = []
In [ ]:
# collects data from user pages
t0 = time.time()
for i in range(0, n, 1):
# retrieves user identifier
storyid = storyids[i]
# collects data from webpage
page = ''
while page == '':
try:
page = requests.get(urls[i])
except:
print("Connection refused, going to sleep...")
time.sleep(5)
continue
html = page.text
soup = BeautifulSoup(html, 'html.parser')
# sets default for missing stories
userid = 'NA'
cat = 'NA'
title = 'NA'
summary = 'NA'
info = 'NA'
error = 'NA'
# collects story information if story exists
if soup.find('span', {'class': 'gui_warning'}) is None:
useridtag = soup.find('a', {'title': 'Send Private Message'})
cattag = soup.find('div', {'id': 'pre_story_links'})
titletag = soup.find('b', {'class': 'xcontrast_txt'})
summarytag = soup.find('div', {'class': 'xcontrast_txt',
'style' : 'margin-top:2px'})
infotag = soup.find('span', {'class': 'xgray xcontrast_txt'})
if useridtag is None:
error = soup.find('span').text
else:
userid = useridtag['href']
cat = [link.text for link in cattag.find_all('a')]
title = titletag.text
summary = summarytag.text
info = infotag.text
story = [storyid, userid, cat, title, summary, info, error]
data_stories.append(story)
print(time.time() - t0)
In [ ]:
# exports data
with open('data_stories', 'wb') as fp:
pickle.dump(data_stories, fp)