notebook.community

Edit and run



In [ ]:

    
# imports libaries
import requests						# HTTP connection
import random						# random generator
import time							# timer
import re 							# regular expression
import pickle						# exports lists
import numpy as np					# numerical computation
from bs4 import BeautifulSoup		# web scraping



In [ ]:

    
# generates random sample of users
n = 10000
max_id = 9000000
userids = random.sample(range(1, max_id), n)
baselink = 'https://www.fanfiction.net/u/'
urls = [baselink + str(userid) for userid in userids]



In [ ]:

    
# initializes datasets
data_profile = []
data_userlinks = []
data_storylinks = []



In [ ]:

    
t0 = time.time()

# collects data from user pages
for i in range(0, n, 1):
    
    # retrieves user identifier
    userid = userids[i]

    # collects data from webpage
    page = ''
    while page == '':
        try:
            page = requests.get(urls[i])
        except:
            print("Connection refused, going to sleep...")
            time.sleep(5)
            continue
    html = requests.get(urls[i]).text
    soup = BeautifulSoup(html, 'html.parser')

    # sets default for missing profile information
    desc = 'NA'
    country = 'NA'
    join = 'NA'
    
    # collects profile information
    desctag = soup.find('meta', {'name': 'description'})
    countrytag = soup.find('img', {'height': '11', 'width': '16'})
    jointag = soup.find(lambda tag: tag.name == 'span' and 'data-xutime' in tag.attrs)
    
    if desctag is not None:
        desc = desctag['content']
    if countrytag is not None:
        country = countrytag['title']
    if jointag is not None:
        join = jointag.text
        
    nlinks_keys = [key['href'] for key in soup.find_all('a', {'data-toggle': 'tab'})]
    nlinks_values = [value.text for value in soup.find_all('span', {'class': 'badge'})]        
    nlinks = dict(zip(nlinks_keys, nlinks_values))
    
    profile = [userid, desc, country, join, nlinks]
    data_profile.append(profile)
    
    # finds all individual links
    links = [tag['href'] for tag in 
             soup.find_all(lambda tag: tag.name == 'a' and 'href' in tag.attrs)]
      
    # stores user links
    user = re.compile('/u/(.+?)/')
    userlinks = [re.search('/u/(.+?)/', link).group(1) 
                 for link in links if user.search(link) is not None]
    userlinks = [userid, userlinks]
    data_userlinks.append(userlinks)
    
    # stores story links
    story = re.compile('/s/(.+?)/')
    storylinks = [re.search('/s/(.+?)/', link).group(1) 
                 for link in links if story.search(link) is not None]
    storylinks = [userid, storylinks]
    data_storylinks.append(storylinks)
    
print(time.time() - t0)



In [ ]:

    
# exports data
with open('data_userlinks', 'wb') as fp:
    pickle.dump(data_userlinks, fp)
with open('data_storylinks', 'wb') as fp:
    pickle.dump(data_storylinks, fp)
with open('data_profile', 'wb') as fp:
    pickle.dump(data_profile, fp)