notebook.community

Edit and run



In [ ]:

    
from __future__ import print_function
from bs4 import BeautifulSoup as bs
import requests
import traceback
import json
import re
import pandas as pd
import numpy as np



In [ ]:

    
def json2df(fname, year):
    with open(fname, "r") as f_in:
        orgData = json.load(f_in)
    cleanTitles = {}
    pattern = "[0-9]+\."
    columnSet = set()
    for k, v in orgData.items():
        title = re.sub(pattern, "", k.lstrip().rstrip())
        cleanTitles[title.lstrip().rstrip()] = v
        for key in v.keys():
            columnSet.add(key)
    df = pd.DataFrame(index=cleanTitles.keys())
    for key in columnSet:
        df[key] = ""
    for ix, rowData in df.iterrows():
        propData = cleanTitles[ix]
        for k, v in propData.items():
            if k != "comments":
                rowData.loc[k] = v
    df.columns = [c.lower().replace(" ", "_").replace(":", "") for c in df]
    df['n_comments'] = df.pop('comments')
    for ix, rowdata in df.iterrows():
        rowdata.loc["n_comments"] = len(cleanTitles[ix]['comments'])
    df['n_comments'] = df['n_comments'].astype(int)
    df['n_votes'] = df['n_votes'].astype(int)
    df['year'] = year
    for col in df:
        if df[col].dtype is np.dtype('O'):
            df[col] = df.pop(col).apply(lambda x: x.lstrip().rstrip())
    return df



In [ ]:

    
def make_request(url):
    """Make an HTTP GET request and return data in beautiful soup.

    :param url: URL to be scraped.
    """
    headers = {}
    headers['User-Agent'] = ("Mozilla/5.0 (Macintosh; Intel Mac"
                             " OS X 10_11_5) AppleWebKit/537.36"
                             "(KHTML, like Gecko) Chrome/51.0."
                             "2704.103 Safari/537.36")
    r = requests.get(url, headers=headers)
    return bs(r.text)



In [ ]:

    
def scrape(url, base_url="https://in.pycon.org"):
    result = {}
    soup = make_request(url)
    soup_proposals = soup.findAll(
        'div', attrs={'class': 'row user-proposals'})
    for proposal in soup_proposals:
        p = proposal.find('h3', attrs={'class': 'proposal--title'})
        title, url = p.text, "".join([base_url,
                                      p.find('a').get('href', '')])
        soup = make_request(url)
        if not soup:
            continue
        soup_proposal = soup.findAll(
            'div', attrs={'class': 'proposal-writeup--section'})
        temp = {}
        for data in soup_proposal:
            try:
                temp[data.find('h4').text] = "".join(
                    [ptag.text for ptag in data.findAll('p')])
            except:
                print(traceback.format_exc())
        temp['comments'] = []
        for comment in soup.findAll("div", attrs={'class': 'comment-description'}):
            text = comment.find("span")
            if text:
                temp['comments'].append(dict(
                    text=text.text.strip(),
                    by=comment.find('b').text.strip(),
                    time=comment.find('small').text.strip()))
        talk_details = soup.findAll('tr')
        for section in talk_details:
            row = section.findAll('td')
            temp[row[0].text] = row[1].text
        vCount = soup.findAll('h1', attrs={'class': 'vote-count'})[0]
        n_votes = int(vCount.text.lstrip().rstrip())
        temp["n_votes"] = n_votes
        result[title] = temp
    return result



In [ ]:

    
cfp_2015_url = "https://in.pycon.org/cfp/pycon-india-2015/proposals/"
cfp_2016_url = "https://in.pycon.org/cfp/2016/proposals/"
results = scrape(cfp_2015_url)
with open("cfp2015.json", "w") as f_out:
    json.dump(results, f_out)
results = scrape(cfp_2016_url)
with open("cfp2016.json", "w") as f_out:
    json.dump(results, f_out)
df1 = json2df("cfp2015.json", 2015)
df2 = json2df("cfp2016.json", 2016)



In [ ]:

    
dates = pd.to_datetime(df2.last_updated)
assert(np.all(pd.notnull(dates)))



In [ ]:

    
df = pd.concat((df1, df2), axis=0)
df.to_csv("cfp.tsv", encoding="utf-8", sep="\t", index_label="title")