In [32]:
# Setup
import pattern.web as web
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from cs109style import customize_mpl, customize_css
customize_mpl()
customize_css()
%pylab inline


Setting custom matplotlib visual style
Setting custom CSS for the IPython Notebook
Populating the interactive namespace from numpy and matplotlib

Example 2: extracting reddit titles, upvotes, downvotes, and submission time

We'll operate in two phases:

  • first, find all the URLs to comment pages on the first few front pages of reddit.
  • second, extract information from each comments page

In [22]:
def get_links_from_front_pages(n):
    'find  URLs of comments pages, linked from the n first few pages of reddit'
    url = web.URL('http://www.reddit.com/')
    comment_pages = []
    for page_idx in range(n):
        dom = web.DOM(url.download(cached=False))
    
        for entry in dom('a.comments'):
            href = entry.attributes.get('href', '')
            if href:
                comment_pages.append(href)
                
        # find the next page link - reddit has 25 links per page
        for a in dom('a'):
            if ('count=%d' % ((page_idx + 1) * 25)) in a.attributes.get('href', ''):
                url = web.URL(a.attributes.get('href'))
    # use set() to remove repeated pages
    return list(set(comment_pages))

            
print len(get_links_from_front_pages(6))


170

In [27]:
def info_from_comments_pages(links):
    'fetch title, upvotes, downvotes, time of submission from a sequence of links'
    results = []
    for urltext in links:
        url = web.URL(urltext)
        print "fetching info for", url
        try:
            dom = web.DOM(url.download(cached=False))
            title = dom('title')[0].content
            upvotes = int(dom.by_class('upvotes')[0].children[0].content.replace(',', ''))
            downvotes = int(dom.by_class('downvotes')[0].children[0].content.replace(',', ''))
            time = dom.by_class('tagline')[0]('time')[0].attributes.get('datetime')
            results.append((title, upvotes, downvotes, pd.to_datetime(time)))
        except KeyboardInterrupt:
            # allow us to interrupt the kernel but use what we've already fetched
            break
        except:
            pass  # some things that look like comment pages don't have the information above
    return results

In [30]:
comments_pages = get_links_from_front_pages(5)
print "Fetching info for", len(comments_pages), "pages"
pages = info_from_comments_pages(comments_pages)
titles, upvotes, downvotes, dates = zip(*pages)  # zip(*seq) transposes a sequence of sequences.
df = pd.DataFrame({'title' : titles, 'upvotes' : upvotes, 'downvotes' : downvotes, 'date' : dates}, index=dates)
print df


Fetching info for 139 pages
fetching info for http://www.reddit.com/r/AskReddit/comments/1m8t6m/teachers_of_reddit_what_is_the_worst_case_of/
fetching info for http://www.reddit.com/r/books/comments/1m8l01/the_price_of_libraries_is_cheap_compared_to_that/
fetching info for http://www.reddit.com/r/funny/comments/1m8l7d/redditors_in_a_nutshell/
fetching info for http://www.reddit.com/r/Music/comments/1m81zk/the_knife_heartbeats/
fetching info for http://www.reddit.com/r/aww/comments/1m8x6z/the_most_fearsome_of_predators/
fetching info for http://www.reddit.com/r/television/comments/1m75ml/better_call_saul_is_a_go/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m8ngn/eli5_how_is_a_countries_military_strength_is/
fetching info for http://www.reddit.com/r/WTF/comments/1m8t6r/my_work_takes_me_to_really_isolated_and_creepy/
fetching info for http://www.reddit.com/r/science/comments/1m8jjz/most_of_the_time_we_try_to_avoid_inflicting_pain/
fetching info for http://www.reddit.com/r/IAmA/comments/1m804g/i_wrote_what_do_you_buy_the_children_of_the/
fetching info for http://www.reddit.com/r/pics/comments/1m8r9c/a_human_skull_that_i_carved_from_lime_wood_xpost/
fetching info for http://www.reddit.com/r/WTF/comments/1m8dsa/today_i_got_a_200000_tip_explanation_inside/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m94iv/why_do_they_ask_for_your_sexuality_race_and/
fetching info for http://www.reddit.com/comments/waenc/subreddit_discovery_browse_articles_posted_from/
fetching info for http://www.reddit.com/r/funny/comments/1m8ltd/when_reverse_gifs_are_better_than_the_original/
fetching info for http://www.reddit.com/r/funny/comments/1m8u72/can_i_get_you_anything/
fetching info for http://www.reddit.com/r/gifs/comments/1m8gw8/bryan_cranston_throwing_a_pizza_on_jimmy_fallons/
fetching info for http://www.reddit.com/r/funny/comments/1m91o9/sad_bear_popped_a_hole_in_my_friends_pool/
fetching info for http://www.reddit.com/r/todayilearned/comments/1m8nt4/til_only_two_words_in_english_contain_all_the/
fetching info for http://www.reddit.com/r/worldnews/comments/1m8lqi/tesla_model_s_is_now_norways_most_sold_car/
fetching info for http://www.reddit.com/r/AskReddit/comments/1m8jab/how_did_you_find_out_about_reddit/
fetching info for http://www.reddit.com/r/Music/comments/1m8pmq/rip_johnny_cash_died_ten_years_ago_today/
fetching info for http://www.reddit.com/r/gaming/comments/1m8430/rated_m_for_mature/
fetching info for http://www.reddit.com/r/news/comments/1m8rll/trayvon_martin_medical_examiner_fired/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8qca/as_a_student_this_really_pisses_me_off/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8wa7/did_somebody_say_confession_bear_was_out_of_the/
fetching info for http://www.reddit.com/r/WTF/comments/1m8zxr/so_someone_found_a_rocket_launcher_in_my_local/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8wcq/to_be_honest_i_didnt_think_i_could_do_it/
fetching info for http://www.reddit.com/r/funny/comments/1m8qm2/for_motion_picture_use_only/
fetching info for http://www.reddit.com/r/EarthPorn/comments/1m83vs/my_husband_asked_to_borrow_my_camera_that_he_had/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8u83/the_redditors_who_so_bravely_savaged_that/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8k62/started_a_new_healthy_diet_this_week/
fetching info for http://www.reddit.com/comments/xtwzm/subreddit_discovery_see_reddit_from_other/
fetching info for http://www.reddit.com/r/WTF/comments/1m8zoj/someone_just_pooped_down_the_hall_at_work/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m8pv7/eli5_why_does_britain_compete_together_at_the/
fetching info for http://www.reddit.com/r/AskReddit/comments/1m8rfb/whats_one_album_youll_never_get_tired_of/
fetching info for http://www.reddit.com/r/funny/comments/1m8fjs/found_this_cow_reading_about_himself_at_the_fair/
fetching info for http://www.reddit.com/comments/wbnzy/subreddit_discovery_explore_gaming_related/
fetching info for http://www.reddit.com/r/funny/comments/1m91i7/super_heroes_and_their_dead_parents/
fetching info for http://www.reddit.com/comments/waea1/subreddit_discovery_find_favorite_subreddits_of/
fetching info for http://www.reddit.com/r/aww/comments/1m8ff2/no_mice_no_birds_reddit_meet_beast_whos_brought/
fetching info for http://www.reddit.com/comments/waede/subreddit_discovery_browse_usercreated_subreddit/
fetching info for http://www.reddit.com/r/news/comments/1m8ppo/federal_court_decides_that_in_god_we_trust_will/
fetching info for http://www.reddit.com/r/todayilearned/comments/1m7j7d/til_of_rick_rescorla_morgan_stanleys_head_of/
fetching info for http://www.reddit.com/r/AskReddit/comments/1m8so4/what_would_be_the_worst_painless_torture/
fetching info for http://www.reddit.com/r/funny/comments/1m8tk2/well_played_nursery/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8g3h/drunk_me_on_a_camping_trip_the_tents_ended_up/
fetching info for http://www.reddit.com/r/AskReddit/comments/1m8k4c/what_was_your_schools_sexed_like/
fetching info for http://www.reddit.com/r/WTF/comments/1m900t/welcome_to_new_york/
fetching info for http://www.reddit.com/r/funny/comments/1m8n24/thats_our_joey/
fetching info for http://www.reddit.com/r/aww/comments/1m8gs7/1_toilet_paper_holder/
fetching info for http://www.reddit.com/r/todayilearned/comments/1m8tcj/til_that_groups_of_human_beings_left_free_to_each/
fetching info for http://www.reddit.com/r/Music/comments/1m8wy0/sublime_dont_push/
fetching info for http://www.reddit.com/r/todayilearned/comments/1m7n24/til_bill_murray_thinks_the_2004_hong_kong_comedy/
fetching info for http://www.reddit.com/r/movies/comments/1m8tev/wb_jk_rowling_team_up_for_harry_potter_spinoff/
fetching info for http://www.reddit.com/r/todayilearned/comments/1m8kx5/til_up_until_the_1960s_poor_swiss_children_could/
fetching info for http://www.reddit.com/r/technology/comments/1m8kcz/lavabits_owner_appeals_secret_surveillance_order/
fetching info for http://www.reddit.com/r/funny/comments/1m8qkh/my_friend_got_this_birthday_card_from_her_mom/
fetching info for http://www.reddit.com/r/movies/comments/1m7x0s/miyazakis_the_wind_rises_to_release_in_north/
fetching info for http://www.reddit.com/r/IAmA/comments/1m926j/we_are_the_outlookcom_team_ask_us_anything/
fetching info for http://www.reddit.com/r/gaming/comments/1m921j/well_i_guess_its_a_decentsized_map/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m7dnc/eli5_how_do_movies_deal_with_casting_overweight/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m8u4l/eli5_lymph_nodes/
fetching info for http://www.reddit.com/r/pics/comments/1m921c/small_town_pizza_place_knows_what_community_is/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m8qtz/eli5_why_do_birds_bob_their_heads_forward_and/
fetching info for http://www.reddit.com/r/Music/comments/1m939y/sufjan_stevens_for_the_widows_in_paradise_for_the/
fetching info for http://www.reddit.com/r/funny/comments/1m8q8m/standard_issue_on_the_death_star/
fetching info for http://www.reddit.com/r/funny/comments/1m8nda/being_paid_minimum_wage/
fetching info for http://www.reddit.com/r/worldnews/comments/1m8dj6/syrian_rebels_slit_throat_of_christian_man_who/
fetching info for http://www.reddit.com/r/todayilearned/comments/1m82bz/til_that_between_50100_americans_currently_use_a/
fetching info for http://www.reddit.com/r/gaming/comments/1m8v7s/the_only_game_that_i_want_to_blizzard_make_it/
fetching info for http://www.reddit.com/r/IAmA/comments/1m90wk/as_requested_i_ama_graduate_student_in_a_prion/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8ahs/this_meant_quite_a_lot_more_to_me_than_he_knows/
fetching info for http://www.reddit.com/comments/nwhsj/subreddit_discovery_rsubredditoftheday_finds_the/
fetching info for http://www.reddit.com/r/Music/comments/1m8o4m/miley_cyrus_wrecking_ball_nicolas_cage_edition/
fetching info for http://www.reddit.com/r/WTF/comments/1m8m6w/this_thing_was_supposedly_found_in/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8hnb/getting_real_fucking_tired_of_your_shit_college/
fetching info for http://www.reddit.com/r/technology/comments/1m8tfb/verizons_diabolical_plan_to_charge_websites_for/
fetching info for http://www.reddit.com/r/funny/comments/1m8mss/as_a_guy_who_cant_take_subtle_hints/
fetching info for http://www.reddit.com/comments/wadkt/subreddit_discovery_find_subreddits_from_all_over/
fetching info for http://www.reddit.com/r/gaming/comments/1m8je5/this_is_supposed_to_be_lava_but_all_i_see_is/
fetching info for http://www.reddit.com/r/aww/comments/1m8ucu/toby_has_a_fresh_haircut/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m8tq0/eli5_what_is_the_relation_differences_and/
fetching info for http://www.reddit.com/r/WTF/comments/1m860q/while_visiting_vegas_i_see_this_guy_will_you/
fetching info for http://www.reddit.com/r/funny/comments/1m89bl/this_line_gets_me_every_time/
fetching info for http://www.reddit.com/r/aww/comments/1m8o3y/offered_my_cat_a_tiny_taco_she_said_no/
fetching info for http://www.reddit.com/r/todayilearned/comments/1m8tec/til_in_moose_antlers_appear_to_act_as_large/
fetching info for http://www.reddit.com/r/gaming/comments/1m8vvp/im_a_monster_mgs5/
fetching info for http://www.reddit.com/r/aww/comments/1m8p8i/gahhh_so_freaking_adorable_i_love_them_so_much_i/
fetching info for http://www.reddit.com/comments/wbnvc/subreddit_discovery_animals/
fetching info for http://www.reddit.com/r/worldnews/comments/1m7r2e/vladamir_putins_oped_in_the_new_york_times_on/
fetching info for http://www.reddit.com/r/news/comments/1m7l7y/yahoo_ceo_mayer_we_faced_jail_if_we_revealed_nsa/
fetching info for http://www.reddit.com/r/worldnews/comments/1m8tp1/assad_confirms_chemical_weapons_handover/
fetching info for http://www.reddit.com/r/askscience/comments/1m8m4u/why_do_we_throw_up_much_more_frequently_when_were/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m83xt/is_that_confession_bear_noooope/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m8qc5/eli5_when_nasa_spots_black_holes_what_do_they/
fetching info for http://www.reddit.com/r/gifs/comments/1m8mq5/helium_foam_printing/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8t05/has_everyone_forgotten_what_a_monster_this_guy_is/
fetching info for http://www.reddit.com/r/gifs/comments/1m8r4p/man_uses_100_pound_tire_as_a_hula_hoop/
fetching info for http://www.reddit.com/comments/wbnz2/subreddit_discovery_music_now_new_and_improved/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m88sq/i_cant_believe_this_just_happened_outside_my_own/
fetching info for http://www.reddit.com/r/funny/comments/1m8p7l/an_important_lesson_from_australia/
fetching info for http://www.reddit.com/r/worldnews/comments/1m8uen/911_was_an_inside_job_says_italian_member_of/
fetching info for http://www.reddit.com/r/IAmA/comments/1m90kv/ama_request_mr_vladimir_putin/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m9c5l/why_do_web_developers_make_free_full_website/
fetching info for http://www.reddit.com/r/bestof/comments/1m82zq/unextwiggin4_who_already_lost_125_pounds_helps/
fetching info for http://www.reddit.com/comments/wad70/subreddit_discovery_subscribe_to_rnewreddits_and/
fetching info for http://www.reddit.com/r/gaming/comments/1m8dqm/a_friend_sent_his_resume_to_nintendo_with_these/
fetching info for http://www.reddit.com/r/gaming/comments/1m8ysk/first_thing_im_doing_in_gta5/
fetching info for http://www.reddit.com/r/funny/comments/1m8rfv/horrible_crash_scene_in_the_parking_lot_of_my/
fetching info for http://www.reddit.com/r/videos/comments/1m8m31/caught_on_cctv_fixing_bike_rack_at_3am/
fetching info for http://www.reddit.com/r/todayilearned/comments/1m7prc/til_over_80_of_the_worlds_eyewear_manufacturer/
fetching info for http://www.reddit.com/r/askscience/comments/1m8eat/is_there_a_smallest_unit_of_time/
fetching info for http://www.reddit.com/r/todayilearned/comments/1m7i9b/til_alcatraz_was_once_the_only_federal/
fetching info for http://www.reddit.com/r/WTF/comments/1m909p/watched_this_lizard_get_captured_and_poisoned/
fetching info for http://www.reddit.com/r/worldnews/comments/1m8o9u/vodafone_germany_has_been_hacked_banking_data_of/
fetching info for http://www.reddit.com/comments/nwiep/subreddit_discovery_find_the_awesome_previous/
fetching info for http://www.reddit.com/r/AskReddit/comments/1m9c7v/isnt_arming_syrian_rebels_an_act_of_war/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8mku/cmon_guys_hes_not_that_bad/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m90lc/eli5_how_did_the_911_hijackers_fly_offcourse_for/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m7v6x/the_other_school_sucks/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8rq9/in_light_of_the_recent_discovery_in_kenya/
fetching info for http://www.reddit.com/r/gaming/comments/1m8nuo/arma_3_is_just_released_with_the_mod_support_this/
fetching info for http://www.reddit.com/r/WTF/comments/1m8jjc/rule_261_dont_get_sent_to_detention_on_opposite/
fetching info for http://www.reddit.com/r/aww/comments/1m8wso/if_i_stop_walking_for_more_than_10_seconds_he/
fetching info for http://www.reddit.com/r/news/comments/1m8ufl/j_k_rowling_to_write_new_harry_potterinspired/
fetching info for http://www.reddit.com/r/gaming/comments/1m7wun/once_you_go_black_you_immediately_switch_back_to/
fetching info for http://www.reddit.com/r/funny/comments/1m8xvh/apparently_i_eat_ramen_incorrectly/
fetching info for http://www.reddit.com/r/WTF/comments/1m8khr/i_live_in_a_developing_country_where_many_men/
fetching info for http://www.reddit.com/r/AdviceAnimals/comments/1m8oqo/friend_admitted_this_the_other_day/
fetching info for http://www.reddit.com/r/news/comments/1m8m4i/american_holed_up_in_canada_denies_child_porn/
fetching info for http://www.reddit.com/r/WTF/comments/1m8xrw/sat_behind_this_on_the_bus_today_you_could_smell/
fetching info for http://www.reddit.com/r/AskReddit/comments/1m8pa9/whats_the_stupidest_thing_someones_been_mad_at/
fetching info for http://www.reddit.com/r/funny/comments/1m9c7s/first_picture_i_receive_from_my_brother_at/
fetching info for http://www.reddit.com/r/todayilearned/comments/1m92rv/til_in_ancient_athens_a_beautiful_courtesan_named/
fetching info for http://www.reddit.com/r/explainlikeimfive/comments/1m8snq/eli5_why_we_have_two_types_of_screw_headsdrivers/
fetching info for http://www.reddit.com/r/technology/comments/1m8vme/apple_no_longer_innovates_says_the_man_who_helped/
fetching info for http://www.reddit.com/r/funny/comments/1m8p2z/my_friends_engagement_photo/
fetching info for http://www.reddit.com/r/videos/comments/1m7oem/talking_macaw_shushes_other_bird/
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 128 entries, 2013-09-12 13:15:50 to 2013-09-12 00:15:58
Data columns (total 4 columns):
date         128  non-null values
downvotes    128  non-null values
title        128  non-null values
upvotes      128  non-null values
dtypes: datetime64[ns](1), int64(2), object(1)

In [31]:
df.sort('date', inplace=True)
df['upvotes'].plot(c='g')
df['downvotes'].plot(c='r')
(df['upvotes'] - df['downvotes']).plot(c='k')


Out[31]:
<matplotlib.axes.AxesSubplot at 0x10cd72350>

In [6]: