In [1]:
import IPython
import pandas as pd

from IPython.parallel import Client
from boto.s3.connection import S3Connection

IPython.__version__


Out[1]:
'0.13.1'

In [2]:
c = Client()
dview = c.direct_view()
len(c.ids)


Out[2]:
62

In [3]:
%%px
%%bash
ec2metadata --local-ipv4


[stdout:1] 10.225.8.72

In [4]:
credentials = pd.read_csv('credentials.csv')

In [5]:
s3conn = S3Connection(credentials['Access Key Id'][0], credentials['Secret Access Key'][0])
datasets = s3conn.get_bucket('datasets.elasticmapreduce')

In [6]:
wikipediaxml_keys = datasets.get_all_keys(prefix='wikipediaxml')
len(wikipediaxml_keys)


Out[6]:
117

In [7]:
data = wikipediaxml_keys[63].get_contents_as_string()

In [8]:
data[:1000]


Out[8]:
'<page>    <title>South Carolina State Armory</title>    <id>17347140</id>    <revision>      <id>410985485</id>      <timestamp>2011-01-30T16:41:06Z</timestamp>      <contributor>        <username>AnomieBOT</username>        <id>7611264</id>      </contributor>      <minor />      <comment>Replacing 1 NRIS {{cite web}} template with {{NRISref}}. Errors? [[User:AnomieBOT/shutoff/ReplaceExternalLinks3]]</comment>      <text xml:space="preserve">{{Infobox_nrhp | name =South Carolina State Armory| nrhp_type = | image = Armory1.png| caption = | location= [[Columbia, South Carolina]]| lat_degrees = 34.0015| lat_minutes = | lat_seconds = | lat_direction = N| long_degrees = 81.0360 | long_minutes = | long_seconds = | long_direction = W| locmapin = South Carolina| area =| built =1905| architect= [[William Augustus Edwards]] &amp; Frank C. Walter| architecture= Early Commercial| added = February 5, 1999| governing_body = Private| refnum=99000099&lt;ref name=&quot;nris&quot;&gt;{{NRISref|2007a}}&'

In [9]:
titles = [entry.split('</title>')[0] for entry in data.split('<title>')[1:]]
len(titles)


Out[9]:
100000

In [10]:
titles[::10000]


Out[10]:
['South Carolina State Armory',
 'Mata Mansa Devi Mandir',
 'Shivered',
 'Lorenzo Sassoli de Bianchi',
 'Ganesha outside Indian Hinduism',
 'Philippine Forest Corporation',
 'Wigierski',
 'Thugz Cry',
 'Shape factor (image analysis and microscopy)',
 'The ancient Near East']

In [11]:
dview.scatter('wikipediaxml_keys', [key.name for key in wikipediaxml_keys], dist='r')


Out[11]:
<AsyncResult: scatter>

In [12]:
%%px 
import pandas as pd
from boto.s3.connection import S3Connection
from collections import defaultdict

credentials = pd.read_csv('credentials.csv')

s3conn = S3Connection(credentials['Access Key Id'][0], credentials['Secret Access Key'][0])
datasets = s3conn.get_bucket('datasets.elasticmapreduce')

title_lens = defaultdict(int)
errors = []

for key_name in wikipediaxml_keys:
    wikipediaxml_key = datasets.get_key(key_name)
    try:
        data = wikipediaxml_key.get_contents_as_string()
        titles = [entry.split('</title>')[0] for entry in data.split('<title>')[1:]]
        for title in titles:
            title_tokens = title.split()
            title_length = len(title_tokens)
            title_lens[title_length] += 1
    except:
        errors.append(key_name)

Intermission


In [13]:
title_lens = dview.gather('title_lens').get()

In [14]:
pd.DataFrame(title_lens).sum(0).plot()


Out[14]:
<matplotlib.axes.AxesSubplot at 0x4141cd0>