In [1]:
import IPython
import pandas as pd
from IPython.parallel import Client
from boto.s3.connection import S3Connection
IPython.__version__
Out[1]:
In [2]:
c = Client()
dview = c.direct_view()
len(c.ids)
Out[2]:
In [3]:
%%px
%%bash
ec2metadata --local-ipv4
In [4]:
credentials = pd.read_csv('credentials.csv')
In [5]:
s3conn = S3Connection(credentials['Access Key Id'][0], credentials['Secret Access Key'][0])
datasets = s3conn.get_bucket('datasets.elasticmapreduce')
In [6]:
wikipediaxml_keys = datasets.get_all_keys(prefix='wikipediaxml')
len(wikipediaxml_keys)
Out[6]:
In [7]:
data = wikipediaxml_keys[63].get_contents_as_string()
In [8]:
data[:1000]
Out[8]:
In [9]:
titles = [entry.split('</title>')[0] for entry in data.split('<title>')[1:]]
len(titles)
Out[9]:
In [10]:
titles[::10000]
Out[10]:
In [11]:
dview.scatter('wikipediaxml_keys', [key.name for key in wikipediaxml_keys], dist='r')
Out[11]:
In [12]:
%%px
import pandas as pd
from boto.s3.connection import S3Connection
from collections import defaultdict
credentials = pd.read_csv('credentials.csv')
s3conn = S3Connection(credentials['Access Key Id'][0], credentials['Secret Access Key'][0])
datasets = s3conn.get_bucket('datasets.elasticmapreduce')
title_lens = defaultdict(int)
errors = []
for key_name in wikipediaxml_keys:
wikipediaxml_key = datasets.get_key(key_name)
try:
data = wikipediaxml_key.get_contents_as_string()
titles = [entry.split('</title>')[0] for entry in data.split('<title>')[1:]]
for title in titles:
title_tokens = title.split()
title_length = len(title_tokens)
title_lens[title_length] += 1
except:
errors.append(key_name)
In [13]:
title_lens = dview.gather('title_lens').get()
In [14]:
pd.DataFrame(title_lens).sum(0).plot()
Out[14]: