Introductory examples

1.usa.gov data from bit.ly


In [1]:
%pwd


Out[1]:
u'/Users/pmui/OneDrive/scu/data.science/lecture01.intro'

In [2]:
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'

In [3]:
open(path).readline()


Out[3]:
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'

In [4]:
import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]

In [5]:
records[0]


Out[5]:
{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 u'al': u'en-US,en;q=0.8',
 u'c': u'US',
 u'cy': u'Danvers',
 u'g': u'A6qOVH',
 u'gr': u'MA',
 u'h': u'wfLQtf',
 u'hc': 1331822918,
 u'hh': u'1.usa.gov',
 u'l': u'orofrog',
 u'll': [42.576698, -70.954903],
 u'nk': 1,
 u'r': u'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 u't': 1331923247,
 u'tz': u'America/New_York',
 u'u': u'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}

In [6]:
records[0]['tz']


Out[6]:
u'America/New_York'

In [7]:
print(records[0]['tz'])


America/New_York

Counting time zones in pure Python

Expect error


In [8]:
time_zones = [rec['tz'] for rec in records]


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-8-db4fbd348da9> in <module>()
----> 1 time_zones = [rec['tz'] for rec in records]

KeyError: 'tz'

In [9]:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]

In [10]:
time_zones[:10]


Out[10]:
[u'America/New_York',
 u'America/Denver',
 u'America/New_York',
 u'America/Sao_Paulo',
 u'America/New_York',
 u'America/New_York',
 u'Europe/Warsaw',
 u'',
 u'',
 u'']

In [11]:
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [12]:
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) # values will initialize to 0
    for x in sequence:
        counts[x] += 1
    return counts

In [13]:
counts = get_counts(time_zones)

In [14]:
counts['America/New_York']


Out[14]:
1251

In [15]:
len(time_zones)


Out[15]:
3440

In [16]:
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [17]:
top_counts(counts)


Out[17]:
[(33, u'America/Sao_Paulo'),
 (35, u'Europe/Madrid'),
 (36, u'Pacific/Honolulu'),
 (37, u'Asia/Tokyo'),
 (74, u'Europe/London'),
 (191, u'America/Denver'),
 (382, u'America/Los_Angeles'),
 (400, u'America/Chicago'),
 (521, u''),
 (1251, u'America/New_York')]

In [18]:
from collections import Counter

In [19]:
counts = Counter(time_zones)

In [20]:
counts.most_common(10)


Out[20]:
[(u'America/New_York', 1251),
 (u'', 521),
 (u'America/Chicago', 400),
 (u'America/Los_Angeles', 382),
 (u'America/Denver', 191),
 (u'Europe/London', 74),
 (u'Asia/Tokyo', 37),
 (u'Pacific/Honolulu', 36),
 (u'Europe/Madrid', 35),
 (u'America/Sao_Paulo', 33)]

Counting time zones with pandas


In [21]:
%matplotlib inline

In [22]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)

In [23]:
import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
lines = open(path).readlines()
records = [json.loads(line) for line in lines]

In [24]:
from pandas import DataFrame, Series
import pandas as pd

frame = DataFrame(records)
frame


Out[24]:
_heartbeat_ a al c cy g gr h hc hh kw l ll nk r t tz u
0 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Danvers A6qOVH MA wfLQtf 1.331823e+09 1.usa.gov NaN orofrog [42.576698, -70.954903] 1.0 http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... 1.331923e+09 America/New_York http://www.ncbi.nlm.nih.gov/pubmed/22415991
1 NaN GoogleMaps/RochesterNY NaN US Provo mwszkS UT mwszkS 1.308262e+09 j.mp NaN bitly [40.218102, -111.613297] 0.0 http://www.AwareMap.com/ 1.331923e+09 America/Denver http://www.monroecounty.gov/etc/911/rss.php
2 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-US US Washington xxr3Qb DC xxr3Qb 1.331920e+09 1.usa.gov NaN bitly [38.9007, -77.043098] 1.0 http://t.co/03elZC4Q 1.331923e+09 America/New_York http://boxer.senate.gov/en/press/releases/0316...
3 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... pt-br BR Braz zCaLwp 27 zUtuOu 1.331923e+09 1.usa.gov NaN alelex88 [-23.549999, -46.616699] 0.0 direct 1.331923e+09 America/Sao_Paulo http://apod.nasa.gov/apod/ap120312.html
4 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Shrewsbury 9b6kNl MA 9b6kNl 1.273672e+09 bit.ly NaN bitly [42.286499, -71.714699] 0.0 http://www.shrewsbury-ma.gov/selco/ 1.331923e+09 America/New_York http://www.shrewsbury-ma.gov/egov/gallery/1341...
5 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Shrewsbury axNK8c MA axNK8c 1.273673e+09 bit.ly NaN bitly [42.286499, -71.714699] 0.0 http://www.shrewsbury-ma.gov/selco/ 1.331923e+09 America/New_York http://www.shrewsbury-ma.gov/egov/gallery/1341...
6 NaN Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1... pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4 PL Luban wcndER 77 zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs [51.116699, 15.2833] 0.0 http://plus.url.google.com/url?sa=z&n=13319232... 1.331923e+09 Europe/Warsaw http://www.nasa.gov/mission_pages/nustar/main/...
7 NaN Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/2... bg,en-us;q=0.7,en;q=0.3 None NaN wcndER NaN zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs NaN 0.0 http://www.facebook.com/ 1.331923e+09 http://www.nasa.gov/mission_pages/nustar/main/...
8 NaN Opera/9.80 (X11; Linux zbov; U; en) Presto/2.1... en-US, en None NaN wcndER NaN zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs NaN 0.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.331923e+09 http://www.nasa.gov/mission_pages/nustar/main/...
9 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4 None NaN zCaLwp NaN zUtuOu 1.331923e+09 1.usa.gov NaN alelex88 NaN 0.0 http://t.co/o1Pd0WeV 1.331923e+09 http://apod.nasa.gov/apod/ap120312.html
10 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... en-us,en;q=0.5 US Seattle vNJS4H WA u0uD9q 1.319564e+09 1.usa.gov NaN o_4us71ccioa [47.5951, -122.332603] 1.0 direct 1.331923e+09 America/Los_Angeles https://www.nysdot.gov/rexdesign/design/commun...
11 NaN Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4... en-us,en;q=0.5 US Washington wG7OIH DC A0nRz4 1.331816e+09 1.usa.gov NaN darrellissa [38.937599, -77.092796] 0.0 http://t.co/ND7SoPyo 1.331923e+09 America/New_York http://oversight.house.gov/wp-content/uploads/...
12 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... en-us,en;q=0.5 US Alexandria vNJS4H VA u0uD9q 1.319564e+09 1.usa.gov NaN o_4us71ccioa [38.790901, -77.094704] 1.0 direct 1.331923e+09 America/New_York https://www.nysdot.gov/rexdesign/design/commun...
13 1.331923e+09 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
14 NaN Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US... en-us,en;q=0.5 US Marietta 2rOUYc GA 2rOUYc 1.255770e+09 1.usa.gov NaN bitly [33.953201, -84.5177] 1.0 direct 1.331923e+09 America/New_York http://toxtown.nlm.nih.gov/index.php
15 NaN Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1... zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4 HK Central District nQvgJp 00 rtrrth 1.317318e+09 j.mp NaN walkeryuen [22.2833, 114.150002] 1.0 http://forum2.hkgolden.com/view.aspx?type=BW&m... 1.331923e+09 Asia/Hong_Kong http://www.ssd.noaa.gov/PS/TROP/TCFP/data/curr...
16 NaN Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1... zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4 HK Central District XdUNr 00 qWkgbq 1.317318e+09 j.mp NaN walkeryuen [22.2833, 114.150002] 1.0 http://forum2.hkgolden.com/view.aspx?type=BW&m... 1.331923e+09 Asia/Hong_Kong http://www.usno.navy.mil/NOOC/nmfc-ph/RSS/jtwc...
17 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10.5; r... en-us,en;q=0.5 US Buckfield zH1BFf ME x3jOIv 1.331840e+09 1.usa.gov NaN andyzieminski [44.299702, -70.369797] 0.0 http://t.co/6Cx4ROLs 1.331923e+09 America/New_York http://www.usda.gov/wps/portal/usda/usdahome?c...
18 NaN GoogleMaps/RochesterNY NaN US Provo mwszkS UT mwszkS 1.308262e+09 1.usa.gov NaN bitly [40.218102, -111.613297] 0.0 http://www.AwareMap.com/ 1.331923e+09 America/Denver http://www.monroecounty.gov/etc/911/rss.php
19 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... it-IT,it;q=0.8,en-US;q=0.6,en;q=0.4 IT Venice wcndER 20 zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs [45.438599, 12.3267] 0.0 http://www.facebook.com/ 1.331923e+09 Europe/Rome http://www.nasa.gov/mission_pages/nustar/main/...
20 NaN Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ... es-ES ES Alcal zQ95Hi 51 ytZYWR 1.331671e+09 bitly.com NaN jplnews [37.516701, -5.9833] 0.0 http://www.facebook.com/ 1.331923e+09 Africa/Ceuta http://voyager.jpl.nasa.gov/imagesvideo/uranus...
21 NaN Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6... en-us,en;q=0.5 US Davidsonville wcndER MD zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs [38.939201, -76.635002] 0.0 http://www.facebook.com/ 1.331923e+09 America/New_York http://www.nasa.gov/mission_pages/nustar/main/...
22 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-us US Hockessin y3ZImz DE y3ZImz 1.331064e+09 1.usa.gov NaN bitly [39.785, -75.682297] 0.0 direct 1.331923e+09 America/New_York http://portal.hud.gov/hudportal/documents/hudd...
23 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3)... en-us US Lititz wWiOiD PA wWiOiD 1.330218e+09 1.usa.gov NaN bitly [40.174999, -76.3078] 0.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.331923e+09 America/New_York http://www.tricare.mil/mybenefit/ProfileFilter...
24 NaN Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES... es-es,es;q=0.8,en-us;q=0.5,en;q=0.3 ES Bilbao wcndER 59 zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs [43.25, -2.9667] 0.0 http://www.facebook.com/ 1.331923e+09 Europe/Madrid http://www.nasa.gov/mission_pages/nustar/main/...
25 NaN Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1... en-GB,en;q=0.8,en-US;q=0.6,en-AU;q=0.4 MY Kuala Lumpur wcndER 14 zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs [3.1667, 101.699997] 0.0 http://www.facebook.com/ 1.331923e+09 Asia/Kuala_Lumpur http://www.nasa.gov/mission_pages/nustar/main/...
26 NaN Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1... ro-RO,ro;q=0.8,en-US;q=0.6,en;q=0.4 CY Nicosia wcndER 04 zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs [35.166698, 33.366699] 0.0 http://www.facebook.com/?ref=tn_tnmn 1.331923e+09 Asia/Nicosia http://www.nasa.gov/mission_pages/nustar/main/...
27 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... en-US,en;q=0.8 BR SPaulo zCaLwp 27 zUtuOu 1.331923e+09 1.usa.gov NaN alelex88 [-23.5333, -46.616699] 0.0 direct 1.331923e+09 America/Sao_Paulo http://apod.nasa.gov/apod/ap120312.html
28 NaN Mozilla/5.0 (iPad; CPU OS 5_0_1 like Mac OS X)... en-us None NaN vNJS4H NaN u0uD9q 1.319564e+09 1.usa.gov NaN o_4us71ccioa NaN 0.0 direct 1.331923e+09 https://www.nysdot.gov/rexdesign/design/commun...
29 NaN Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X... en-us None NaN FPX0IM NaN FPX0IL 1.331923e+09 1.usa.gov NaN twittershare NaN 1.0 http://t.co/5xlp0B34 1.331923e+09 http://www.ed.gov/news/media-advisories/us-dep...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3530 NaN Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1... en-US,en;q=0.8 US San Francisco xVZg4P CA wqUkTo 1.331908e+09 go.nasa.gov NaN nasatwitter [37.7645, -122.429398] 0.0 http://www.facebook.com/l.php?u=http%3A%2F%2Fg... 1.331927e+09 America/Los_Angeles http://www.nasa.gov/multimedia/imagegallery/im...
3531 NaN Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6... en-US None NaN wcndER NaN zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs NaN 0.0 direct 1.331927e+09 http://www.nasa.gov/mission_pages/nustar/main/...
3532 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... en-us,en;q=0.5 US Washington Au3aUS DC A9ct6C 1.331926e+09 1.usa.gov NaN ncsha [38.904202, -77.031998] 1.0 http://www.ncsha.org/ 1.331927e+09 America/New_York http://portal.hud.gov/hudportal/HUD?src=/press...
3533 NaN Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) A... en-us US Jacksonville b2UtUJ FL ieCdgH 1.301393e+09 go.nasa.gov NaN nasatwitter [30.279301, -81.585098] 1.0 direct 1.331927e+09 America/New_York http://apod.nasa.gov/apod/
3534 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... en-us US Frisco vNJS4H TX u0uD9q 1.319564e+09 1.usa.gov NaN o_4us71ccioa [33.149899, -96.855499] 1.0 direct 1.331927e+09 America/Chicago https://www.nysdot.gov/rexdesign/design/commun...
3535 NaN Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/... en-us US Houston zIgLx8 TX yrPaLt 1.331903e+09 aash.to NaN aashto [29.775499, -95.415199] 1.0 direct 1.331927e+09 America/Chicago http://ntl.bts.gov/lib/44000/44300/44374/FHWA-...
3536 NaN Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; e... en-US,en;q=0.5 None NaN xIcyim NaN yG1TTf 1.331728e+09 go.nasa.gov NaN nasatwitter NaN 0.0 http://t.co/g1VKE8zS 1.331927e+09 http://www.nasa.gov/mission_pages/hurricanes/a...
3537 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... es-es,es;q=0.8,en-us;q=0.5,en;q=0.3 HN Tegucigalpa zCaLwp 08 w63FZW 1.331547e+09 1.usa.gov NaN bufferapp [14.1, -87.216698] 0.0 http://t.co/A8TJyibE 1.331927e+09 America/Tegucigalpa http://apod.nasa.gov/apod/ap120312.html
3538 NaN Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Ma... en-us US Los Angeles qMac9k CA qds1Ge 1.310474e+09 1.usa.gov NaN healthypeople [34.041599, -118.298798] 0.0 direct 1.331927e+09 America/Los_Angeles http://healthypeople.gov/2020/connect/webinars...
3539 NaN Mozilla/5.0 (compatible; Fedora Core 3) FC3 KDE NaN US Bellevue zu2M5o WA zDhdro 1.331586e+09 bit.ly NaN glimtwin [47.615398, -122.210297] 0.0 direct 1.331927e+09 America/Los_Angeles http://www.federalreserve.gov/newsevents/press...
3540 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Payson wcndER UT zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs [40.014198, -111.738899] 0.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.331927e+09 America/Denver http://www.nasa.gov/mission_pages/nustar/main/...
3541 NaN Mozilla/5.0 (X11; U; OpenVMS AlphaServer_ES40;... NaN US Bellevue zu2M5o WA zDhdro 1.331586e+09 1.usa.gov NaN glimtwin [47.615398, -122.210297] 0.0 direct 1.331927e+09 America/Los_Angeles http://www.federalreserve.gov/newsevents/press...
3542 NaN Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ... en-us US Pittsburg y3reI1 CA y3reI1 1.331926e+09 1.usa.gov NaN bitly [38.0051, -121.838699] 0.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.331927e+09 America/Los_Angeles http://www.sba.gov/community/blogs/community-b...
3543 1.331927e+09 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3544 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0.1) ... en-us,en;q=0.5 US Wentzville vNJS4H MO u0uD9q 1.319564e+09 1.usa.gov NaN o_4us71ccioa [38.790001, -90.854897] 1.0 direct 1.331927e+09 America/Chicago https://www.nysdot.gov/rexdesign/design/commun...
3545 NaN Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... en-us,en;q=0.5 US Saint Charles vNJS4H IL u0uD9q 1.319564e+09 1.usa.gov NaN o_4us71ccioa [41.9352, -88.290901] 1.0 direct 1.331927e+09 America/Chicago https://www.nysdot.gov/rexdesign/design/commun...
3546 NaN Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Ma... en-us US Los Angeles qMac9k CA qds1Ge 1.310474e+09 1.usa.gov NaN healthypeople [34.041599, -118.298798] 1.0 direct 1.331927e+09 America/Los_Angeles http://healthypeople.gov/2020/connect/webinars...
3547 NaN Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... en-us US Silver Spring y0jYkg MD y0jYkg 1.331852e+09 1.usa.gov NaN bitly [39.052101, -77.014999] 1.0 direct 1.331927e+09 America/New_York http://www.epa.gov/otaq/regs/fuels/additive/e1...
3548 NaN Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Ma... en-us US Mcgehee y5rMac AR xANY6O 1.331916e+09 1.usa.gov NaN twitterfeed [33.628399, -91.356903] 1.0 https://twitter.com/fdarecalls/status/18069759... 1.331927e+09 America/Chicago http://www.fda.gov/Safety/Recalls/ucm296326.htm
3549 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... sv-SE,sv;q=0.8,en-US;q=0.6,en;q=0.4 SE Sollefte eH8wu 24 7dtjei 1.260316e+09 1.usa.gov NaN tweetdeckapi [63.166698, 17.266701] 1.0 direct 1.331927e+09 Europe/Stockholm http://www.nasa.gov/mission_pages/WISE/main/in...
3550 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-us US Conshohocken A00b72 PA yGSwzn 1.331918e+09 1.usa.gov NaN addthis [40.0798, -75.2855] 0.0 http://www.linkedin.com/home?trk=hb_tab_home_top 1.331927e+09 America/New_York http://www.nlm.nih.gov/medlineplus/news/fullst...
3551 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 None NaN wcndER NaN zkpJBR 1.331923e+09 1.usa.gov NaN bnjacobs NaN 0.0 http://plus.url.google.com/url?sa=z&n=13319268... 1.331927e+09 http://www.nasa.gov/mission_pages/nustar/main/...
3552 NaN Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US... NaN US Decatur rqgJuE AL xcz8vt 1.331227e+09 1.usa.gov NaN bootsnall [34.572701, -86.940598] 0.0 direct 1.331927e+09 America/Chicago http://travel.state.gov/passport/passport_5535...
3553 NaN Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ... en-us US Shrewsbury 9b6kNl MA 9b6kNl 1.273672e+09 bit.ly NaN bitly [42.286499, -71.714699] 0.0 http://www.shrewsbury-ma.gov/selco/ 1.331927e+09 America/New_York http://www.shrewsbury-ma.gov/egov/gallery/1341...
3554 NaN Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ... en-us US Shrewsbury axNK8c MA axNK8c 1.273673e+09 bit.ly NaN bitly [42.286499, -71.714699] 0.0 http://www.shrewsbury-ma.gov/selco/ 1.331927e+09 America/New_York http://www.shrewsbury-ma.gov/egov/gallery/1341...
3555 NaN Mozilla/4.0 (compatible; MSIE 9.0; Windows NT ... en US Paramus e5SvKE NJ fqPSr9 1.301298e+09 1.usa.gov NaN tweetdeckapi [40.9445, -74.07] 1.0 direct 1.331927e+09 America/New_York http://www.fda.gov/AdvisoryCommittees/Committe...
3556 NaN Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1... en-US,en;q=0.8 US Oklahoma City jQLtP4 OK jQLtP4 1.307530e+09 1.usa.gov NaN bitly [35.4715, -97.518997] 0.0 http://www.facebook.com/l.php?u=http%3A%2F%2F1... 1.331927e+09 America/Chicago http://www.okc.gov/PublicNotificationSystem/Fo...
3557 NaN GoogleMaps/RochesterNY NaN US Provo mwszkS UT mwszkS 1.308262e+09 j.mp NaN bitly [40.218102, -111.613297] 0.0 http://www.AwareMap.com/ 1.331927e+09 America/Denver http://www.monroecounty.gov/etc/911/rss.php
3558 NaN GoogleProducer NaN US Mountain View zjtI4X CA zjtI4X 1.327529e+09 1.usa.gov NaN bitly [37.419201, -122.057404] 0.0 direct 1.331927e+09 America/Los_Angeles http://www.ahrq.gov/qual/qitoolkit/
3559 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-US US Mc Lean qxKrTK VA qxKrTK 1.312898e+09 1.usa.gov NaN bitly [38.935799, -77.162102] 0.0 http://t.co/OEEEvwjU 1.331927e+09 America/New_York http://herndon-va.gov/Content/public_safety/Pu...

3560 rows × 18 columns


In [25]:
frame['tz'][:10]


Out[25]:
0     America/New_York
1       America/Denver
2     America/New_York
3    America/Sao_Paulo
4     America/New_York
5     America/New_York
6        Europe/Warsaw
7                     
8                     
9                     
Name: tz, dtype: object

In [26]:
tz_counts = frame['tz'].value_counts()
tz_counts[:10]


Out[26]:
America/New_York       1251
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33
Name: tz, dtype: int64

In [27]:
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]


Out[27]:
America/New_York       1251
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
Name: tz, dtype: int64

In [28]:
plt.figure(figsize=(10, 4))


Out[28]:
<matplotlib.figure.Figure at 0x103a42bd0>
<matplotlib.figure.Figure at 0x103a42bd0>

In [29]:
tz_counts[:10].plot(kind='barh', rot=0)


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ee1da90>

In [30]:
frame['a'][1]


Out[30]:
u'GoogleMaps/RochesterNY'

In [31]:
frame['a'][50]


Out[31]:
u'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'

In [32]:
frame['a'][51]


Out[32]:
u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'

In [33]:
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]


Out[33]:
0               Mozilla/5.0
1    GoogleMaps/RochesterNY
2               Mozilla/4.0
3               Mozilla/5.0
4               Mozilla/5.0
dtype: object

In [34]:
results.value_counts()[:8]


Out[34]:
Mozilla/5.0                 2594
Mozilla/4.0                  601
GoogleMaps/RochesterNY       121
Opera/9.80                    34
TEST_INTERNET_AGENT           24
GoogleProducer                21
Mozilla/6.0                    5
BlackBerry8520/5.0.0.681       4
dtype: int64

In [35]:
cframe = frame[frame.a.notnull()]

In [36]:
operating_system = np.where(cframe['a'].str.contains('Windows'),
                            'Windows', 'Not Windows')
operating_system[:5]


Out[36]:
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'], 
      dtype='|S11')

In [37]:
by_tz_os = cframe.groupby(['tz', operating_system])

In [38]:
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]


Out[38]:
Not Windows Windows
tz
245.0 276.0
Africa/Cairo 0.0 3.0
Africa/Casablanca 0.0 1.0
Africa/Ceuta 0.0 2.0
Africa/Johannesburg 0.0 1.0
Africa/Lusaka 0.0 1.0
America/Anchorage 4.0 1.0
America/Argentina/Buenos_Aires 1.0 0.0
America/Argentina/Cordoba 0.0 1.0
America/Argentina/Mendoza 0.0 1.0

In [39]:
# Use to sort in ascending order
indexer = agg_counts.sum(1).argsort()
indexer[:10]


Out[39]:
tz
                                  24
Africa/Cairo                      20
Africa/Casablanca                 21
Africa/Ceuta                      92
Africa/Johannesburg               87
Africa/Lusaka                     53
America/Anchorage                 54
America/Argentina/Buenos_Aires    57
America/Argentina/Cordoba         26
America/Argentina/Mendoza         55
dtype: int64

In [40]:
count_subset = agg_counts.take(indexer)[-10:]
count_subset


Out[40]:
Not Windows Windows
tz
America/Sao_Paulo 13.0 20.0
Europe/Madrid 16.0 19.0
Pacific/Honolulu 0.0 36.0
Asia/Tokyo 2.0 35.0
Europe/London 43.0 31.0
America/Denver 132.0 59.0
America/Los_Angeles 130.0 252.0
America/Chicago 115.0 285.0
245.0 276.0
America/New_York 339.0 912.0

In [41]:
plt.figure()


Out[41]:
<matplotlib.figure.Figure at 0x103f923d0>
<matplotlib.figure.Figure at 0x103f923d0>

In [42]:
count_subset.plot(kind='barh', stacked=True)


Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x103fea950>

In [43]:
plt.figure()


Out[43]:
<matplotlib.figure.Figure at 0x104371fd0>
<matplotlib.figure.Figure at 0x104371fd0>

In [44]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x10413ee90>

MovieLens 1M data set


In [45]:
import pandas as pd
import os
encoding = 'latin1'

upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']

users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)


/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/ipykernel/__main__.py:13: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators; you can avoid this warning by specifying engine='python'.
/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/ipykernel/__main__.py:14: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators; you can avoid this warning by specifying engine='python'.
/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/ipykernel/__main__.py:15: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators; you can avoid this warning by specifying engine='python'.

In [46]:
users[:5]


Out[46]:
user_id gender age occupation zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455

In [ ]:
ratings[:5]

In [ ]:
movies[:5]

In [ ]:
ratings

In [ ]:
data = pd.merge(pd.merge(ratings, users), movies)
data

In [ ]:
data.ix[0]

In [ ]:
mean_ratings = data.pivot_table('rating', index='title',
                                columns='gender', aggfunc='mean')
mean_ratings[:5]

In [ ]:
ratings_by_title = data.groupby('title').size()

In [ ]:
ratings_by_title[:5]

In [ ]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]

In [ ]:
active_titles[:10]

In [ ]:
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings

In [ ]:
mean_ratings = mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':
                           'Seven Samurai (Shichinin no samurai) (1954)'})

In [ ]:
top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)
top_female_ratings[:10]

Measuring rating disagreement


In [ ]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']

In [ ]:
sorted_by_diff = mean_ratings.sort_index(by='diff')
sorted_by_diff[:15]

In [ ]:
# Reverse order of rows, take first 15 rows
sorted_by_diff[::-1][:15]

In [ ]:
# Standard deviation of rating grouped by title
rating_std_by_title = data.groupby('title')['rating'].std()
# Filter down to active_titles
rating_std_by_title = rating_std_by_title.ix[active_titles]
# Order Series by value in descending order
rating_std_by_title.order(ascending=False)[:10]

US Baby Names 1880-2010


In [ ]:
from __future__ import division
from numpy.random import randn
import numpy as np
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(12, 5))
np.set_printoptions(precision=4)
%pwd

In [ ]:
!head -n 10 ch02/names/yob1880.txt

In [ ]:
import pandas as pd
names1880 = pd.read_csv('ch02/names/yob1880.txt', names=['name', 'sex', 'births'])
names1880

In [ ]:
names1880.groupby('sex').births.sum()

In [ ]:
# 2010 is the last available year right now
years = range(1880, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = 'ch02/names/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)

    frame['year'] = year
    pieces.append(frame)

# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

In [ ]:
total_births = names.pivot_table('births', index='year',
                                 columns='sex', aggfunc=sum)

In [ ]:
total_births.tail()

In [ ]:
total_births.plot(title='Total births by sex and year')

In [ ]:
def add_prop(group):
    # Integer division floors
    births = group.births.astype(float)

    group['prop'] = births / births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)

In [ ]:
names

In [ ]:
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

In [ ]:
def get_top1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

In [ ]:
pieces = []
for year, group in names.groupby(['year', 'sex']):
    pieces.append(group.sort_index(by='births', ascending=False)[:1000])
top1000 = pd.concat(pieces, ignore_index=True)

In [ ]:
top1000.index = np.arange(len(top1000))

In [ ]:
top1000

In [ ]:
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

In [ ]:
total_births = top1000.pivot_table('births', index='year', columns='name',
                                   aggfunc=sum)
total_births

In [ ]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False,
            title="Number of births per year")

Measuring the increase in naming diversity


In [ ]:
plt.figure()

In [ ]:
table = top1000.pivot_table('prop', index='year',
                            columns='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))

In [ ]:
df = boys[boys.year == 2010]
df

In [ ]:
prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()
prop_cumsum[:10]

In [ ]:
prop_cumsum.values.searchsorted(0.5)

In [ ]:
df = boys[boys.year == 1900]
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()
in1900.values.searchsorted(0.5) + 1

In [ ]:
def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')

In [ ]:
def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.head()

In [ ]:
diversity.plot(title="Number of popular names in top 50%")

The "Last letter" Revolution


In [ ]:
# extract last letter from name column
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', index=last_letters,
                          columns=['sex', 'year'], aggfunc=sum)

In [ ]:
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()

In [ ]:
subtable.sum()

In [ ]:
letter_prop = subtable / subtable.sum().astype(float)

In [ ]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)

In [ ]:
plt.subplots_adjust(hspace=0.25)

In [ ]:
letter_prop = table / table.sum().astype(float)

dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()

In [ ]:
plt.close('all')

In [ ]:
dny_ts.plot()

Boy names that became girl names (and vice versa)


In [ ]:
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like

In [ ]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

In [ ]:
table = filtered.pivot_table('births', index='year',
                             columns='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()

In [ ]:
plt.close('all')

In [ ]:
table.plot(style={'M': 'k-', 'F': 'k--'})