In [39]:
import pandas
plot(arange(10))


Out[39]:
[<matplotlib.lines.Line2D at 0x4dd4f90>]
Out[39]:
[<matplotlib.lines.Line2D at 0x6c675b0>]

In [40]:
import pandas
plot(arange(100))


Out[40]:
[<matplotlib.lines.Line2D at 0x4fc0250>]
Out[40]:
[<matplotlib.lines.Line2D at 0x6df7c90>]

In [41]:
plot(arange(50))


Out[41]:
[<matplotlib.lines.Line2D at 0x5183e50>]
Out[41]:
[<matplotlib.lines.Line2D at 0x6e54610>]

In [42]:
a=rand(100)
plot(a)


Out[42]:
[<matplotlib.lines.Line2D at 0x52de870>]
Out[42]:
[<matplotlib.lines.Line2D at 0x7044050>]

In [43]:
#usagov_bitly_data_path='F:\syn\git\pydata-book\ch02\usagov_bitly_data2012-03-16-1331923249.txt'
usagov_bitly_data_path='E:\git\pydata-book\ch02\usagov_bitly_data2012-03-16-1331923249.txt'
f=open(usagov_bitly_data_path)
f.readline()


Out[43]:
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
Out[43]:
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'

In [44]:
import json
records=[json.loads(line) for line in f]

In [45]:
records[0]


Out[45]:
{u'a': u'GoogleMaps/RochesterNY',
 u'c': u'US',
 u'cy': u'Provo',
 u'g': u'mwszkS',
 u'gr': u'UT',
 u'h': u'mwszkS',
 u'hc': 1308262393,
 u'hh': u'j.mp',
 u'l': u'bitly',
 u'll': [40.218102, -111.613297],
 u'nk': 0,
 u'r': u'http://www.AwareMap.com/',
 u't': 1331923249,
 u'tz': u'America/Denver',
 u'u': u'http://www.monroecounty.gov/etc/911/rss.php'}
Out[45]:
{u'a': u'GoogleMaps/RochesterNY',
 u'c': u'US',
 u'cy': u'Provo',
 u'g': u'mwszkS',
 u'gr': u'UT',
 u'h': u'mwszkS',
 u'hc': 1308262393,
 u'hh': u'j.mp',
 u'l': u'bitly',
 u'll': [40.218102, -111.613297],
 u'nk': 0,
 u'r': u'http://www.AwareMap.com/',
 u't': 1331923249,
 u'tz': u'America/Denver',
 u'u': u'http://www.monroecounty.gov/etc/911/rss.php'}

In [46]:
records[0]['tz']


Out[46]:
u'America/Denver'
Out[46]:
u'America/Denver'

In [47]:
print records[0]['tz']


America/DenverAmerica/Denver

In [48]:
print records[0]['nk']


00

In [49]:
time_zones=[rec['tz'] for rec in records if 'tz' in rec]

In [50]:
time_zones[:10]


Out[50]:
[u'America/Denver',
 u'America/New_York',
 u'America/Sao_Paulo',
 u'America/New_York',
 u'America/New_York',
 u'Europe/Warsaw',
 u'',
 u'',
 u'',
 u'America/Los_Angeles']
Out[50]:
[u'America/Denver',
 u'America/New_York',
 u'America/Sao_Paulo',
 u'America/New_York',
 u'America/New_York',
 u'Europe/Warsaw',
 u'',
 u'',
 u'',
 u'America/Los_Angeles']

In [51]:
len(time_zones)


Out[51]:
3439
Out[51]:
3439

In [52]:
records[:2]


Out[52]:
[{u'a': u'GoogleMaps/RochesterNY',
  u'c': u'US',
  u'cy': u'Provo',
  u'g': u'mwszkS',
  u'gr': u'UT',
  u'h': u'mwszkS',
  u'hc': 1308262393,
  u'hh': u'j.mp',
  u'l': u'bitly',
  u'll': [40.218102, -111.613297],
  u'nk': 0,
  u'r': u'http://www.AwareMap.com/',
  u't': 1331923249,
  u'tz': u'America/Denver',
  u'u': u'http://www.monroecounty.gov/etc/911/rss.php'},
 {u'a': u'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.3)',
  u'al': u'en-US',
  u'c': u'US',
  u'cy': u'Washington',
  u'g': u'xxr3Qb',
  u'gr': u'DC',
  u'h': u'xxr3Qb',
  u'hc': 1331919941,
  u'hh': u'1.usa.gov',
  u'l': u'bitly',
  u'll': [38.9007, -77.043098],
  u'nk': 1,
  u'r': u'http://t.co/03elZC4Q',
  u't': 1331923250,
  u'tz': u'America/New_York',
  u'u': u'http://boxer.senate.gov/en/press/releases/031612.cfm'}]
Out[52]:
[{u'a': u'GoogleMaps/RochesterNY',
  u'c': u'US',
  u'cy': u'Provo',
  u'g': u'mwszkS',
  u'gr': u'UT',
  u'h': u'mwszkS',
  u'hc': 1308262393,
  u'hh': u'j.mp',
  u'l': u'bitly',
  u'll': [40.218102, -111.613297],
  u'nk': 0,
  u'r': u'http://www.AwareMap.com/',
  u't': 1331923249,
  u'tz': u'America/Denver',
  u'u': u'http://www.monroecounty.gov/etc/911/rss.php'},
 {u'a': u'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.3)',
  u'al': u'en-US',
  u'c': u'US',
  u'cy': u'Washington',
  u'g': u'xxr3Qb',
  u'gr': u'DC',
  u'h': u'xxr3Qb',
  u'hc': 1331919941,
  u'hh': u'1.usa.gov',
  u'l': u'bitly',
  u'll': [38.9007, -77.043098],
  u'nk': 1,
  u'r': u'http://t.co/03elZC4Q',
  u't': 1331923250,
  u'tz': u'America/New_York',
  u'u': u'http://boxer.senate.gov/en/press/releases/031612.cfm'}]

In [53]:
def get_counts(sequence):
    counts={}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [54]:
get_counts(time_zones)['Africa/Cairo']


Out[54]:
3
Out[54]:
3

In [55]:
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) #所有的值均会初始化为0
    for x in sequence:
        counts[x] += 1
    return counts

In [56]:
counts = get_counts2(time_zones)

In [57]:
counts['America/New_York']


Out[57]:
1250
Out[57]:
1250

In [58]:
len(time_zones)


Out[58]:
3439
Out[58]:
3439

In [59]:
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz,count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [60]:
top_counts(counts)


Out[60]:
[(33, u'America/Sao_Paulo'),
 (35, u'Europe/Madrid'),
 (36, u'Pacific/Honolulu'),
 (37, u'Asia/Tokyo'),
 (74, u'Europe/London'),
 (191, u'America/Denver'),
 (382, u'America/Los_Angeles'),
 (400, u'America/Chicago'),
 (521, u''),
 (1250, u'America/New_York')]
Out[60]:
[(33, u'America/Sao_Paulo'),
 (35, u'Europe/Madrid'),
 (36, u'Pacific/Honolulu'),
 (37, u'Asia/Tokyo'),
 (74, u'Europe/London'),
 (191, u'America/Denver'),
 (382, u'America/Los_Angeles'),
 (400, u'America/Chicago'),
 (521, u''),
 (1250, u'America/New_York')]

In [61]:
from collections import Counter
counts2=Counter(time_zones)
counts2.most_common(10)


Out[61]:
[(u'America/New_York', 1250),
 (u'', 521),
 (u'America/Chicago', 400),
 (u'America/Los_Angeles', 382),
 (u'America/Denver', 191),
 (u'Europe/London', 74),
 (u'Asia/Tokyo', 37),
 (u'Pacific/Honolulu', 36),
 (u'Europe/Madrid', 35),
 (u'America/Sao_Paulo', 33)]
Out[61]:
[(u'America/New_York', 1250),
 (u'', 521),
 (u'America/Chicago', 400),
 (u'America/Los_Angeles', 382),
 (u'America/Denver', 191),
 (u'Europe/London', 74),
 (u'Asia/Tokyo', 37),
 (u'Pacific/Honolulu', 36),
 (u'Europe/Madrid', 35),
 (u'America/Sao_Paulo', 33)]

In [62]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
frame = DataFrame(records)
frame


Out[62]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3559 entries, 0 to 3558
Data columns:
_heartbeat_    120  non-null values
a              3439  non-null values
al             3093  non-null values
c              2918  non-null values
cy             2918  non-null values
g              3439  non-null values
gr             2918  non-null values
h              3439  non-null values
hc             3439  non-null values
hh             3439  non-null values
kw             93  non-null values
l              3439  non-null values
ll             2918  non-null values
nk             3439  non-null values
r              3439  non-null values
t              3439  non-null values
tz             3439  non-null values
u              3439  non-null values
dtypes: float64(4), object(14)
Out[62]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3559 entries, 0 to 3558
Data columns:
_heartbeat_    120  non-null values
a              3439  non-null values
al             3093  non-null values
c              2918  non-null values
cy             2918  non-null values
g              3439  non-null values
gr             2918  non-null values
h              3439  non-null values
hc             3439  non-null values
hh             3439  non-null values
kw             93  non-null values
l              3439  non-null values
ll             2918  non-null values
nk             3439  non-null values
r              3439  non-null values
t              3439  non-null values
tz             3439  non-null values
u              3439  non-null values
dtypes: float64(4), object(14)

In [63]:
frame['tz'][:10]


Out[63]:
0         America/Denver
1       America/New_York
2      America/Sao_Paulo
3       America/New_York
4       America/New_York
5          Europe/Warsaw
6                       
7                       
8                       
9    America/Los_Angeles
Name: tz
Out[63]:
0         America/Denver
1       America/New_York
2      America/Sao_Paulo
3       America/New_York
4       America/New_York
5          Europe/Warsaw
6                       
7                       
8                       
9    America/Los_Angeles
Name: tz

In [64]:
tz_counts = frame['tz'].value_counts()

In [65]:
tz_counts[:10]


Out[65]:
America/New_York       1250
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33
Out[65]:
America/New_York       1250
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33

In [66]:
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()

tz_counts[:10]


Out[66]:
America/New_York       1250
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
Out[66]:
America/New_York       1250
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35

In [67]:
tz_counts[:10].plot(kind='barh', rot=0)


Out[67]:
<matplotlib.axes.AxesSubplot at 0x6944cd0>
Out[67]:
<matplotlib.axes.AxesSubplot at 0x6f87930>

In [68]:
frame['a'][1]


Out[68]:
u'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.3)'
Out[68]:
u'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.3)'

In [69]:
frame['a'][50]


Out[69]:
u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
Out[69]:
u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'

In [70]:
frame['a'][51]


Out[70]:
u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11'
Out[70]:
u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11'

In [71]:
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]


Out[71]:
0    GoogleMaps/RochesterNY
1               Mozilla/4.0
2               Mozilla/5.0
3               Mozilla/5.0
4               Mozilla/5.0
Out[71]:
0    GoogleMaps/RochesterNY
1               Mozilla/4.0
2               Mozilla/5.0
3               Mozilla/5.0
4               Mozilla/5.0

In [72]:
results.value_counts()[:8]


Out[72]:
Mozilla/5.0                 2593
Mozilla/4.0                  601
GoogleMaps/RochesterNY       121
Opera/9.80                    34
TEST_INTERNET_AGENT           24
GoogleProducer                21
Mozilla/6.0                    5
BlackBerry8520/5.0.0.681       4
Out[72]:
Mozilla/5.0                 2593
Mozilla/4.0                  601
GoogleMaps/RochesterNY       121
Opera/9.80                    34
TEST_INTERNET_AGENT           24
GoogleProducer                21
Mozilla/6.0                    5
BlackBerry8520/5.0.0.681       4

In [73]:
cframe = frame[frame.a.notnull()]
operting_system = np.where(cframe['a'].str.contains('Windows'), 'Windows',
 'Not Windows')

In [74]:
operting_system[:5]


Out[74]:
0    Not Windows
1        Windows
2    Not Windows
3        Windows
4        Windows
Name: a
Out[74]:
0    Not Windows
1        Windows
2    Not Windows
3        Windows
4        Windows
Name: a

In [76]:
by_tz_os = cframe.groupby(['tz', operting_system])

In [77]:
agg_counts = by_tz_os.size().unstack().fillna(0)

In [78]:
agg_counts[:10]


Out[78]:
a Not Windows Windows
tz
245 276
Africa/Cairo 0 3
Africa/Casablanca 0 1
Africa/Ceuta 0 2
Africa/Johannesburg 0 1
Africa/Lusaka 0 1
America/Anchorage 4 1
America/Argentina/Buenos_Aires 1 0
America/Argentina/Cordoba 0 1
America/Argentina/Mendoza 0 1

In [79]:
indexer = agg_counts.sum(1).argsort()
indexer[:10]


Out[79]:
tz
                                  24
Africa/Cairo                      20
Africa/Casablanca                 21
Africa/Ceuta                      92
Africa/Johannesburg               87
Africa/Lusaka                     53
America/Anchorage                 54
America/Argentina/Buenos_Aires    57
America/Argentina/Cordoba         26
America/Argentina/Mendoza         55

In [82]:
count_subset = agg_counts.take(indexer)[-10:]
count_subset


Out[82]:
a Not Windows Windows
tz
America/Sao_Paulo 13 20
Europe/Madrid 16 19
Pacific/Honolulu 0 36
Asia/Tokyo 2 35
Europe/London 43 31
America/Denver 132 59
America/Los_Angeles 130 252
America/Chicago 115 285
245 276
America/New_York 339 911

In [84]:
count_subset.plot(kind='barh', stacked=True)


Out[84]:
<matplotlib.axes.AxesSubplot at 0x7835950>

In [85]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)


Out[85]:
<matplotlib.axes.AxesSubplot at 0x7bcc330>

In [ ]: