notebook.community

Edit and run



In [39]:

    
import pandas
plot(arange(10))









    Out[39]:





[<matplotlib.lines.Line2D at 0x4dd4f90>]






    












    Out[39]:





[<matplotlib.lines.Line2D at 0x6c675b0>]



In [40]:

    
import pandas
plot(arange(100))









    Out[40]:





[<matplotlib.lines.Line2D at 0x4fc0250>]






    












    Out[40]:





[<matplotlib.lines.Line2D at 0x6df7c90>]



In [41]:

    
plot(arange(50))









    Out[41]:





[<matplotlib.lines.Line2D at 0x5183e50>]






    












    Out[41]:





[<matplotlib.lines.Line2D at 0x6e54610>]



In [42]:

    
a=rand(100)
plot(a)









    Out[42]:





[<matplotlib.lines.Line2D at 0x52de870>]






    












    Out[42]:





[<matplotlib.lines.Line2D at 0x7044050>]



In [43]:

    
#usagov_bitly_data_path='F:\syn\git\pydata-book\ch02\usagov_bitly_data2012-03-16-1331923249.txt'
usagov_bitly_data_path='E:\git\pydata-book\ch02\usagov_bitly_data2012-03-16-1331923249.txt'
f=open(usagov_bitly_data_path)
f.readline()









    Out[43]:





'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'






    Out[43]:





'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'



In [44]:

    
import json
records=[json.loads(line) for line in f]



In [45]:

    
records[0]









    Out[45]:





{u'a': u'GoogleMaps/RochesterNY',
 u'c': u'US',
 u'cy': u'Provo',
 u'g': u'mwszkS',
 u'gr': u'UT',
 u'h': u'mwszkS',
 u'hc': 1308262393,
 u'hh': u'j.mp',
 u'l': u'bitly',
 u'll': [40.218102, -111.613297],
 u'nk': 0,
 u'r': u'http://www.AwareMap.com/',
 u't': 1331923249,
 u'tz': u'America/Denver',
 u'u': u'http://www.monroecounty.gov/etc/911/rss.php'}






    Out[45]:





{u'a': u'GoogleMaps/RochesterNY',
 u'c': u'US',
 u'cy': u'Provo',
 u'g': u'mwszkS',
 u'gr': u'UT',
 u'h': u'mwszkS',
 u'hc': 1308262393,
 u'hh': u'j.mp',
 u'l': u'bitly',
 u'll': [40.218102, -111.613297],
 u'nk': 0,
 u'r': u'http://www.AwareMap.com/',
 u't': 1331923249,
 u'tz': u'America/Denver',
 u'u': u'http://www.monroecounty.gov/etc/911/rss.php'}



In [46]:

    
records[0]['tz']









    Out[46]:





u'America/Denver'






    Out[46]:





u'America/Denver'



In [47]:

    
print records[0]['tz']









    



America/DenverAmerica/Denver



In [48]:

    
print records[0]['nk']



In [49]:

    
time_zones=[rec['tz'] for rec in records if 'tz' in rec]



In [50]:

    
time_zones[:10]









    Out[50]:





[u'America/Denver',
 u'America/New_York',
 u'America/Sao_Paulo',
 u'America/New_York',
 u'America/New_York',
 u'Europe/Warsaw',
 u'',
 u'',
 u'',
 u'America/Los_Angeles']






    Out[50]:





[u'America/Denver',
 u'America/New_York',
 u'America/Sao_Paulo',
 u'America/New_York',
 u'America/New_York',
 u'Europe/Warsaw',
 u'',
 u'',
 u'',
 u'America/Los_Angeles']



In [51]:

    
len(time_zones)









    Out[51]:





3439






    Out[51]:





3439



In [52]:

    
records[:2]









    Out[52]:





[{u'a': u'GoogleMaps/RochesterNY',
  u'c': u'US',
  u'cy': u'Provo',
  u'g': u'mwszkS',
  u'gr': u'UT',
  u'h': u'mwszkS',
  u'hc': 1308262393,
  u'hh': u'j.mp',
  u'l': u'bitly',
  u'll': [40.218102, -111.613297],
  u'nk': 0,
  u'r': u'http://www.AwareMap.com/',
  u't': 1331923249,
  u'tz': u'America/Denver',
  u'u': u'http://www.monroecounty.gov/etc/911/rss.php'},
 {u'a': u'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.3)',
  u'al': u'en-US',
  u'c': u'US',
  u'cy': u'Washington',
  u'g': u'xxr3Qb',
  u'gr': u'DC',
  u'h': u'xxr3Qb',
  u'hc': 1331919941,
  u'hh': u'1.usa.gov',
  u'l': u'bitly',
  u'll': [38.9007, -77.043098],
  u'nk': 1,
  u'r': u'http://t.co/03elZC4Q',
  u't': 1331923250,
  u'tz': u'America/New_York',
  u'u': u'http://boxer.senate.gov/en/press/releases/031612.cfm'}]






    Out[52]:





[{u'a': u'GoogleMaps/RochesterNY',
  u'c': u'US',
  u'cy': u'Provo',
  u'g': u'mwszkS',
  u'gr': u'UT',
  u'h': u'mwszkS',
  u'hc': 1308262393,
  u'hh': u'j.mp',
  u'l': u'bitly',
  u'll': [40.218102, -111.613297],
  u'nk': 0,
  u'r': u'http://www.AwareMap.com/',
  u't': 1331923249,
  u'tz': u'America/Denver',
  u'u': u'http://www.monroecounty.gov/etc/911/rss.php'},
 {u'a': u'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.3)',
  u'al': u'en-US',
  u'c': u'US',
  u'cy': u'Washington',
  u'g': u'xxr3Qb',
  u'gr': u'DC',
  u'h': u'xxr3Qb',
  u'hc': 1331919941,
  u'hh': u'1.usa.gov',
  u'l': u'bitly',
  u'll': [38.9007, -77.043098],
  u'nk': 1,
  u'r': u'http://t.co/03elZC4Q',
  u't': 1331923250,
  u'tz': u'America/New_York',
  u'u': u'http://boxer.senate.gov/en/press/releases/031612.cfm'}]



In [53]:

    
def get_counts(sequence):
    counts={}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts



In [54]:

    
get_counts(time_zones)['Africa/Cairo']









    Out[54]:





3






    Out[54]:





3



In [55]:

    
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) #所有的值均会初始化为0
    for x in sequence:
        counts[x] += 1
    return counts



In [56]:

    
counts = get_counts2(time_zones)



In [57]:

    
counts['America/New_York']









    Out[57]:





1250






    Out[57]:





1250



In [58]:

    
len(time_zones)









    Out[58]:





3439






    Out[58]:





3439



In [59]:

    
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz,count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]



In [60]:

    
top_counts(counts)









    Out[60]:





[(33, u'America/Sao_Paulo'),
 (35, u'Europe/Madrid'),
 (36, u'Pacific/Honolulu'),
 (37, u'Asia/Tokyo'),
 (74, u'Europe/London'),
 (191, u'America/Denver'),
 (382, u'America/Los_Angeles'),
 (400, u'America/Chicago'),
 (521, u''),
 (1250, u'America/New_York')]






    Out[60]:





[(33, u'America/Sao_Paulo'),
 (35, u'Europe/Madrid'),
 (36, u'Pacific/Honolulu'),
 (37, u'Asia/Tokyo'),
 (74, u'Europe/London'),
 (191, u'America/Denver'),
 (382, u'America/Los_Angeles'),
 (400, u'America/Chicago'),
 (521, u''),
 (1250, u'America/New_York')]



In [61]:

    
from collections import Counter
counts2=Counter(time_zones)
counts2.most_common(10)









    Out[61]:





[(u'America/New_York', 1250),
 (u'', 521),
 (u'America/Chicago', 400),
 (u'America/Los_Angeles', 382),
 (u'America/Denver', 191),
 (u'Europe/London', 74),
 (u'Asia/Tokyo', 37),
 (u'Pacific/Honolulu', 36),
 (u'Europe/Madrid', 35),
 (u'America/Sao_Paulo', 33)]






    Out[61]:





[(u'America/New_York', 1250),
 (u'', 521),
 (u'America/Chicago', 400),
 (u'America/Los_Angeles', 382),
 (u'America/Denver', 191),
 (u'Europe/London', 74),
 (u'Asia/Tokyo', 37),
 (u'Pacific/Honolulu', 36),
 (u'Europe/Madrid', 35),
 (u'America/Sao_Paulo', 33)]



In [62]:

    
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
frame = DataFrame(records)
frame









    Out[62]:





<class 'pandas.core.frame.DataFrame'>
Int64Index: 3559 entries, 0 to 3558
Data columns:
_heartbeat_    120  non-null values
a              3439  non-null values
al             3093  non-null values
c              2918  non-null values
cy             2918  non-null values
g              3439  non-null values
gr             2918  non-null values
h              3439  non-null values
hc             3439  non-null values
hh             3439  non-null values
kw             93  non-null values
l              3439  non-null values
ll             2918  non-null values
nk             3439  non-null values
r              3439  non-null values
t              3439  non-null values
tz             3439  non-null values
u              3439  non-null values
dtypes: float64(4), object(14)






    Out[62]:





<class 'pandas.core.frame.DataFrame'>
Int64Index: 3559 entries, 0 to 3558
Data columns:
_heartbeat_    120  non-null values
a              3439  non-null values
al             3093  non-null values
c              2918  non-null values
cy             2918  non-null values
g              3439  non-null values
gr             2918  non-null values
h              3439  non-null values
hc             3439  non-null values
hh             3439  non-null values
kw             93  non-null values
l              3439  non-null values
ll             2918  non-null values
nk             3439  non-null values
r              3439  non-null values
t              3439  non-null values
tz             3439  non-null values
u              3439  non-null values
dtypes: float64(4), object(14)



In [63]:

    
frame['tz'][:10]









    Out[63]:





0         America/Denver
1       America/New_York
2      America/Sao_Paulo
3       America/New_York
4       America/New_York
5          Europe/Warsaw
6                       
7                       
8                       
9    America/Los_Angeles
Name: tz






    Out[63]:





0         America/Denver
1       America/New_York
2      America/Sao_Paulo
3       America/New_York
4       America/New_York
5          Europe/Warsaw
6                       
7                       
8                       
9    America/Los_Angeles
Name: tz



In [64]:

    
tz_counts = frame['tz'].value_counts()



In [65]:

    
tz_counts[:10]









    Out[65]:





America/New_York       1250
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33






    Out[65]:





America/New_York       1250
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33



In [66]:

    
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()

tz_counts[:10]









    Out[66]:





America/New_York       1250
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35






    Out[66]:





America/New_York       1250
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35



In [67]:

    
tz_counts[:10].plot(kind='barh', rot=0)









    Out[67]:





<matplotlib.axes.AxesSubplot at 0x6944cd0>






    












    Out[67]:





<matplotlib.axes.AxesSubplot at 0x6f87930>



In [68]:

    
frame['a'][1]









    Out[68]:





u'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.3)'






    Out[68]:





u'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.3)'



In [69]:

    
frame['a'][50]









    Out[69]:





u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'






    Out[69]:





u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'



In [70]:

    
frame['a'][51]









    Out[70]:





u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11'






    Out[70]:





u'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11'



In [71]:

    
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]









    Out[71]:





0    GoogleMaps/RochesterNY
1               Mozilla/4.0
2               Mozilla/5.0
3               Mozilla/5.0
4               Mozilla/5.0






    Out[71]:





0    GoogleMaps/RochesterNY
1               Mozilla/4.0
2               Mozilla/5.0
3               Mozilla/5.0
4               Mozilla/5.0



In [72]:

    
results.value_counts()[:8]









    Out[72]:





Mozilla/5.0                 2593
Mozilla/4.0                  601
GoogleMaps/RochesterNY       121
Opera/9.80                    34
TEST_INTERNET_AGENT           24
GoogleProducer                21
Mozilla/6.0                    5
BlackBerry8520/5.0.0.681       4






    Out[72]:





Mozilla/5.0                 2593
Mozilla/4.0                  601
GoogleMaps/RochesterNY       121
Opera/9.80                    34
TEST_INTERNET_AGENT           24
GoogleProducer                21
Mozilla/6.0                    5
BlackBerry8520/5.0.0.681       4



In [73]:

    
cframe = frame[frame.a.notnull()]
operting_system = np.where(cframe['a'].str.contains('Windows'), 'Windows',
 'Not Windows')



In [74]:

    
operting_system[:5]









    Out[74]:





0    Not Windows
1        Windows
2    Not Windows
3        Windows
4        Windows
Name: a






    Out[74]:





0    Not Windows
1        Windows
2    Not Windows
3        Windows
4        Windows
Name: a



In [76]:

    
by_tz_os = cframe.groupby(['tz', operting_system])



In [77]:

    
agg_counts = by_tz_os.size().unstack().fillna(0)



In [78]:

    
agg_counts[:10]









    Out[78]:






  
    
      a
      Not Windows
      Windows
    
    
      tz
      
      
    
  
  
    
      
       245
       276
    
    
      Africa/Cairo
         0
         3
    
    
      Africa/Casablanca
         0
         1
    
    
      Africa/Ceuta
         0
         2
    
    
      Africa/Johannesburg
         0
         1
    
    
      Africa/Lusaka
         0
         1
    
    
      America/Anchorage
         4
         1
    
    
      America/Argentina/Buenos_Aires
         1
         0
    
    
      America/Argentina/Cordoba
         0
         1
    
    
      America/Argentina/Mendoza
         0
         1



In [79]:

    
indexer = agg_counts.sum(1).argsort()
indexer[:10]









    Out[79]:





tz
                                  24
Africa/Cairo                      20
Africa/Casablanca                 21
Africa/Ceuta                      92
Africa/Johannesburg               87
Africa/Lusaka                     53
America/Anchorage                 54
America/Argentina/Buenos_Aires    57
America/Argentina/Cordoba         26
America/Argentina/Mendoza         55



In [82]:

    
count_subset = agg_counts.take(indexer)[-10:]
count_subset









    Out[82]:






  
    
      a
      Not Windows
      Windows
    
    
      tz
      
      
    
  
  
    
      America/Sao_Paulo
        13
        20
    
    
      Europe/Madrid
        16
        19
    
    
      Pacific/Honolulu
         0
        36
    
    
      Asia/Tokyo
         2
        35
    
    
      Europe/London
        43
        31
    
    
      America/Denver
       132
        59
    
    
      America/Los_Angeles
       130
       252
    
    
      America/Chicago
       115
       285
    
    
      
       245
       276
    
    
      America/New_York
       339
       911



In [84]:

    
count_subset.plot(kind='barh', stacked=True)









    Out[84]:





<matplotlib.axes.AxesSubplot at 0x7835950>



In [85]:

    
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)









    Out[85]:





<matplotlib.axes.AxesSubplot at 0x7bcc330>



In [ ]:

a	Not Windows	Windows
tz
	245	276
Africa/Cairo	0	3
Africa/Casablanca	0	1
Africa/Ceuta	0	2
Africa/Johannesburg	0	1
Africa/Lusaka	0	1
America/Anchorage	4	1
America/Argentina/Buenos_Aires	1	0
America/Argentina/Cordoba	0	1
America/Argentina/Mendoza	0	1

a	Not Windows	Windows
tz
America/Sao_Paulo	13	20
Europe/Madrid	16	19
Pacific/Honolulu	0	36
Asia/Tokyo	2	35
Europe/London	43	31
America/Denver	132	59
America/Los_Angeles	130	252
America/Chicago	115	285
	245	276
America/New_York	339	911