In [88]:
cd "/home/bakuda/pandas-book/"


/home/bakuda/pandas-book

In [89]:
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'

In [90]:
open(path).readline()


Out[90]:
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'

In [91]:
import json
records = [json.loads(line) for line in open(path)]

In [92]:
records[:1]


Out[92]:
[{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
  u'al': u'en-US,en;q=0.8',
  u'c': u'US',
  u'cy': u'Danvers',
  u'g': u'A6qOVH',
  u'gr': u'MA',
  u'h': u'wfLQtf',
  u'hc': 1331822918,
  u'hh': u'1.usa.gov',
  u'l': u'orofrog',
  u'll': [42.576698, -70.954903],
  u'nk': 1,
  u'r': u'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
  u't': 1331923247,
  u'tz': u'America/New_York',
  u'u': u'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}]

In [14]:
records[0]['tz']


Out[14]:
u'America/New_York'

In [16]:
tzs = [rec['tz'] for rec in records if 'tz' in rec]

In [18]:
tzs[:5]


Out[18]:
[u'America/New_York',
 u'America/Denver',
 u'America/New_York',
 u'America/Sao_Paulo',
 u'America/New_York']

In [19]:
type(tzs)


Out[19]:
list

In [20]:
len(tzs)


Out[20]:
3440

In [21]:
len(set(tzs))


Out[21]:
97

In [22]:
%rmagic


ERROR: Line magic function `%rmagic` not found.

In [23]:
tzs[:10]


Out[23]:
[u'America/New_York',
 u'America/Denver',
 u'America/New_York',
 u'America/Sao_Paulo',
 u'America/New_York',
 u'America/New_York',
 u'Europe/Warsaw',
 u'',
 u'',
 u'']

In [25]:
from collections import Counter

In [26]:
counts = Counter(tzs)

In [28]:
counts.most_common(10)


Out[28]:
[(u'America/New_York', 1251),
 (u'', 521),
 (u'America/Chicago', 400),
 (u'America/Los_Angeles', 382),
 (u'America/Denver', 191),
 (u'Europe/London', 74),
 (u'Asia/Tokyo', 37),
 (u'Pacific/Honolulu', 36),
 (u'Europe/Madrid', 35),
 (u'America/Sao_Paulo', 33)]

In [93]:
import pandas as pd

In [94]:
df = pd.DataFrame(records)

In [95]:
df.shape


Out[95]:
(3560, 18)

In [96]:
df['tz'][:10]


Out[96]:
0     America/New_York
1       America/Denver
2     America/New_York
3    America/Sao_Paulo
4     America/New_York
5     America/New_York
6        Europe/Warsaw
7                     
8                     
9                     
Name: tz, dtype: object

In [35]:
tz_counts = df['tz'].value_counts()

In [36]:
tz_counts[:10]


Out[36]:
America/New_York       1251
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33
dtype: int64

In [37]:
clean_tz = df['tz'].fillna('Missing')

In [38]:
clean_tz[clean_tz=='']='Unknown'

In [39]:
tz_counts = clean_tz.value_counts()

In [40]:
tz_counts[:10]


Out[40]:
America/New_York       1251
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
dtype: int64

In [41]:
tz_counts[:10].plot(kind='bar', rot=0)


Out[41]:
<matplotlib.axes.AxesSubplot at 0xb273d6c>

In [42]:
tz_counts[:10].plot(kind='barh', rot=0)


Out[42]:
<matplotlib.axes.AxesSubplot at 0xb228cac>

In [43]:
df['a'][:5]


Out[43]:
0    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
1                               GoogleMaps/RochesterNY
2    Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
3    Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...
4    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...
Name: a, dtype: object

In [44]:
s = df['a'][0]

In [45]:
s


Out[45]:
u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11'

In [47]:
s.split()[0]


Out[47]:
u'Mozilla/5.0'

In [53]:
results = pd.Series([x.split()[0] for x in df['a'].dropna()])

In [52]:
#df.tz
# same as below
df['tz'][:5]


Out[52]:
0     America/New_York
1       America/Denver
2     America/New_York
3    America/Sao_Paulo
4     America/New_York
Name: tz, dtype: object

In [54]:
results[:5]


Out[54]:
0               Mozilla/5.0
1    GoogleMaps/RochesterNY
2               Mozilla/4.0
3               Mozilla/5.0
4               Mozilla/5.0
dtype: object

In [55]:
%load_ext rmagic

In [57]:
%%R
install.packages("nycflights13", repos='http://cran.us.r‐project.org')


/usr/local/lib/python2.7/dist-packages/ipython-1.1.0-py2.7.egg/IPython/extensions/rmagic.py:194: UserWarning: Installing package into ‘/home/bakuda/R/i686-pc-linux-gnu-library/3.2’
(as ‘lib’ is unspecified)

  value = ri.baseenv['eval'](ri.parse(line))

In [58]:
%%R
library(nycflights13)
write.csv(flights, "flights.csv")

In [59]:
flights = pd.read_csv("flights.csv", index_col=0)

In [60]:
flights.shape


Out[60]:
(336776, 16)

In [62]:
%%R
head(flights)

In [63]:
%%R
search()

In [100]:
#flights.head()

In [66]:
df.loc[:2,:]


Out[66]:
_heartbeat_ a al c cy g gr h hc hh kw l ll nk r t tz u
0 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-US,en;q=0.8 US Danvers A6qOVH MA wfLQtf 1331822918 1.usa.gov NaN orofrog [42.576698, -70.954903] 1 http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... 1331923247 America/New_York http://www.ncbi.nlm.nih.gov/pubmed/22415991
1 NaN GoogleMaps/RochesterNY NaN US Provo mwszkS UT mwszkS 1308262393 j.mp NaN bitly [40.218102, -111.613297] 0 http://www.AwareMap.com/ 1331923249 America/Denver http://www.monroecounty.gov/etc/911/rss.php
2 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... en-US US Washington xxr3Qb DC xxr3Qb 1331919941 1.usa.gov NaN bitly [38.9007, -77.043098] 1 http://t.co/03elZC4Q 1331923250 America/New_York http://boxer.senate.gov/en/press/releases/0316...

3 rows × 18 columns


In [67]:
type(records)


Out[67]:
list

In [68]:
type(records[0])


Out[68]:
dict

In [69]:
df.columns


Out[69]:
Index([u'_heartbeat_', u'a', u'al', u'c', u'cy', u'g', u'gr', u'h', u'hc', u'hh', u'kw', u'l', u'll', u'nk', u'r', u't', u'tz', u'u'], dtype='object')

In [74]:
df.shape


Out[74]:
(3560, 18)

In [81]:
df.to_csv('/home/bakuda/pandas-book/ch02/bitly-data-for-R.csv', encoding='utf8', header=True, sep=',')

In [82]:
df = pd.DataFrame(records)

In [99]:
#df.loc[10:18]

In [103]:
results.value_counts()[:10].plot(kind='barh')


Out[103]:
<matplotlib.axes.AxesSubplot at 0xe0531ec>

In [104]:
cdf = df[df['a'].notnull()]

In [105]:
cdf.shape


Out[105]:
(3440, 18)

In [106]:
cdf.groupby('a')


Out[106]:
<pandas.core.groupby.DataFrameGroupBy object at 0xd43caec>

In [111]:
results.value_counts()[:20].plot(kind='barh')


Out[111]:
<matplotlib.axes.AxesSubplot at 0x1dbed58c>

In [112]:
os_types = np.where(cdf['a'].str.contains('Windows'), 'Windows', 'Not Windows')

In [114]:
os_types[:5]


Out[114]:
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'], 
      dtype='|S11')

In [116]:
by_tz_os = cdf.groupby(['tz',os_types])

In [118]:
by_tz_os.size()


Out[118]:
tz                                         
                                Not Windows    245
                                Windows        276
Africa/Cairo                    Windows          3
Africa/Casablanca               Windows          1
Africa/Ceuta                    Windows          2
Africa/Johannesburg             Windows          1
Africa/Lusaka                   Windows          1
America/Anchorage               Not Windows      4
                                Windows          1
America/Argentina/Buenos_Aires  Not Windows      1
America/Argentina/Cordoba       Windows          1
America/Argentina/Mendoza       Windows          1
America/Bogota                  Not Windows      1
                                Windows          2
America/Caracas                 Windows          1
...
Europe/Skopje     Windows         1
Europe/Sofia      Windows         1
Europe/Stockholm  Not Windows     2
                  Windows        12
Europe/Uzhgorod   Windows         1
Europe/Vienna     Not Windows     3
                  Windows         3
Europe/Vilnius    Windows         2
Europe/Volgograd  Windows         1
Europe/Warsaw     Not Windows     1
                  Windows        15
Europe/Zurich     Not Windows     4
Pacific/Auckland  Not Windows     3
                  Windows         8
Pacific/Honolulu  Windows        36
Length: 149, dtype: int64

In [119]:
agg_counts = by_tz_os.size().unstack().fillna(0)

In [120]:
agg_counts[:10]


Out[120]:
Not Windows Windows
tz
245 276
Africa/Cairo 0 3
Africa/Casablanca 0 1
Africa/Ceuta 0 2
Africa/Johannesburg 0 1
Africa/Lusaka 0 1
America/Anchorage 4 1
America/Argentina/Buenos_Aires 1 0
America/Argentina/Cordoba 0 1
America/Argentina/Mendoza 0 1

10 rows × 2 columns


In [121]:
# Unstack is like R::tidyr:spread()
by_tz_os.size().unstack()


Out[121]:
Not Windows Windows
tz
245 276
Africa/Cairo NaN 3
Africa/Casablanca NaN 1
Africa/Ceuta NaN 2
Africa/Johannesburg NaN 1
Africa/Lusaka NaN 1
America/Anchorage 4 1
America/Argentina/Buenos_Aires 1 NaN
America/Argentina/Cordoba NaN 1
America/Argentina/Mendoza NaN 1
America/Bogota 1 2
America/Caracas NaN 1
America/Chicago 115 285
America/Chihuahua 1 1
America/Costa_Rica NaN 1
America/Denver 132 59
America/Edmonton 2 4
America/Guayaquil 2 NaN
America/Halifax 1 3
America/Indianapolis 8 12
America/La_Paz NaN 1
America/Lima NaN 1
America/Los_Angeles 130 252
America/Managua NaN 3
America/Mazatlan 1 NaN
America/Mexico_City 7 8
America/Monterrey 1 NaN
America/Montevideo NaN 1
America/Montreal 3 6
America/New_York 339 912
America/Phoenix 3 17
America/Puerto_Rico 9 1
America/Rainy_River 10 15
America/Recife NaN 2
America/Santo_Domingo 1 NaN
America/Sao_Paulo 13 20
America/St_Kitts 1 NaN
America/Tegucigalpa NaN 1
America/Vancouver 6 6
America/Winnipeg 3 1
Asia/Amman NaN 2
Asia/Bangkok NaN 6
Asia/Beirut 1 3
Asia/Calcutta 2 7
Asia/Dubai 2 2
Asia/Harbin NaN 3
Asia/Hong_Kong 4 6
Asia/Istanbul NaN 9
Asia/Jakarta 2 1
Asia/Jerusalem 2 1
Asia/Karachi NaN 3
Asia/Kuala_Lumpur 1 2
Asia/Kuching NaN 1
Asia/Manila NaN 1
Asia/Nicosia NaN 1
Asia/Novosibirsk NaN 1
Asia/Pontianak 1 NaN
Asia/Riyadh NaN 1
Asia/Seoul 4 1
Asia/Tokyo 2 35
... ...

97 rows × 2 columns


In [123]:
cdf.groupby(['tz',os_types]).size().unstack().fillna(0)[:5]


Out[123]:
Not Windows Windows
tz
245 276
Africa/Cairo 0 3
Africa/Casablanca 0 1
Africa/Ceuta 0 2
Africa/Johannesburg 0 1

5 rows × 2 columns


In [124]:
indexer = agg_counts.sum(1).argsort()

In [125]:
indexer[:5]


Out[125]:
tz
                       24
Africa/Cairo           20
Africa/Casablanca      21
Africa/Ceuta           92
Africa/Johannesburg    87
dtype: int64

In [126]:
count_subset = agg_counts.take(indexer)[-10:]

In [127]:
count_subset


Out[127]:
Not Windows Windows
tz
America/Sao_Paulo 13 20
Europe/Madrid 16 19
Pacific/Honolulu 0 36
Asia/Tokyo 2 35
Europe/London 43 31
America/Denver 132 59
America/Los_Angeles 130 252
America/Chicago 115 285
245 276
America/New_York 339 912

10 rows × 2 columns


In [130]:
count_subset.plot(kind='barh', stacked=True)


Out[130]:
<matplotlib.axes.AxesSubplot at 0x1cd3e0cc>

In [146]:
cd "/home/bakuda/pandas-book/ch02/movielens/"


/home/bakuda/pandas-book/ch02/movielens

In [148]:
import pandas as pd
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('./users.dat', sep='::', header=None,
names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('./ratings.dat', sep='::', header=None,
names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('./movies.dat', sep='::', header=None,
names=mnames)

In [149]:
users[:5]


Out[149]:
user_id gender age occupation zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455

5 rows × 5 columns


In [153]:
ratings[:5]


Out[153]:
user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291

5 rows × 4 columns


In [152]:
movies[:5]


Out[152]:
movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy

5 rows × 3 columns


In [154]:
data = pd.merge(pd.merge(ratings, users), movies)

In [155]:
data.shape


Out[155]:
(1000209, 10)

In [156]:
data[:10]


Out[156]:
user_id movie_id rating timestamp gender age occupation zip title genres
0 1 1193 5 978300760 F 1 10 48067 One Flew Over the Cuckoo's Nest (1975) Drama
1 2 1193 5 978298413 M 56 16 70072 One Flew Over the Cuckoo's Nest (1975) Drama
2 12 1193 4 978220179 M 25 12 32793 One Flew Over the Cuckoo's Nest (1975) Drama
3 15 1193 4 978199279 M 25 7 22903 One Flew Over the Cuckoo's Nest (1975) Drama
4 17 1193 5 978158471 M 50 1 95350 One Flew Over the Cuckoo's Nest (1975) Drama
5 18 1193 4 978156168 F 18 3 95825 One Flew Over the Cuckoo's Nest (1975) Drama
6 19 1193 5 982730936 M 1 10 48073 One Flew Over the Cuckoo's Nest (1975) Drama
7 24 1193 5 978136709 F 25 7 10023 One Flew Over the Cuckoo's Nest (1975) Drama
8 28 1193 3 978125194 F 25 1 14607 One Flew Over the Cuckoo's Nest (1975) Drama
9 33 1193 5 978557765 M 45 3 55421 One Flew Over the Cuckoo's Nest (1975) Drama

10 rows × 10 columns


In [158]:
data[:1]


Out[158]:
user_id movie_id rating timestamp gender age occupation zip title genres
0 1 1193 5 978300760 F 1 10 48067 One Flew Over the Cuckoo's Nest (1975) Drama

1 rows × 10 columns


In [159]:
data.ix[0]


Out[159]:
user_id                                            1
movie_id                                        1193
rating                                             5
timestamp                                  978300760
gender                                             F
age                                                1
occupation                                        10
zip                                            48067
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 0, dtype: object

In [ ]: