Pandas

Requests

Requests is a simple HTTP library for doing wget/curl type operations.

Open Data and Socrata


In [1]:
import numpy as np
import pandas as pd
import requests
import os

In [2]:
# GET A CSV OF ALL STARBUCKS LOCATIONS

# If this link is ever broken, use the link above to get a new one

fname = 'All_Starbucks_Locations_in_the_World.csv'
if not(os.path.isfile(fname)):
    print 'Getting file from Socrata portal'
    r = requests.get('https://opendata.socrata.com/api/views/xy4y-c4mk/rows.csv?accessType=DOWNLOAD')
    f = open(fname, 'w')
    f.write(r.text.encode('utf-8'))
    f.close()
df = pd.read_csv(fname)


Getting file from Socrata portal

In [3]:
# LET'S GET SOME SUMMARY STATISTICS BY COUNTRY

by_country = pd.DataFrame(df.groupby(['Country'])['Store ID'].count())
by_country.sort('Store ID', ascending=False, inplace=True)
by_country.columns = ['count']
by_country['percentage'] = by_country['count'] / by_country['count'].sum()
by_country.head()


Out[3]:
count percentage
Country
US 12171 0.566304
CN 1715 0.079797
CA 1332 0.061977
JP 1081 0.050298
GB 806 0.037502

In [4]:
# DRILL DOWN BY STATES

filter = df['Country'] == 'US'
usa = pd.DataFrame(df[filter])
by_state = pd.DataFrame(usa.groupby(['Country Subdivision'])['Store ID'].count())
by_state.sort('Store ID', ascending=False, inplace=True)
by_state.columns = ['count']
by_state['percentage'] = by_state['count'] / by_state['count'].sum()
by_state.head()


Out[4]:
count percentage
Country Subdivision
CA 2631 0.216170
TX 916 0.075261
WA 714 0.058664
FL 624 0.051269
NY 573 0.047079

Copy is a gotcha


In [5]:
# FOCUS ON LOS ANGELES

cfilter = df['Country'] == 'US'
sfilter = df['Country Subdivision'] == 'CA'
lafilter = df['City'] == 'Los Angeles'
filter = cfilter & sfilter & lafilter
la = df[filter].copy()

In [6]:
# HOW MANY ROWS AND COLUMNS?

la.shape


Out[6]:
(110, 21)

In [7]:
# CAN YOU FIND YOUR FAVORITE?

la[['Street 1', 'Street 2']]


Out[7]:
Street 1 Street 2
4765 12313 Jefferson Blvd NaN
4914 3242 West Cahuenga Blvd. NaN
5146 5353 Wilshire Blvd. NaN
5485 4177 W. Washington Blvd. NaN
5517 4430 York Blvd. NaN
5697 3535 S La Cienega Blvd NaN
5720 3722 Crenshaw Blvd. NaN
6352 5020 Wilshire Blvd. NaN
6473 1437 E. Gage Avenue NaN
6478 8817 South Sepulveda Blvd. NaN
6483 6333 W. 3rd Street NaN
6517 3461 W 3rd St NaN
6520 735 S. Figueroa Street #308
6587 1258 South La Brea Ave NaN
6642 206 North Larchmont NaN
6771 1161 Westwood Blvd. NaN
6806 9824 National Blvd NaN
6891 6102 W. Sunset Blvd. Unit 6102
6924 3118 S Sepulveda Blvd NaN
7017 120 S. Los Angeles Street #110
7116 8700 Beverly Blvd. NaN
7276 6101 W Century Blvd NaN
7368 3853 E. 3rd Street E
7447 1111 South Grand Avenue NaN
7609 3111 Glendale Blvd NaN
8158 5151 State University Drive NaN
8373 2134 Sunset Blvd NaN
8413 6081 Center Drive 108
8484 1601 Wilshire Blvd NaN
8542 138 S. Central Avenue NaN
... ... ...
15669 2626 Colorado Blvd NaN
15795 217 N. Hill Street NaN
16182 1 World Way NaN
16335 600 West 9th Street 135
16476 10861 Weyburn Ave NaN
16522 333 South Hope Street NaN
16602 642 W 34th St NaN
16758 3584 S. Figueroa St. #1B
16765 3450 Wilshire Blvd. NaN
16769 603 S. Spring Street NaN
16913 735 S Figueroa St NaN
17044 3880 N Mission Rd NaN
17420 8480 Beverly Blvd NaN
18064 11727 W Olympic Blvd NaN
18544 1 World Way NaN
18705 7100 Santa Monca Blvd, Suite 201 NaN
18887 1 World Way NaN
19407 639 N Broadway NaN
19423 9616 W. Pico Blvd NaN
19442 5857 S. Central Ave. NaN
19714 3150 Wilshire Blvd NaN
19736 12057 Wilshire Blvd NaN
19819 2675 E Olympic Blvd NaN
20095 6240 York Ave. NaN
20202 5080 Rodeo Road NaN
20344 7257 W Sunset Blvd NaN
20346 5601 Wilshire Blvd NaN
20924 359 South La Brea NaN
21015 12404 W. Venice Blvd. Suite A
21120 8701 Lincoln Blvd NaN

110 rows × 2 columns

A few Pandas features used in this workshop


In [8]:
co_series = la['Ownership Type']=='CO'
co_series.head()


Out[8]:
4765    True
4914    True
5146    True
5485    True
5517    True
Name: Ownership Type, dtype: bool

In [9]:
~co_series.head()


Out[9]:
4765    False
4914    False
5146    False
5485    False
5517    False
Name: Ownership Type, dtype: bool

In [10]:
co_series.tolist()


Out[10]:
[True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 False]

In [11]:
la.sort('Postal Code', inplace=True)
la.head()


Out[11]:
Store ID Name Brand Store Number Phone Number Ownership Type Street Combined Street 1 Street 2 Street 3 ... Country Subdivision Country Postal Code Coordinates Latitude Longitude Timezone Current Timezone Offset Olson Timezone First Seen
19442 1006808 Central & Slauson Starbucks 21800-197225 323-521-1535 CO 5857 S. Central Ave. 5857 S. Central Ave. NaN NaN ... CA US 90001 (33.9883270263672, -118.257118225098) 33.988327 -118.257118 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 05/03/2014 04:00:00 AM
6473 8822 Gage & Compton, Huntington Park Starbucks 8819-94823 323-585-1928 CO 1437 E. Gage Avenue 1437 E. Gage Avenue NaN NaN ... CA US 900011789 (33.9823722839355, -118.249458312988) 33.982372 -118.249458 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM
6642 9035 Hancock Park Starbucks 507-450 323-469-1081 CO 206 North Larchmont 206 North Larchmont NaN NaN ... CA US 900043707 (34.0749092102051, -118.323463439941) 34.074909 -118.323463 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM
16602 90142 Trojan Grounds @ USC Starbucks 15244-113064 213-740-6285 LS 642 W 34th St 642 W 34th St NaN NaN ... CA US 90007 (34.0212821960449, -118.282356262207) 34.021282 -118.282356 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM
16758 91220 Figueroa & Exposition Starbucks 17413-168644 213-749-9302 CO 3584 S. Figueroa St., #1B 3584 S. Figueroa St. #1B NaN ... CA US 90007 (34.0185890197754, -118.281768798828) 34.018589 -118.281769 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM

5 rows × 21 columns

Indexes


In [12]:
la.index


Out[12]:
Int64Index([19442, 6473, 6642, 16602, 16758, 16765, 19714, 11813, 19407, 14979, 15795, 7017, 8542, 12770, 16769, 13464, 14156, 8851, 7447, 12917, 16335, 20202, 14168, 5697, 5720, 8484, 11028, 6520, 16913, 5485, 6587, 13554, 6517, 19819, 9994, 16476, 6771, 19736, 8373, 12240, 11405, 8889, 14980, 6891, 8675, 17044, 8158, 14598, 6806, 9960, 6924, 19423, 12398, 20346, 20924, 9980, 6483, 14813, 5146, 6352, 11119, 12276, 11353, 7609, 9197, 11363, 15669, 13453, 5517, 20095, 18544, 18887, 21120, 8413, 9993, 6478, 7276, 9505, 9869, 9870, 9878, 10783, 16182, 18705, 14097, 20344, 9048, 8617, 10795, 9968, 7116, 17420, 11182, 11518, 7368, 18064, 9976, 9120, 8784, 11193, ...], dtype='int64')

In [13]:
la.index = np.arange(la.shape[0])
la.index


Out[13]:
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...], dtype='int64')

Column renaming and dropping


In [14]:
la.head()


Out[14]:
Store ID Name Brand Store Number Phone Number Ownership Type Street Combined Street 1 Street 2 Street 3 ... Country Subdivision Country Postal Code Coordinates Latitude Longitude Timezone Current Timezone Offset Olson Timezone First Seen
0 1006808 Central & Slauson Starbucks 21800-197225 323-521-1535 CO 5857 S. Central Ave. 5857 S. Central Ave. NaN NaN ... CA US 90001 (33.9883270263672, -118.257118225098) 33.988327 -118.257118 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 05/03/2014 04:00:00 AM
1 8822 Gage & Compton, Huntington Park Starbucks 8819-94823 323-585-1928 CO 1437 E. Gage Avenue 1437 E. Gage Avenue NaN NaN ... CA US 900011789 (33.9823722839355, -118.249458312988) 33.982372 -118.249458 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM
2 9035 Hancock Park Starbucks 507-450 323-469-1081 CO 206 North Larchmont 206 North Larchmont NaN NaN ... CA US 900043707 (34.0749092102051, -118.323463439941) 34.074909 -118.323463 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM
3 90142 Trojan Grounds @ USC Starbucks 15244-113064 213-740-6285 LS 642 W 34th St 642 W 34th St NaN NaN ... CA US 90007 (34.0212821960449, -118.282356262207) 34.021282 -118.282356 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM
4 91220 Figueroa & Exposition Starbucks 17413-168644 213-749-9302 CO 3584 S. Figueroa St., #1B 3584 S. Figueroa St. #1B NaN ... CA US 90007 (34.0185890197754, -118.281768798828) 34.018589 -118.281769 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM

5 rows × 21 columns


In [15]:
la.drop('Brand', axis=1, inplace=True)

cols = la.columns.tolist()
cols[0] = 'store_id'
la.columns = cols

la.head()


Out[15]:
store_id Name Store Number Phone Number Ownership Type Street Combined Street 1 Street 2 Street 3 City Country Subdivision Country Postal Code Coordinates Latitude Longitude Timezone Current Timezone Offset Olson Timezone First Seen
0 1006808 Central & Slauson 21800-197225 323-521-1535 CO 5857 S. Central Ave. 5857 S. Central Ave. NaN NaN Los Angeles CA US 90001 (33.9883270263672, -118.257118225098) 33.988327 -118.257118 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 05/03/2014 04:00:00 AM
1 8822 Gage & Compton, Huntington Park 8819-94823 323-585-1928 CO 1437 E. Gage Avenue 1437 E. Gage Avenue NaN NaN Los Angeles CA US 900011789 (33.9823722839355, -118.249458312988) 33.982372 -118.249458 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM
2 9035 Hancock Park 507-450 323-469-1081 CO 206 North Larchmont 206 North Larchmont NaN NaN Los Angeles CA US 900043707 (34.0749092102051, -118.323463439941) 34.074909 -118.323463 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM
3 90142 Trojan Grounds @ USC 15244-113064 213-740-6285 LS 642 W 34th St 642 W 34th St NaN NaN Los Angeles CA US 90007 (34.0212821960449, -118.282356262207) 34.021282 -118.282356 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM
4 91220 Figueroa & Exposition 17413-168644 213-749-9302 CO 3584 S. Figueroa St., #1B 3584 S. Figueroa St. #1B NaN Los Angeles CA US 90007 (34.0185890197754, -118.281768798828) 34.018589 -118.281769 Pacific Standard Time -420 GMT-08:00 America/Los_Angeles 12/08/2013 10:41:59 PM

Uses for Requests in Data Science

  • Easy retrieval of data from the web

Uses for Pandas in Data Science

  • Too many to count!