In [1]:
import pandas as pd
print(pd.__version__)


1.0.3

In [2]:
pd.read_json?

In [3]:
df = pd.read_json("https://data.smcgov.org/resource/mb6a-xn89.json")

df.head(5)


Out[3]:
geography geography_type year less_than_high_school_graduate high_school_graduate some_college_or_associate_s_degree bachelor_s_degree_or_higher location_1 :@computed_region_uph5_8hpn :@computed_region_i2t2_cryp
0 Atherton Town 2014-01-01T00:00:00.000 13.6 12.3 2.7 3.5 {'type': 'Point', 'coordinates': [-122.2, 37.4... 2.0 28596
1 Colma Town 2014-01-01T00:00:00.000 6.3 6.4 10.4 2.4 {'type': 'Point', 'coordinates': [-122.455556,... 4.0 28588
2 Foster City City 2014-01-01T00:00:00.000 11.9 9.7 2.0 2.9 {'type': 'Point', 'coordinates': [-122.266389,... 6.0 319
3 Portola Valley Town 2014-01-01T00:00:00.000 48.1 0.0 0.0 1.8 {'type': 'Point', 'coordinates': [-122.218611,... 14.0 28597
4 Redwood City City 2014-01-01T00:00:00.000 16.4 10.6 6.6 3.0 {'type': 'Point', 'coordinates': [-122.236111,... 21.0 28607

In [4]:
dir()


Out[4]:
['In',
 'Out',
 '_',
 '_3',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_i',
 '_i1',
 '_i2',
 '_i3',
 '_i4',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 'df',
 'exit',
 'get_ipython',
 'pd',
 'quit']

In [5]:
df.shape


Out[5]:
(32, 10)

In [6]:
df.describe()


Out[6]:
less_than_high_school_graduate high_school_graduate some_college_or_associate_s_degree bachelor_s_degree_or_higher :@computed_region_uph5_8hpn :@computed_region_i2t2_cryp
count 32.00000 32.000000 32.000000 32.000000 30.000000 32.000000
mean 17.80000 6.462500 5.946875 2.856250 17.733333 25062.093750
std 19.29944 4.693905 4.728430 1.873919 9.762466 9502.711577
min 0.00000 0.000000 0.000000 0.000000 1.000000 312.000000
25% 6.82500 1.925000 2.525000 2.100000 9.500000 28587.750000
50% 13.90000 7.750000 5.500000 3.000000 18.500000 28595.000000
75% 20.97500 9.450000 8.800000 3.600000 25.750000 28604.250000
max 100.00000 16.400000 18.500000 9.100000 34.000000 28613.000000

In [7]:
df.drop("location_1", axis=1).describe(include="all")


Out[7]:
geography geography_type year less_than_high_school_graduate high_school_graduate some_college_or_associate_s_degree bachelor_s_degree_or_higher :@computed_region_uph5_8hpn :@computed_region_i2t2_cryp
count 32 32 32 32.00000 32.000000 32.000000 32.000000 30.000000 32.000000
unique 32 3 1 NaN NaN NaN NaN NaN NaN
top San Carlos City 2014-01-01T00:00:00.000 NaN NaN NaN NaN NaN NaN
freq 1 15 32 NaN NaN NaN NaN NaN NaN
mean NaN NaN NaN 17.80000 6.462500 5.946875 2.856250 17.733333 25062.093750
std NaN NaN NaN 19.29944 4.693905 4.728430 1.873919 9.762466 9502.711577
min NaN NaN NaN 0.00000 0.000000 0.000000 0.000000 1.000000 312.000000
25% NaN NaN NaN 6.82500 1.925000 2.525000 2.100000 9.500000 28587.750000
50% NaN NaN NaN 13.90000 7.750000 5.500000 3.000000 18.500000 28595.000000
75% NaN NaN NaN 20.97500 9.450000 8.800000 3.600000 25.750000 28604.250000
max NaN NaN NaN 100.00000 16.400000 18.500000 9.100000 34.000000 28613.000000

In [8]:
df.dtypes


Out[8]:
geography                              object
geography_type                         object
year                                   object
less_than_high_school_graduate        float64
high_school_graduate                  float64
some_college_or_associate_s_degree    float64
bachelor_s_degree_or_higher           float64
location_1                             object
:@computed_region_uph5_8hpn           float64
:@computed_region_i2t2_cryp             int64
dtype: object

In [9]:
df.bachelor_s_degree_or_higher.mean()


Out[9]:
2.8562500000000006

In [10]:
df.geography.count()


Out[10]:
32

In [11]:
df.geography_type.unique()


Out[11]:
array(['Town', 'City', 'CDP'], dtype=object)

In [12]:
df.less_than_high_school_graduate.value_counts()


Out[12]:
0.0      4
14.2     1
8.5      1
7.0      1
100.0    1
9.5      1
11.9     1
4.8      1
31.1     1
26.7     1
6.2      1
15.7     1
22.1     1
16.4     1
6.3      1
44.4     1
20.9     1
7.7      1
9.2      1
37.8     1
3.3      1
15.1     1
48.1     1
18.3     1
21.2     1
16.1     1
13.6     1
13.4     1
20.1     1
Name: less_than_high_school_graduate, dtype: int64

In [13]:
def mapGeography(x):
    if x == "City":
        return 1
    else:
        return 0

In [14]:
df['geography_mapped_value'] = df.geography_type.apply(mapGeography)

In [15]:
df.geography_mapped_value.value_counts()


Out[15]:
0    17
1    15
Name: geography_mapped_value, dtype: int64

In [16]:
df['geography_mapped_value_lambda'] = df.geography_type.apply(lambda y: 1 if y == "City" else 0)

In [17]:
df.geography_mapped_value_lambda.value_counts()


Out[17]:
0    17
1    15
Name: geography_mapped_value_lambda, dtype: int64

In [ ]: