In [65]:
# A list is a mutable, ordered collection
# Collections can be heterogenous: integers, strings, other data structures, etc.
# can all be added to a single list.
# Creating lists:
# Initialize an empty list:
my_movies = []
print(my_movies, type(my_movies))
# Initialize a list with one or more objects in it
unwatched = ['avengers', 'frozen', 'cats']
print(unwatched)
# Use the list() type constructor
# Note that the constructed "list" may not be what we expect
watched = list('jumanji')
print(watched)
# Use list comprehension
even_numbers = [i for i in range(2, 20, 2)]
print(even_numbers)
In [66]:
# We can use index numbers to select and slice from lists
print(len(watched))
In [67]:
# The first object is a list has index/position 0
watched[0]
Out[67]:
In [68]:
# So even though the list is 7 objects long, if we try to select the object in position 7
# we will get an index out of range error
watched[7]
In [70]:
watched[6]
Out[70]:
In [71]:
# We can slice out subsections using start and finish index positions separated by a colon
# Note the "finish" position should be read as "up to but not including" the object at that position
watched[0:3]
Out[71]:
In [72]:
# Same output as above
watched[:3]
Out[72]:
In [73]:
watched[4:7]
Out[73]:
In [74]:
# Same output as above
watched[4:]
Out[74]:
In [75]:
# Lists are mutable - we can edit, add, remove objects
print(watched)
watched[0] = 'jumanji'
print(watched)
In [76]:
watched.remove('u')
print(watched)
In [77]:
# There are other ways we can fix the "watched" list, but
# in this case it is most efficient to reassign the "watched" variable as a new list
watched = [watched[0]]
print(watched)
In [78]:
# We can make a list of lists using append()
print(my_movies)
my_movies.append(unwatched)
print(my_movies)
my_movies.append(watched)
print(my_movies)
This is okay - we have a master list of all of our movies. The nested lists break our collection down further by watched and unwatched, but we don't know which list is which.
One way to improve this design is to use a dictionary.
In [79]:
# A dictionary is a mutable, unordered list of key-value pairs
# Like lists, the values in a dictionary can be composed of different data types or structures
# Re-assign my_movies to an empty dictionary
my_movies = {}
# Now add our lists of unwatched and watched movies
# Note we are using a string as the key and the list as the value
my_movies["unwatched"] = unwatched
print(my_movies)
In [80]:
my_movies["watched"] = watched
print(my_movies)
In [81]:
# We retrieve items from a dictionary using the keys.
# If we don't know the keys, we can ask for them first using the 'keys()' attribute:
print(my_movies.keys())
In [87]:
print(my_movies['watched'])
Using the 2010 surname data from the US Census, we will develop a workflow to accomplish the following:
Decennial Census Surname Files (2010)
https://www.census.gov/data/developers/data-sets/surnames.html
https://api.census.gov/data/2010/surname.html
US Census Bureau (2016) Decennial Census Surname Files (2010) Retrieved from https://api.census.gov/data/2010/surname.jsonstructures
The modules used in this exercise are popular and under active development. Follow the links for more information about methods, syntax, etc.
Requests: http://docs.python-requests.org/en/master/
JSON: https://docs.python.org/3/library/json.html
Pandas: http://pandas.pydata.org/
Matplotlib: https://matplotlib.org/
Look for information about or links to the API, developer's documentation, etc. Helpful examples are often included.
Note that we are providing an alias for Pandas and matplotlib. Whenever we need to call a method from those module, we can use the alias.
In [88]:
# http://api.census.gov/data/2010/surname
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
In [89]:
# First, get the basic info about the dataset.
# References: Dataset API (https://api.census.gov/data/2010/surname.html)
# Requests API (http://docs.python-requests.org/en/master/)
# Python 3 JSON API (https://docs.python.org/3/library/json.html)
api_base_url = "http://api.census.gov/data/2010/surname"
api_info = requests.get(api_base_url)
api_json = api_info.json()
# Uncomment the next line(s) to see the response content.
# NOTE: JSON and TEXT don't look much different to us. They can look very different to a machine!
#print(api_info.text)
print(json.dumps(api_json, indent=4))
# The output is a dictionary - data are stored as key:value pairs and can be nested.
In [90]:
# Request and store a local copy of the dataset variables.
# Note that the URL could be hard coded just from referencing the API, but
# we are navigating the JSON data.
var_link = api_json['dataset'][0]['c_variablesLink']
print(var_link)
In [91]:
# Use the variable info link to make a new request
variables = requests.get(var_link)
jsonData = variables.json()
variable_data = jsonData['variables']
# Note that this is a dictionary of dictionaries.
# We are going to use it throughout this exercise.
print(json.dumps(variable_data, indent=4))
In [92]:
print(variable_data.keys())
In [94]:
# Now request info about a single surname
# Update 2020-02-26: Surnames should be all caps!
name = 'WHEELER'
name_query = '&NAME=' + name
In [95]:
# Default vars: 'RANK,COUNT,PCTWHITE,PCTAPI,PCT2PRACE,PCTAIAN,PCTBLACK,PCTHISPANIC'
desired_vars = 'RANK,COUNT,PCTWHITE,PCTAPI,PCT2PRACE,PCTAIAN,PCTBLACK,PCTHISPANIC'
In [98]:
# References: Pandas (http://pandas.pydata.org/)
base_url = 'http://api.census.gov/data/2010/surname?get='
query_url = base_url + desired_vars + name_query
name_stats = requests.get(query_url)
name_data = name_stats.json()
# The response data are not very human readable.
print('Raw response data:\n')
print(name_data)
# The simplest dataframe would use the variable names returned with the data. Example: PCTWHITE
# It's easier to read the descriptive labels provide via the variables API.
# The code block below replaces variable names with labels as it builds the dataframe.
column_list = []
for each in name_data[0]:
label = variable_data[each]['label']
column_list.append(label)
df = pd.DataFrame([name_data[1]], columns=column_list)
print('\n\nPandas dataframe:')
df
Out[98]:
If we want to query data for a different surname, we can go back up a few cells, change the value of the "name" variable, then re-run all of the other cells. That works in a notebook context but it's not very portable. Instead, we will define a function that can be called later without having to re-execute a bunch of code or code blocks.
In Python, the syntax for defining a function is:
def function_name(arg1, arg2, arg3=Default_value):
do stuff
return something
In [99]:
def get_surname_data(desired_variables, surname):
name = str.upper(surname) # API requires surnames in all uppercase
base_url = 'http://api.census.gov/data/2010/surname?get='
query_url = base_url + desired_vars + '&NAME=' + name
name_request = requests.get(query_url)
name_data = name_request.json()
return name_data
In [101]:
# Now we can get data for any name just by changing the value of the surname variable.
# We also have the option to update the variables of interest.
desired_vars = 'RANK,COUNT,PCTWHITE,PCTAPI,PCT2PRACE,PCTAIAN,PCTBLACK,PCTHISPANIC'
surname = "Jones"
print(get_surname_data(desired_vars, surname))
In [103]:
# That is not nicely formatted, so we can write another function to create a more human-readable data frame
def format_surname_data(surname_data, variable_data):
column_list = []
for each in surname_data[0]:
label = variable_data[each]['label']
column_list.append(label)
surname_dataframe = pd.DataFrame([surname_data[1]], columns=column_list)
return surname_dataframe
In [106]:
# Now we can get a nice table for any name by calling both functions
desired_vars = 'RANK,COUNT,PCTWHITE,PCTAPI,PCT2PRACE,PCTAIAN,PCTBLACK,PCTHISPANIC'
surname = "Smith"
surname_data = get_surname_data(desired_vars, surname)
surname_dataframe = format_surname_data(surname_data, variable_data)
surname_dataframe
Out[106]:
In [ ]: