In [1]:
import pandas as pd
# pd.set_option('max_colwidth', 50)
# set this if you need to
In [46]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
The Health Department has developed an inspection report and scoring system. After conducting an inspection of the facility, the Health Inspector calculates a score based on the violations observed. Violations can fall into:
In [3]:
businesses = pd.read_csv('./data/businesses_plus.csv', parse_dates=True, dtype={'phone_number': str})
businesses.head()
# dtype casts the column as a specific data type
Out[3]:
In [52]:
inspections = pd.read_csv('./data/inspections_plus.csv', parse_dates = ['date'])
inspections.head()
Out[52]:
In [53]:
inspections.info()
In [5]:
violations = pd.read_csv('./data/violations_plus.csv', parse_dates=True)
violations.head()
Out[5]:
In [6]:
# 1 Combine the three dataframes into one data frame called restaurant_scores
# Hint: http://pandas.pydata.org/pandas-docs/stable/merging.html
restaurant_scores = pd.merge(inspections, violations, on = ['business_id', 'date'])
restaurant_scores = pd.merge(businesses, restaurant_scores, on = 'business_id')
restaurant_scores.head()
Out[6]:
In [7]:
# 2 Which ten business have had the most inspections?
inspections.business_id.value_counts().head(10)
Out[7]:
In [8]:
# 3 Group and count the inspections by type
inspections.type.value_counts()
Out[8]:
In [55]:
# 4 Create a plot that shows number of inspections per month
# Bonus for creating a heatmap
# http://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.heatmap.html?highlight=heatmap
inspections['date'].dt.strftime('%b').value_counts().plot()
Out[55]:
In [10]:
# 5 Which zip code contains the most high risk violations?
restaurant_scores[restaurant_scores['risk_category'] == 'High Risk'].postal_code.value_counts().head(1)
Out[10]:
In [11]:
# 6 If inspection is prompted by a change in restaurant ownership,
# is the inspection more likely to be categorized as higher or lower risk?
restaurant_scores.risk_category[restaurant_scores['type'] == 'New Ownership'].value_counts()
Out[11]:
In [12]:
# 7 Examining the descriptions, what is the most common violation?
violations.description.value_counts().head(1)
Out[12]:
In [56]:
# 8 Create a hist of the scores with 10 bins
inspections['Score'].hist(bins = 10)
Out[56]:
In [71]:
# 9 Can you predict risk category based on the other features in this dataset? Higher scores are lower risk
restaurant_scores.risk_category[restaurant_scores['Score'] > 75].value_counts()
Out[71]:
In [76]:
# 10 Extra Credit:
# Use Instagram location API to find pictures taken at the lat, long of the most High Risk restaurant
# https://www.instagram.com/developer/endpoints/locations/
restaurant_scores[restaurant_scores['risk_category'] == 'High Risk'].head()
Out[76]:
In [78]:
#https://api.instagram.com/v1/locations/search?lat=37.794483&lng=-122.396584&access_token=145677507.717d650.794fe62f43014634985e6694cdabd845
In [16]:
############################
### A Little More Morbid ###
############################
In [17]:
killings = pd.read_csv('./data/police-killings.csv')
killings.head()
Out[17]:
In [18]:
# 1. Make the following changed to column names:
# lawenforcementagency -> agency
# raceethnicity -> race
killings.rename(columns = {'Lawenforcementagency': 'agency', 'raceethnicity': 'race'}, inplace = True)
In [19]:
# 2. Show the count of missing values in each column
killings.isnull().sum()
Out[19]:
In [20]:
# 3. replace each null value in the dataframe with the string "Unknown"
killings.fillna('Unknown', inplace = True)
In [21]:
# 4. How many killings were there so far in 2015?
killings[killings['year'] == 2015].name.count()
Out[21]:
In [22]:
# 5. Of all killings, how many were male and how many female?
killings.gender.value_counts()
Out[22]:
In [23]:
# 6. How many killings were of unarmed people?
killings[killings['armed'] == 'No'].name.count()
Out[23]:
In [24]:
# 7. What percentage of all killings were unarmed?
float(killings[killings['armed'] == 'No'].name.count()) / float(killings.name.count()) * 100
Out[24]:
In [25]:
# 8. What are the 5 states with the most killings?
killings.state.value_counts().head()
Out[25]:
In [26]:
# 9. Show a value counts of deaths for each race
killings.race.value_counts()
Out[26]:
In [57]:
# 10. Display a histogram of ages of all killings
killings['age'].hist(bins = 10)
Out[57]:
In [58]:
# 11. Show 6 histograms of ages by race
killings.age.hist(by = killings.race)
Out[58]:
In [29]:
# 12. What is the average age of death by race?
killings.groupby('race').age.mean()
Out[29]:
In [59]:
# 13. Show a bar chart with counts of deaths every month
killings.month.value_counts().plot(kind = 'bar', title = 'Deaths per Month')
plt.xlabel('Month')
plt.ylabel('Deaths')
Out[59]:
In [31]:
###################
### Less Morbid ###
###################
In [32]:
majors = pd.read_csv('./data/college-majors.csv')
majors.head()
Out[32]:
In [33]:
# 1. Delete the columns (employed_full_time_year_round, major_code)
majors.drop(majors.columns[[6, 1]], axis = 1)
Out[33]:
In [34]:
# 2. Show the cout of missing values in each column
majors.isnull().sum()
Out[34]:
In [35]:
# 3. What are the top 10 highest paying majors?
majors.groupby('Major').Median.max().head(10)
Out[35]:
In [36]:
# 4. Plot the data from the last question in a bar chart, include proper title, and labels!
In [37]:
# 5. What is the average median salary for each major category?
majors.groupby('Major').Median.mean()
Out[37]:
In [38]:
# 6. Show only the top 5 paying major categories
majors.groupby('Major').Median.mean().head()
Out[38]:
In [60]:
# 7. Plot a histogram of the distribution of median salaries
majors['Median'].hist(bins = 10)
Out[60]:
In [61]:
# 8. Plot a histogram of the distribution of median salaries by major category
majors.Median.hist(by = majors.Major_category)
Out[61]:
In [41]:
# 9. What are the top 10 most UNemployed majors?
majors[['Major', 'Unemployed']].sort_index(by = 'Unemployed', ascending = False).head(10)
Out[41]:
In [42]:
# What are the unemployment rates?
majors['Unemployment_rate'] = (majors.Unemployed / (majors.Unemployed + majors.Employed) * 100)
majors[['Major', 'Unemployment_rate']].sort_index(by = 'Unemployment_rate', ascending = False)
Out[42]:
In [43]:
# 10. What are the top 10 most UNemployed majors CATEGORIES? Use the mean for each category
# What are the unemployment rates?
majors.groupby('Major_category').Unemployment_rate.mean()
Out[43]:
In [44]:
# 11. the total and employed column refer to the people that were surveyed.
# Create a new column showing the emlpoyment rate of the people surveyed for each major
# call it "sample_employment_rate"
# Example the first row has total: 128148 and employed: 90245. it's
# sample_employment_rate should be 90245.0 / 128148.0 = .7042
majors['sample_employment_rate'] = majors['Employed'] / majors['Total']
majors.head()
Out[44]:
In [45]:
# 12. Create a "sample_unemployment_rate" column
# this column should be 1 - "sample_employment_rate"
majors['sample_unemployment_rate'] = 1 - majors['sample_employment_rate']
majors.head()
Out[45]:
In [ ]: