In [1]:
import pandas as pd
# pd.set_option('max_colwidth', 50)
# set this if you need to

In [46]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

The Health Department has developed an inspection report and scoring system. After conducting an inspection of the facility, the Health Inspector calculates a score based on the violations observed. Violations can fall into:

  • high risk category: records specific violations that directly relate to the transmission of food borne illnesses,the adulteration of food products and the contamination of food-contact surfaces.
  • moderate risk category: records specific violations that are of a moderate risk to the public health and safety.
  • low risk category: records violations that are low risk or have no immediate risk to the public health and safety.

In [3]:
businesses = pd.read_csv('./data/businesses_plus.csv', parse_dates=True, dtype={'phone_number': str})
businesses.head()
# dtype casts the column as a specific data type


Out[3]:
business_id name address city postal_code latitude longitude phone_number TaxCode business_certificate application_date owner_name owner_address owner_city owner_state owner_zip
0 10 Tiramisu Kitchen 033 Belden Pl San Francisco 94104 37.791116 -122.403816 NaN H24 779059.0 NaN Tiramisu LLC 33 Belden St San Francisco CA 94104
1 19 Nrgize Lifestyle Cafe 1200 Van Ness Ave, 3rd Floor San Francisco 94109 37.786848 -122.421547 NaN H24 NaN NaN 24 Hour Fitness Inc 1200 Van Ness Ave, 3rd Floor San Francisco CA 94109
2 24 OMNI S.F. Hotel - 2nd Floor Pantry 500 California St, 2nd Floor San Francisco 94104 37.792888 -122.403135 NaN H24 352312.0 NaN OMNI San Francisco Hotel Corp 500 California St, 2nd Floor San Francisco CA 94104
3 31 Norman's Ice Cream and Freezes 2801 Leavenworth St San Francisco 94133 37.807155 -122.419004 NaN H24 346882.0 NaN Norman Antiforda 2801 Leavenworth St San Francisco CA 94133
4 45 CHARLIE'S DELI CAFE 3202 FOLSOM St S.F. 94110 37.747114 -122.413641 NaN H24 340024.0 10/10/2001 HARB, CHARLES AND KRISTIN 1150 SANCHEZ S.F. CA 94114

In [52]:
inspections = pd.read_csv('./data/inspections_plus.csv', parse_dates = ['date'])
inspections.head()


Out[52]:
business_id Score date type
0 10 NaN 2014-08-07 Reinspection/Followup
1 10 94.0 2014-07-29 Routine - Unscheduled
2 10 NaN 2014-01-24 Reinspection/Followup
3 10 92.0 2014-01-14 Routine - Unscheduled
4 19 94.0 2014-11-10 Routine - Unscheduled

In [53]:
inspections.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27285 entries, 0 to 27284
Data columns (total 4 columns):
business_id    27285 non-null int64
Score          15263 non-null float64
date           27285 non-null datetime64[ns]
type           27285 non-null object
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 852.7+ KB

In [5]:
violations = pd.read_csv('./data/violations_plus.csv', parse_dates=True)
violations.head()


Out[5]:
business_id date ViolationTypeID risk_category description
0 10 20140114 103154 Low Risk Unclean or degraded floors walls or ceilings
1 10 20140114 103119 Moderate Risk Inadequate and inaccessible handwashing facili...
2 10 20140114 103145 Low Risk Improper storage of equipment utensils or linens
3 10 20140729 103129 Moderate Risk Insufficient hot water or running water
4 10 20140729 103144 Low Risk Unapproved or unmaintained equipment or utensils

In [6]:
# 1 Combine the three dataframes into one data frame called restaurant_scores
# Hint: http://pandas.pydata.org/pandas-docs/stable/merging.html

restaurant_scores = pd.merge(inspections, violations, on = ['business_id', 'date'])
restaurant_scores = pd.merge(businesses, restaurant_scores, on = 'business_id')
restaurant_scores.head()


Out[6]:
business_id name address city postal_code latitude longitude phone_number TaxCode business_certificate ... owner_address owner_city owner_state owner_zip Score date type ViolationTypeID risk_category description
0 10 Tiramisu Kitchen 033 Belden Pl San Francisco 94104 37.791116 -122.403816 NaN H24 779059.0 ... 33 Belden St San Francisco CA 94104 94.0 20140729 Routine - Unscheduled 103129 Moderate Risk Insufficient hot water or running water
1 10 Tiramisu Kitchen 033 Belden Pl San Francisco 94104 37.791116 -122.403816 NaN H24 779059.0 ... 33 Belden St San Francisco CA 94104 94.0 20140729 Routine - Unscheduled 103144 Low Risk Unapproved or unmaintained equipment or utensils
2 10 Tiramisu Kitchen 033 Belden Pl San Francisco 94104 37.791116 -122.403816 NaN H24 779059.0 ... 33 Belden St San Francisco CA 94104 92.0 20140114 Routine - Unscheduled 103154 Low Risk Unclean or degraded floors walls or ceilings
3 10 Tiramisu Kitchen 033 Belden Pl San Francisco 94104 37.791116 -122.403816 NaN H24 779059.0 ... 33 Belden St San Francisco CA 94104 92.0 20140114 Routine - Unscheduled 103119 Moderate Risk Inadequate and inaccessible handwashing facili...
4 10 Tiramisu Kitchen 033 Belden Pl San Francisco 94104 37.791116 -122.403816 NaN H24 779059.0 ... 33 Belden St San Francisco CA 94104 92.0 20140114 Routine - Unscheduled 103145 Low Risk Improper storage of equipment utensils or linens

5 rows × 22 columns


In [7]:
# 2 Which ten business have had the most inspections?

inspections.business_id.value_counts().head(10)


Out[7]:
1775     21
74374    20
2505     20
71618    19
74131    19
67154    19
71804    19
489      18
7747     18
65523    17
Name: business_id, dtype: int64

In [8]:
# 3 Group and count the inspections by type

inspections.type.value_counts()


Out[8]:
Routine - Unscheduled                15481
Reinspection/Followup                 5386
New Ownership                         1877
Complaint                             1634
New Construction                      1613
Non-inspection site visit              840
Complaint Reinspection/Followup        163
Structural Inspection                  120
Foodborne Illness Investigation        100
Routine - Scheduled                     63
Administrative or Document Review        4
Special Event                            2
Multi-agency Investigation               2
Name: type, dtype: int64

In [55]:
# 4 Create a plot that shows number of inspections per month
# Bonus for creating a heatmap
# http://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.heatmap.html?highlight=heatmap

inspections['date'].dt.strftime('%b').value_counts().plot()


Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0x12549198>

In [10]:
# 5 Which zip code contains the most high risk violations?

restaurant_scores[restaurant_scores['risk_category'] == 'High Risk'].postal_code.value_counts().head(1)


Out[10]:
94133    618
Name: postal_code, dtype: int64

In [11]:
# 6 If inspection is prompted by a change in restaurant ownership, 
# is the inspection more likely to be categorized as higher or lower risk?

restaurant_scores.risk_category[restaurant_scores['type'] == 'New Ownership'].value_counts()


Out[11]:
Low Risk         553
Moderate Risk    419
High Risk        148
Name: risk_category, dtype: int64

In [12]:
# 7 Examining the descriptions, what is the most common violation?

violations.description.value_counts().head(1)


Out[12]:
Unclean or degraded floors walls or ceilings    3395
Name: description, dtype: int64

In [56]:
# 8 Create a hist of the scores with 10 bins

inspections['Score'].hist(bins = 10)


Out[56]:
<matplotlib.axes._subplots.AxesSubplot at 0x12514cc0>

In [71]:
# 9 Can you predict risk category based on the other features in this dataset? Higher scores are lower risk

restaurant_scores.risk_category[restaurant_scores['Score'] > 75].value_counts()


Out[71]:
Low Risk         17840
Moderate Risk    11180
High Risk         3629
Name: risk_category, dtype: int64

In [76]:
# 10 Extra Credit:
# Use Instagram location API to find pictures taken at the lat, long of the most High Risk restaurant
# https://www.instagram.com/developer/endpoints/locations/

restaurant_scores[restaurant_scores['risk_category'] == 'High Risk'].head()


Out[76]:
business_id name address city postal_code latitude longitude phone_number TaxCode business_certificate ... owner_address owner_city owner_state owner_zip Score date type ViolationTypeID risk_category description
48 58 Oasis Grill 91 Drumm St San Francisco 94111 37.794483 -122.396584 NaN H24 954377.0 ... 91 Drumm St San Francisco CA 94111 78.0 20140725 Routine - Unscheduled 103109 High Risk Unclean or unsanitary food contact surfaces
49 58 Oasis Grill 91 Drumm St San Francisco 94111 37.794483 -122.396584 NaN H24 954377.0 ... 91 Drumm St San Francisco CA 94111 78.0 20140725 Routine - Unscheduled 103114 High Risk High risk vermin infestation
67 66 STARBUCKS 1800 IRVING St S.F. 94122 37.763578 -122.477461 NaN H24 911744.0 ... 1800 IRVING St S.F. CA 94122 91.0 20140519 Routine - Unscheduled 103112 High Risk No hot water or running water
70 67 Revolution Cafe 3248 22nd St SF 94110 37.755419 -122.419542 NaN H24 NaN ... 3248 22nd St. SF CA 94110 87.0 20160401 Routine - Unscheduled 103109 High Risk Unclean or unsanitary food contact surfaces
80 73 DINO'S UNCLE VITO 2101 Fillmore St San Francisco 94115 37.788932 -122.433895 NaN H24 186790.0 ... \t2101 FILLMORE ST SAN FRANCISCO CA 94115 85.0 20150717 Routine - Unscheduled 103103 High Risk High risk food holding temperature

5 rows × 22 columns


In [78]:
#https://api.instagram.com/v1/locations/search?lat=37.794483&lng=-122.396584&access_token=145677507.717d650.794fe62f43014634985e6694cdabd845

In [16]:
############################
### A Little More Morbid ###
############################

In [17]:
killings = pd.read_csv('./data/police-killings.csv')
killings.head()


Out[17]:
Unnamed: 0 name age gender raceethnicity month day year streetaddress city state lawenforcementagency cause armed county_income
0 0 A'donte Washington 16 Male Black February 23 2015 Clearview Ln Millbrook AL Millbrook Police Department Gunshot No 54766
1 1 Aaron Rutledge 27 Male White April 2 2015 300 block Iris Park Dr Pineville LA Rapides Parish Sheriff's Office Gunshot No 40930
2 2 Aaron Siler 26 Male White March 14 2015 22nd Ave and 56th St Kenosha WI Kenosha Police Department Gunshot No 54930
3 3 Aaron Valdez 25 Male Hispanic/Latino March 11 2015 3000 Seminole Ave South Gate CA South Gate Police Department Gunshot Firearm 55909
4 4 Adam Jovicic 29 Male White March 19 2015 364 Hiwood Ave Munroe Falls OH Kent Police Department Gunshot No 49669

In [18]:
# 1. Make the following changed to column names:
# lawenforcementagency -> agency
# raceethnicity        -> race

killings.rename(columns = {'Lawenforcementagency': 'agency', 'raceethnicity': 'race'}, inplace = True)

In [19]:
# 2. Show the count of missing values in each column

killings.isnull().sum()


Out[19]:
Unnamed: 0              0
name                    0
age                     0
gender                  0
race                    0
month                   0
day                     0
year                    0
streetaddress           4
city                    0
state                   0
lawenforcementagency    0
cause                   0
armed                   0
county_income           0
dtype: int64

In [20]:
# 3. replace each null value in the dataframe with the string "Unknown"

killings.fillna('Unknown', inplace = True)

In [21]:
# 4. How many killings were there so far in 2015?

killings[killings['year'] == 2015].name.count()


Out[21]:
467

In [22]:
# 5. Of all killings, how many were male and how many female?

killings.gender.value_counts()


Out[22]:
Male      445
Female     22
Name: gender, dtype: int64

In [23]:
# 6. How many killings were of unarmed people?

killings[killings['armed'] == 'No'].name.count()


Out[23]:
102

In [24]:
# 7. What percentage of all killings were unarmed?

float(killings[killings['armed'] == 'No'].name.count()) / float(killings.name.count()) * 100


Out[24]:
21.841541755888652

In [25]:
# 8. What are the 5 states with the most killings?

killings.state.value_counts().head()


Out[25]:
CA    74
TX    46
FL    29
AZ    25
OK    22
Name: state, dtype: int64

In [26]:
# 9. Show a value counts of deaths for each race

killings.race.value_counts()


Out[26]:
White                     236
Black                     135
Hispanic/Latino            67
Unknown                    15
Asian/Pacific Islander     10
Native American             4
Name: race, dtype: int64

In [57]:
# 10. Display a histogram of ages of all killings

killings['age'].hist(bins = 10)


Out[57]:
<matplotlib.axes._subplots.AxesSubplot at 0xdc74390>

In [58]:
# 11. Show 6 histograms of ages by race

killings.age.hist(by = killings.race)


Out[58]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000000128874E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000012B61208>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000012D3BB70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000012E6E0F0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000012F23A58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000012FF86A0>]], dtype=object)

In [29]:
# 12. What is the average age of death by race?

killings.groupby('race').age.mean()


Out[29]:
race
Asian/Pacific Islander    40.800000
Black                     34.044444
Hispanic/Latino           31.716418
Native American           27.750000
Unknown                   43.533333
White                     40.466102
Name: age, dtype: float64

In [59]:
# 13. Show a bar chart with counts of deaths every month

killings.month.value_counts().plot(kind = 'bar', title = 'Deaths per Month')
plt.xlabel('Month')
plt.ylabel('Deaths')


Out[59]:
<matplotlib.text.Text at 0x131ff860>

In [31]:
###################
### Less Morbid ###
###################

In [32]:
majors = pd.read_csv('./data/college-majors.csv')
majors.head()


Out[32]:
Unnamed: 0 Major_code Major Major_category Total Employed Employed_full_time_year_round Unemployed Unemployment_rate Median P25th P75th
0 0 1100 GENERAL AGRICULTURE Agriculture & Natural Resources 128148 90245 74078 2423 0.026147 50000 34000 80000.0
1 1 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources 95326 76865 64240 2266 0.028636 54000 36000 80000.0
2 2 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources 33955 26321 22810 821 0.030248 63000 40000 98000.0
3 3 1103 ANIMAL SCIENCES Agriculture & Natural Resources 103549 81177 64937 3619 0.042679 46000 30000 72000.0
4 4 1104 FOOD SCIENCE Agriculture & Natural Resources 24280 17281 12722 894 0.049188 62000 38500 90000.0

In [33]:
# 1. Delete the columns (employed_full_time_year_round, major_code)

majors.drop(majors.columns[[6, 1]], axis = 1)


Out[33]:
Unnamed: 0 Major Major_category Total Employed Unemployed Unemployment_rate Median P25th P75th
0 0 GENERAL AGRICULTURE Agriculture & Natural Resources 128148 90245 2423 0.026147 50000 34000 80000.0
1 1 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources 95326 76865 2266 0.028636 54000 36000 80000.0
2 2 AGRICULTURAL ECONOMICS Agriculture & Natural Resources 33955 26321 821 0.030248 63000 40000 98000.0
3 3 ANIMAL SCIENCES Agriculture & Natural Resources 103549 81177 3619 0.042679 46000 30000 72000.0
4 4 FOOD SCIENCE Agriculture & Natural Resources 24280 17281 894 0.049188 62000 38500 90000.0
5 5 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources 79409 63043 2070 0.031791 50000 35000 75000.0
6 6 SOIL SCIENCE Agriculture & Natural Resources 6586 4926 264 0.050867 63000 39400 88000.0
7 7 MISCELLANEOUS AGRICULTURE Agriculture & Natural Resources 8549 6392 261 0.039230 52000 35000 75000.0
8 8 ENVIRONMENTAL SCIENCE Biology & Life Science 106106 87602 4736 0.051290 52000 38000 75000.0
9 9 FORESTRY Agriculture & Natural Resources 69447 48228 2144 0.042563 58000 40500 80000.0
10 10 NATURAL RESOURCES MANAGEMENT Agriculture & Natural Resources 83188 65937 3789 0.054341 52000 37100 75000.0
11 11 ARCHITECTURE Engineering 294692 216770 20394 0.085991 63000 40400 93500.0
12 12 AREA ETHNIC AND CIVILIZATION STUDIES Humanities & Liberal Arts 103740 75798 5525 0.067939 46000 32000 71000.0
13 13 COMMUNICATIONS Communications & Journalism 987676 790696 54390 0.064360 50000 35000 80000.0
14 14 JOURNALISM Communications & Journalism 418104 314438 20754 0.061917 50000 35000 80000.0
15 15 MASS MEDIA Communications & Journalism 211213 170474 15431 0.083005 48000 32000 70000.0
16 16 ADVERTISING AND PUBLIC RELATIONS Communications & Journalism 186829 147433 10624 0.067216 50000 34000 75000.0
17 17 COMMUNICATION TECHNOLOGIES Computers & Mathematics 62141 49609 4609 0.085009 50000 34500 75000.0
18 18 COMPUTER AND INFORMATION SYSTEMS Computers & Mathematics 253782 218248 11945 0.051891 65000 45000 90000.0
19 19 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics 29317 22828 2265 0.090264 60000 40000 85000.0
20 20 COMPUTER SCIENCE Computers & Mathematics 783292 656372 34196 0.049519 78000 51000 105000.0
21 21 INFORMATION SCIENCES Computers & Mathematics 77805 66393 3704 0.052841 68000 46200 95000.0
22 22 COMPUTER ADMINISTRATION MANAGEMENT AND SECURITY Computers & Mathematics 39362 32366 2626 0.075046 55000 40000 80000.0
23 23 COMPUTER NETWORKING AND TELECOMMUNICATIONS Computers & Mathematics 51771 44071 2748 0.058694 55000 36000 80000.0
24 24 COSMETOLOGY SERVICES AND CULINARY ARTS Industrial Arts & Consumer Services 42325 33388 1941 0.054941 40000 26200 60000.0
25 25 GENERAL EDUCATION Education 1438867 843693 38742 0.043904 43000 32000 59000.0
26 26 EDUCATIONAL ADMINISTRATION AND SUPERVISION Education 4037 3113 0 0.000000 58000 44750 79000.0
27 27 SCHOOL STUDENT COUNSELING Education 2396 1492 169 0.101746 41000 33200 50000.0
28 28 ELEMENTARY EDUCATION Education 1446701 819393 32685 0.038359 40000 31000 50000.0
29 29 MATHEMATICS TEACHER EDUCATION Education 68808 47203 1610 0.032983 43000 34000 60000.0
... ... ... ... ... ... ... ... ... ... ...
143 143 FILM VIDEO AND PHOTOGRAPHIC ARTS Arts 133508 107651 10080 0.085619 47000 30000 70000.0
144 144 ART HISTORY AND CRITICISM Humanities & Liberal Arts 90852 61295 4185 0.063913 44500 30000 70000.0
145 145 STUDIO ARTS Arts 81008 58799 5372 0.083714 37600 24900 58000.0
146 146 MISCELLANEOUS FINE ARTS Arts 8511 6431 1190 0.156147 45000 30000 60000.0
147 147 GENERAL MEDICAL AND HEALTH SERVICES Health 104516 78198 4525 0.054701 50000 35000 73000.0
148 148 COMMUNICATION DISORDERS SCIENCES AND SERVICES Health 74977 49393 2407 0.046467 42000 30000 60000.0
149 149 HEALTH AND MEDICAL ADMINISTRATIVE SERVICES Health 108510 85360 5160 0.057004 50000 35000 75000.0
150 150 MEDICAL ASSISTING SERVICES Health 64316 51279 1660 0.031357 55000 37000 75000.0
151 151 MEDICAL TECHNOLOGIES TECHNICIANS Health 164990 121479 4564 0.036210 60000 45000 76000.0
152 152 HEALTH AND MEDICAL PREPARATORY PROGRAMS Health 32514 19009 1431 0.070010 50000 34000 85000.0
153 153 NURSING Health 1769892 1325711 36503 0.026797 62000 48000 80000.0
154 154 PHARMACY PHARMACEUTICAL SCIENCES AND ADMINISTR... Health 180084 124058 4414 0.034358 106000 78000 125000.0
155 155 TREATMENT THERAPY PROFESSIONS Health 252138 199174 5378 0.026292 61000 40000 80000.0
156 156 COMMUNITY AND PUBLIC HEALTH Health 56741 42543 3032 0.066528 47000 32800 70000.0
157 157 MISCELLANEOUS HEALTH MEDICAL PROFESSIONS Health 77647 52610 2978 0.053573 45000 32000 62000.0
158 158 GENERAL BUSINESS Business 2148712 1580978 85626 0.051378 60000 40000 95000.0
159 159 ACCOUNTING Business 1779219 1335825 75379 0.053415 65000 42500 100000.0
160 160 ACTUARIAL SCIENCE Business 9763 7846 466 0.056064 72000 53000 115000.0
161 161 BUSINESS MANAGEMENT AND ADMINISTRATION Business 3123510 2354398 147261 0.058865 58000 39500 86000.0
162 162 OPERATIONS LOGISTICS AND E-COMMERCE Business 57200 47341 2141 0.043268 65000 45000 90000.0
163 163 BUSINESS ECONOMICS Business 75547 57983 3816 0.061749 65000 45000 100000.0
164 164 MARKETING AND MARKETING RESEARCH Business 1114624 890125 51839 0.055033 56000 38500 90000.0
165 165 FINANCE Business 816548 670681 34166 0.048473 65000 45000 100000.0
166 166 HUMAN RESOURCES AND PERSONNEL MANAGEMENT Business 187274 142879 9241 0.060748 54000 38000 80000.0
167 167 INTERNATIONAL BUSINESS Business 86064 66453 5106 0.071354 54000 38600 80000.0
168 168 HOSPITALITY MANAGEMENT Business 200854 163393 8862 0.051447 49000 33000 70000.0
169 169 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business 156673 134478 6186 0.043977 72000 50000 100000.0
170 170 MISCELLANEOUS BUSINESS & MEDICAL ADMINISTRATION Business 102753 77471 4308 0.052679 53000 36000 83000.0
171 171 HISTORY Humanities & Liberal Arts 712509 478416 33725 0.065851 50000 35000 80000.0
172 172 UNITED STATES HISTORY Humanities & Liberal Arts 17746 11887 943 0.073500 50000 39000 81000.0

173 rows × 10 columns


In [34]:
# 2. Show the cout of missing values in each column

majors.isnull().sum()


Out[34]:
Unnamed: 0                       0
Major_code                       0
Major                            0
Major_category                   0
Total                            0
Employed                         0
Employed_full_time_year_round    0
Unemployed                       0
Unemployment_rate                0
Median                           0
P25th                            0
P75th                            0
dtype: int64

In [35]:
# 3. What are the top 10 highest paying majors?

majors.groupby('Major').Median.max().head(10)


Out[35]:
Major
ACCOUNTING                               65000
ACTUARIAL SCIENCE                        72000
ADVERTISING AND PUBLIC RELATIONS         50000
AEROSPACE ENGINEERING                    80000
AGRICULTURAL ECONOMICS                   63000
AGRICULTURE PRODUCTION AND MANAGEMENT    54000
ANIMAL SCIENCES                          46000
ANTHROPOLOGY AND ARCHEOLOGY              43000
APPLIED MATHEMATICS                      70000
ARCHITECTURAL ENGINEERING                78000
Name: Median, dtype: int64

In [36]:
# 4. Plot the data from the last question in a bar chart, include proper title, and labels!

In [37]:
# 5. What is the average median salary for each major category?

majors.groupby('Major').Median.mean()


Out[37]:
Major
ACCOUNTING                                       65000
ACTUARIAL SCIENCE                                72000
ADVERTISING AND PUBLIC RELATIONS                 50000
AEROSPACE ENGINEERING                            80000
AGRICULTURAL ECONOMICS                           63000
AGRICULTURE PRODUCTION AND MANAGEMENT            54000
ANIMAL SCIENCES                                  46000
ANTHROPOLOGY AND ARCHEOLOGY                      43000
APPLIED MATHEMATICS                              70000
ARCHITECTURAL ENGINEERING                        78000
ARCHITECTURE                                     63000
AREA ETHNIC AND CIVILIZATION STUDIES             46000
ART AND MUSIC EDUCATION                          42600
ART HISTORY AND CRITICISM                        44500
ASTRONOMY AND ASTROPHYSICS                       80000
ATMOSPHERIC SCIENCES AND METEOROLOGY             60000
BIOCHEMICAL SCIENCES                             53000
BIOLOGICAL ENGINEERING                           62000
BIOLOGY                                          51000
BIOMEDICAL ENGINEERING                           65000
BOTANY                                           50000
BUSINESS ECONOMICS                               65000
BUSINESS MANAGEMENT AND ADMINISTRATION           58000
CHEMICAL ENGINEERING                             86000
CHEMISTRY                                        59000
CIVIL ENGINEERING                                78000
CLINICAL PSYCHOLOGY                              45000
COGNITIVE SCIENCE AND BIOPSYCHOLOGY              53000
COMMERCIAL ART AND GRAPHIC DESIGN                46600
COMMUNICATION DISORDERS SCIENCES AND SERVICES    42000
                                                 ...  
PHILOSOPHY AND RELIGIOUS STUDIES                 45000
PHYSICAL AND HEALTH EDUCATION TEACHING           48400
PHYSICAL FITNESS PARKS RECREATION AND LEISURE    44000
PHYSICAL SCIENCES                                60000
PHYSICS                                          70000
PHYSIOLOGY                                       50000
PLANT SCIENCE AND AGRONOMY                       50000
POLITICAL SCIENCE AND GOVERNMENT                 58000
PRE-LAW AND LEGAL STUDIES                        48000
PSYCHOLOGY                                       45000
PUBLIC ADMINISTRATION                            56000
PUBLIC POLICY                                    60000
SCHOOL STUDENT COUNSELING                        41000
SCIENCE AND COMPUTER TEACHER EDUCATION           46000
SECONDARY TEACHER EDUCATION                      45000
SOCIAL PSYCHOLOGY                                47000
SOCIAL SCIENCE OR HISTORY TEACHER EDUCATION      45000
SOCIAL WORK                                      40000
SOCIOLOGY                                        47000
SOIL SCIENCE                                     63000
SPECIAL NEEDS EDUCATION                          42000
STATISTICS AND DECISION SCIENCE                  70000
STUDIO ARTS                                      37600
TEACHER EDUCATION: MULTIPLE LEVELS               40000
THEOLOGY AND RELIGIOUS VOCATIONS                 40000
TRANSPORTATION SCIENCES AND TECHNOLOGIES         67000
TREATMENT THERAPY PROFESSIONS                    61000
UNITED STATES HISTORY                            50000
VISUAL AND PERFORMING ARTS                       40000
ZOOLOGY                                          55000
Name: Median, dtype: int64

In [38]:
# 6. Show only the top 5 paying major categories

majors.groupby('Major').Median.mean().head()


Out[38]:
Major
ACCOUNTING                          65000
ACTUARIAL SCIENCE                   72000
ADVERTISING AND PUBLIC RELATIONS    50000
AEROSPACE ENGINEERING               80000
AGRICULTURAL ECONOMICS              63000
Name: Median, dtype: int64

In [60]:
# 7. Plot a histogram of the distribution of median salaries

majors['Median'].hist(bins = 10)


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x13954518>

In [61]:
# 8. Plot a histogram of the distribution of median salaries by major category

majors.Median.hist(by = majors.Major_category)


Out[61]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001380D438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000013CA9C50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000013EF9518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000013F59C50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000014082D68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000012F63080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014216160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001430DAC8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001436ED68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000144747F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014519828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001461F2B0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000014650A20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000147C4B00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000148DB4A8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001497D6A0>]], dtype=object)

In [41]:
# 9. What are the top 10 most UNemployed majors?

majors[['Major', 'Unemployed']].sort_index(by = 'Unemployed', ascending = False).head(10)


C:\Users\sglembocki\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\__main__.py:3: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
  app.launch_new_instance()
Out[41]:
Major Unemployed
161 BUSINESS MANAGEMENT AND ADMINISTRATION 147261
158 GENERAL BUSINESS 85626
114 PSYCHOLOGY 79066
159 ACCOUNTING 75379
13 COMMUNICATIONS 54390
73 ENGLISH LANGUAGE AND LITERATURE 52248
164 MARKETING AND MARKETING RESEARCH 51839
132 POLITICAL SCIENCE AND GOVERNMENT 40376
25 GENERAL EDUCATION 38742
78 BIOLOGY 36757

In [42]:
# What are the unemployment rates?
majors['Unemployment_rate'] = (majors.Unemployed / (majors.Unemployed + majors.Employed) * 100)
majors[['Major', 'Unemployment_rate']].sort_index(by = 'Unemployment_rate', ascending = False)


C:\Users\sglembocki\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\__main__.py:3: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
  app.launch_new_instance()
Out[42]:
Major Unemployment_rate
146 MISCELLANEOUS FINE ARTS 15.614749
116 CLINICAL PSYCHOLOGY 10.271216
93 MILITARY TECHNOLOGIES 10.179641
27 SCHOOL STUDENT COUNSELING 10.174594
77 LIBRARY SCIENCE 9.484299
141 VISUAL AND PERFORMING ARTS 9.465800
19 COMPUTER PROGRAMMING AND DATA PROCESSING 9.026422
119 SOCIAL PSYCHOLOGY 8.733624
104 ASTRONOMY AND ASTROPHYSICS 8.602151
11 ARCHITECTURE 8.599113
143 FILM VIDEO AND PHOTOGRAPHIC ARTS 8.561891
17 COMMUNICATION TECHNOLOGIES 8.500867
145 STUDIO ARTS 8.371383
118 INDUSTRIAL AND ORGANIZATIONAL PSYCHOLOGY 8.362907
15 MASS MEDIA 8.300476
120 MISCELLANEOUS PSYCHOLOGY 8.200936
67 LINGUISTICS AND COMPARATIVE LANGUAGE AND LITER... 8.134809
139 DRAMA AND THEATER ARTS 8.027373
123 PUBLIC POLICY 7.921692
44 BIOMEDICAL ENGINEERING 7.903583
128 ANTHROPOLOGY AND ARCHEOLOGY 7.817578
76 HUMANITIES 7.784376
94 MULTI/INTERDISCIPLINARY STUDIES 7.726897
115 EDUCATIONAL PSYCHOLOGY 7.563114
22 COMPUTER ADMINISTRATION MANAGEMENT AND SECURITY 7.504572
101 PHILOSOPHY AND RELIGIOUS STUDIES 7.502034
95 INTERCULTURAL AND INTERNATIONAL STUDIES 7.433013
142 COMMERCIAL ART AND GRAPHIC DESIGN 7.391972
74 COMPOSITION AND RHETORIC 7.361495
172 UNITED STATES HISTORY 7.349961
... ... ...
9 FORESTRY 4.256333
102 THEOLOGY AND RELIGIOUS VOCATIONS 4.250511
59 PETROLEUM ENGINEERING 4.220535
32 SCIENCE AND COMPUTER TEACHER EDUCATION 4.219989
41 AEROSPACE ENGINEERING 4.197131
83 GENETICS 4.159095
38 ART AND MUSIC EDUCATION 4.097337
57 NAVAL ARCHITECTURE AND MARINE ENGINEERING 4.030882
7 MISCELLANEOUS AGRICULTURE 3.923042
39 MISCELLANEOUS EDUCATION 3.921524
103 PHYSICAL SCIENCES 3.895254
28 ELEMENTARY EDUCATION 3.835916
105 ATMOSPHERIC SCIENCES AND METEOROLOGY 3.672631
151 MEDICAL TECHNOLOGIES TECHNICIANS 3.620986
154 PHARMACY PHARMACEUTICAL SCIENCES AND ADMINISTR... 3.435768
80 BOTANY 3.402351
36 TEACHER EDUCATION: MULTIPLE LEVELS 3.335686
29 MATHEMATICS TEACHER EDUCATION 3.298302
5 PLANT SCIENCE AND AGRONOMY 3.179089
150 MEDICAL ASSISTING SERVICES 3.135684
2 AGRICULTURAL ECONOMICS 3.024832
1 AGRICULTURE PRODUCTION AND MANAGEMENT 2.863606
153 NURSING 2.679682
155 TREATMENT THERAPY PROFESSIONS 2.629160
0 GENERAL AGRICULTURE 2.614711
97 MATHEMATICS AND COMPUTER SCIENCE 2.490040
111 MATERIALS SCIENCE 2.233333
85 PHARMACOLOGY 1.611080
26 EDUCATIONAL ADMINISTRATION AND SUPERVISION 0.000000
51 GEOLOGICAL AND GEOPHYSICAL ENGINEERING 0.000000

173 rows × 2 columns


In [43]:
# 10. What are the top 10 most UNemployed majors CATEGORIES? Use the mean for each category
# What are the unemployment rates?

majors.groupby('Major_category').Unemployment_rate.mean()


Out[43]:
Major_category
Agriculture & Natural Resources        3.956918
Arts                                   8.760052
Biology & Life Science                 4.993597
Business                               5.449602
Communications & Journalism            6.912452
Computers & Mathematics                5.943698
Education                              4.676196
Engineering                            5.063002
Health                                 4.720928
Humanities & Liberal Arts              6.942871
Industrial Arts & Consumer Services    5.854567
Interdisciplinary                      7.726897
Law & Public Policy                    6.785356
Physical Sciences                      5.454062
Psychology & Social Work               7.786702
Social Science                         6.568566
Name: Unemployment_rate, dtype: float64

In [44]:
# 11. the total and employed column refer to the people that were surveyed.
# Create a new column showing the emlpoyment rate of the people surveyed for each major
# call it "sample_employment_rate"
# Example the first row has total: 128148 and employed: 90245. it's 
# sample_employment_rate should be 90245.0 / 128148.0 = .7042

majors['sample_employment_rate'] = majors['Employed'] / majors['Total']
majors.head()


Out[44]:
Unnamed: 0 Major_code Major Major_category Total Employed Employed_full_time_year_round Unemployed Unemployment_rate Median P25th P75th sample_employment_rate
0 0 1100 GENERAL AGRICULTURE Agriculture & Natural Resources 128148 90245 74078 2423 2.614711 50000 34000 80000.0 0.704225
1 1 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources 95326 76865 64240 2266 2.863606 54000 36000 80000.0 0.806338
2 2 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources 33955 26321 22810 821 3.024832 63000 40000 98000.0 0.775173
3 3 1103 ANIMAL SCIENCES Agriculture & Natural Resources 103549 81177 64937 3619 4.267890 46000 30000 72000.0 0.783948
4 4 1104 FOOD SCIENCE Agriculture & Natural Resources 24280 17281 12722 894 4.918845 62000 38500 90000.0 0.711738

In [45]:
# 12. Create a "sample_unemployment_rate" column
# this column should be 1 - "sample_employment_rate"

majors['sample_unemployment_rate'] = 1 - majors['sample_employment_rate']
majors.head()


Out[45]:
Unnamed: 0 Major_code Major Major_category Total Employed Employed_full_time_year_round Unemployed Unemployment_rate Median P25th P75th sample_employment_rate sample_unemployment_rate
0 0 1100 GENERAL AGRICULTURE Agriculture & Natural Resources 128148 90245 74078 2423 2.614711 50000 34000 80000.0 0.704225 0.295775
1 1 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources 95326 76865 64240 2266 2.863606 54000 36000 80000.0 0.806338 0.193662
2 2 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources 33955 26321 22810 821 3.024832 63000 40000 98000.0 0.775173 0.224827
3 3 1103 ANIMAL SCIENCES Agriculture & Natural Resources 103549 81177 64937 3619 4.267890 46000 30000 72000.0 0.783948 0.216052
4 4 1104 FOOD SCIENCE Agriculture & Natural Resources 24280 17281 12722 894 4.918845 62000 38500 90000.0 0.711738 0.288262

In [ ]: