Start with the standard imports we have used for every notebook in this class.
In [45]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
Each of the datasheets downloaded from ELSI had download metadata on the top of them and total and key information on the bottom of them that were not data rows. This metadata, total, and key information was manually deleted before import. Some of the files had ="0" instead of 0 in the cells. This was found/replaced before import using the sed -i '' 's/="0"/0/g' *.csv command from the terminal.
In [46]:
#CITATION: This is the data from National Center for Education Statistics on Schools
#Schools for all 50 states and Washington, D.C.
#http://nces.ed.gov/ccd/elsi/
#ELSI Root Data Source: U.S. Department of Education National Center for Education Statistics Common Core of Data (CCD) "Public Elementary/Secondary School Universe Survey" 2012-13 v.1a 2013-14 v.1a.
#KEY:
#† indicates that the data are not applicable.
#– indicates that the data are missing.
#‡ indicates that the data do not meet NCES data quality standards.
schoolinformation = pd.read_csv("data/rawdata/schools/2009-2010 SCHOOL Information Tab.csv", dtype=np.str)
schoolcharacteristicsa = pd.read_csv("data/rawdata/schools/2009-2010 SCHOOL CharacteristicsA Tab.csv", dtype=np.str)
schoolcharacteristicsb = pd.read_csv("data/rawdata/schools/2009-2010 SCHOOL CharacteristicsB Tab.csv", dtype=np.str)
schoolenrollment = pd.read_csv("data/rawdata/schools/2009-2010 SCHOOL Enrollments Tab.csv", dtype=np.str)
schoolenrollmentdetails = pd.read_csv("data/rawdata/schools/2009-2010 SCHOOL Enrollment Details Tab.csv", dtype=np.str)
#schoolenrollmentK3 = pd.read_csv("tempdata/2013-2014 SCHOOL Enrollment by Grade, Race-Ethnicity, and Gender Tab PreK-3.csv", dtype=np.str)
#schoolenrollment48 = pd.read_csv("tempdata/2013-2014 SCHOOL Enrollment by Grade, Race-Ethnicity and Gender Tab 4-8.csv", dtype=np.str)
#schoolenrollment912 = pd.read_csv("tempdata/2013-2014 SCHOOL Enrollment by Grade, Race-Ethnicity and Gender Tab 9-Ungraded.csv", dtype=np.str)
schoolteacherstaff = pd.read_csv("data/rawdata/schools/2009-2010 SCHOOL Teacher and Staff Tab.csv", dtype=np.str)
Check the lengths of the datasets to see if we have a row for every school.
In [47]:
print len(schoolinformation)
print len(schoolcharacteristicsa)
print len(schoolcharacteristicsb)
print len(schoolenrollment)
print len(schoolenrollmentdetails)
#print len(schoolenrollmentK3)
#print len(schoolenrollment48)
#print len(schoolenrollment912)
print len(schoolteacherstaff)
Drop all of the duplicate columns.
In [49]:
schoolcharacteristicsa = schoolcharacteristicsa.drop(schoolcharacteristicsa.columns[[0, 1, 20]], 1)
schoolcharacteristicsb = schoolcharacteristicsb.drop(schoolcharacteristicsb.columns[[0, 1]], 1)
schoolenrollment = schoolenrollment.drop(schoolenrollment.columns[[0, 1]], 1)
schoolenrollmentdetails = schoolenrollmentdetails.drop(schoolenrollmentdetails.columns[[0, 1]], 1)
#schoolenrollmentK3 = schoolenrollmentK3.drop(schoolenrollmentK3.columns[[0, 1]], 1)
#schoolenrollment48 = schoolenrollment48.drop(schoolenrollment48.columns[[0, 1]], 1)
#schoolenrollment912 = schoolenrollment912.drop(schoolenrollment912.columns[[0, 1, 72]], 1)
In [50]:
schoolinformation.head()
Out[50]:
Join all of the school datasets.
In [51]:
joinedschool = schoolteacherstaff.join([schoolcharacteristicsa, schoolcharacteristicsb, schoolenrollment, schoolenrollmentdetails])
In [52]:
joinedschool = schoolinformation.merge(joinedschool, 'left', 'School ID - NCES Assigned [Public School] Latest available year', suffixes=('', '_DEL'))
In [55]:
#Need to get rid of Excel syntax ="" from some of the columns
for i, col in enumerate(joinedschool.columns):
joinedschool[col] = joinedschool[col].map(lambda x: str(x).lstrip('="').rstrip('"'))
In [ ]:
#If by chance any rows have NaN, replace with the ELSI standard for missing data '–'
joinedschool = joinedschool.fillna('–')
joinedschool = joinedschool.replace('nan', '–')
In [57]:
# Replacing Missing Data / NA / Bad Quality data with blank, later to be turned into NaN for float columns
# CITATIOIN : http://pandas.pydata.org/pandas-docs/version/0.15.2/missing_data.html
joinedschool = joinedschool.replace('\xe2\x80\x93', '') # Replace "-" (Missing Data) with blank
joinedschool = joinedschool.replace('\xe2\x80\xa0', '') # Replace "†" (Not Applicable) with blank
joinedschool = joinedschool.replace('\xe2\x80\xa1', '') # Replace "‡" (Bad Quality) with blank
In [62]:
joinedschool['i_agency_type_regional_education_services'] = np.where(joinedschool['Agency Type [District] 2009-10']=='4-Regional education services agency', 1, 0)
joinedschool['i_agency_type_local_school_district'] = np.where(joinedschool['Agency Type [District] 2009-10']=='1-Local school district', 1, 0)
joinedschool['i_lgo_PK'] = np.where(joinedschool['Lowest Grade Offered [Public School] 2009-10']=='Prekindergarten', 1, 0)
joinedschool['i_lgo_K'] = np.where(joinedschool['Lowest Grade Offered [Public School] 2009-10']=='Kindergarten', 1, 0)
joinedschool['Black Students [Public School] 2009-10'] = joinedschool['Black Students [Public School] 2009-10'].replace('', np.nan)
joinedschool['Grades 9-12 Students [Public School] 2009-10'] = joinedschool['Grades 9-12 Students [Public School] 2009-10'].replace('', np.nan)
joinedschool['Total Students [Public School] 2009-10'] = joinedschool['Total Students [Public School] 2009-10'].replace('', np.nan)
joinedschool['Black Students [Public School] 2009-10'] = joinedschool['Black Students [Public School] 2009-10'].astype(float)
joinedschool['Grades 9-12 Students [Public School] 2009-10'] = joinedschool['Grades 9-12 Students [Public School] 2009-10'].astype(float)
joinedschool['Total Students [Public School] 2009-10'] = joinedschool['Total Students [Public School] 2009-10'].astype(float)
joinedschool['r_stud_re_B'] = joinedschool['Black Students [Public School] 2009-10']/joinedschool['Total Students [Public School] 2009-10']
joinedschool['r_stud_912'] = joinedschool['Grades 9-12 Students [Public School] 2009-10']/joinedschool['Total Students [Public School] 2009-10']
joinedschool['r_st_TS'] = joinedschool['Pupil/Teacher Ratio [Public School] 2009-10']
In [65]:
joinedschool = joinedschool[['School Name [Public School] 2009-10', 'State Name [Public School] Latest available year', 'School ID - NCES Assigned [Public School] Latest available year', 'Agency ID - NCES Assigned [Public School] Latest available year', 'i_agency_type_regional_education_services', 'i_agency_type_local_school_district', 'i_lgo_PK', 'i_lgo_K', 'r_stud_re_B', 'r_stud_912', 'r_st_TS']]
In [66]:
joinedschool.head()
Out[66]:
In [67]:
joinedschool=joinedschool.replace([np.inf, -np.inf], np.nan)
In [68]:
joinedschool.fillna(value=0,inplace=True)
In [69]:
joinedschool.head()
Out[69]:
In [71]:
joinedschool.to_csv("data/finaldata/tableauschools.csv")
This is the full list of all column names in the schools dataframe.
In [72]:
for col in joinedschool.columns:
print col
In [ ]: