In [4]:
import pandas as pd
data_files = [
"ap_2010.csv",
"class_size.csv",
"demographics.csv",
"graduation.csv",
"hs_directory.csv",
"sat_results.csv"
]
data = {}
for data_file in data_files:
#remove .csv in the file name for the variable name
df_name = data_file[:-4]
data[df_name] = pd.read_csv('../resources/' + data_file)
print(data.keys())
dict_keys(['ap_2010', 'class_size', 'demographics', 'graduation', 'hs_directory', 'sat_results'])
In [5]:
#make copy pointer to the sat dataframe
sat = data['sat_results']
print(sat.head())
for key,value in data.items():
print('Data set name {}'.format(key))
print(value.head())
DBN SCHOOL NAME \
0 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES
1 01M448 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL
2 01M450 EAST SIDE COMMUNITY SCHOOL
3 01M458 FORSYTH SATELLITE ACADEMY
4 01M509 MARTA VALLE HIGH SCHOOL
Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score \
0 29 355 404
1 91 383 423
2 70 377 402
3 7 414 401
4 44 390 433
SAT Writing Avg. Score
0 363
1 366
2 370
3 359
4 384
Data set name ap_2010
DBN SchoolName AP Test Takers \
0 01M448 UNIVERSITY NEIGHBORHOOD H.S. 39
1 01M450 EAST SIDE COMMUNITY HS 19
2 01M515 LOWER EASTSIDE PREP 24
3 01M539 NEW EXPLORATIONS SCI,TECH,MATH 255
4 02M296 High School of Hospitality Management s
Total Exams Taken Number of Exams with scores 3 4 or 5
0 49 10
1 21 s
2 26 24
3 377 191
4 s s
Data set name class_size
CSD BOROUGH SCHOOL CODE SCHOOL NAME GRADE PROGRAM TYPE \
0 1 M M015 P.S. 015 Roberto Clemente 0K GEN ED
1 1 M M015 P.S. 015 Roberto Clemente 0K CTT
2 1 M M015 P.S. 015 Roberto Clemente 01 GEN ED
3 1 M M015 P.S. 015 Roberto Clemente 01 CTT
4 1 M M015 P.S. 015 Roberto Clemente 02 GEN ED
CORE SUBJECT (MS CORE and 9-12 ONLY) CORE COURSE (MS CORE and 9-12 ONLY) \
0 - -
1 - -
2 - -
3 - -
4 - -
SERVICE CATEGORY(K-9* ONLY) NUMBER OF STUDENTS / SEATS FILLED \
0 - 19.0
1 - 21.0
2 - 17.0
3 - 17.0
4 - 15.0
NUMBER OF SECTIONS AVERAGE CLASS SIZE SIZE OF SMALLEST CLASS \
0 1.0 19.0 19.0
1 1.0 21.0 21.0
2 1.0 17.0 17.0
3 1.0 17.0 17.0
4 1.0 15.0 15.0
SIZE OF LARGEST CLASS DATA SOURCE SCHOOLWIDE PUPIL-TEACHER RATIO
0 19.0 ATS NaN
1 21.0 ATS NaN
2 17.0 ATS NaN
3 17.0 ATS NaN
4 15.0 ATS NaN
Data set name demographics
DBN Name schoolyear fl_percent frl_percent \
0 01M015 P.S. 015 ROBERTO CLEMENTE 20052006 89.4 NaN
1 01M015 P.S. 015 ROBERTO CLEMENTE 20062007 89.4 NaN
2 01M015 P.S. 015 ROBERTO CLEMENTE 20072008 89.4 NaN
3 01M015 P.S. 015 ROBERTO CLEMENTE 20082009 89.4 NaN
4 01M015 P.S. 015 ROBERTO CLEMENTE 20092010 96.5
total_enrollment prek k grade1 grade2 ... black_num black_per \
0 281 15 36 40 33 ... 74 26.3
1 243 15 29 39 38 ... 68 28.0
2 261 18 43 39 36 ... 77 29.5
3 252 17 37 44 32 ... 75 29.8
4 208 16 40 28 32 ... 67 32.2
hispanic_num hispanic_per white_num white_per male_num male_per female_num \
0 189 67.3 5 1.8 158.0 56.2 123.0
1 153 63.0 4 1.6 140.0 57.6 103.0
2 157 60.2 7 2.7 143.0 54.8 118.0
3 149 59.1 7 2.8 149.0 59.1 103.0
4 118 56.7 6 2.9 124.0 59.6 84.0
female_per
0 43.8
1 42.4
2 45.2
3 40.9
4 40.4
[5 rows x 38 columns]
Data set name graduation
Demographic DBN School Name Cohort \
0 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2003
1 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2004
2 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2005
3 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2006
4 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2006 Aug
Total Cohort Total Grads - n Total Grads - % of cohort Total Regents - n \
0 5 s s s
1 55 37 67.3% 17
2 64 43 67.2% 27
3 78 43 55.1% 36
4 78 44 56.4% 37
Total Regents - % of cohort Total Regents - % of grads \
0 s s
1 30.9% 45.9%
2 42.2% 62.8%
3 46.2% 83.7%
4 47.4% 84.1%
... Regents w/o Advanced - n \
0 ... s
1 ... 17
2 ... 27
3 ... 36
4 ... 37
Regents w/o Advanced - % of cohort Regents w/o Advanced - % of grads \
0 s s
1 30.9% 45.9%
2 42.2% 62.8%
3 46.2% 83.7%
4 47.4% 84.1%
Local - n Local - % of cohort Local - % of grads Still Enrolled - n \
0 s s s s
1 20 36.4% 54.1% 15
2 16 25% 37.200000000000003% 9
3 7 9% 16.3% 16
4 7 9% 15.9% 15
Still Enrolled - % of cohort Dropped Out - n Dropped Out - % of cohort
0 s s s
1 27.3% 3 5.5%
2 14.1% 9 14.1%
3 20.5% 11 14.1%
4 19.2% 11 14.1%
[5 rows x 23 columns]
Data set name hs_directory
dbn school_name boro \
0 17K548 Brooklyn School for Music & Theatre Brooklyn
1 09X543 High School for Violin and Dance Bronx
2 09X327 Comprehensive Model School Project M.S. 327 Bronx
3 02M280 Manhattan Early College School for Advertising Manhattan
4 28Q680 Queens Gateway to Health Sciences Secondary Sc... Queens
building_code phone_number fax_number grade_span_min grade_span_max \
0 K440 718-230-6250 718-230-6262 9 12
1 X400 718-842-0687 718-589-9849 9 12
2 X240 718-294-8111 718-294-8109 6 12
3 M520 718-935-3477 NaN 9 10
4 Q695 718-969-3155 718-969-3552 6 12
expgrade_span_min expgrade_span_max \
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 9 14.0
4 NaN NaN
... \
0 ...
1 ...
2 ...
3 ...
4 ...
priority02 \
0 Then to New York City residents
1 Then to New York City residents who attend an ...
2 Then to Bronx students or residents who attend...
3 Then to New York City residents who attend an ...
4 Then to Districts 28 and 29 students or residents
priority03 \
0 NaN
1 Then to Bronx students or residents
2 Then to New York City residents who attend an ...
3 Then to Manhattan students or residents
4 Then to Queens students or residents
priority04 priority05 \
0 NaN NaN
1 Then to New York City residents NaN
2 Then to Bronx students or residents Then to New York City residents
3 Then to New York City residents NaN
4 Then to New York City residents NaN
priority06 priority07 priority08 priority09 priority10 \
0 NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN
Location 1
0 883 Classon Avenue\nBrooklyn, NY 11225\n(40.67...
1 1110 Boston Road\nBronx, NY 10456\n(40.8276026...
2 1501 Jerome Avenue\nBronx, NY 10452\n(40.84241...
3 411 Pearl Street\nNew York, NY 10038\n(40.7106...
4 160-20 Goethals Avenue\nJamaica, NY 11432\n(40...
[5 rows x 58 columns]
Data set name sat_results
DBN SCHOOL NAME \
0 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES
1 01M448 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL
2 01M450 EAST SIDE COMMUNITY SCHOOL
3 01M458 FORSYTH SATELLITE ACADEMY
4 01M509 MARTA VALLE HIGH SCHOOL
Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score \
0 29 355 404
1 91 383 423
2 70 377 402
3 7 414 401
4 44 390 433
SAT Writing Avg. Score
0 363
1 366
2 370
3 359
4 384
In [6]:
all_survey = pd.read_csv('../resources//survey_all.txt', delimiter='\t',
encoding='windows-1252')
print(all_survey.iloc[0])
d75_survey = pd.read_csv('../resources/survey_d75.txt', delimiter='\t',
encoding='windows-1252')
print(d75_survey.iloc[0])
survey = pd.concat([all_survey,d75_survey],axis=0)
print(survey.iloc[0])
dbn 01M015
bn M015
schoolname P.S. 015 Roberto Clemente
d75 0
studentssurveyed No
highschool 0
schooltype Elementary School
rr_s NaN
rr_t 88
rr_p 60
N_s NaN
N_t 22
N_p 90
nr_s 0
nr_t 25
nr_p 150
saf_p_11 8.5
com_p_11 7.6
eng_p_11 7.5
aca_p_11 7.8
saf_t_11 7.5
com_t_11 7.8
eng_t_11 7.6
aca_t_11 7.9
saf_s_11 NaN
com_s_11 NaN
eng_s_11 NaN
aca_s_11 NaN
saf_tot_11 8
com_tot_11 7.7
...
s_N_q13g_3 NaN
s_N_q13g_4 NaN
s_N_q14a_1 NaN
s_N_q14a_2 NaN
s_N_q14a_3 NaN
s_N_q14a_4 NaN
s_N_q14b_1 NaN
s_N_q14b_2 NaN
s_N_q14b_3 NaN
s_N_q14b_4 NaN
s_N_q14c_1 NaN
s_N_q14c_2 NaN
s_N_q14c_3 NaN
s_N_q14c_4 NaN
s_N_q14d_1 NaN
s_N_q14d_2 NaN
s_N_q14d_3 NaN
s_N_q14d_4 NaN
s_N_q14e_1 NaN
s_N_q14e_2 NaN
s_N_q14e_3 NaN
s_N_q14e_4 NaN
s_N_q14f_1 NaN
s_N_q14f_2 NaN
s_N_q14f_3 NaN
s_N_q14f_4 NaN
s_N_q14g_1 NaN
s_N_q14g_2 NaN
s_N_q14g_3 NaN
s_N_q14g_4 NaN
Name: 0, Length: 1942, dtype: object
dbn 75K004
bn K004
schoolname P.S. K004
d75 1
studentssurveyed Yes
highschool 0
schooltype District 75 Special Education
rr_s 38
rr_t 90
rr_p 72
N_s 8
N_t 81
N_p 244
nr_s 21
nr_t 90
nr_p 337
saf_p_11 9.1
com_p_11 8.6
eng_p_11 8.4
aca_p_11 8.5
saf_t_11 7.4
com_t_11 6.6
eng_t_11 6.4
aca_t_11 6.4
saf_s_11 7.6
com_s_11 5.8
eng_s_11 7.6
aca_s_11 6.3
saf_tot_11 8
com_tot_11 7
...
s_q13c_2 86
s_q13c_3 0
s_q13c_4 0
s_q13d_1 0
s_q13d_2 86
s_q13d_3 14
s_q13d_4 0
s_q13e_1 0
s_q13e_2 100
s_q13e_3 0
s_q13e_4 0
s_q13f_1 0
s_q13f_2 100
s_q13f_3 0
s_q13f_4 0
s_q13g_1 0
s_q13g_2 100
s_q13g_3 0
s_q13g_4 0
s_q14_1 71
s_q14_2 29
s_q14_3 0
s_q14_4 0
s_q14_5 0
s_q14_6 0
s_q14_7 0
s_q14_8 0
s_q14_9 0
s_q14_10 0
s_q14_11 0
Name: 0, Length: 1773, dtype: object
N_p 90
N_s NaN
N_t 22
aca_p_11 7.8
aca_s_11 NaN
aca_t_11 7.9
aca_tot_11 7.9
bn M015
com_p_11 7.6
com_s_11 NaN
com_t_11 7.8
com_tot_11 7.7
d75 0
dbn 01M015
eng_p_11 7.5
eng_s_11 NaN
eng_t_11 7.6
eng_tot_11 7.5
highschool 0
nr_p 150
nr_s 0
nr_t 25
p_N_q10a 56
p_N_q10a_1 NaN
p_N_q10a_2 NaN
p_N_q10a_3 NaN
p_N_q10a_4 NaN
p_N_q10a_5 NaN
p_N_q10b 30
p_N_q10b_1 NaN
...
t_q7e_3 5
t_q7e_4 5
t_q7e_5 0
t_q7f NaN
t_q7f_1 NaN
t_q7f_2 NaN
t_q7f_3 NaN
t_q7f_4 NaN
t_q7f_5 NaN
t_q8a 7
t_q8a_1 30
t_q8a_2 50
t_q8a_3 20
t_q8a_4 0
t_q8b 7
t_q8b_1 20
t_q8b_2 70
t_q8b_3 10
t_q8b_4 0
t_q8c 7.5
t_q8c_1 29
t_q8c_2 67
t_q8c_3 5
t_q8c_4 0
t_q9 NaN
t_q9_1 5
t_q9_2 14
t_q9_3 52
t_q9_4 24
t_q9_5 5
Name: 0, Length: 2773, dtype: object
In [7]:
survey["DBN"] = survey["dbn"]
survey_fields = [
"DBN",
"rr_s",
"rr_t",
"rr_p",
"N_s",
"N_t",
"N_p",
"saf_p_11",
"com_p_11",
"eng_p_11",
"aca_p_11",
"saf_t_11",
"com_t_11",
"eng_t_11",
"aca_t_11",
"saf_s_11",
"com_s_11",
"eng_s_11",
"aca_s_11",
"saf_tot_11",
"com_tot_11",
"eng_tot_11",
"aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
print(survey.head())
DBN rr_s rr_t rr_p N_s N_t N_p saf_p_11 com_p_11 eng_p_11 \
0 01M015 NaN 88 60 NaN 22.0 90.0 8.5 7.6 7.5
1 01M019 NaN 100 60 NaN 34.0 161.0 8.4 7.6 7.6
2 01M020 NaN 88 73 NaN 42.0 367.0 8.9 8.3 8.3
3 01M034 89.0 73 50 145.0 29.0 151.0 8.8 8.2 8.0
4 01M063 NaN 100 60 NaN 23.0 90.0 8.7 7.9 8.1
... eng_t_11 aca_t_11 saf_s_11 com_s_11 eng_s_11 aca_s_11 \
0 ... 7.6 7.9 NaN NaN NaN NaN
1 ... 8.9 9.1 NaN NaN NaN NaN
2 ... 6.8 7.5 NaN NaN NaN NaN
3 ... 6.8 7.8 6.2 5.9 6.5 7.4
4 ... 7.8 8.1 NaN NaN NaN NaN
saf_tot_11 com_tot_11 eng_tot_11 aca_tot_11
0 8.0 7.7 7.5 7.9
1 8.5 8.1 8.2 8.4
2 8.2 7.3 7.5 8.0
3 7.3 6.7 7.1 7.9
4 8.5 7.6 7.9 8.0
[5 rows x 23 columns]
In [9]:
def padded_csd(csd):
csd_str = str(csd)
if len(csd_str) == 1:
csd_str = csd_str.zfill(2)
return csd_str
def genDBN(row):
return row['padded_csd'] + row['SCHOOL CODE']
hs_dir = data['hs_directory']
hs_dir['DBN'] = hs_dir['dbn']
class_size = data['class_size']
class_size['padded_csd'] = class_size['CSD'].apply(padded_csd)
class_size['DBN'] = class_size.apply(genDBN, axis=1)
print(class_size['DBN'].iloc[0:5])
0 01M015
1 01M015
2 01M015
3 01M015
4 01M015
Name: DBN, dtype: object
In [10]:
sat_results = data['sat_results']
string_to_num_cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score','SAT Writing Avg. Score']
sat_results['sat_score'] = [0]*len(sat_results)
for col in string_to_num_cols:
sat_results[col] = pd.to_numeric(sat_results[col],errors='coerce')
sat_results['sat_score'] = sat_results['sat_score'] + sat_results[col]
print(sat_results[string_to_num_cols].iloc[0])
print(sat_results['sat_score'].iloc[0])
SAT Math Avg. Score 404.0
SAT Critical Reading Avg. Score 355.0
SAT Writing Avg. Score 363.0
Name: 0, dtype: float64
1122.0
In [11]:
import re
def extract_lat(data):
match = re.findall('\(.+\)', data)
#splits the lat and long and remove first char '(' and last char from lat which is comma(,) and returns the string
lat_long = match[0]
return lat_long.split(' ')[0][1:-1]
hs_directory = data['hs_directory']
hs_directory['lat'] = hs_directory['Location 1'].apply(extract_lat)
print(hs_directory['lat'].iloc[0:5])
0 40.67029890700047
1 40.8276026690005
2 40.842414068000494
3 40.71067947100045
4 40.718810094000446
Name: lat, dtype: object
In [12]:
def extract_longitude(data):
match = re.findall('\(.+\)', data)
lat_long = match[0]
#split lat and long and remove last char ')' from long then return it
return lat_long.split(' ')[1][:-1]
hs_directory = data['hs_directory']
hs_directory['lon'] = hs_directory['Location 1'].apply(extract_longitude)
lat_lon = ['lat', 'lon']
for l in lat_lon:
hs_directory[l] = pd.to_numeric(hs_directory[l], errors='coerce')
print(hs_directory.iloc[0:5])
dbn school_name boro \
0 17K548 Brooklyn School for Music & Theatre Brooklyn
1 09X543 High School for Violin and Dance Bronx
2 09X327 Comprehensive Model School Project M.S. 327 Bronx
3 02M280 Manhattan Early College School for Advertising Manhattan
4 28Q680 Queens Gateway to Health Sciences Secondary Sc... Queens
building_code phone_number fax_number grade_span_min grade_span_max \
0 K440 718-230-6250 718-230-6262 9 12
1 X400 718-842-0687 718-589-9849 9 12
2 X240 718-294-8111 718-294-8109 6 12
3 M520 718-935-3477 NaN 9 10
4 Q695 718-969-3155 718-969-3552 6 12
expgrade_span_min expgrade_span_max ... \
0 NaN NaN ...
1 NaN NaN ...
2 NaN NaN ...
3 9 14.0 ...
4 NaN NaN ...
priority05 priority06 priority07 priority08 \
0 NaN NaN NaN NaN
1 NaN NaN NaN NaN
2 Then to New York City residents NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
priority09 priority10 Location 1 \
0 NaN NaN 883 Classon Avenue\nBrooklyn, NY 11225\n(40.67...
1 NaN NaN 1110 Boston Road\nBronx, NY 10456\n(40.8276026...
2 NaN NaN 1501 Jerome Avenue\nBronx, NY 10452\n(40.84241...
3 NaN NaN 411 Pearl Street\nNew York, NY 10038\n(40.7106...
4 NaN NaN 160-20 Goethals Avenue\nJamaica, NY 11432\n(40...
DBN lat lon
0 17K548 40.670299 -73.961648
1 09X543 40.827603 -73.904475
2 09X327 40.842414 -73.916162
3 02M280 40.710679 -74.000807
4 28Q680 40.718810 -73.806500
[5 rows x 61 columns]
In [13]:
class_size = data['class_size']
#drop rows where grade is different than '09-12' and program type diff than 'GEN ED'
program_type = class_size['PROGRAM TYPE'] != 'GEN ED'
grade = class_size['GRADE '] != '09-12'
program_type_grade = (program_type | grade)
class_size.drop(class_size[program_type_grade].index, inplace=True)
print(class_size.iloc[0:5])
CSD BOROUGH SCHOOL CODE SCHOOL NAME \
225 1 M M292 Henry Street School for International Studies
226 1 M M292 Henry Street School for International Studies
227 1 M M292 Henry Street School for International Studies
228 1 M M292 Henry Street School for International Studies
229 1 M M292 Henry Street School for International Studies
GRADE PROGRAM TYPE CORE SUBJECT (MS CORE and 9-12 ONLY) \
225 09-12 GEN ED ENGLISH
226 09-12 GEN ED ENGLISH
227 09-12 GEN ED ENGLISH
228 09-12 GEN ED ENGLISH
229 09-12 GEN ED MATH
CORE COURSE (MS CORE and 9-12 ONLY) SERVICE CATEGORY(K-9* ONLY) \
225 English 9 -
226 English 10 -
227 English 11 -
228 English 12 -
229 Integrated Algebra -
NUMBER OF STUDENTS / SEATS FILLED NUMBER OF SECTIONS \
225 63.0 3.0
226 79.0 3.0
227 38.0 2.0
228 69.0 3.0
229 53.0 3.0
AVERAGE CLASS SIZE SIZE OF SMALLEST CLASS SIZE OF LARGEST CLASS \
225 21.0 19.0 25.0
226 26.3 24.0 31.0
227 19.0 16.0 22.0
228 23.0 13.0 30.0
229 17.7 16.0 21.0
DATA SOURCE SCHOOLWIDE PUPIL-TEACHER RATIO padded_csd DBN
225 STARS NaN 01 01M292
226 STARS NaN 01 01M292
227 STARS NaN 01 01M292
228 STARS NaN 01 01M292
229 STARS NaN 01 01M292
In [ ]:
Content source: matija94/show-me-the-code
Similar notebooks: