In [1]:
## (c) Veronica Helms
## Team Neighborhood
## Last updated: 7/30/2016
#######################################
#excel dependencies
import openpyxl
import pandas as pd
import os
import csv
import us #link state FIPS codes to state names using US package (pip install us)#
path=r'/Users/veronicahelms/Desktop/Educat'
xls_file = os.path.join(path,r'math.xls')
data_xls = pd.read_excel(xls_file, index_col=None)
data_xls.to_csv('math.csv')
In [2]:
df = pd.read_csv('math.csv')
In [ ]:
df.head(5)
In [3]:
math = df[['Jurisdiction', 'Average scale score']]
In [7]:
math.rename(columns={"Jurisdiction": "name"}, inplace=True) ## change title to MSA
In [8]:
math.head(5)
Out[8]:
name
Average scale score
0
National public
239.854464
1
Alabama
230.977423
2
Alaska
236.328417
3
Arizona
237.611755
4
Arkansas
235.215470
In [ ]:
In [9]:
## import dependencies
import urllib2
import os
import openpyxl
import pandas as pd
import csv
# Creates directory for data - CBSA - FIPS CROSSWALK
CBSA = os.path.join(os.getcwd(),"CBSA") #folder name - if this doesnt exist then make it
if not os.path.exists(CBSA):
os.makedirs(CBSA)
filename = CBSA + "/" + "FIPS_CBSA.csv"
url= "http://www.nber.org/cbsa-csa-fips-county-crosswalk/cbsa2fipsxw.csv"
f = urllib2.urlopen(url)
data = f.read()
with open(filename, "wb") as code:
code.write(data)
In [10]:
df = pd.read_csv('FIPS_CBSA.csv', converters={'fipsstatecode': lambda x: str(x) ,
'fipscountycode': lambda x: str(x) } )
In [11]:
df.rename(columns={"statename": "name"}, inplace=True) ## change title to MSA
In [12]:
df [['fipsstatecode', "cbsacode", 'fipscountycode', 'name']] #view obs
Out[12]:
fipsstatecode
cbsacode
fipscountycode
name
0
NaN
NaN
1
01
33860
001
Alabama
2
01
19300
003
Alabama
3
01
13820
007
Alabama
4
01
13820
009
Alabama
5
01
11500
015
Alabama
6
01
46740
017
Alabama
7
01
13820
021
Alabama
8
01
21460
031
Alabama
9
01
22520
033
Alabama
10
01
45180
037
Alabama
11
01
18980
043
Alabama
12
01
37120
045
Alabama
13
01
42820
047
Alabama
14
01
33860
051
Alabama
15
01
23460
055
Alabama
16
01
20020
061
Alabama
17
01
46220
065
Alabama
18
01
20020
067
Alabama
19
01
20020
069
Alabama
20
01
42460
071
Alabama
21
01
13820
073
Alabama
22
01
22520
077
Alabama
23
01
19460
079
Alabama
24
01
12220
081
Alabama
25
01
26620
083
Alabama
26
01
33860
085
Alabama
27
01
26620
089
Alabama
28
01
10700
095
Alabama
29
01
33660
097
Alabama
...
...
...
...
...
1853
72
41980
091
Puerto Rico
1854
72
41980
095
Puerto Rico
1855
72
32420
097
Puerto Rico
1856
72
10380
099
Puerto Rico
1857
72
41980
101
Puerto Rico
1858
72
41980
103
Puerto Rico
1859
72
41980
105
Puerto Rico
1860
72
41980
107
Puerto Rico
1861
72
25020
109
Puerto Rico
1862
72
38660
111
Puerto Rico
1863
72
38660
113
Puerto Rico
1864
72
11640
115
Puerto Rico
1865
72
10380
117
Puerto Rico
1866
72
41980
119
Puerto Rico
1867
72
41900
121
Puerto Rico
1868
72
17640
123
Puerto Rico
1869
72
41900
125
Puerto Rico
1870
72
41980
127
Puerto Rico
1871
72
41980
129
Puerto Rico
1872
72
10380
131
Puerto Rico
1873
72
42180
133
Puerto Rico
1874
72
41980
135
Puerto Rico
1875
72
41980
137
Puerto Rico
1876
72
41980
139
Puerto Rico
1877
72
10380
141
Puerto Rico
1878
72
41980
143
Puerto Rico
1879
72
41980
145
Puerto Rico
1880
72
38660
149
Puerto Rico
1881
72
41980
151
Puerto Rico
1882
72
38660
153
Puerto Rico
1883 rows × 4 columns
In [13]:
CBSA = df[['fipsstatecode', "cbsacode", 'fipscountycode', 'name']]
In [14]:
CBSA.head(5)
Out[14]:
fipsstatecode
cbsacode
fipscountycode
name
0
NaN
NaN
1
01
33860
001
Alabama
2
01
19300
003
Alabama
3
01
13820
007
Alabama
4
01
13820
009
Alabama
In [15]:
FINAL_ED = pd.merge(math, CBSA, on='name')
In [16]:
FINAL_ED.head(5)
Out[16]:
name
Average scale score
fipsstatecode
cbsacode
fipscountycode
0
Alabama
230.977423
01
33860
001
1
Alabama
230.977423
01
19300
003
2
Alabama
230.977423
01
13820
007
3
Alabama
230.977423
01
13820
009
4
Alabama
230.977423
01
11500
015
In [17]:
FINAL_ED.drop_duplicates(['cbsacode'],)
Out[17]:
name
Average scale score
fipsstatecode
cbsacode
fipscountycode
0
Alabama
230.977423
01
33860
001
1
Alabama
230.977423
01
19300
003
2
Alabama
230.977423
01
13820
007
4
Alabama
230.977423
01
11500
015
5
Alabama
230.977423
01
46740
017
7
Alabama
230.977423
01
21460
031
8
Alabama
230.977423
01
22520
033
9
Alabama
230.977423
01
45180
037
10
Alabama
230.977423
01
18980
043
11
Alabama
230.977423
01
37120
045
12
Alabama
230.977423
01
42820
047
14
Alabama
230.977423
01
23460
055
15
Alabama
230.977423
01
20020
061
16
Alabama
230.977423
01
46220
065
19
Alabama
230.977423
01
42460
071
22
Alabama
230.977423
01
19460
079
23
Alabama
230.977423
01
12220
081
24
Alabama
230.977423
01
26620
083
27
Alabama
230.977423
01
10700
095
28
Alabama
230.977423
01
33660
097
32
Alabama
230.977423
01
45980
109
33
Alabama
230.977423
01
17980
113
39
Alaska
236.328417
02
11260
020
40
Alaska
236.328417
02
21820
090
41
Alaska
236.328417
02
27940
110
42
Alaska
236.328417
02
28540
130
44
Arizona
237.611755
04
43420
003
45
Arizona
237.611755
04
22380
005
46
Arizona
237.611755
04
37740
007
47
Arizona
237.611755
04
40940
009
...
...
...
...
...
...
1760
Wisconsin
243.310211
55
11540
015
1761
Wisconsin
243.310211
55
20740
017
1762
Wisconsin
243.310211
55
31540
021
1764
Wisconsin
243.310211
55
13180
027
1766
Wisconsin
243.310211
55
32860
033
1769
Wisconsin
243.310211
55
22540
039
1770
Wisconsin
243.310211
55
38420
043
1773
Wisconsin
243.310211
55
48020
055
1777
Wisconsin
243.310211
55
32980
069
1778
Wisconsin
243.310211
55
31820
071
1779
Wisconsin
243.310211
55
48140
073
1781
Wisconsin
243.310211
55
43020
078
1782
Wisconsin
243.310211
55
33340
079
1787
Wisconsin
243.310211
55
44620
097
1788
Wisconsin
243.310211
55
39540
101
1789
Wisconsin
243.310211
55
27500
105
1791
Wisconsin
243.310211
55
12660
111
1793
Wisconsin
243.310211
55
43100
117
1794
Wisconsin
243.310211
55
48580
127
1797
Wisconsin
243.310211
55
36780
139
1798
Wisconsin
243.310211
55
49220
141
1799
Wyoming
246.759764
56
29660
001
1800
Wyoming
246.759764
56
23940
005
1801
Wyoming
246.759764
56
40180
013
1802
Wyoming
246.759764
56
16940
021
1803
Wyoming
246.759764
56
16220
025
1804
Wyoming
246.759764
56
43260
033
1805
Wyoming
246.759764
56
40540
037
1807
Wyoming
246.759764
56
21740
041
1808
NaN
NaN
NaN
918 rows × 5 columns
EDUCATION = FINAL_ED[['Average scale score','cbsacode']]
In [18]:
EDUCAT = FINAL_ED[['Average scale score','cbsacode']]
In [19]:
EDUCAT.head(5)
Out[19]:
Average scale score
cbsacode
0
230.977423
33860
1
230.977423
19300
2
230.977423
13820
3
230.977423
13820
4
230.977423
11500
In [20]:
#REMOVE DUPLICATES FROM CBSA FILE
EDUCAT2 = EDUCAT.drop_duplicates(['cbsacode'],)
In [21]:
#REMOVE MISSING VALUES
EDUCAT3 = EDUCAT2[pd.notnull(EDUCAT2['Average scale score'])]
In [22]:
EDUCAT3
Out[22]:
Average scale score
cbsacode
0
230.977423
33860
1
230.977423
19300
2
230.977423
13820
4
230.977423
11500
5
230.977423
46740
7
230.977423
21460
8
230.977423
22520
9
230.977423
45180
10
230.977423
18980
11
230.977423
37120
12
230.977423
42820
14
230.977423
23460
15
230.977423
20020
16
230.977423
46220
19
230.977423
42460
22
230.977423
19460
23
230.977423
12220
24
230.977423
26620
27
230.977423
10700
28
230.977423
33660
32
230.977423
45980
33
230.977423
17980
39
236.328417
11260
40
236.328417
21820
41
236.328417
27940
42
236.328417
28540
44
237.611755
43420
45
237.611755
22380
46
237.611755
37740
47
237.611755
40940
...
...
...
1759
243.310211
24580
1760
243.310211
11540
1761
243.310211
20740
1762
243.310211
31540
1764
243.310211
13180
1766
243.310211
32860
1769
243.310211
22540
1770
243.310211
38420
1773
243.310211
48020
1777
243.310211
32980
1778
243.310211
31820
1779
243.310211
48140
1781
243.310211
43020
1782
243.310211
33340
1787
243.310211
44620
1788
243.310211
39540
1789
243.310211
27500
1791
243.310211
12660
1793
243.310211
43100
1794
243.310211
48580
1797
243.310211
36780
1798
243.310211
49220
1799
246.759764
29660
1800
246.759764
23940
1801
246.759764
40180
1802
246.759764
16940
1803
246.759764
16220
1804
246.759764
43260
1805
246.759764
40540
1807
246.759764
21740
917 rows × 2 columns
In [23]:
education = EDUCAT3
In [24]:
education
Out[24]:
Average scale score
cbsacode
0
230.977423
33860
1
230.977423
19300
2
230.977423
13820
4
230.977423
11500
5
230.977423
46740
7
230.977423
21460
8
230.977423
22520
9
230.977423
45180
10
230.977423
18980
11
230.977423
37120
12
230.977423
42820
14
230.977423
23460
15
230.977423
20020
16
230.977423
46220
19
230.977423
42460
22
230.977423
19460
23
230.977423
12220
24
230.977423
26620
27
230.977423
10700
28
230.977423
33660
32
230.977423
45980
33
230.977423
17980
39
236.328417
11260
40
236.328417
21820
41
236.328417
27940
42
236.328417
28540
44
237.611755
43420
45
237.611755
22380
46
237.611755
37740
47
237.611755
40940
...
...
...
1759
243.310211
24580
1760
243.310211
11540
1761
243.310211
20740
1762
243.310211
31540
1764
243.310211
13180
1766
243.310211
32860
1769
243.310211
22540
1770
243.310211
38420
1773
243.310211
48020
1777
243.310211
32980
1778
243.310211
31820
1779
243.310211
48140
1781
243.310211
43020
1782
243.310211
33340
1787
243.310211
44620
1788
243.310211
39540
1789
243.310211
27500
1791
243.310211
12660
1793
243.310211
43100
1794
243.310211
48580
1797
243.310211
36780
1798
243.310211
49220
1799
246.759764
29660
1800
246.759764
23940
1801
246.759764
40180
1802
246.759764
16940
1803
246.759764
16220
1804
246.759764
43260
1805
246.759764
40540
1807
246.759764
21740
917 rows × 2 columns
In [25]:
#THE NAME OF THE FINAL DATASET IS education- 917 obs#
In [26]:
#MERGE TO ONLY INCLUDE TOP 100 CBSA'S BY POPULATION (Another Program)#
In [27]:
MSA = pd.read_csv('MSA_300.csv')
MSA.rename(columns={"CBSA": "cbsacode"}, inplace=True) ## change title of CBSA code var ##
MSA.head(5)
Out[27]:
Unnamed: 0
Population
cbsacode
MSA Name
0
0
20092883
35620
New York-Newark-Jersey City NY-NJ-PA Metro Area
1
1
13262220
31080
Los Angeles-Long Beach-Anaheim CA Metro Area
2
2
9553810
16980
Chicago-Naperville-Elgin IL-IN-WI Metro Area
3
3
6954003
19100
Dallas-Fort Worth-Arlington TX Metro Area
4
4
6490180
26420
Houston-The Woodlands-Sugar Land TX Metro Area
In [30]:
EDUCATION_MSA = pd.merge(education, MSA, on='cbsacode')
In [31]:
EDUCATION_MSA
Out[31]:
Average scale score
cbsacode
Unnamed: 0
Population
MSA Name
0
230.977423
33860
141
374430
Montgomery AL Metro Area
1
230.977423
19300
221
200111
Daphne-Fairhope-Foley AL Metro Area
2
230.977423
13820
49
1143772
Birmingham-Hoover AL Metro Area
3
230.977423
22520
288
147639
Florence-Muscle Shoals AL Metro Area
4
230.977423
20020
286
148095
Dothan AL Metro Area
5
230.977423
46220
193
235954
Tuscaloosa AL Metro Area
6
230.977423
19460
272
153084
Decatur AL Metro Area
7
230.977423
12220
269
154255
Auburn-Opelika AL Metro Area
8
230.977423
26620
118
441086
Huntsville AL Metro Area
9
230.977423
33660
127
415123
Mobile AL Metro Area
10
230.977423
17980
162
312731
Columbus GA-AL Metro Area
11
236.328417
11260
134
398892
Anchorage AK Metro Area
12
237.611755
38060
11
4489109
Phoenix-Mesa-Scottsdale AZ Metro Area
13
237.611755
29420
216
203361
Lake Havasu City-Kingman AZ Metro Area
14
237.611755
46060
53
1004516
Tucson AZ Metro Area
15
237.611755
39140
201
218844
Prescott AZ Metro Area
16
237.611755
49740
217
203247
Yuma AZ Metro Area
17
235.215470
22220
105
503046
Fayetteville-Springdale-Rogers AR-MO Metro Area
18
235.215470
22900
168
279592
Fort Smith AR-OK Metro Area
19
235.215470
32820
41
1344121
Memphis TN-MS-AR Metro Area
20
235.215470
30780
75
727777
Little Rock-North Little Rock-Conway AR Metro ...
21
235.215470
45500
287
147740
Texarkana TX-AR Metro Area
22
231.549345
41860
10
4594060
San Francisco-Oakland-Hayward CA Metro Area
23
231.549345
17020
199
224241
Chico CA Metro Area
24
231.549345
40900
27
2244397
Sacramento--Roseville--Arden-Arcade CA Metro Area
25
231.549345
23420
56
965974
Fresno CA Metro Area
26
231.549345
20940
237
179091
El Centro CA Metro Area
27
231.549345
12540
62
874589
Bakersfield CA Metro Area
28
231.549345
25260
279
150269
Hanford-Corcoran CA Metro Area
29
231.549345
31080
1
13262220
Los Angeles-Long Beach-Anaheim CA Metro Area
...
...
...
...
...
...
266
244.002472
32580
68
831073
McAllen-Edinburg-Mission TX Metro Area
267
244.002472
33260
258
163470
Midland TX Metro Area
268
244.002472
46340
202
218842
Tyler TX Metro Area
269
244.002472
29700
178
266673
Laredo TX Metro Area
270
242.562787
36260
87
631146
Ogden-Clearfield UT Metro Area
271
242.562787
39340
94
572798
Provo-Orem UT Metro Area
272
242.562787
41620
48
1153340
Salt Lake City UT Metro Area
273
242.562787
41100
275
151948
St. George UT Metro Area
274
243.250399
15540
205
215824
Burlington-South Burlington VT Metro Area
275
246.612622
16820
198
227738
Charlottesville VA Metro Area
276
246.612622
40060
44
1260668
Richmond VA Metro Area
277
246.612622
31340
186
257570
Lynchburg VA Metro Area
278
246.612622
40220
161
312837
Roanoke VA Metro Area
279
246.612622
13980
234
181249
Blacksburg-Christiansburg-Radford VA Metro Area
280
244.981422
28420
173
274295
Kennewick-Richland WA Metro Area
281
244.981422
42660
14
3671478
Seattle-Tacoma-Bellevue WA Metro Area
282
244.981422
14740
187
254183
Bremerton-Silverdale WA Metro Area
283
244.981422
44060
101
542073
Spokane-Spokane Valley WA Metro Area
284
244.981422
36500
180
265851
Olympia-Tumwater WA Metro Area
285
244.981422
13380
213
208351
Bellingham WA Metro Area
286
244.981422
49420
189
247687
Yakima WA Metro Area
287
235.204661
16620
200
223371
Charleston WV Metro Area
288
243.310211
24580
159
314531
Green Bay WI Metro Area
289
243.310211
11540
196
231497
Appleton WI Metro Area
290
243.310211
20740
255
165024
Eau Claire WI Metro Area
291
243.310211
31540
86
633787
Madison WI Metro Area
292
243.310211
33340
39
1572245
Milwaukee-Waukesha-West Allis WI Metro Area
293
243.310211
39540
223
195163
Racine WI Metro Area
294
243.310211
27500
261
161188
Janesville-Beloit WI Metro Area
295
243.310211
36780
248
169511
Oshkosh-Neenah WI Metro Area
296 rows × 5 columns
In [32]:
EDUCATION_MSA.to_csv("EDUCATION_MSA_MATH.csv")
In [ ]:
Content source: georgetown-analytics/neighborhood-change
Similar notebooks: