In [1]:
## (c) Veronica Helms
## Team Neighborhood
## Last updated: 7/30/2016

#######################################

#excel dependencies
import openpyxl
import pandas as pd
import os 
import csv
import us #link state FIPS codes to state names using US package (pip install us)#

path=r'/Users/veronicahelms/Desktop/Educat'

xls_file = os.path.join(path,r'math.xls')
data_xls = pd.read_excel(xls_file, index_col=None)
data_xls.to_csv('math.csv')

In [2]:
df = pd.read_csv('math.csv')

In [ ]:
df.head(5)

In [3]:
math = df[['Jurisdiction', 'Average scale score']]

In [7]:
math.rename(columns={"Jurisdiction": "name"}, inplace=True)  ## change title to MSA

In [8]:
math.head(5)


Out[8]:
name Average scale score
0 National public 239.854464
1 Alabama 230.977423
2 Alaska 236.328417
3 Arizona 237.611755
4 Arkansas 235.215470

In [ ]:


In [9]:
## import dependencies
import urllib2
import os
import openpyxl
import pandas as pd
import csv

# Creates directory for data - CBSA - FIPS CROSSWALK
CBSA = os.path.join(os.getcwd(),"CBSA") #folder name - if this doesnt exist then make it
if not os.path.exists(CBSA):
    os.makedirs(CBSA)

filename = CBSA + "/" + "FIPS_CBSA.csv"
url= "http://www.nber.org/cbsa-csa-fips-county-crosswalk/cbsa2fipsxw.csv"

f = urllib2.urlopen(url)
data = f.read()
with open(filename, "wb") as code:
    code.write(data)

In [10]:
df = pd.read_csv('FIPS_CBSA.csv', converters={'fipsstatecode': lambda x: str(x) ,
                                              'fipscountycode': lambda x: str(x) } )

In [11]:
df.rename(columns={"statename": "name"}, inplace=True)  ## change title to MSA

In [12]:
df [['fipsstatecode', "cbsacode", 'fipscountycode', 'name']] #view obs


Out[12]:
fipsstatecode cbsacode fipscountycode name
0 NaN NaN
1 01 33860 001 Alabama
2 01 19300 003 Alabama
3 01 13820 007 Alabama
4 01 13820 009 Alabama
5 01 11500 015 Alabama
6 01 46740 017 Alabama
7 01 13820 021 Alabama
8 01 21460 031 Alabama
9 01 22520 033 Alabama
10 01 45180 037 Alabama
11 01 18980 043 Alabama
12 01 37120 045 Alabama
13 01 42820 047 Alabama
14 01 33860 051 Alabama
15 01 23460 055 Alabama
16 01 20020 061 Alabama
17 01 46220 065 Alabama
18 01 20020 067 Alabama
19 01 20020 069 Alabama
20 01 42460 071 Alabama
21 01 13820 073 Alabama
22 01 22520 077 Alabama
23 01 19460 079 Alabama
24 01 12220 081 Alabama
25 01 26620 083 Alabama
26 01 33860 085 Alabama
27 01 26620 089 Alabama
28 01 10700 095 Alabama
29 01 33660 097 Alabama
... ... ... ... ...
1853 72 41980 091 Puerto Rico
1854 72 41980 095 Puerto Rico
1855 72 32420 097 Puerto Rico
1856 72 10380 099 Puerto Rico
1857 72 41980 101 Puerto Rico
1858 72 41980 103 Puerto Rico
1859 72 41980 105 Puerto Rico
1860 72 41980 107 Puerto Rico
1861 72 25020 109 Puerto Rico
1862 72 38660 111 Puerto Rico
1863 72 38660 113 Puerto Rico
1864 72 11640 115 Puerto Rico
1865 72 10380 117 Puerto Rico
1866 72 41980 119 Puerto Rico
1867 72 41900 121 Puerto Rico
1868 72 17640 123 Puerto Rico
1869 72 41900 125 Puerto Rico
1870 72 41980 127 Puerto Rico
1871 72 41980 129 Puerto Rico
1872 72 10380 131 Puerto Rico
1873 72 42180 133 Puerto Rico
1874 72 41980 135 Puerto Rico
1875 72 41980 137 Puerto Rico
1876 72 41980 139 Puerto Rico
1877 72 10380 141 Puerto Rico
1878 72 41980 143 Puerto Rico
1879 72 41980 145 Puerto Rico
1880 72 38660 149 Puerto Rico
1881 72 41980 151 Puerto Rico
1882 72 38660 153 Puerto Rico

1883 rows × 4 columns


In [13]:
CBSA = df[['fipsstatecode', "cbsacode", 'fipscountycode', 'name']]

In [14]:
CBSA.head(5)


Out[14]:
fipsstatecode cbsacode fipscountycode name
0 NaN NaN
1 01 33860 001 Alabama
2 01 19300 003 Alabama
3 01 13820 007 Alabama
4 01 13820 009 Alabama

In [15]:
FINAL_ED = pd.merge(math, CBSA, on='name')

In [16]:
FINAL_ED.head(5)


Out[16]:
name Average scale score fipsstatecode cbsacode fipscountycode
0 Alabama 230.977423 01 33860 001
1 Alabama 230.977423 01 19300 003
2 Alabama 230.977423 01 13820 007
3 Alabama 230.977423 01 13820 009
4 Alabama 230.977423 01 11500 015

In [17]:
FINAL_ED.drop_duplicates(['cbsacode'],)


Out[17]:
name Average scale score fipsstatecode cbsacode fipscountycode
0 Alabama 230.977423 01 33860 001
1 Alabama 230.977423 01 19300 003
2 Alabama 230.977423 01 13820 007
4 Alabama 230.977423 01 11500 015
5 Alabama 230.977423 01 46740 017
7 Alabama 230.977423 01 21460 031
8 Alabama 230.977423 01 22520 033
9 Alabama 230.977423 01 45180 037
10 Alabama 230.977423 01 18980 043
11 Alabama 230.977423 01 37120 045
12 Alabama 230.977423 01 42820 047
14 Alabama 230.977423 01 23460 055
15 Alabama 230.977423 01 20020 061
16 Alabama 230.977423 01 46220 065
19 Alabama 230.977423 01 42460 071
22 Alabama 230.977423 01 19460 079
23 Alabama 230.977423 01 12220 081
24 Alabama 230.977423 01 26620 083
27 Alabama 230.977423 01 10700 095
28 Alabama 230.977423 01 33660 097
32 Alabama 230.977423 01 45980 109
33 Alabama 230.977423 01 17980 113
39 Alaska 236.328417 02 11260 020
40 Alaska 236.328417 02 21820 090
41 Alaska 236.328417 02 27940 110
42 Alaska 236.328417 02 28540 130
44 Arizona 237.611755 04 43420 003
45 Arizona 237.611755 04 22380 005
46 Arizona 237.611755 04 37740 007
47 Arizona 237.611755 04 40940 009
... ... ... ... ... ...
1760 Wisconsin 243.310211 55 11540 015
1761 Wisconsin 243.310211 55 20740 017
1762 Wisconsin 243.310211 55 31540 021
1764 Wisconsin 243.310211 55 13180 027
1766 Wisconsin 243.310211 55 32860 033
1769 Wisconsin 243.310211 55 22540 039
1770 Wisconsin 243.310211 55 38420 043
1773 Wisconsin 243.310211 55 48020 055
1777 Wisconsin 243.310211 55 32980 069
1778 Wisconsin 243.310211 55 31820 071
1779 Wisconsin 243.310211 55 48140 073
1781 Wisconsin 243.310211 55 43020 078
1782 Wisconsin 243.310211 55 33340 079
1787 Wisconsin 243.310211 55 44620 097
1788 Wisconsin 243.310211 55 39540 101
1789 Wisconsin 243.310211 55 27500 105
1791 Wisconsin 243.310211 55 12660 111
1793 Wisconsin 243.310211 55 43100 117
1794 Wisconsin 243.310211 55 48580 127
1797 Wisconsin 243.310211 55 36780 139
1798 Wisconsin 243.310211 55 49220 141
1799 Wyoming 246.759764 56 29660 001
1800 Wyoming 246.759764 56 23940 005
1801 Wyoming 246.759764 56 40180 013
1802 Wyoming 246.759764 56 16940 021
1803 Wyoming 246.759764 56 16220 025
1804 Wyoming 246.759764 56 43260 033
1805 Wyoming 246.759764 56 40540 037
1807 Wyoming 246.759764 56 21740 041
1808 NaN NaN NaN

918 rows × 5 columns

EDUCATION = FINAL_ED[['Average scale score','cbsacode']]


In [18]:
EDUCAT = FINAL_ED[['Average scale score','cbsacode']]

In [19]:
EDUCAT.head(5)


Out[19]:
Average scale score cbsacode
0 230.977423 33860
1 230.977423 19300
2 230.977423 13820
3 230.977423 13820
4 230.977423 11500

In [20]:
#REMOVE DUPLICATES FROM CBSA FILE
EDUCAT2 = EDUCAT.drop_duplicates(['cbsacode'],)

In [21]:
#REMOVE MISSING VALUES
EDUCAT3 = EDUCAT2[pd.notnull(EDUCAT2['Average scale score'])]

In [22]:
EDUCAT3


Out[22]:
Average scale score cbsacode
0 230.977423 33860
1 230.977423 19300
2 230.977423 13820
4 230.977423 11500
5 230.977423 46740
7 230.977423 21460
8 230.977423 22520
9 230.977423 45180
10 230.977423 18980
11 230.977423 37120
12 230.977423 42820
14 230.977423 23460
15 230.977423 20020
16 230.977423 46220
19 230.977423 42460
22 230.977423 19460
23 230.977423 12220
24 230.977423 26620
27 230.977423 10700
28 230.977423 33660
32 230.977423 45980
33 230.977423 17980
39 236.328417 11260
40 236.328417 21820
41 236.328417 27940
42 236.328417 28540
44 237.611755 43420
45 237.611755 22380
46 237.611755 37740
47 237.611755 40940
... ... ...
1759 243.310211 24580
1760 243.310211 11540
1761 243.310211 20740
1762 243.310211 31540
1764 243.310211 13180
1766 243.310211 32860
1769 243.310211 22540
1770 243.310211 38420
1773 243.310211 48020
1777 243.310211 32980
1778 243.310211 31820
1779 243.310211 48140
1781 243.310211 43020
1782 243.310211 33340
1787 243.310211 44620
1788 243.310211 39540
1789 243.310211 27500
1791 243.310211 12660
1793 243.310211 43100
1794 243.310211 48580
1797 243.310211 36780
1798 243.310211 49220
1799 246.759764 29660
1800 246.759764 23940
1801 246.759764 40180
1802 246.759764 16940
1803 246.759764 16220
1804 246.759764 43260
1805 246.759764 40540
1807 246.759764 21740

917 rows × 2 columns


In [23]:
education = EDUCAT3

In [24]:
education


Out[24]:
Average scale score cbsacode
0 230.977423 33860
1 230.977423 19300
2 230.977423 13820
4 230.977423 11500
5 230.977423 46740
7 230.977423 21460
8 230.977423 22520
9 230.977423 45180
10 230.977423 18980
11 230.977423 37120
12 230.977423 42820
14 230.977423 23460
15 230.977423 20020
16 230.977423 46220
19 230.977423 42460
22 230.977423 19460
23 230.977423 12220
24 230.977423 26620
27 230.977423 10700
28 230.977423 33660
32 230.977423 45980
33 230.977423 17980
39 236.328417 11260
40 236.328417 21820
41 236.328417 27940
42 236.328417 28540
44 237.611755 43420
45 237.611755 22380
46 237.611755 37740
47 237.611755 40940
... ... ...
1759 243.310211 24580
1760 243.310211 11540
1761 243.310211 20740
1762 243.310211 31540
1764 243.310211 13180
1766 243.310211 32860
1769 243.310211 22540
1770 243.310211 38420
1773 243.310211 48020
1777 243.310211 32980
1778 243.310211 31820
1779 243.310211 48140
1781 243.310211 43020
1782 243.310211 33340
1787 243.310211 44620
1788 243.310211 39540
1789 243.310211 27500
1791 243.310211 12660
1793 243.310211 43100
1794 243.310211 48580
1797 243.310211 36780
1798 243.310211 49220
1799 246.759764 29660
1800 246.759764 23940
1801 246.759764 40180
1802 246.759764 16940
1803 246.759764 16220
1804 246.759764 43260
1805 246.759764 40540
1807 246.759764 21740

917 rows × 2 columns


In [25]:
#THE NAME OF THE FINAL DATASET IS education- 917 obs#

In [26]:
#MERGE TO ONLY INCLUDE TOP 100 CBSA'S BY POPULATION (Another Program)#

In [27]:
MSA = pd.read_csv('MSA_300.csv')
MSA.rename(columns={"CBSA": "cbsacode"}, inplace=True)  ## change title of CBSA code var ##
MSA.head(5)


Out[27]:
Unnamed: 0 Population cbsacode MSA Name
0 0 20092883 35620 New York-Newark-Jersey City NY-NJ-PA Metro Area
1 1 13262220 31080 Los Angeles-Long Beach-Anaheim CA Metro Area
2 2 9553810 16980 Chicago-Naperville-Elgin IL-IN-WI Metro Area
3 3 6954003 19100 Dallas-Fort Worth-Arlington TX Metro Area
4 4 6490180 26420 Houston-The Woodlands-Sugar Land TX Metro Area

In [30]:
EDUCATION_MSA = pd.merge(education, MSA, on='cbsacode')

In [31]:
EDUCATION_MSA


Out[31]:
Average scale score cbsacode Unnamed: 0 Population MSA Name
0 230.977423 33860 141 374430 Montgomery AL Metro Area
1 230.977423 19300 221 200111 Daphne-Fairhope-Foley AL Metro Area
2 230.977423 13820 49 1143772 Birmingham-Hoover AL Metro Area
3 230.977423 22520 288 147639 Florence-Muscle Shoals AL Metro Area
4 230.977423 20020 286 148095 Dothan AL Metro Area
5 230.977423 46220 193 235954 Tuscaloosa AL Metro Area
6 230.977423 19460 272 153084 Decatur AL Metro Area
7 230.977423 12220 269 154255 Auburn-Opelika AL Metro Area
8 230.977423 26620 118 441086 Huntsville AL Metro Area
9 230.977423 33660 127 415123 Mobile AL Metro Area
10 230.977423 17980 162 312731 Columbus GA-AL Metro Area
11 236.328417 11260 134 398892 Anchorage AK Metro Area
12 237.611755 38060 11 4489109 Phoenix-Mesa-Scottsdale AZ Metro Area
13 237.611755 29420 216 203361 Lake Havasu City-Kingman AZ Metro Area
14 237.611755 46060 53 1004516 Tucson AZ Metro Area
15 237.611755 39140 201 218844 Prescott AZ Metro Area
16 237.611755 49740 217 203247 Yuma AZ Metro Area
17 235.215470 22220 105 503046 Fayetteville-Springdale-Rogers AR-MO Metro Area
18 235.215470 22900 168 279592 Fort Smith AR-OK Metro Area
19 235.215470 32820 41 1344121 Memphis TN-MS-AR Metro Area
20 235.215470 30780 75 727777 Little Rock-North Little Rock-Conway AR Metro ...
21 235.215470 45500 287 147740 Texarkana TX-AR Metro Area
22 231.549345 41860 10 4594060 San Francisco-Oakland-Hayward CA Metro Area
23 231.549345 17020 199 224241 Chico CA Metro Area
24 231.549345 40900 27 2244397 Sacramento--Roseville--Arden-Arcade CA Metro Area
25 231.549345 23420 56 965974 Fresno CA Metro Area
26 231.549345 20940 237 179091 El Centro CA Metro Area
27 231.549345 12540 62 874589 Bakersfield CA Metro Area
28 231.549345 25260 279 150269 Hanford-Corcoran CA Metro Area
29 231.549345 31080 1 13262220 Los Angeles-Long Beach-Anaheim CA Metro Area
... ... ... ... ... ...
266 244.002472 32580 68 831073 McAllen-Edinburg-Mission TX Metro Area
267 244.002472 33260 258 163470 Midland TX Metro Area
268 244.002472 46340 202 218842 Tyler TX Metro Area
269 244.002472 29700 178 266673 Laredo TX Metro Area
270 242.562787 36260 87 631146 Ogden-Clearfield UT Metro Area
271 242.562787 39340 94 572798 Provo-Orem UT Metro Area
272 242.562787 41620 48 1153340 Salt Lake City UT Metro Area
273 242.562787 41100 275 151948 St. George UT Metro Area
274 243.250399 15540 205 215824 Burlington-South Burlington VT Metro Area
275 246.612622 16820 198 227738 Charlottesville VA Metro Area
276 246.612622 40060 44 1260668 Richmond VA Metro Area
277 246.612622 31340 186 257570 Lynchburg VA Metro Area
278 246.612622 40220 161 312837 Roanoke VA Metro Area
279 246.612622 13980 234 181249 Blacksburg-Christiansburg-Radford VA Metro Area
280 244.981422 28420 173 274295 Kennewick-Richland WA Metro Area
281 244.981422 42660 14 3671478 Seattle-Tacoma-Bellevue WA Metro Area
282 244.981422 14740 187 254183 Bremerton-Silverdale WA Metro Area
283 244.981422 44060 101 542073 Spokane-Spokane Valley WA Metro Area
284 244.981422 36500 180 265851 Olympia-Tumwater WA Metro Area
285 244.981422 13380 213 208351 Bellingham WA Metro Area
286 244.981422 49420 189 247687 Yakima WA Metro Area
287 235.204661 16620 200 223371 Charleston WV Metro Area
288 243.310211 24580 159 314531 Green Bay WI Metro Area
289 243.310211 11540 196 231497 Appleton WI Metro Area
290 243.310211 20740 255 165024 Eau Claire WI Metro Area
291 243.310211 31540 86 633787 Madison WI Metro Area
292 243.310211 33340 39 1572245 Milwaukee-Waukesha-West Allis WI Metro Area
293 243.310211 39540 223 195163 Racine WI Metro Area
294 243.310211 27500 261 161188 Janesville-Beloit WI Metro Area
295 243.310211 36780 248 169511 Oshkosh-Neenah WI Metro Area

296 rows × 5 columns


In [32]:
EDUCATION_MSA.to_csv("EDUCATION_MSA_MATH.csv")

In [ ]: