In [151]:
import geopandas as gpd
import pandas as pd
import pickle
import numpy as np

In [228]:
# load city lights dataframe
with open('data/geo/pickles/zonal_stats_m.pickle') as f:
    df_cities_geo = pickle.load(f)
df_cities_geo.rename(columns={'adm0_a3':'ISO_A3'}, inplace=True)
df_cities = pd.DataFrame(df_cities_geo)

# load country lights dataframe
with open('data/geo/pickles/zonal_stats_c.pickle') as f:
    df_countries_geo = pickle.load(f)
df_countries_geo.reset_index(inplace=True)
df_countries = pd.DataFrame(df_countries_geo)

# merge dataframes
df_merge = pd.merge(df_cities, df_countries, how='left', on='ISO_A3', suffixes=('m', 'c')); df_merge.head(3)


Out[228]:
ISO_A3 cartodb_id geometrym nameascii pop_max F101992m F101993m F101994m F121994m F121995m ... F162004c F162005c F162006c F162007c F162008c F162009c F182010c F182011c F182012c F182013c
0 AUS 1198 POLYGON ((138.513149 -34.991254, 138.504815 -3... Adelaide 1145000 60314.0 64088.0 62126.0 62264.0 64969.0 ... 1929061.0 1653849.0 1855688.0 2016436.0 1991068.0 2001535.0 2657549.0 2304520.0 2591313.0 2490423.0
1 USA 37873 POLYGON ((-111.785275 33.163921, -111.785275 3... Mesa 1085394 267889.0 256514.0 258932.0 271544.0 284599.0 ... 61624213.0 54883977.0 58134101.0 67727412.0 66596415.0 63256383.0 83859358.0 71072373.0 69517416.0 71832002.0
2 ARE 131 POLYGON ((55.100937 24.147646, 55.100937 24.13... Sharjah 1103027 346648.0 351177.0 339926.0 374813.0 382730.0 ... 782052.0 708632.0 798291.0 881028.0 877970.0 857934.0 1109724.0 1076830.0 1146627.0 1165915.0

3 rows × 81 columns


In [260]:
# compute fraction of light for each city within its country
df_cities_years = df_merge.ix[:, 5:39]
np_cities_years = df_cities_years.as_matrix()
df_countries_years = df_merge.ix[:, 47:]
np_countries_years = df_countries_years.as_matrix()
np_cities_light_fractions = np_cities_years / np_countries_years

In [275]:
# join with city name and country code
df_cities_years_cols = [x[:7] for x in df_cities_years.columns.values]
df_cities_truncated = df_cities.ix[:, :5]
df_cities_light_fractions = pd.DataFrame(np_cities_light_fractions, columns=df_cities_years_cols)
df_cities_light_fractions = df_cities_truncated.join(df_cities_light_fractions)
df_cities_light_fractions.drop(['cartodb_id', 'geometry', 'pop_max'], inplace=True, axis=1)
df_cities_light_fractions.rename(columns={'nameascii': 'city'}, inplace=True)
df_cities_light_fractions.head(3)


Out[275]:
ISO_A3 city F101992 F101993 F101994 F121994 F121995 F121996 F121997 F121998 ... F162004 F162005 F162006 F162007 F162008 F162009 F182010 F182011 F182012 F182013
0 AUS Adelaide 0.040712 0.038999 0.038227 0.037661 0.033461 0.034791 0.033779 0.033385 ... 0.032392 0.035174 0.033167 0.031126 0.032087 0.032131 0.025771 0.029548 0.026893 0.028003
1 USA Mesa 0.004626 0.004192 0.004260 0.004590 0.004301 0.004289 0.004533 0.004601 ... 0.005367 0.005948 0.005952 0.005300 0.005392 0.005644 0.004456 0.005139 0.005326 0.005178
2 ARE Sharjah 0.745420 0.732680 0.713109 0.745305 0.710744 0.692191 0.687522 0.669494 ... 0.690925 0.718230 0.707320 0.695539 0.704375 0.722402 0.646997 0.649524 0.649606 0.651584

3 rows × 36 columns


In [276]:
# pickle dataframe
df_cities_light_fractions.to_pickle('data/geo/pickles/cities_light_frac.pickle')

In [ ]:
# load 1992 dataframe and rename 1992 light column
#with open('data/geo/pickles/zonal_stats_c92.pickle') as f:
    #df_1992 = pickle.load(f)
#df_1992.rename(columns={'F101992':'c_F101992'}, inplace=True)

# drop dups and rename cols
#df_input = pd.DataFrame(df_input)
#df_input = df_input.drop_duplicates(subset='nameascii')
#df_input.rename(columns={'adm0_a3':'ISO_A3', 'nameascii':'city'}, inplace=True)

# join dataset on ISO_A3 column
#df_join = pd.merge(df_input, df_1992, on='ISO_A3')

#df_join

In [ ]:


In [ ]:
df_normalizer = np.reshape((df_join.c_F101992).as_matrix(), (df_join.shape[0],1))
df_drop_cols = df_join.drop(['ISO_A3', 'cartodb_id', 'geometry', 'pop_max', 'city', 'WB_A3', 'c_F101992'], axis=1)
light_years = df_drop_cols.columns.values
df_np = df_drop_cols.as_matrix()
df_np_normed = np.divide(df_np, df_normalizer)
df_tojoin = df_join[['WB_A3', 'city', 'pop_max']]
df_normed = pd.DataFrame(df_np_normed, columns=light_years)
df_rejoined = df_tojoin.join(df_normed, how='inner')
df_rejoined.head(3)

In [176]:
# Load, clean, and normalize wb data
wb = pd.read_csv('data/econ/wb.csv')

# wb = wb[wb['Series Name'] == 'GDP at market prices (constant 2005 US$)']
# df = pd.merge(df_tojoin, df_normed, left_index=True, right_index=True); df
label = 'GDP, PPP (constant 2011 international $)'
wb = wb[wb['Series Name'] == label]
wb.drop(['Country Name', 'Series Name', 'Series Code', '2014', '2015'], axis=1, inplace=True)
wb.rename(columns={'Country Code': 'WB_A3'}, inplace=True)
wb.dropna(axis=0, inplace=True)
wb = wb.set_index('WB_A3')
wb_normalizer = (wb['1992']).as_matrix()
wb_normed = wb.divide(wb_normalizer, axis=0)
wb_normed = wb_normed.reset_index()
wb_1992 = wb.reset_index()[['WB_A3', '1992']]
wb_1992 = wb_1992.rename(columns={'1992': 'wb_1992'}); wb_1992.head(3)


Out[176]:
WB_A3 wb_1992
0 ALB 9.241349e+09
1 DZA 2.681550e+11
2 ATG 1.092734e+09

In [277]:
country_stats = pd.read_csv('data/country_stats.csv')
country_stats.rename(columns={'country': 'WB_A3'}, inplace=True)
beta_intercept = country_stats[['WB_A3','beta','intercept']]
beta_intercept.head(100)


Out[277]:
WB_A3 beta intercept
0 CHN 2.106011 -1.055173
1 ETH 0.762989 0.156017
2 OMN 0.260797 0.905775
3 BTN 0.485263 0.695238
4 VNM 0.359649 0.769938
5 EGY 0.875287 0.086420
6 MLI 0.308667 0.715733
7 MAR 0.516291 0.583528
8 DJI 0.311363 0.573508
9 BFA 0.585040 0.450565
10 BWA 0.499806 0.497423
11 SAU 0.734662 0.327779
12 COG 0.400989 0.703327
13 MYS 0.498236 0.463380
14 BOL 0.489625 0.607063
15 SLE 0.392889 0.621196
16 SEN 0.433899 0.554424
17 ALB 0.178532 0.697791
18 ARE 1.010802 0.114130
19 MOZ 0.724434 0.933208
20 HND 0.269812 0.667162
21 LAO 0.257198 1.128858
22 PER 0.686604 0.288633
23 JOR 0.936640 0.190353
24 IRN 0.669335 0.291526
25 CHL 0.542952 0.499104
26 GTM 0.317933 0.712941
27 ECU 0.370728 0.591036
28 CPV 0.895292 0.256561
29 LSO 0.423998 0.669042
... ... ... ...
70 GUY 0.368851 0.773972
71 NIC 0.397468 0.674422
72 BDI 0.171867 0.640075
73 NAM 0.741438 0.322613
74 SLV 0.385471 0.722142
75 KOR 1.404341 -0.388770
76 GRD 0.203077 0.905171
77 LCA 0.291475 0.791884
78 IRL 1.108894 0.394137
79 MWI 0.645996 0.573877
80 POL 0.394865 0.892755
81 CRI 0.505775 0.550680
82 ZAF 0.807949 0.304618
83 KIR 0.037727 1.134288
84 TZA 0.716174 0.312445
85 ESP 0.613172 0.490604
86 GNQ 0.345546 16.965988
87 CMR 0.522463 0.637246
88 ZWE 0.596383 0.343050
89 NGA 1.484300 0.086446
90 PHL 0.504959 0.484162
91 SLB 0.109842 0.921580
92 MEX 0.485721 0.613901
93 SYC 0.688008 0.572607
94 IRQ 1.922867 0.855745
95 BRB 0.574109 0.566662
96 CUB 0.257764 0.626746
97 CHE 0.332418 0.737247
98 VEN 0.549649 0.432586
99 UGA 0.721246 0.568564

100 rows × 3 columns


In [179]:
# join lights and country stats dataframes
df_plus_1992 = df_rejoined.merge(wb_1992, on='WB_A3', how='inner')
df = df_plus_1992.merge(beta_intercept, on='WB_A3', how='inner')
# pickle joined dataframe
df.to_pickle('data/cleaned_df_cities.pickle'); df.head(3)


Out[179]:
WB_A3 city pop_max F101992 F101993 F101994 F121994 F121995 F121996 F121997 ... F162007 F162008 F162009 F182010 F182011 F182012 F182013 wb_1992 beta intercept
0 AUS Adelaide 1145000 0.040712 0.043259 0.041935 0.042028 0.043854 0.044374 0.043695 ... 0.042365 0.043124 0.043410 0.046229 0.045963 0.047039 0.047074 4.876710e+11 0.891693 0.344698
1 AUS Brisbane 1860000 0.084469 0.085660 0.084832 0.098226 0.094979 0.096808 0.099221 ... 0.103671 0.106120 0.107497 0.115798 0.111103 0.114763 0.112809 4.876710e+11 0.891693 0.344698
2 AUS Perth 1532000 0.064119 0.066091 0.064022 0.071494 0.070122 0.071937 0.072184 ... 0.081374 0.080118 0.081378 0.091827 0.089461 0.091938 0.091964 4.876710e+11 0.891693 0.344698

3 rows × 40 columns


In [ ]:


In [181]:
df.to_csv('test.csv', encoding='latin-1')

In [ ]: