In [43]:
import geopandas as gpd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import multiprocessing as mp
import numpy as np
import pandas as pd
import pysal as ps
from shapely.geometry import Point
import time
import xlrd
%matplotlib inline
mpl.rcParams['figure.figsize'] = 25,25 #set the default map size
mpl.rcParams['patch.linewidth'] = 0.5 #set default polygon line width
mpl.rcParams['markers.fillstyle'] = 'full' #set default polygon line width
In [2]:
def correlation_matrix(df,title):
fig = plt.figure()
ax1 = fig.add_subplot(111)
cmap = cm.get_cmap('jet', 100)
cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
ax1.grid(True)
plt.title(title)
labels=["",'Tweets','Twts/1000','Inc.','Age','Ed.','Commute',]
ax1.set_xticklabels(labels,fontsize=6)
ax1.set_yticklabels(labels,fontsize=6)
# Add colorbar, make sure to specify tick locations to match desired ticklabels
cbar = fig.colorbar(cax, ticks=[])
plt.show()
#http://datascience.stackexchange.com/questions/10459/calculation-and-visualization-of-correlation-matrix-with-pandas
In [3]:
data_directory = "/Users/jgaboardi/Dropbox/Fall_16/Smart_Cities/Exercises/Exercise_1/data/"
# MSA List
msa_list = "List1.xls"
# County Data
county_shapes = "US_county_2014.shp"
county_demographic = "nhgis0010_ds206_20145_2014_county.csv"
prepped_county = "US_Continental_Reproj_PNTCNT_joined.shp"
# Census Tract Data
tract_shapes = "US_tract_2014.shp"
tract_demographic = "nhgis0010_ds206_20145_2014_tract.csv"
# Twitter Data (Clean for Tampa MSA)
tweets = "tampa_msa_tweets_REPROJECTED.shp"
results_directory = "/Users/jgaboardi/Dropbox/Fall_16/Smart_Cities/Exercises/Exercise_1/results/"
In [4]:
# Initial
wgs = {'init': 'epsg:4326', 'no_defs': True}
# NAD83 / Florida West (ftUS) --> {'init': 'epsg:2237', 'no_defs': True}
florida_west = 2237
In [5]:
t1 = time.time()
all_msa = pd.read_excel(data_directory+msa_list,
header=2,
usecols=['CBSA Code', 'CBSA Title',
'Metropolitan/Micropolitan Statistical Area',
'County/County Equivalent', 'State Name', 'FIPS State Code',
'FIPS County Code', 'Central/Outlying County'])
print round((time.time()-t1)/60, 8),\
"minutes to run."
In [6]:
tampa_msa_df = all_msa[all_msa["CBSA Title"] == "Tampa-St. Petersburg-Clearwater, FL"]
tampa_msa_code = tampa_msa_df["CBSA Code"].values[0]
print round((time.time()-t1)/60, 8),\
"minutes to run."
In [7]:
t1 = time.time()
us_counties = gpd.read_file(data_directory+county_shapes,
crs=wgs)
florida_counties = us_counties[us_counties["STATEFP"] == "12"]
tampa_msa_counties = florida_counties[florida_counties["CBSAFP"] == tampa_msa_code]
tampa_msa_counties = tampa_msa_counties.to_crs(epsg=florida_west)
print round((time.time()-t1)/60, 8),\
"minutes to run."
In [44]:
tampa_msa_counties.plot()
# Title
plt.title('Tampa - St. Petersburg - Clearwater\nFlorida\nMetropolitan Statistical Area',
family='Times New Roman',
size=40,
color='k',
backgroundcolor='w',
weight='bold')
# Must be changed for different spatial resolutions, etc.
plt.annotate('^ N ^',
xy=(355000, 1560000),
fontstyle='italic',
fontsize='xx-large',
fontweight='heavy',
alpha=0.75)
plt.annotate('<| ~ 5 miles |>',
xy=(355000, 1550000),
fontstyle='italic',
fontweight='heavy',
fontsize='large',
alpha=0.95)
plt.annotate("Counties",
xy=(355000, 1520000),
fontstyle='italic',
fontweight='heavy',
fontsize='xx-large',
alpha=0.95)
plt.show()
In [9]:
#twitter_data = pd.read_csv(tweets)
#geometry = [Point(xy) for xy in zip(twitter_data.Longitude, twitter_data.Latitude)]
#twitter_data = gpd.GeoDataFrame(twitter_data,
# crs=wgs,
# geometry=geometry)
#twitter_data_reprojected = twitter_data.to_crs(epsg=florida_west)
In [10]:
#tampa_tweets = [Tampa_MSA_union['geometry'].intersection(tweet) for tweet in tampa['geometry']]
#tampa_tweets = gpd.GeoDataFrame(tampa_tweets, crs=Tampa_MSA_union.crs)
#tampa_tweets.columns = ['geometry']
#tampa_tweets[:5]
In [11]:
us_county_data = pd.read_csv(data_directory+county_demographic)
In [12]:
tampa_msa_counties_Data = tampa_msa_counties.merge(us_county_data, on="GISJOIN")
In [13]:
# Age Ratio (20-39)
tampa_msa_counties_Data["Age_Ratio"] = ((tampa_msa_counties_Data["ABAQE008"] #Males
+ tampa_msa_counties_Data["ABAQE009"]
+ tampa_msa_counties_Data["ABAQE010"]
+ tampa_msa_counties_Data["ABAQE011"]
+ tampa_msa_counties_Data["ABAQE012"]
+ tampa_msa_counties_Data["ABAQE013"]
+ tampa_msa_counties_Data["ABAQE032"] #Females
+ tampa_msa_counties_Data["ABAQE033"]
+ tampa_msa_counties_Data["ABAQE034"]
+ tampa_msa_counties_Data["ABAQE035"]
+ tampa_msa_counties_Data["ABAQE036"]
+ tampa_msa_counties_Data["ABAQE037"])
/ tampa_msa_counties_Data["ABAQE001"])
In [14]:
# Education Ratio (Bachelor's Degree)
tampa_msa_counties_Data["Ed_Ratio"] = (tampa_msa_counties_Data["ABC4E022"]
/tampa_msa_counties_Data["ABC4E001"])
In [15]:
# Commute Time (Less than 25 minutes)
tampa_msa_counties_Data["Commute"] = ((tampa_msa_counties_Data["ABBTE002"]
+ tampa_msa_counties_Data["ABBTE003"]
+ tampa_msa_counties_Data["ABBTE004"]
+ tampa_msa_counties_Data["ABBTE005"]
+ tampa_msa_counties_Data["ABBTE006"])
/ tampa_msa_counties_Data["ABBTE001"])
In [16]:
tampa_msa_counties_Data = tampa_msa_counties_Data.rename(columns = {"ABDPE001":"Median_HH_Income"})
tampa_msa_counties_Data[["Age_Ratio","Ed_Ratio","Commute","Median_HH_Income"]]
Out[16]:
In [17]:
t1 = time.time()
us_tracts = gpd.read_file(data_directory+tract_shapes,
crs=wgs)
us_tracts = us_tracts.to_crs(epsg=florida_west)
florida_tracts = us_tracts[us_tracts["STATEFP"] == "12"]
hernando_tracts = florida_tracts[florida_tracts["COUNTYFP"] == "053"]
hillsborough_tracts = florida_tracts[florida_tracts["COUNTYFP"] == "057"]
pasco_tracts = florida_tracts[florida_tracts["COUNTYFP"] == "101"]
pinnellas_tracts = florida_tracts[florida_tracts["COUNTYFP"] == "103"]
tampa_mas_tracts = gpd.GeoDataFrame()
tampa_mas_tracts = tampa_mas_tracts.append(hernando_tracts)
tampa_mas_tracts = tampa_mas_tracts.append(hillsborough_tracts)
tampa_mas_tracts = tampa_mas_tracts.append(pasco_tracts)
tampa_mas_tracts = tampa_mas_tracts.append(pinnellas_tracts)
print round((time.time()-t1)/60, 8),\
"minutes to run."
In [45]:
tampa_mas_tracts.plot()
# Title
plt.title('Tampa - St. Petersburg - Clearwater\nFlorida\nMetropolitan Statistical Area',
family='Times New Roman',
size=40,
color='k',
backgroundcolor='w',
weight='bold')
# Must be changed for different spatial resolutions, etc.
plt.annotate('^ N ^',
xy=(355000, 1560000),
fontstyle='italic',
fontsize='xx-large',
fontweight='heavy',
alpha=0.75)
plt.annotate('<| ~ 5 miles |>',
xy=(355000, 1550000),
fontstyle='italic',
fontweight='heavy',
fontsize='large',
alpha=0.95)
plt.annotate("Census Tracts",
xy=(355000, 1520000),
fontstyle='italic',
fontweight='heavy',
fontsize='xx-large',
alpha=0.95)
plt.show()
In [19]:
us_tract_data = pd.read_csv(data_directory+tract_demographic)
In [20]:
tampa_msa_tract_Data = tampa_mas_tracts.merge(us_tract_data, on="GISJOIN")
In [21]:
# Age Ratio (20-39)
tampa_msa_tract_Data["Age_Ratio"] = ((tampa_msa_tract_Data["ABAQE008"] #Males
+ tampa_msa_tract_Data["ABAQE009"]
+ tampa_msa_tract_Data["ABAQE010"]
+ tampa_msa_tract_Data["ABAQE011"]
+ tampa_msa_tract_Data["ABAQE012"]
+ tampa_msa_tract_Data["ABAQE013"]
+ tampa_msa_tract_Data["ABAQE032"] #Females
+ tampa_msa_tract_Data["ABAQE033"]
+ tampa_msa_tract_Data["ABAQE034"]
+ tampa_msa_tract_Data["ABAQE035"]
+ tampa_msa_tract_Data["ABAQE036"]
+ tampa_msa_tract_Data["ABAQE037"])
/ tampa_msa_tract_Data["ABAQE001"])
In [22]:
# Education Ratio (Bachelor's Degree)
tampa_msa_tract_Data["Ed_Ratio"] = (tampa_msa_tract_Data["ABC4E022"]
/ tampa_msa_tract_Data["ABC4E001"])
In [23]:
# Commute Time (Less than 25 minutes)
tampa_msa_tract_Data["Commute"] = ((tampa_msa_tract_Data["ABBTE002"]
+ tampa_msa_tract_Data["ABBTE003"]
+ tampa_msa_tract_Data["ABBTE004"]
+ tampa_msa_tract_Data["ABBTE005"]
+ tampa_msa_tract_Data["ABBTE006"])
/ tampa_msa_tract_Data["ABBTE001"])
In [24]:
tampa_msa_tract_Data = tampa_msa_tract_Data.rename(columns = {"ABDPE001":"Median_HH_Income"})
tampa_msa_tract_Data[["Age_Ratio","Ed_Ratio","Commute","Median_HH_Income"]].head()
Out[24]:
In [25]:
only_tampa_MSA_tweets = gpd.read_file(data_directory+"tampa_msa_tweets_REPROJECTED.shp")
only_tampa_MSA_tweets = only_tampa_MSA_tweets.to_crs(epsg=2237)
In [26]:
county_for_analysis = gpd.read_file(data_directory+"tampa_msa_counties_TWEETS_joined.shp")
county_for_analysis = county_for_analysis.to_crs(epsg=2237)
county_for_analysis = county_for_analysis.rename(columns = {"ABDPE001":"Median_HH_Income"})
In [27]:
county_for_analysis["Tweets_1000"] = (county_for_analysis["PNTCNT"]\
/ (county_for_analysis["ABAQE001"] / 1000))
In [28]:
county_for_analysis = county_for_analysis.replace([np.inf, -np.inf], np.nan)
county_for_analysis = county_for_analysis.fillna(0.)
# Write Out
county_for_analysis.to_file(results_directory+"Tampa_MSA_county_for_analysis.shp")
county_for_analysis[["Age_Ratio","Ed_Ratio","Commute","PNTCNT","Tweets_1000"]]
Out[28]:
In [46]:
base = county_for_analysis.plot(column='PNTCNT',
cmap='OrRd',
scheme="quantiles",
k=4,
legend=True)
only_tampa_MSA_tweets.plot(ax=base,
marker='o',
color='green',
markersize=6,
alpha=0.5)
# Title
plt.title('Tampa - St. Petersburg - Clearwater\nFlorida\nMetropolitan Statistical Area',
family='Times New Roman',
size=40,
color='k',
backgroundcolor='w',
weight='bold')
# Must be changed for different spatial resolutions, etc.
plt.annotate('^ N ^',
xy=(355000, 1560000),
fontstyle='italic',
fontsize='xx-large',
fontweight='heavy',
alpha=0.75)
plt.annotate('<| ~ 5 miles |>',
xy=(355000, 1550000),
fontstyle='italic',
fontweight='heavy',
fontsize='large',
alpha=0.95)
plt.annotate("Tweet Density\nby County\nTotal Count\nResidents",
xy=(355000, 1520000),
fontstyle='italic',
fontweight='heavy',
fontsize='xx-large',
alpha=0.95)
plt.show()
In [48]:
base = county_for_analysis.plot(column="Tweets_1000",
cmap="OrRd",
scheme="quantiles",
k=4,
legend=True)
only_tampa_MSA_tweets.plot(ax=base,
marker='o',
color='green',
markersize=6,
alpha=0.5)
# Title
plt.title('Tampa - St. Petersburg - Clearwater\nFlorida\nMetropolitan Statistical Area',
family='Times New Roman',
size=40,
color='k',
backgroundcolor='w',
weight='bold')
# Must be changed for different spatial resolutions, etc.
plt.annotate('^ N ^',
xy=(355000, 1560000),
fontstyle='italic',
fontsize='xx-large',
fontweight='heavy',
alpha=0.75)
plt.annotate('<| ~ 5 miles |>',
xy=(355000, 1550000),
fontstyle='italic',
fontweight='heavy',
fontsize='large',
alpha=0.95)
plt.annotate("Tweet Density\nby County\nPer 1,000\nResidents",
xy=(355000, 1520000),
fontstyle='italic',
fontweight='heavy',
fontsize='xx-large',
alpha=0.95)
plt.show()
In [31]:
tract_for_analysis = gpd.read_file(data_directory+"tampa_msa_tracts_TWEETS_joined.shp")
tract_for_analysis = tract_for_analysis.to_crs(epsg=2237)
tract_for_analysis = tract_for_analysis.rename(columns = {"ABDPE001":"Median_HH_Income"})
In [32]:
tract_for_analysis["Tweets_1000"] = (tract_for_analysis["PNTCNT"]\
/ (tract_for_analysis["ABAQE001"] / 1000))
In [33]:
tract_for_analysis = tract_for_analysis.replace([np.inf, -np.inf], np.nan)
tract_for_analysis = tract_for_analysis.fillna(0.)
# Write Out
tract_for_analysis.to_file(results_directory+"Tampa_MSA_tract_for_analysis.shp")
tract_for_analysis[["Age_Ratio","Ed_Ratio","Commute","PNTCNT","Tweets_1000"]].head()
Out[33]:
In [49]:
base = tract_for_analysis.plot(column="PNTCNT",
cmap="OrRd",
scheme="quantiles",
k=9,
legend=True)
only_tampa_MSA_tweets.plot(ax=base,
marker='o',
color='green',
markersize=6,
alpha=0.5)
# Title
plt.title('Tampa - St. Petersburg - Clearwater\nFlorida\nMetropolitan Statistical Area',
family='Times New Roman',
size=40,
color='k',
backgroundcolor='w',
weight='bold')
# Must be changed for different spatial resolutions, etc.
plt.annotate('^ N ^',
xy=(355000, 1560000),
fontstyle='italic',
fontsize='xx-large',
fontweight='heavy',
alpha=0.75)
plt.annotate('<| ~ 5 miles |>',
xy=(355000, 1550000),
fontstyle='italic',
fontweight='heavy',
fontsize='large',
alpha=0.95)
plt.annotate("Tweet Density\nby Census Tract\nTotal Count\nResidents",
xy=(355000, 1520000),
fontstyle='italic',
fontweight='heavy',
fontsize='xx-large',
alpha=0.95)
plt.show()
In [50]:
base = tract_for_analysis.plot(column="Tweets_1000",
cmap="OrRd",
scheme="quantiles",
k=9,
legend=True)
only_tampa_MSA_tweets.plot(ax=base,
marker='o',
color='green',
markersize=6,
alpha=0.5)
# Title
plt.title('Tampa - St. Petersburg - Clearwater\nFlorida\nMetropolitan Statistical Area',
family='Times New Roman',
size=40,
color='k',
backgroundcolor='w',
weight='bold')
# Must be changed for different spatial resolutions, etc.
plt.annotate('^ N ^',
xy=(355000, 1560000),
fontstyle='italic',
fontsize='xx-large',
fontweight='heavy',
alpha=0.75)
plt.annotate('<| ~ 5 miles |>',
xy=(355000, 1550000),
fontstyle='italic',
fontweight='heavy',
fontsize='large',
alpha=0.95)
plt.annotate("Tweet Density\nby Census Tract\nPer 1,000\nResidents",
xy=(355000, 1520000),
fontstyle='italic',
fontweight='heavy',
fontsize='xx-large',
alpha=0.95)
plt.show()
In [36]:
corr_vars = ["PNTCNT","Tweets_1000","Median_HH_Income","Age_Ratio","Ed_Ratio","Commute"]
In [37]:
# Calculate Correlations
tract_corr = []
idx = -1
for var_1 in tract_for_analysis[corr_vars]:
idx += 1
tract_corr.append([])
for var_2 in corr_vars:
tract_corr[idx].append(np.corrcoef(tract_for_analysis[corr_vars][var_1],
tract_for_analysis[corr_vars][var_2])[0][1])
# Correlation Matrix
tract_corr = pd.DataFrame(np.array(tract_corr),
columns=corr_vars,
index=corr_vars)
# Write Out
tract_corr.to_csv(results_directory+"Tampa_tract_correlation_matrix.csv")
tract_corr
Out[37]:
In [38]:
correlation_matrix(tract_corr, title="Tampa MSA Correlation - Tract")
In [39]:
county_corr = []
idx = -1
for var_1 in county_for_analysis[corr_vars]:
idx += 1
county_corr.append([])
for var_2 in corr_vars:
county_corr[idx].append(np.corrcoef(county_for_analysis[corr_vars][var_1],
county_for_analysis[corr_vars][var_2])[0][1])
county_corr = pd.DataFrame(np.array(county_corr),
columns=corr_vars,
index=corr_vars)
# Write Out
county_corr.to_csv(results_directory+"Tampa_county_correlation_matrix.csv")
county_corr
Out[39]:
In [40]:
correlation_matrix(county_corr, title="Tampa MSA Correlation - County")
In [6]:
us_county_analysis = gpd.read_file(data_directory+prepped_county)
us_county_analysis = us_county_analysis.rename(columns = {"ABDPE001":"Median_HH_Income"})
In [7]:
US_tweets = gpd.read_file(data_directory+"US_Tweets.shp")
In [8]:
# Age Ratio (20-39)
us_county_analysis["Age_Ratio"] = ((us_county_analysis["ABAQE008"] #Males
+ us_county_analysis["ABAQE009"]
+ us_county_analysis["ABAQE010"]
+ us_county_analysis["ABAQE011"]
+ us_county_analysis["ABAQE012"]
+ us_county_analysis["ABAQE013"]
+ us_county_analysis["ABAQE032"] #Females
+ us_county_analysis["ABAQE033"]
+ us_county_analysis["ABAQE034"]
+ us_county_analysis["ABAQE035"]
+ us_county_analysis["ABAQE036"]
+ us_county_analysis["ABAQE037"])
/ us_county_analysis["ABAQE001"])
# Education Ratio (Bachelor's Degree)
us_county_analysis["Ed_Ratio"] = (us_county_analysis["ABC4E022"]
/ us_county_analysis["ABC4E001"])
# Commute Time (Less than 25 minutes)
us_county_analysis["Commute"] = ((us_county_analysis["ABBTE002"]
+ us_county_analysis["ABBTE003"]
+ us_county_analysis["ABBTE004"]
+ us_county_analysis["ABBTE005"]
+ us_county_analysis["ABBTE006"])
/ us_county_analysis["ABBTE001"])
# Tweets per 1000 Residents
us_county_analysis["Tweets_1000"] = (us_county_analysis["PNTCNT"]\
/ (us_county_analysis["ABAQE001"] / 1000))
In [9]:
us_county_analysis = us_county_analysis.replace([np.inf, -np.inf], np.nan)
us_county_analysis = us_county_analysis.fillna(0.)
# Write Out
us_county_analysis.to_file(results_directory+"US_county_for_analysis.shp")
us_county_analysis[["Age_Ratio","Ed_Ratio","Commute","PNTCNT","Tweets_1000"]].head()
Out[9]:
In [7]:
us_county_analysis.plot(column="PNTCNT",
cmap="OrRd",
scheme="quantiles",
k=9,
legend=True)
Out[7]:
In [9]:
us_county_analysis.plot(column="Tweets_100",
cmap="OrRd",
scheme="quantiles",
k=9,
legend=True)
Out[9]:
In [13]:
# Calculate Correlations
US_corr = []
idx = -1
for var_1 in us_county_analysis[corr_vars]:
idx += 1
US_corr.append([])
for var_2 in corr_vars:
US_corr[idx].append(np.corrcoef(us_county_analysis[corr_vars][var_1],
us_county_analysis[corr_vars][var_2])[0][1])
# Correlation Matrix
US_corr = pd.DataFrame(np.array(US_corr),
columns=corr_vars,
index=corr_vars)
# Write Out
US_corr.to_csv(results_directory+"US_county_correlation_matrix.csv")
US_corr
Out[13]:
In [14]:
correlation_matrix(US_corr, title="U.S. Correlation - County")