In [ ]:
# Algorithm
1. Merge all data files (census, business type, and Flickr) using zip codes as primary key.
2. Run linear regression (at the end of the notebook)
3. Transform the problem as classification
4. Try multiple classifiers
5. Run
In [280]:
import requests
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import seaborn as sns
font = {'family' : 'Arial',
'weight' : 'bold',
'size' : 25}
matplotlib.rc('font', **font)
from census import Census
from us import states
import csv
In [2]:
#biz type
"""
11 Agriculture, Forestry, Fishing and Hunting
21 Mining, Quarrying, and Oil and Gas Extraction
22 Utilities
#23 Construction
31-33 Manufacturing
#42 Wholesale Trade
#44-45 Retail Trade
48-49 Transportation and Warehousing
#51 Information
#52 Finance and Insurance
#53 Real Estate and Rental and Leasing
#54 Professional, Scientific, and Technical Services
#55 Management of Companies and Enterprises
56 Administrative and Support and Waste Management and Remediation Services
61 Educational Services
62 Health Care and Social Assistance
71 Arts, Entertainment, and Recreation
#72 Accommodation and Food Services
81 Other Services (except Public Administration)
92 Public Administration
potential parameters: 23, 42, 45, 51, 52, 53, 54, 55, 72
"""
business_codes = [11, 21, 22, 23, 30, 42, 45, 49, 51, 52, 53, 54, 55, 56, 61, 62, 71, 72, 81, 99]
business_names = ['Agriculture/Fishing','Mining/Gas Extraction','Utilities','Construction',
'Manufacturing','Wholesale','Retail','Transportation','Media/Publishing',
'Finance','Real Estate','Law/Science/Ads','Management','Administrative',
'Education','Health','Arts','Accommodation/Food','Others','Public']
In [3]:
# zip code: this is from biz file
np.load('new_zip_codes.npy')
# flickr data -> no nan, no inf
df_art = pd.read_csv('art_rate.csv')
# census
df_cen = pd.read_csv('census_filtered.csv')
# predictor
df_biz = pd.read_csv('biz_growth.csv')
In [4]:
# zip code with city names
df_city = pd.read_csv('city_zip_df.csv')
df_city.head()
Out[4]:
In [5]:
df_cen.head()
Out[5]:
In [6]:
temp = []
for zip_code in df_cen['zip_code'].values:
temp.append(df_city.city[zip_code == df_city.zipcodes.values].values[0])
In [9]:
df_cen['city']=temp
In [10]:
# rename to prevent confusion
df_art = df_art.rename(columns={'growth':'art_growth'})
df_biz = df_biz.rename(columns={'growth':'biz_growth'})
In [11]:
df_biz.head()
Out[11]:
In [12]:
print df_art.columns
print df_cen.columns
print df_biz.columns
In [13]:
# putting things together: inner join
df_feat = pd.merge(df_art, df_cen, how='inner', on=['year', 'zip_code'])
df_feat.head()
Out[13]:
In [14]:
df_corr = df_feat
In [15]:
# converting nan -> 0
# converting inf -> 2
columns = df_feat.columns
for column in columns[:-1]:
for i in range(len(df_feat)):
if np.isinf((df_feat[column][i])):
df_corr[column].values[i]=2
if np.isnan(df_feat[column][i]):
df_corr[column].values[i]=0
In [16]:
df_corr.head()
Out[16]:
In [17]:
# check biz column value: if inf -> 2
for i in range(len(df_biz)):
biz_growth_num = df_biz['biz_growth'][i]
if np.isinf(biz_growth_num):
df_biz['biz_growth'][i]=2
In [18]:
# merge with biz
df_all = pd.merge(df_corr, df_biz, how='inner', on=['year', 'zip_code'])
In [32]:
df_all.head()
Out[32]:
In [20]:
# change the column order for the future
temp = df_all.ix[:,1:32]
temp['art_growth']=df_all['art_growth']
temp['business_code']=df_all['business_code']
temp['biz_growth']=df_all['biz_growth']
temp['city']=df_all['city']
df_all = temp
In [21]:
df_all.head()
Out[21]:
In [56]:
# save
df_all.to_csv('data_city.csv',encoding='utf-8',index=False)
In [5]:
df_all = pd.read_csv('data_city.csv')
In [6]:
# assign indices
df_all['biz_growth'].describe()
Out[6]:
In [7]:
plt.hist(df_all['biz_growth'],100)
Out[7]:
In [8]:
# create index: 3
def assign_biz_multiclass1(num):
temp = []
for biz_growth in df_all['biz_growth'].values:
if biz_growth <= 0.0:
biz_index = 0
elif biz_growth > 0.0 and biz_growth <= num:
biz_index = 1
elif biz_growth > num:
biz_index = 2
temp.append(biz_index)
plt.hist(temp)
return temp
In [9]:
# create index: 4
def assign_biz_multiclass2(num):
temp = []
for biz_growth in df_all['biz_growth'].values:
if biz_growth <= 0.0:
biz_index = 0
elif biz_growth > 0.0 and biz_growth <= num:
biz_index = 1
elif biz_growth > num and biz_growth <= num*2:
biz_index = 2
elif biz_growth > num*2:
biz_index = 3
temp.append(biz_index)
plt.hist(temp)
return temp
In [10]:
def assign_biz_binary():
temp = []
for biz_growth in df_all['biz_growth'].values:
if biz_growth <= 0.0:
biz_index = 0
elif biz_growth > 0.0:
biz_index = 1
temp.append(biz_index)
plt.hist(temp)
return temp
In [11]:
temp = assign_biz_multiclass1(0.05)
df_all['biz_index']=pd.DataFrame(temp)
In [12]:
gb = df_all.groupby(('year','business_code'))
In [13]:
# compute average biz_growth per business type in a year, across zip_codes
ref_years = [2011, 2012]
biz_avr_by_zip = []
for ref_year in ref_years:
for biz_code in business_codes:
a = gb.get_group((ref_year,biz_code))
biz_avr_by_zip.append(np.average(a['biz_growth'].values))
In [14]:
df_av = pd.DataFrame({'2011':biz_avr_by_zip[0:len(business_codes)],'2012':biz_avr_by_zip[len(business_codes):]})
df_av.index = business_names
df_av
Out[14]:
In [15]:
df_av['biz_name'] = business_names
In [16]:
sns.barplot(x='biz_name',y='2012',data=df_av)
Out[16]:
In [18]:
x = df_av['2011'][:-1]
y = df_av['2012'][:-1]
plt.plot(df_av['2011'][:-1],df_av['2012'][:-1],'o',hold='on')
plt.plot(np.linspace(-.02,.08,100),np.linspace(-.02,.08,100),':')
# plt.axis('equal')
plt.axis([-0.03, 0.10, -0.05, 0.10])
sns.set_context("poster")
mu = 0.0025
sigma = 0.003
errors = sigma * np.random.randn(len(business_names)-1) + mu
for i in range(0,len(business_codes)-1):
biz_name = business_names[i]
plt.annotate(biz_name, (x[i]-errors[i],y[i]+errors[i]),fontsize=15)
plt.show()
In [19]:
from sklearn import linear_model, cross_validation
from sklearn.metrics import accuracy_score
temp = assign_biz_multiclass1(0.10)
df_all['biz_index']=pd.DataFrame(temp)
gb = df_all.groupby(('year','business_code'))
scores = []
for biz_code,biz_name in zip(business_codes,business_names):
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X_train_df, X_test_df, y_train_df, y_test_df = cross_validation.train_test_split(df_2011, df_2012, test_size=test_size, random_state=0)
X_train = X_train_df.ix[:,2:32]
X_test = X_test_df.ix[:,2:32]
y_train = y_train_df.ix[:,-1]
y_test = y_test_df.ix[:,-1]
logreg = linear_model.LogisticRegression(multi_class='ovr',penalty='l1')#solver='lbfgs')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
score = accuracy_score(y_test, y_pred)
# print biz_name + ':' + str(score)
# rf_eval_feat(df_2011.columns[2:32],logreg,biz_name,score)
scores.append(score)
print sum(np.array(scores)>0.5)/float(len(business_codes))*100
In [20]:
# Selecting classes: choose based on +/-SD
df_all[df_all.year.values==2012].biz_growth.describe()
Out[20]:
In [22]:
# plot feature importances (as a confirmation)
def rf_eval_feat(feature_names,clf,title,score):
fig = plt.figure(figsize=(15,10))
# sort importances
indices = np.argsort(clf.feature_importances_)
# plot as bar chart
plt.barh(np.arange(len(feature_names)), clf.feature_importances_[indices])
plt.yticks(np.arange(len(feature_names)) + 0.5, np.array(feature_names)[indices])
_ = plt.xlabel('Relative importance')
plt.title([title + ':' + str(score)],fontsize=20)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)
return indices
In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=100),
AdaBoostClassifier(),
GaussianNB(),
LDA(),
QDA()]
In [24]:
df_all.head()
Out[24]:
In [25]:
temp = assign_biz_binary()
df_binom = df_all
df_binom['biz_index']=pd.DataFrame(temp)
gb = df_binom.groupby(('year','business_code'))
for name, clf in zip(names, classifiers):
scores = []
for biz_code,biz_name in zip(business_codes,business_names):
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
scores.append(score)
print name+ ':' + biz_name + ':' +str(score)
print sum(np.array(scores)>0.5)/float(len(business_codes))*100
In [26]:
df_all.head()
Out[26]:
In [27]:
# class 3
temp = assign_biz_multiclass1(0.30)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))
for name, clf in zip(names, classifiers):
scores = []
for biz_code,biz_name in zip(business_codes,business_names):
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
scores.append(score)
print name+ ':' + biz_name + ':' +str(score)
print sum(np.array(scores)>0.33)/float(len(business_codes))*100
In [281]:
np.average(df_all.biz_index)
Out[281]:
In [29]:
temp = assign_biz_binary()
df_binom = df_all
df_binom['biz_index']=pd.DataFrame(temp)
gb = df_binom.groupby(('year','business_code'))
clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
scores = []
for biz_code,biz_name in zip(business_codes,business_names):
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
scores.append(score)
# print biz_name + ':' +str(score)
# evaluate feature importance
rf_eval_feat(df_2011.columns[2:32],clf,biz_name,score)
print sum(np.array(scores)>0.6)/float(len(business_codes))*100
In [30]:
# class 3
temp = assign_biz_multiclass1(0.10)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))
for name, clf in zip(names, classifiers):
scores = []
for biz_code,biz_name in zip(business_codes,business_names):
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
scores.append(score)
print name+ ':' + biz_name + ':' +str(score)
print sum(np.array(scores)>0.6)/float(len(business_codes))*100
We're settling with 0.46
In [ ]:
feature_names = df_2011.columns[2:32]
In [159]:
temp = assign_biz_multiclass2(0.05)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))
clf = RandomForestClassifier(max_depth=5, n_estimators=100)
scores = []
y_2013_list = []
art_importance = []
art_rank_list = []
for biz_code,biz_name in zip(business_codes,business_names):
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
scores.append(score)
# prediction
X_2012 = df_2012.ix[:,2:32]
y_2013 = list(clf.predict(X_2012))
y_2013_list.append(y_2013)
# evaluate feature importance
indices = rf_eval_feat(df_2011.columns[2:32],clf,biz_name,score)
# check the contribution of art_growth
art_importance.append(clf.feature_importances_[-1])
art_rank = len(feature_names)-np.arange(len(feature_names))[feature_names[indices]=='art_growth'][0]
art_rank_list.append(art_rank)
# merge lists of list
import itertools
y_2013_merged = list(itertools.chain(*y_2013_list))
print sum(np.array(scores)>0.46)/float(len(business_codes))*100
In [46]:
# To build confusion matrix, you need # of data points corresponding to the index
# 2. per business, cm = confusion_matrix(y_test, y_pred)
df_multinom.columns
Out[46]:
In [203]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics
temp = assign_biz_multiclass2(0.20)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))
clf = RandomForestClassifier(max_depth=5, n_estimators=100)
vals = []
for biz_code,biz_name in zip(business_codes,business_names):
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
y_true = y_test
y_pred = clf.fit(X_train, y_train).predict(X_test)
# vals.append(metrics.f1_score(y_true, y_pred, average=None))
vals.append(metrics.precision_recall_fscore_support(y_test, y_pred, beta = 0.9, average=None))
In [213]:
# vals[i]: biz
# vals[i][2]: F1 score
# vals[i][0]: precision (precision is more important in this case)
# maximum precision = no false positive
# vals[i][3]: weights
vals_weighted = []
for i in range(len(vals)):
vals_weighted.append(sum(vals[i][2]*vals[i][3]/sum(vals[i][3])))
In [284]:
# use bootstrap to evaluate the classifier
temp = assign_biz_multiclass2(0.05)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))
clf = RandomForestClassifier(max_depth=5, n_estimators=100)
scores = []
y_2013_list = []
for biz_code,biz_name in zip(business_codes,business_names):
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
scores.append(score)
# prediction
X_2012 = df_2012.ix[:,2:32]
y_2013 = list(clf.predict(X_2012))
y_2013_list.append(y_2013)
# evaluate feature importance
indices = rf_eval_feat(df_2011.columns[2:32],clf,biz_name,score)
# check the contribution of art_growth
art_importance.append(clf.feature_importances_[-1])
art_rank = len(feature_names)-np.arange(len(feature_names))[feature_names[indices]=='art_growth'][0]
art_rank_list.append(art_rank)
# merge lists of list
import itertools
y_2013_merged = list(itertools.chain(*y_2013_list))
print sum(np.array(scores)>0.46)/float(len(business_codes))*100
Out[284]:
In [282]:
vals[0][3]
Out[282]:
In [214]:
vals_weighted
Out[214]:
In [215]:
fig = plt.figure(figsize=(15,10))
# sort importances
indices = np.argsort(vals_weighted[:-1])
# plot as bar chart
plt.barh(np.arange(len(business_names[:-1])), np.array(vals_weighted)[indices])
plt.yticks(np.arange(len(business_names[:-1])) + 0.5, np.array(business_names)[indices])
_ = plt.xlabel('Precision')
# plt.title([title + ':' + str(score)],fontsize=20)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)
In [279]:
fig = plt.figure(figsize=(15,10))
# sort importances
indices = np.argsort(art_importance[:-1])
# plot as bar chart
plt.barh(np.arange(len(business_names[:-1])), np.array(art_importance)[indices])
plt.yticks(np.arange(len(business_names[:-1])) + 0.5, np.array(business_names)[indices])
_ = plt.xlabel('Relative importance')
# plt.title([title + ':' + str(score)],fontsize=20)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)
In [523]:
art_rank_list
Out[523]:
In [529]:
np.arange(len(business_names[:-10]))
Out[529]:
In [536]:
np.array(art_rank_list[:-10])
Out[536]:
In [35]:
fig = plt.figure(figsize=(15,10))
# sort importances (descending)
indices = np.array(art_rank_list[:-1]).argsort()[::-1]
# indices = np.argsort(art_rank_list[:-1])
# plot as bar chart
plt.barh(np.arange(len(business_names[:-1])), np.array(art_rank_list[:-1])[indices])
plt.yticks(np.arange(len(business_names[:-1])) + 0.5, np.array(business_names[:-1])[indices])
_ = plt.xlabel('Importance Rank')
# plt.title([title + ':' + str(score)],fontsize=20)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)
In [164]:
fig = plt.figure(figsize=(15,10))
# sort importances
# indices = np.arange(len(scores)-1)
indices = np.argsort(scores[:-1])
# plot as bar chart
# plt.barh(np.arange(len(business_codes)-1), scores[:-1])
# plt.yticks(np.arange(len(business_codes)-1) + 0.5, np.array(business_names)[:-1])
plt.barh(np.arange(len(business_codes)-1), np.array(scores)[indices])
plt.yticks(np.arange(len(business_codes)-1) + 0.5, np.array(business_names)[indices])
# plt.barh(np.arange(len(feature_names)), clf.feature_importances_[indices])
# plt.yticks(np.arange(len(feature_names)) + 0.5, np.array(feature_names)[indices])
# _ = plt.xlabel('Relative importance')
# plt.title([title + ':' + str(score)],fontsize=20)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)
sns.set_context("poster")
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
# plt.axvline(0.44, color='red', linewidth=2)
In [45]:
plt.hist(y_2013_merged)
Out[45]:
In [315]:
print business_codes
print business_names
In [316]:
np.array(business_codes)[np.array(business_names)=='Finance']
Out[316]:
In [319]:
# def rf_eval_feat(feature_names,clf,title,score):
# fig = plt.figure(figsize=(15,10))
# # sort importances
# indices = np.argsort(clf.feature_importances_)
# # plot as bar chart
# plt.barh(np.arange(len(feature_names)), clf.feature_importances_[indices])
# plt.yticks(np.arange(len(feature_names)) + 0.5, np.array(feature_names)[indices])
# _ = plt.xlabel('Relative importance')
# plt.title([title + ':' + str(score)],fontsize=20)
# plt.tick_params(axis='x', labelsize=20)
# plt.tick_params(axis='y', labelsize=20)
# return indices
In [32]:
temp = assign_biz_multiclass2(0.05)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))
# 0. Start with a single business type
biz_code = 52
biz_name = 'Finance'
# 1. Run random forest with all features.
clf = RandomForestClassifier(max_depth=5, n_estimators=100)
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
# 2. Compute accuracy.
score = clf.score(X_test, y_test)
# 3. Compute feature importance.
feature_names = df_2011.columns[2:32]
indices = rf_eval_feat(feature_names,clf,biz_name,score)
In [508]:
np.arange(len(feature_names))[feature_names[indices]=='art_growth']
Out[508]:
In [343]:
# so these are bottom 5
feature_names[indices[:5]]
Out[343]:
In [342]:
# 4. Pick top 5, and run it again.
"""
indices: ascending order
ex. indices[:5]: the first 5 lowest
so, top n features by index: indices[-n:]
"""
print indices
print indices[-5:]
In [348]:
np.shape(X[:,indices[-5:]])
Out[348]:
In [354]:
X[:,indices[-5:]]
Out[354]:
In [367]:
# 5. Compute accuracy and compare (top 5)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X[:,indices[-5:]], y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
print clf.score(X_test, y_test)
In [382]:
# 6: compare: loop over first n indices, and see how accuracy changes
indices = np.argsort(clf.feature_importances_)
scores = []
for i in range(1,21):
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X[:,indices[-i:]], y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
scores.append(clf.score(X_test, y_test))
plt.plot(scores,':o')
plt.show()
# 7. Loop over the rest of the features and repeat 2-3.
# 8. Plot the graph.
In [395]:
# 7. Loop over the rest of the features and repeat steps 2 and 3.
scores_all = []
df_feat_import = pd.DataFrame()
for biz_code,biz_name in zip(business_codes,business_names):
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
score_all = clf.score(X_test, y_test)
scores_all.append(score_all)
scores = []
for i in range(1,21):
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X[:,indices[-i:]], y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
scores.append(clf.score(X_test, y_test))
df_feat_import[biz_name]=scores
fig = plt.figure(figsize=(15,10))
plt.plot(scores,':o')
plt.title(biz_name)
plt.ylim([np.average(scores)-.1, min(np.average(scores)+.2,1.0)])
In [40]:
# 8. The results depend on train/test datasets. Repeat this 10 times to get standard errors.
temp = assign_biz_multiclass2(0.05)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))
n_runs = 10
df_feat_import_n = pd.DataFrame()
for n in range(n_runs):
scores_all = []
df_feat_import = pd.DataFrame()
for biz_code,biz_name in zip(business_codes,business_names):
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
score_all = clf.score(X_test, y_test)
scores_all.append(score_all)
scores = []
for i in range(1,21):
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X[:,indices[-i:]], y, test_size=test_size, random_state=0)
clf.fit(X_train, y_train)
scores.append(clf.score(X_test, y_test))
df_feat_import['try']=range(1,21)
df_feat_import['num_feat']=i
df_feat_import[biz_name]=scores
df_feat_import_n = pd.concat([df_feat_import_n, df_feat_import],ignore_index=True)
In [99]:
gb_year = df_all.groupby('year',as_index=False)
# df_2011 = gb_year.get_group(2011)
# df_2012 = gb_year.get_group(2012)
# gb_11 = df_2011.groupby(('city','business_code'),as_index=False)
# gb_12 = df_2012.groupby(('city','business_code'),as_index=False)
cities = df_all.city.unique()
In [47]:
def plot2D_city_biz(gb_year,year,business_names,cities):
# select a year
df = gb_year.get_group(year)
# groupby
gb = df.groupby(('city','business_code'),as_index=False)
# comput the average
df_ = gb.agg(np.average)
# simplify
df_ = df_[['city','business_code','year','biz_growth']]
city_biz = []
for city in cities:
temp = df_[df_['city']==city]
city_biz.append(list(temp.biz_growth.values))
city_biz_array = np.array(city_biz)
# plot
fig = plt.figure(figsize=(15,5))
sns.set()
sns.set_context("poster")
ax = sns.heatmap(city_biz_array, xticklabels=business_names, yticklabels=cities, vmin=-.4, vmax=.4)
In [ ]:
plot2D_city_biz(gb_year,2011,business_names,cities)
In [53]:
plot2D_city_biz(gb_year,2011,business_names,cities)
In [54]:
plot2D_city_biz(gb_year,2012,business_names,cities)
In [100]:
year = 2011
df = gb_year.get_group(year)
# groupby
gb = df.groupby('city',as_index=False)
# comput the average
df_ = gb.agg(np.average)
# df_ = gb.get_group('Chicago')
df_.ix[:,3:33].head()
# df_.head()
Out[100]:
In [49]:
def plot2D_city_demo(gb_year,year,cities):
df = gb_year.get_group(year)
# groupby
gb = df.groupby('city',as_index=False)
# comput the average
df_ = gb.agg(np.average)
df_ = df_.ix[:,3:33]
city_demo = []
for i in range(len(cities)):
temp = df_.ix[i,:]
city_demo.append(list(temp.values))
city_demo_array = np.array(city_demo)
# plot
fig = plt.figure(figsize=(19,5))
sns.set()
sns.set_context("poster")
ax = sns.heatmap(city_demo_array, xticklabels=df_.columns, yticklabels=cities, vmin=-1,vmax=1)
In [57]:
plot2D_city_demo(gb_year,2011,cities)
In [58]:
plot2D_city_demo(gb_year,2012,cities)
In [101]:
df_all.biz_index.unique()
Out[101]:
In [102]:
df = df_all[['city', 'zip_code', 'business_code', 'biz_growth', 'biz_index','year']]
In [103]:
zip_codes = df.zip_code.unique()
In [341]:
# filter geojson file based on zip_codes
geo_file = 'zip_cities.json'
import json
with open(geo_file) as f:
data = json.load(f)
In [349]:
df_feat = pd.DataFrame(data['features'])
temp = pd.DataFrame()
for i in range(len(data['features'])):
if int(data['features'][i]['properties']['ZCTA5CE10']) in zip_codes:
temp = pd.concat([temp,df_feat.iloc[[i]]],ignore_index=True)
dict_temp = temp.to_dict(orient='records')
data['features'] = dict_temp
with open('zip_cities_final.json', 'w') as f:
json.dump(data, f)
In [351]:
import folium
m = folium.Map(location=[37.769959, -122.448679], zoom_start=9)
m.geo_json(geo_path='zip_cities_final.json')
m.create_map('test_.html')
In [356]:
data['features'][0]['properties']
Out[356]:
In [104]:
# add zero to 4 digit zip codes
temp = (df.zip_code.apply(str))
temp.unique()
zips = []
for zip_code in temp:
if len(zip_code)==4:
zip_code = '0' + zip_code
zips.append(zip_code)
# attach it to the original table
df['zips'] = zips
In [105]:
df.head()
Out[105]:
In [106]:
print business_codes
In [107]:
plt.hist(y_2013_merged)
Out[107]:
In [118]:
# test 2012 data
import folium
geo_path = 'zip_cities_final.json'
df_2012 = df[df.year==2012]
# df_2012['predict']=y_2013_merged
gb = df_2012.groupby('business_code')
# for biz_code in business_codes:
biz_code = 11
data = gb.get_group(biz_code)
# LOOP DOES NOT WORK! IT OVERWRITES IT
m = folium.Map(location=[37.769959, -122.448679], zoom_start=9)
m.geo_json(geo_path=geo_path,
data=data,
columns=['zips', 'biz_index'],
threshold_scale=range(0,4),
key_on='feature.properties.ZCTA5CE10',
fill_color='BuPu', fill_opacity=0.7, line_opacity=0.2,
legend_name='Business Growth Index')
m.create_map('test_bind_' + str(biz_code) + '.html')
In [132]:
# Bind data to Folium: this is for prediction data
import folium
df_2012 = df[df.year==2012]
df_2012['predict']=y_2013_merged
gb = df_2012.groupby(('year','business_code'))
# biz_code=11
# for biz_code in business_codes:
def create_map(biz_code,gb):
geo_path = 'zip_cities_final.json'
data = gb.get_group((2012,biz_code))
m = folium.Map(location=[37.769959, -122.448679], zoom_start=9)
m.geo_json(geo_path=geo_path,
data_out = 'data' + str(biz_code) + '.json',
data=data,
columns=['zips', 'predict'],
threshold_scale=range(0,4),
key_on='feature.properties.ZCTA5CE10',
fill_color='BuPu', fill_opacity=0.7, line_opacity=0.2,
legend_name='Business Growth Index')
m.create_map('biz_map_' + str(biz_code) + '.html')
In [133]:
for i in range(len(business_codes)-1):
create_map(business_codes[i],gb)
In [149]:
cities
Out[149]:
In [152]:
import requests
# Google Maps API
GOOGLE_KEY = ''
# Convert zip codes to coordinates
city_names = ['Baltimore,+MD', 'Boston,+MA', 'Chicago,+IL', 'Detroit,+MI', 'Los+Angeles,+CA',
'New+York,+NY', 'Oakland,+CA', 'San+Francisco,+CA', 'Seattle,+WA', 'Washington,+DC']
lats2 = []
lngs2 = []
for city_name in city_names:
query_url = 'https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=%s' % (city_name, GOOGLE_KEY)
r = requests.get(query_url)
temp = r.json()
if len(temp['results'])==0:
lat = 'none'
lng = 'none'
else:
lat = temp['results'][0]['geometry']['location']['lat']
lng = temp['results'][0]['geometry']['location']['lng']
lats2.append(lat)
lngs2.append(lng)
In [157]:
city_coordinates = zip(lats2,lngs2)
city_coordinates
Out[157]:
In [195]:
# X: 2011-2012 data, Y: 2012-2013 data
# train and test decided by zip code
gb = df_all.groupby('year')
df_2011 = gb.get_group(2011)
df_2012 = gb.get_group(2012)
from sklearn import cross_validation
X_train_df, X_test_df, y_train_df, y_test_df = cross_validation.train_test_split(df_2011, df_2012, test_size=0.2, random_state=0)
In [196]:
from sklearn import linear_model
# choose features
X_train = X_train_df.ix[:,2:32]
X_test = X_test_df.ix[:,2:32]
y_train = y_train_df.ix[:,-1]
y_test = y_test_df.ix[:,-1]
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)
Out[196]:
In [197]:
gb = df_all.groupby(('year','zip_code'),as_index=False)
# gb = df2.groupby(('zip_code','year','business_code'),as_index=False)
gb_avr = gb.agg(np.average)
gb_avr.head()
Out[197]:
In [198]:
gb = gb_avr.groupby('year')
df_2011 = gb.get_group(2011)
df_2012 = gb.get_group(2012)
X_train_df, X_test_df, y_train_df, y_test_df = cross_validation.train_test_split(df_2011, df_2012, test_size=0.4, random_state=0)
In [199]:
X_train = X_train_df.ix[:,2:32]
X_test = X_test_df.ix[:,2:32]
y_train = y_train_df.ix[:,-1]
y_test = y_test_df.ix[:,-1]
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)
Out[199]:
In [226]:
# X
df_corr2 = gb_avr.ix[:,:-2]
# Y
gb_biz = df_biz.groupby('business_code')
# biz list
biz_codes = df_biz['business_code'].unique()
In [253]:
# they are not in the same order, so let's merge and separate.
for biz_code in biz_codes:
temp = gb_biz.get_group(biz_code)
df_temp = pd.merge(df_corr2, temp, how='inner', on=['year', 'zip_code'])
# split it by year
gb_temp = df_temp.groupby('year')
df_2011 = gb_temp.get_group(2011)
df_2012 = gb_temp.get_group(2012)
X_train_df, X_test_df, y_train_df, y_test_df = cross_validation.train_test_split(df_2011, df_2012, test_size=0.4, random_state=0)
X_train = X_train_df.ix[:,2:32]
X_test = X_test_df.ix[:,2:32]
y_train = y_train_df.ix[:,-1]
y_test = y_test_df.ix[:,-1]
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
print 'business code:' + str(biz_code) + ' ' + str(regr.score(X_test, y_test))
In [206]:
# We are using 2011-2012 (data to predict 2012-2013 data (business pattern)
gb_avr_year = gb_avr.gropby('year')
df_2011 = gb_avr_year.get_group(2011)
df_2012 = gb_avr_year.get_group(2012)
Out[206]:
In [259]:
# PCA
def plot_exp_var_ratio(X,N):
"""Create a bar plot of explained variance ratio of N components from the PCA results using data X"""
pca = PCA(n_components=N)
pca.fit(X)
PCA(copy=True, n_components=3, whiten=False)
print 'var: ' + str((pca.explained_variance_ratio_))
print 'score: '+str((pca.score(X)))
Y = pca.transform(X)
plt.bar(range(N),pca.explained_variance_ratio_)
plt.xticks(np.array(range(N))+0.4,range(1,N+1))
plt.ylabel('Explained Var Ratio')
plt.xlabel('pc #')
plt.show()
return Y
In [264]:
X = df_2011.ix[:,2:32]
X_PC = plot_exp_var_ratio(X,N)