In [6]:
# Routine by Chelsea X. Huang
# Import lots of libraries (fuzzywuzzy lol)
import numpy as np
import scipy as sp
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
import re
import sklearn
import datetime
from sklearn.cross_validation import train_test_split,cross_val_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectPercentile,f_classif,chi2,f_regression
from sklearn.ensemble import GradientBoostingRegressor
In [7]:
# Defining a function that pre-process the data
def processing(df):
dummies_df = pd.get_dummies(df['City Group'])
def add_CG(name):
return 'CG_' + name
dummies_df = dummies_df.rename(columns=add_CG)
df = pd.concat([df, dummies_df.iloc[:,0]], axis=1)
dummies_df = pd.get_dummies(df['Type'])
def add_Type(name):
return 'Type_' + name
dummies_df = dummies_df.rename(columns=add_Type)
df = pd.concat([df, dummies_df.iloc[:,0:3]], axis=1)
# try to put in age as a column
def add_Age(string):
age=datetime.datetime.now()-datetime.datetime.strptime(string,"%m/%d/%Y")
return age.days
df['Age']=df['Open Date'].map(add_Age)
df=df.drop(['Id','Open Date','City','City Group','Type','revenue'],axis=1)
# RobustScaler scale the features to the same order of magnitude
scaler = RobustScaler().fit(df)
df = scaler.transform(df)
return df
In [8]:
# read data input files and apply log transformation to revenues
df_train=pd.read_csv("data/train.csv")
df_test=pd.read_csv("data/test.csv")
y=np.log(np.array(df_train["revenue"]))
df = pd.concat([df_train, df_test])
# Applying the pre-processing routine
df = processing(df)
df_train = df[0:137,:]
df_test = df[137:,:]
X=df_train
X_test=df_test
# Defining the regressors
model=GradientBoostingRegressor(learning_rate=0.1,max_depth=1, random_state=0, loss='huber')
In [ ]:
# SelectPercentile will select best features according to a percentile of the highest scores.
selector=SelectPercentile(f_regression,percentile=100)
selector.fit(X,y)
scores=-np.log10(selector.pvalues_)
scores/=scores.max()
feature_index_0=scores>0.1
X=X[:,feature_index_0]
In [ ]:
# Fit the training set
model.nestimators=100
model.fit(X,y)
# Fit the test set and create submission file
X_test=X_test[:,feature_index_0]
y_pred=model.predict(X_test)
samplesubmit = pd.read_csv("data/template.csv")
samplesubmit["Prediction"]=np.exp(y_pred)
samplesubmit.to_csv("GBR_advanced.csv",index=False)