In [2]:
import pandas as pd
from scipy import stats
import numpy as np # linear algebra
import seaborn as sns
import matplotlib.pyplot as plt
color = sns.color_palette()
%matplotlib inline
In [3]:
p=pd.read_csv('D:\\data\\properties_2016.csv', dtype={'hashottuborspa':'bool','propertycountylandusecode':'object','propertyzoningdesc':'object', 'fireplaceflag':'bool', 'taxdelinquencyflag':'object'})
p.head()
Out[3]:
In [4]:
p.describe()
Out[4]:
In [5]:
p.shape
Out[5]:
In [6]:
train=pd.read_csv('D:\\data\\train_2016.csv')
train.head()
Out[6]:
In [7]:
train.shape
Out[7]:
In [8]:
train['transactiondate']=pd.to_datetime(train['transactiondate'], errors='coerce')
train['transaction_month']=train['transactiondate'].dt.month
In [67]:
train_df=pd.merge(train, p, on='parcelid', how='left')
train_df.head()
Out[67]:
In [53]:
size = int(train_df.shape[0]*1)
y_test_df = train_df[size:]['logerror'].values
y_train_df = train_df[0:size]['logerror'].values
In [54]:
train_df=train_df.drop(['parcelid', 'logerror', 'transactiondate', "propertycountylandusecode", "propertyzoningdesc"], axis=1)
cat_cols = ["hashottuborspa", "fireplaceflag", "taxdelinquencyflag"]
train_df.hashottuborspa.ix[train_df.hashottuborspa==True]=1
train_df.hashottuborspa.fillna(0)
train_df.fireplaceflag.ix[train_df.fireplaceflag==True]=1
train_df.fireplaceflag.fillna(0)
train_df.taxdelinquencyflag.ix[train_df.taxdelinquencyflag=='Y']=1
train_df.taxdelinquencyflag.fillna(0)
for col in cat_cols:
train_df[col]=train_df[col].astype('category')
mean_values=train_df.mean(axis=0)
train_df_new = train_df.fillna(mean_values, inplace=True)
x_test_df = train_df[size:]
x_train_df = train_df[0:size]
In [55]:
cat_cols = ["hashottuborspa", "fireplaceflag", "taxdelinquencyflag"] #drop object value
train_df = train_df.drop(cat_cols, axis=1)
In [56]:
x_test_df = train_df[size:]
x_train_df = train_df[0:size]
In [65]:
from sklearn import ensemble
model = ensemble.RandomForestRegressor(n_estimators=100,n_jobs=17)
model.fit(x_train_df, y_train_df)
Out[65]:
In [58]:
re = model.predict(x_test_df)
In [60]:
for i in range(10):
print('target:', y_test_df[i], 'predict:',re[i])
In [61]:
def rmsle(y, y_, convertExp=True):
if convertExp:
y = np.exp(y),
y_ = np.exp(y_)
log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
calc = (log1 - log2) ** 2
return np.sqrt(np.mean(calc))
In [62]:
rmsle(y_test_df, re)
Out[62]:
In [70]:
p.ix[p.parcelid==10754147]
Out[70]:
In [134]:
sub_df=pd.read_csv('D:\\data\\sample_submission.csv')
sub_df.head()
Out[134]:
In [137]:
sub_df['parcelid']=sub_df['ParcelId']
test=pd.merge(sub_df, p, on='parcelid', how='left')
test.head()
Out[137]:
In [135]:
sub_df.shape
Out[135]:
In [138]:
drop_month=['201610','201611','201612','201710','201711','201712']
test=test.drop(['parcelid', "propertycountylandusecode", "propertyzoningdesc"]+drop_month, axis=1)
cat_cols = ["hashottuborspa", "fireplaceflag", "taxdelinquencyflag"]
test.hashottuborspa.ix[test.hashottuborspa==True]=1
test.hashottuborspa.fillna(0)
test.fireplaceflag.ix[test.fireplaceflag==True]=1
test.fireplaceflag.fillna(0)
test.taxdelinquencyflag.ix[test.taxdelinquencyflag=='Y']=1
test.taxdelinquencyflag.fillna(0)
for col in cat_cols:
test[col]=test[col].astype('category')
In [139]:
mean=test.mean(axis=0)
test.fillna(mean, inplace=True)
test.shape
Out[139]:
In [140]:
test = test.drop(cat_cols, axis=1)
In [147]:
test=test.drop(['ParcelId'],axis=1)
test.head()
Out[147]:
In [148]:
month_pridict=[10,11,12,22,23,24]
result=[]
for m in month_pridict:
test['transaction_month']=m
re = model.predict(test)
result.append(re)
In [158]:
sub = pd.read_csv('D:\\data\\sample_submission.csv')
for i in range(len(drop_month)):
sub[drop_month[i]]=result[i]
In [153]:
#out=pd.DataFrame(X,columns=drop_month, index=sub_df['parcelid'].values)
#out.head()
#out.to_csv("D:\\data\\1.csv")
Out[153]:
In [161]:
sub.to_csv("D:\\data\\1.csv", index=False, float_format='%.4f')
In [ ]: