In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
import statsmodels.api as sm
import patsy
In [2]:
# Data Load
wadiz_df_original = pd.read_csv('wadiz_df_0329_1.csv', index_col=0)
user_comment = pd.read_csv('user_data_all_0329.csv', index_col=0)
provider_comment = pd.read_csv('provider_data_all_0329.csv', index_col=0)
wadiz_df = pd.read_csv('wadiz_provider_analysis_0329.csv', index_col=0)
provider_comment_grammar = pd.read_csv('comment_analysis.csv', index_col=0)
In [3]:
# grammar null값 제거
wadiz_df = wadiz_df[wadiz_df['provider_grammar_level'].notnull()]
# duration 처리
wadiz_df['date_duration'] = wadiz_df['date_duration'].apply(lambda x: int(x[:-24]))
Category, Area -> OneHotEncoding
In [4]:
x_static = pd.DataFrame([wadiz_df[u'target'],
wadiz_df['date_duration'],wadiz_df [u'category_label_art/photo/exhibit'],
wadiz_df['category_label_book/movie'], wadiz_df['category_label_education'],
wadiz_df['category_label_environment'], wadiz_df['category_label_figure/webtoon'],
wadiz_df['category_label_game/comics'], wadiz_df['category_label_life/fashion'],
wadiz_df['category_label_music/concert'], wadiz_df['category_label_share/public'],
wadiz_df['category_label_sports'], wadiz_df['category_label_tech/design'],
wadiz_df['category_label_travel'], wadiz_df['category_label_busan'],
wadiz_df['category_label_chungbuk'], wadiz_df[u'category_label_chungnam'],
wadiz_df['category_label_deagu'], wadiz_df['category_label_deajeon'],
wadiz_df['category_label_gwangju'], wadiz_df['category_label_incheon'],
wadiz_df['category_label_jeju'], wadiz_df['category_label_jeonbuk'],
wadiz_df['category_label_jeonnam'], wadiz_df['category_label_kangwon'],
wadiz_df['category_label_kyungbuk'], wadiz_df['category_label_kyungki'],
wadiz_df['category_label_kyungnam'], wadiz_df['category_label_sejong'],
wadiz_df['category_label_seoul'], wadiz_df['category_label_ulsan'], wadiz_df['provider_grammar_level']]).T
y = wadiz_df['result']
In [5]:
static_reg_model = sm.OLS(y, x_static).fit()
In [6]:
static_reg_model.summary()
Out[6]:
In [7]:
# category, area 제거
x_static_2 = pd.DataFrame([wadiz_df[u'target'], wadiz_df['date_duration'],
wadiz_df['provider_grammar_level']]).T
static_reg_model_2 = sm.OLS(y, x_static_2).fit()
static_reg_model_2.summary()
Out[7]:
샘플의 개수에 비해 카테고리가 너무 많아 카테고리값들을 제거하여 R^2 값이 올라감. 하지만 grammar_level의 p-value > 0.05 이므로 95%의 신뢰도하에서는 기각됨
In [8]:
# grammar_level 변수 제거
x_static_3 = pd.DataFrame([wadiz_df['target'], wadiz_df['date_duration']]).T
static_reg_model_3 = sm.OLS(y, x_static_3).fit()
static_reg_model_3.summary()
Out[8]:
R^2 값은 조금 떨어지지만 target과 date_duration은 result값 예측 모형에 영향을 미침
In [9]:
#date_duration만 사용하였을때
x_static_4 = pd.DataFrame([wadiz_df['date_duration']]).T
static_reg_model_4 = sm.OLS(y, x_static_4).fit()
static_reg_model_4.summary()
Out[9]:
In [10]:
#date_duration만 사용하였을때
x_static_5 = pd.DataFrame([wadiz_df['target']]).T
static_reg_model_5 = sm.OLS(y, x_static_5).fit()
static_reg_model_5.summary()
Out[10]:
In [11]:
fig, ax = plt.subplots(figsize=(10, 8))
sm.graphics.influence_plot(static_reg_model_3, plot_alpha=0.3, ax=ax)
plt.show()
In [12]:
outlier_index = static_reg_model_3.outlier_test()[static_reg_model_3.outlier_test().ix[:, -1].abs() < 0.01].index
In [13]:
#outlier test 결과값
static_reg_model_3.outlier_test()[static_reg_model_3.outlier_test().ix[:, -1] < 0.01]
Out[13]:
In [14]:
wadiz_outlier = wadiz_df.loc[outlier_index]
wadiz_no_outlier = wadiz_df.drop(outlier_index)
In [15]:
#outlier project
wadiz_outlier.iloc[:, :6]
Out[15]:
In [16]:
wadiz_outlier['funding_rate']
Out[16]:
outlier들은 funding_rate값이 너무 높거나 너무 낮은 projects들임
In [17]:
# OLS
x_static_3_no_outlier = pd.DataFrame([wadiz_no_outlier['target'], wadiz_no_outlier['date_duration']]).T
y_no_outlier = wadiz_no_outlier['result']
no_outlier_model = sm.OLS(y_no_outlier, x_static_3_no_outlier).fit()
no_outlier_model.summary()
Out[17]:
In [18]:
reg_predict = no_outlier_model.predict(x_static_3_no_outlier)
In [19]:
plt.figure(figsize = (15,10));
plt.plot(x_static_3_no_outlier, reg_predict, 'v', markersize=10, markeredgewidth=1
, markeredgecolor='r', markerfacecolor='None', label = 'prediction');
plt.plot(x_static_3_no_outlier, y_no_outlier, 'o', markersize=10, markeredgewidth=1
, markeredgecolor='g', markerfacecolor='None', label = 'real', );
plt.legend(fontsize=20);
plt.xlabel('Target', fontsize=20);
plt.ylabel('Result', fontsize=20);
plt.xlim(-5000000, 42000000);
plt.ylim(-5000000, 40000000);
In [ ]:
In [ ]:
In [20]:
x_classification = pd.DataFrame([wadiz_no_outlier['target'], wadiz_no_outlier['date_duration'], wadiz_no_outlier['provider_grammar_level']]).T
y = wadiz_no_outlier['success']
In [21]:
re = RandomForestClassifier()
In [22]:
x_re_list_1 = []
y_re_list_1 = []
for i in range(1, 20):
re_1 = RandomForestClassifier(n_estimators=i)
score = cross_val_score(re_1, x_classification, y, cv=10).mean()
x_re_list_1.append(i)
y_re_list_1.append(score)
In [23]:
base_success_rate = round((y.value_counts()[1] / len(y)), 2)
figure = plt.figure(figsize=(10,8))
plt.plot(x_re_list_1, y_re_list_1, 'o--', c = 'r', label = 'Accuracy')
plt.axhline(base_success_rate, ls = '--', label = 'base_success_rate')
plt.legend(fontsize=15)
plt.xlabel('n_estimator', fontsize=15)
plt.ylabel('accuracy', fontsize=15)
print('base_success_rate :', round((y.value_counts()[1] / len(y))*100, 2), '%')
print('max_accuracy :', round(max(y_re_list_1)*100, 2), '%')
In [24]:
from sklearn.grid_search import GridSearchCV
In [25]:
# Gridsearch report를 위한 함수 생성
from operator import itemgetter
def report(grid_scores, n_top=3):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
for i, score in enumerate(top_scores):
print("Model with rank: {0}".format(i + 1))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
score.mean_validation_score,
np.std(score.cv_validation_scores)))
print("Parameters: {0}".format(score.parameters))
print("")
In [26]:
#parameter 설정
param_grid = {"max_depth": [5, 10, None],
"max_features": [1, 2, None],
"min_samples_split": [1, 3, 5],
"min_samples_leaf": [1, 3, 5, 10],
"n_estimators" : np.arange(3, 20)}
# run grid search
grid_search = GridSearchCV(re, param_grid=param_grid)
grid_search.fit(x_classification, y)
report(grid_search.grid_scores_)
In [27]:
best_re = RandomForestClassifier(max_features=2, min_samples_split=1, n_estimators=15, max_depth=5, min_samples_leaf=10)
In [28]:
Stkfold = StratifiedKFold(y, n_folds=10)
In [29]:
print('Mean of Score :', np.mean(cross_val_score(best_re, x_classification, y, cv = Stkfold))),
print('Std of Score :', np.std(cross_val_score(best_re, x_classification, y, cv = Stkfold)))
In [30]:
best_re.fit(x_classification, y)
Out[30]:
In [31]:
# feature importance
feature_importance = pd.DataFrame([best_re.feature_importances_], columns=x_classification.columns, index=['Importance']).T
feature_importance
Out[31]:
In [32]:
knn = KNeighborsClassifier()
In [33]:
knn_model = knn.fit(x_classification, y)
In [34]:
Stkfold = StratifiedKFold(y, n_folds=10)
In [35]:
knn_model = knn.fit(x_classification, y)
knn_best_score_mean = cross_val_score(knn_model, x_classification, y, cv = 10).mean()
knn_best_score_std = cross_val_score(knn_model, x_classification, y, cv = 10).std()
print('KNN Best Score :', knn_best_score_mean),
print('KNN Best Score(Standard Error) :', knn_best_score_std)
??????? 너무 낮은 결과가 나옴.
무엇인가가 잘못되었다는 것을 의미.
feature들을 재선택하여 분석
In [36]:
# target 제거
x_knn_1 = pd.DataFrame([wadiz_no_outlier['date_duration'], wadiz_no_outlier['provider_grammar_level']]).T
y_knn_1 = wadiz_no_outlier['success']
knn_model_1 = knn.fit(x_knn_1, y_knn_1)
knn_best_score_mean_1 = cross_val_score(knn_model_1, x_knn_1, y_knn_1, cv = 10).mean()
knn_best_score_std_1 = cross_val_score(knn_model_1, x_knn_1, y_knn_1, cv = 10).std()
print('KNN Best Score :', knn_best_score_mean_1),
print('KNN Best Score(Standard Error) :', knn_best_score_std_1)
In [37]:
# grammar_level 제거
x_knn_2 = pd.DataFrame([wadiz_no_outlier['target'], wadiz_no_outlier['date_duration']]).T
y_knn_2 = wadiz_no_outlier['success']
knn_model_2 = knn.fit(x_knn_2, y_knn_2)
knn_best_score_mean_2 = cross_val_score(knn_model_2, x_knn_2, y_knn_2, cv = 10).mean()
knn_best_score_std_2 = cross_val_score(knn_model_2, x_knn_2, y_knn_2, cv = 10).std()
print('KNN Best Score :', knn_best_score_mean_2),
print('KNN Best Score(Standard Error) :', knn_best_score_std_2)
In [38]:
# date_duration 제거
x_knn_3 = pd.DataFrame([wadiz_no_outlier['target'], wadiz_no_outlier['provider_grammar_level']]).T
y_knn_3 = wadiz_no_outlier['success']
knn_model_3 = knn.fit(x_knn_3, y_knn_3)
knn_best_score_mean_3 = cross_val_score(knn_model_3, x_knn_3, y_knn_3, cv = 10).mean()
knn_best_score_std_3 = cross_val_score(knn_model_3, x_knn_3, y_knn_3, cv = 10).std()
print('KNN Best Score :', knn_best_score_mean_3),
print('KNN Best Score(Standard Error) :', knn_best_score_std_3)
In [39]:
gnb = GaussianNB()
In [40]:
gnb_best_score_mean = cross_val_score(gnb, x_classification, y, cv=Stkfold).mean()
gnb_best_score_std = cross_val_score(gnb, x_classification, y, cv=Stkfold).std()
In [41]:
print('GNB Best Score :', gnb_best_score_mean),
print('GNB Best Score(Standard Error) :', gnb_best_score_std)
샘플의 수가 작고(588개) 사용가능한 feature의 수도 한정적인 것이 낮은 성능의 가장 큰 이유임.
Project가 시작되기전 갖는 정보들로는 성공예측이 어려움.
In [ ]: