In [1]:
wadiz_df_original = pd.read_csv('wadiz_df_0329_1.csv', index_col=0)
user_comment = pd.read_csv('user_data_all_0329.csv', index_col=0)
provider_comment = pd.read_csv('provider_data_all_0329.csv', index_col=0)
wadiz_df = pd.read_csv('wadiz_provider_analysis_0329.csv', index_col=0)
provider_comment_grammar = pd.read_csv('comment_analysis.csv', index_col=0)
In [2]:
# 각 DataFrame별 샘플 수 비교
print('Original DataFrame :', len(wadiz_df_original))
print('User comment :', len(user_comment['project_id'].value_counts()))
print('Provider comment :', len(provider_comment['project_id'].value_counts()))
print('Provider comment grammar check:', len(provider_comment_grammar['project_id'].value_counts()))
print('Revised DataFrame :', len(wadiz_df))
In [3]:
# grammar null값 제거
wadiz_df = wadiz_df[wadiz_df['provider_grammar_level'].notnull()]
# duration 처리
wadiz_df['date_duration'] = wadiz_df['date_duration'].apply(lambda x: int(x[:-24]))
In [4]:
figure = plt.figure(figsize=(10,8));
sns.kdeplot(wadiz_df['funding_rate']);
plt.xlim(-3, 10);
plt.xticks(fontsize=15);
plt.yticks(fontsize=15);
plt.legend(fontsize = 15);
plt.xlabel('funding_rate', fontsize=15);
plt.ylabel('distribution', fontsize = 15);
In [5]:
# 전체 분산과 각 category 분산과의 분포 차이 검정
# K-S : Kolmogorov Smirnov test
for i in wadiz_df['category'].unique()[:-1]:
all_data = wadiz_df['funding_rate']
category_data = wadiz_df.loc[wadiz_df['category'] == i]['funding_rate']
print('[all_sample vs {category_i}]'.format(category_i = i)),
print(' K-S statistic :', round(sp.stats.ks_2samp(all_data, category_data)[0], 4))
print(' p-value :', round(sp.stats.ks_2samp(all_data, category_data)[1], 4))
In [6]:
print(wadiz_df['area'].value_counts())
In [7]:
# 가장 많은 부분을 차지하는 서울/경기만 분석
plt.figure(figsize=(10,8));
sns.kdeplot(wadiz_df.loc[wadiz_df['area'] == 'seoul']['funding_rate'], label = 'seoul');
sns.kdeplot(wadiz_df.loc[wadiz_df['area'] == 'kyungki']['funding_rate'], label = 'kyungki', linestyle = '--');
plt.xlim(-2, 6);
plt.legend(fontsize = 15);
plt.xlabel('funding_rate', fontsize=15);
plt.ylabel('distribution', fontsize = 15);
In [8]:
# Ks_2sampResult : Kolmogorov-Smirnov test
# Ttest_indResult : 2 sample T-test
seoul_dist = wadiz_df.loc[wadiz_df['area'] == 'seoul']['funding_rate']
kyungki_dist = wadiz_df.loc[wadiz_df['area'] == 'kyungki']['funding_rate']
print('seoul vs kyungki :'),
print(sp.stats.ks_2samp(seoul_dist, kyungki_dist))
print(sp.stats.ttest_ind(seoul_dist, kyungki_dist))
In [35]:
figure = plt.figure(figsize=(10,8))
sns.kdeplot(wadiz_df.loc[wadiz_df['success'] == 1]['date_duration'], label = 'success')
sns.kdeplot(wadiz_df.loc[wadiz_df['success'] == 0]['date_duration'], label = 'fail', c='r', linestyle='--')
plt.xlim(-50, 100);
plt.xlabel('date_distribution', fontsize=15);
plt.ylabel('distribution', fontsize = 15);
plt.legend(fontsize = 15);
In [36]:
# Ks_2sampResult : Kolmogorov-Smirnov test
# Ttest_indResult : 2 sample T-test
success_duration = wadiz_df.loc[wadiz_df['success'] == 1]['date_duration']
fail_duration = wadiz_df.loc[wadiz_df['success'] == 0]['date_duration']
print('[success_duration vs fail_duration]'),
print(sp.stats.ks_2samp(success_duration, fail_duration)),
print(sp.stats.ttest_ind(success_duration, fail_duration))
In [11]:
figure = plt.figure(figsize=(10,8))
sns.kdeplot(wadiz_df.loc[wadiz_df['success'] == 1]['target'], label = 'success');
sns.kdeplot(wadiz_df.loc[wadiz_df['success'] == 0]['target'], label = 'fail');
#plt.xlim(-3, 10)
plt.xticks(fontsize=15);
plt.yticks(fontsize=15);
plt.legend(fontsize = 15);
plt.xlabel('duration', fontsize=15);
plt.ylabel('distribution', fontsize = 15);
In [12]:
success_target = wadiz_df.loc[wadiz_df['success'] == 1]['target']
fail_target = wadiz_df.loc[wadiz_df['success'] == 0]['target']
In [13]:
#정규성 test (성공 샘플의 목표펀딩금액)
print('Shapiro test statistics :', sp.stats.shapiro(success_target)[0]),
print('Shapiro test p-value :', sp.stats.shapiro(success_target)[1])
In [14]:
#정규성 test (실패 샘플의 목표펀딩금액)
print('Shapiro test statistics :', sp.stats.shapiro(fail_target)[0]),
print('Shapiro test p-value :', sp.stats.shapiro(fail_target)[1])
두 샘플 모두 정규분포를 이루지않아 (p-value < 0.05, 귀무가설 : 정규분포를 이룬다) t-test보다 mann-whiteney u test로 평균 차이 검정
In [15]:
# 분포, 평균 검정
# Ks_2sampResult : Kolmogorov-Smirnov test
# MannwhitneyuResult : Mann-Whiteney U test
print(sp.stats.ks_2samp(success_target, fail_target)),
print(sp.stats.mannwhitneyu(success_target, fail_target))
검정 결과 두 분포는 동일한 분포(p-value >0.05)이고 평균차이는 존재함 (p-value < 0.05)
In [16]:
print('성공 Project들의 target 평균 :', np.mean(success_target),
'Std :', np.std(success_target))
print('실패 Project들의 target 평균 :', np.mean(fail_target),
'Std :', np.std(fail_target))
In [17]:
print('<월별 프로젝트 수>')
print(wadiz_df['month'].value_counts())
In [18]:
plt.figure(figsize=(10,8))
sns.kdeplot(wadiz_df.loc[wadiz_df['success'] ==1]['month'], label = 'success')
sns.kdeplot(wadiz_df.loc[wadiz_df['success'] ==0]['month'], label = 'fail')
plt.xticks(range(1, 12), fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('month', fontsize=15)
plt.ylabel('distribution', fontsize = 15)
plt.legend(fontsize = 15)
print('<월별 성공/실패 분포>')
In [34]:
# Ks_2sampResult : Kolmogorov-Smirnov test
# Ttest_indResult : 2 sample T-test
success_month = wadiz_df.loc[wadiz_df['success'] ==1]['month']
fail_month = wadiz_df.loc[wadiz_df['success'] ==0]['month']
print('[success_month vs fail_month]'),
print('')
print(sp.stats.ks_2samp(success_month, fail_month))
print(sp.stats.ttest_ind(success_month, fail_month))
In [20]:
plt.figure(figsize=(10,8))
sns.kdeplot(wadiz_df.loc[wadiz_df['success'] == 1]['provider_grammar_level'], label = 'success')
sns.kdeplot(wadiz_df.loc[wadiz_df['success'] == 0]['provider_grammar_level'], label = 'fail')
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('grammar_level', fontsize=15)
plt.ylabel('distribution', fontsize = 15)
plt.legend(fontsize = 15)
Out[20]:
In [21]:
wadiz_df['log_grammar_level'] = wadiz_df['provider_grammar_level'].apply(lambda x: np.log(x))
In [22]:
# log scailing
plt.figure(figsize=(10,8))
sns.kdeplot(wadiz_df.loc[wadiz_df['success'] == 1]['log_grammar_level'], label = 'success')
sns.kdeplot(wadiz_df.loc[wadiz_df['success'] == 0]['log_grammar_level'], label = 'fail')
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('grammar_level', fontsize=15)
plt.ylabel('distribution', fontsize = 15)
plt.legend(fontsize = 15)
Out[22]:
In [23]:
success_grammar = wadiz_df.loc[wadiz_df['success'] == 1]['provider_grammar_level']
fail_grammar = wadiz_df.loc[wadiz_df['success'] == 0]['provider_grammar_level']
In [24]:
success_log_grammar = wadiz_df.loc[wadiz_df['success'] == 1]['log_grammar_level']
fail_log_grammar = wadiz_df.loc[wadiz_df['success'] == 0]['log_grammar_level']
In [25]:
#정규성 test (성공 샘플의 grammar)
print('Shapiro test statistics :', sp.stats.shapiro(success_grammar)[0]),
print('Shapiro test p-value :', sp.stats.shapiro(success_grammar)[1])
In [26]:
#정규성 test (실패 샘플의 grammar)
print('Shapiro test statistics :', sp.stats.shapiro(fail_grammar)[0]),
print('Shapiro test p-value :', sp.stats.shapiro(fail_grammar)[1])
In [27]:
# Ks_2sampResult : Kolmogorov-Smirnov test
# MannwhitneyuResult : Mann-Whiteney U test
print(sp.stats.ks_2samp(success_grammar, fail_grammar)),
print(sp.stats.mannwhitneyu(success_grammar, fail_grammar))
grammar_level을 변형시키지않고 분석시 정규분포가 성립하지 않음. Mann-whiteney u test 실시하면 평균차이가 있다는 결과가 나오고 성공/실패 샘플 두 분포는 다른 분포임
In [28]:
#정규성 test (실패 샘플의 log_grammar)
print('Shapiro test statistics :', sp.stats.shapiro(success_log_grammar)[0]),
print('Shapiro test p-value :', sp.stats.shapiro(success_log_grammar)[1])
In [29]:
#정규성 test (성공 샘플의 log_grammar)
print('Shapiro test statistics :', sp.stats.shapiro(fail_log_grammar)[0]),
print('Shapiro test p-value :', sp.stats.shapiro(fail_log_grammar)[1])
In [30]:
# Ks_2sampResult : Kolmogorov-Smirnov test
# MannwhitneyuResult : Mann-Whiteney U test
print(sp.stats.ks_2samp(success_log_grammar, fail_log_grammar)),
print(sp.stats.mannwhitneyu(success_log_grammar, fail_log_grammar))
In [ ]:
In [ ]: