final_df1

  • 별점분포를 그대로 사용한 피쳐

In [1]:
df = pd.read_csv('../resource/preprocess_df.csv')
df.drop(['title', 'director', 'actors', 'film_rate', 'genre', 'nation'], axis=1, inplace=True)
sparse_df = pd.read_csv('./resource/tfidf_df.csv')
expect_df = pd.read_csv('./resource/expect_df.csv')

In [2]:
df.rename(columns={'0.5':'star0.5',
           '1':'star1',
           '1.5':'star1.5',
           '2':'star2',
           '2.5':'star2.5',
           '3':'star3',
           '3.5':'star3.5',
           '4':'star4',
           '4.5':'star4.5',
           '5':'star5',
          }, inplace=True)
df = df.fillna(3) #이동진 Nan값 => 이동진의 평균 별점 3점으로 imputation

In [3]:
final_df1 = pd.concat([df, sparse_df], axis=1)
final_df1.head()


Out[3]:
rating(y) avg_rating lee_rating eval_count wish_count cmt_count run_time year star0.5 star1 ... 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509
0 3.5 4.22683 4.5 13025 9796 2585 128 2015 7 10 ... 0 0 0 0 0 0 0 0 0 0
1 2.5 2.99629 3.0 58122 3166 965 121 2013 1312 2238 ... 0 0 0 0 0 0 0 0 0 0
2 4.0 3.90119 3.0 66296 33565 1539 108 2013 228 316 ... 0 0 0 0 0 0 0 0 0 0
3 2.0 2.62241 3.0 67031 1079 712 104 2013 3615 4063 ... 0 0 0 0 0 0 0 0 0 0
4 3.0 3.31175 3.0 68174 9510 2439 119 2014 787 1612 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 1528 columns


In [4]:
final_df1.to_csv('./resource/final_df1.csv', index=False)

fianl_df2

  • 별점분포를 기대값(std, skew)로 표현한 피쳐

In [5]:
df1 = df.ix[:,:'year']
final_df2 = pd.concat([df1, expect_df, sparse_df], axis=1).fillna(3)
final_df2.drop('mean', axis=1, inplace=True)
final_df2.head()


Out[5]:
rating(y) avg_rating lee_rating eval_count wish_count cmt_count run_time year std skew ... 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509
0 3.5 4.22683 4.5 13025 9796 2585 128 2015 0.625521 -1.002260 ... 0 0 0 0 0 0 0 0 0 0
1 2.5 2.99629 3.0 58122 3166 965 121 2013 1.059023 -0.347552 ... 0 0 0 0 0 0 0 0 0 0
2 4.0 3.90119 3.0 66296 33565 1539 108 2013 0.895549 -1.052706 ... 0 0 0 0 0 0 0 0 0 0
3 2.0 2.62241 3.0 67031 1079 712 104 2013 1.155476 -0.127879 ... 0 0 0 0 0 0 0 0 0 0
4 3.0 3.31175 3.0 68174 9510 2439 119 2014 1.016681 -0.452095 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 1520 columns


In [6]:
final_df2.to_csv('./resource/final_df2.csv', index=False)

In [ ]: