final_df1

별점분포를 그대로 사용한 피쳐



In [1]:

    
df = pd.read_csv('../resource/preprocess_df.csv')
df.drop(['title', 'director', 'actors', 'film_rate', 'genre', 'nation'], axis=1, inplace=True)
sparse_df = pd.read_csv('./resource/tfidf_df.csv')
expect_df = pd.read_csv('./resource/expect_df.csv')



In [2]:

    
df.rename(columns={'0.5':'star0.5',
           '1':'star1',
           '1.5':'star1.5',
           '2':'star2',
           '2.5':'star2.5',
           '3':'star3',
           '3.5':'star3.5',
           '4':'star4',
           '4.5':'star4.5',
           '5':'star5',
          }, inplace=True)
df = df.fillna(3) #이동진 Nan값 => 이동진의 평균 별점 3점으로 imputation



In [3]:

    
final_df1 = pd.concat([df, sparse_df], axis=1)
final_df1.head()









    Out[3]:






  
    
      
      rating(y)
      avg_rating
      lee_rating
      eval_count
      wish_count
      cmt_count
      run_time
      year
      star0.5
      star1
      ...
      1500
      1501
      1502
      1503
      1504
      1505
      1506
      1507
      1508
      1509
    
  
  
    
      0
      3.5
      4.22683
      4.5
      13025
      9796
      2585
      128
      2015
      7
      10
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      2.5
      2.99629
      3.0
      58122
      3166
      965
      121
      2013
      1312
      2238
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      4.0
      3.90119
      3.0
      66296
      33565
      1539
      108
      2013
      228
      316
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      2.0
      2.62241
      3.0
      67031
      1079
      712
      104
      2013
      3615
      4063
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      3.0
      3.31175
      3.0
      68174
      9510
      2439
      119
      2014
      787
      1612
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 1528 columns



In [4]:

    
final_df1.to_csv('./resource/final_df1.csv', index=False)

fianl_df2

별점분포를 기대값(std, skew)로 표현한 피쳐



In [5]:

    
df1 = df.ix[:,:'year']
final_df2 = pd.concat([df1, expect_df, sparse_df], axis=1).fillna(3)
final_df2.drop('mean', axis=1, inplace=True)
final_df2.head()









    Out[5]:






  
    
      
      rating(y)
      avg_rating
      lee_rating
      eval_count
      wish_count
      cmt_count
      run_time
      year
      std
      skew
      ...
      1500
      1501
      1502
      1503
      1504
      1505
      1506
      1507
      1508
      1509
    
  
  
    
      0
      3.5
      4.22683
      4.5
      13025
      9796
      2585
      128
      2015
      0.625521
      -1.002260
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      2.5
      2.99629
      3.0
      58122
      3166
      965
      121
      2013
      1.059023
      -0.347552
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      4.0
      3.90119
      3.0
      66296
      33565
      1539
      108
      2013
      0.895549
      -1.052706
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      2.0
      2.62241
      3.0
      67031
      1079
      712
      104
      2013
      1.155476
      -0.127879
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      3.0
      3.31175
      3.0
      68174
      9510
      2439
      119
      2014
      1.016681
      -0.452095
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 1520 columns



In [6]:

    
final_df2.to_csv('./resource/final_df2.csv', index=False)



In [ ]:

	rating(y)	avg_rating	lee_rating	eval_count	wish_count	cmt_count	run_time	year	star0.5	star1	...
0	3.5	4.22683	4.5	13025	9796	2585	128	2015	7	10	...
1	2.5	2.99629	3.0	58122	3166	965	121	2013	1312	2238	...
2	4.0	3.90119	3.0	66296	33565	1539	108	2013	228	316	...
3	2.0	2.62241	3.0	67031	1079	712	104	2013	3615	4063	...
4	3.0	3.31175	3.0	68174	9510	2439	119	2014	787	1612	...