notebook.community

Edit and run



In [1]:

    
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from six.moves.urllib.request import urlopen

import tensorflow as tf
import numpy as np
import pandas as pd









    



/home/voyageth/develop/anaconda3/envs/kaggle/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)



In [2]:

    
########################################
## load the data
########################################
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
song = pd.read_csv('./input/songs.csv')
song_extra = pd.read_csv('./input/song_extra_info.csv')
member = pd.read_csv('./input/members.csv')



In [3]:

    
tf.__version__









    Out[3]:





'1.4.0'



In [4]:

    
from sklearn.preprocessing import LabelEncoder

def encode_str(train_data, test_data):
    data_encoder = LabelEncoder()
    data_encoder.fit(train_data.append(test_data))
    t_train_data = data_encoder.transform(train_data)
    t_test_data = data_encoder.transform(test_data)
    return t_train_data, t_test_data

def generate_encoded_data(data_raw, data_test_raw):
    data, data_test = encode_str(data_raw, data_test_raw)
    data_cnt = int(max(data.max(), data_test.max()) + 1)
    return data, data_test, data_cnt



In [5]:

    
song['language'] = song['language'].fillna(0).astype(str)



In [6]:

    
song.head()









    Out[6]:







  
    
      
      song_id
      song_length
      genre_ids
      artist_name
      composer
      lyricist
      language
    
  
  
    
      0
      CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=
      247640
      465
      張信哲 (Jeff Chang)
      董貞
      何啟弘
      3.0
    
    
      1
      o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=
      197328
      444
      BLACKPINK
      TEDDY|  FUTURE BOUNCE|  Bekuh BOOM
      TEDDY
      31.0
    
    
      2
      DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=
      231781
      465
      SUPER JUNIOR
      NaN
      NaN
      31.0
    
    
      3
      dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=
      273554
      465
      S.H.E
      湯小康
      徐世珍
      3.0
    
    
      4
      W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=
      140329
      726
      貴族精選
      Traditional
      Traditional
      52.0



In [7]:

    
song.columns









    Out[7]:





Index(['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer',
       'lyricist', 'language'],
      dtype='object')



In [8]:

    
song.dtypes









    Out[8]:





song_id        object
song_length     int64
genre_ids      object
artist_name    object
composer       object
lyricist       object
language       object
dtype: object



In [9]:

    
print('genre_ids unique count :', len(song['genre_ids'].unique()))
print('artist_name unique count :', len(song['artist_name'].unique()))
print('composer unique count :', len(song['composer'].unique()))
print('lyricist unique count :', len(song['lyricist'].unique()))
print('language unique count :', len(song['language'].unique()))









    



genre_ids unique count : 1046
artist_name unique count : 222363
composer unique count : 329825
lyricist unique count : 110927
language unique count : 11



In [10]:

    
song_length = tf.feature_column.numeric_column('song_length')
genre_ids = tf.feature_column.categorical_column_with_hash_bucket('genre_ids', hash_bucket_size=1000)
artist_name = tf.feature_column.categorical_column_with_hash_bucket('artist_name', hash_bucket_size=200000)
composer = tf.feature_column.categorical_column_with_hash_bucket('composer', hash_bucket_size=300000)
lyricist = tf.feature_column.categorical_column_with_hash_bucket('lyricist', hash_bucket_size=100000)
language = tf.feature_column.categorical_column_with_hash_bucket('language', hash_bucket_size=10)



In [11]:

    
train_merged = train.merge(song, on='song_id')
test_merged = test.merge(song, on='song_id')



In [12]:

    
train_merged.columns









    Out[12]:





Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'song_length', 'genre_ids', 'artist_name',
       'composer', 'lyricist', 'language'],
      dtype='object')



In [13]:

    
train_merged.dtypes









    Out[13]:





msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
target                 int64
song_length            int64
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              object
dtype: object



In [14]:

    
uid_raw = train_merged.msno
sid_raw = train_merged.song_id

uid_test_raw = test_merged.msno
sid_test_raw = test_merged.song_id

uid, uid_test = encode_str(uid_raw, uid_test_raw)
sid, sid_test = encode_str(sid_raw, sid_test_raw)

train_merged['msno_no'] = uid
train_merged['song_no'] = sid
test_merged['msno_no'] = uid_test
test_merged['song_no'] = sid_test



In [15]:

    
msno = tf.feature_column.categorical_column_with_identity('msno_no', num_buckets=1000000)
song_id = tf.feature_column.categorical_column_with_identity('song_no', num_buckets=1000000)



In [16]:

    
train_merged['source_system_tab'].unique()









    Out[16]:





array(['explore', 'my library', 'discover', 'search', 'radio',
       'listen with', 'notification', nan, 'null', 'settings'], dtype=object)



In [17]:

    
source_system_tab = tf.feature_column.categorical_column_with_vocabulary_list('source_system_tab', 
    ['explore', 'my library', 'search', 'discover', 'radio', 'none', 'nan', 
     'listen with', 'notification', 'null', 'settings']
)



In [18]:

    
train_merged['source_screen_name'].unique()









    Out[18]:





array(['Explore', 'Local playlist more', 'Online playlist more', nan,
       'Discover Chart', 'My library', 'Discover Genre', 'Radio',
       'Album more', 'Search', 'Discover Feature', 'Unknown',
       'Others profile more', 'My library_Search', 'Artist more',
       'Search Trends', 'Search Home', 'Concert', 'Discover New',
       'Self profile more', 'Payment'], dtype=object)



In [19]:

    
source_screen_name = tf.feature_column.categorical_column_with_vocabulary_list('source_screen_name', 
    ['Explore', 'Local playlist more', 'None', 'My library',
       'Online playlist more', 'Album more', 'Discover Feature', 'Unknown',
       'Discover Chart', 'Radio', 'Artist more', 'Search',
       'Others profile more', 'Search Trends', 'Discover Genre',
       'My library_Search', 'Search Home', 'Discover New',
       'Self profile more', 'Concert', 'Payment', 'none', 'nan']
)



In [20]:

    
train_merged['source_type'].unique()









    Out[20]:





array(['online-playlist', 'local-playlist', 'local-library',
       'song-based-playlist', 'radio', 'top-hits-for-artist', 'song',
       'artist', 'listen-with', 'my-daily-playlist', 'album', nan,
       'topic-article-playlist'], dtype=object)



In [21]:

    
train_merged.head()









    Out[21]:







  
    
      
      msno
      song_id
      source_system_tab
      source_screen_name
      source_type
      target
      song_length
      genre_ids
      artist_name
      composer
      lyricist
      language
      msno_no
      song_no
    
  
  
    
      0
      FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=
      BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=
      explore
      Explore
      online-playlist
      1
      206471
      359
      Bastille
      Dan Smith| Mark Crew
      NaN
      52.0
      9176
      86872
    
    
      1
      e5Ezre9HPuPos+CXQXtmo32E/hHIZTMmo6jG3yRf6UA=
      BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=
      my library
      Local playlist more
      local-playlist
      1
      206471
      359
      Bastille
      Dan Smith| Mark Crew
      NaN
      52.0
      22652
      86872
    
    
      2
      pouJqjNRmZOnRNzzMWWkamTKkIGHyvhl/jo4HgbncnM=
      BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=
      discover
      Online playlist more
      online-playlist
      0
      206471
      359
      Bastille
      Dan Smith| Mark Crew
      NaN
      52.0
      28978
      86872
    
    
      3
      sSexP400TJOZRhx3JB+0s9cqrCnqrlV51B9njoKR1II=
      BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=
      discover
      Online playlist more
      online-playlist
      0
      206471
      359
      Bastille
      Dan Smith| Mark Crew
      NaN
      52.0
      30456
      86872
    
    
      4
      hKdGiUKHVqKkXGHLrc+EzdSW6q0ERAJ2Cs7/L1N0Ae4=
      BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=
      discover
      NaN
      online-playlist
      0
      206471
      359
      Bastille
      Dan Smith| Mark Crew
      NaN
      52.0
      24383
      86872



In [22]:

    
source_type = tf.feature_column.categorical_column_with_vocabulary_list('source_type', 
    ['online-playlist', 'local-playlist', 'local-library',
       'top-hits-for-artist', 'album', 'none', 'nan', 'song-based-playlist', 'radio',
       'song', 'listen-with', 'artist', 'topic-article-playlist',
       'my-daily-playlist']
)



In [23]:

    
base_columns = [
    song_length,
]

deep_columns = [
    tf.feature_column.indicator_column(source_system_tab),
    tf.feature_column.indicator_column(source_screen_name),
    tf.feature_column.indicator_column(source_type),
    tf.feature_column.indicator_column(language),
    tf.feature_column.indicator_column(genre_ids),
    
    tf.feature_column.embedding_column(msno, dimension=128),
    tf.feature_column.embedding_column(song_id, dimension=128),
    tf.feature_column.embedding_column(artist_name, dimension=32),
    tf.feature_column.embedding_column(composer, dimension=32),
    tf.feature_column.embedding_column(lyricist, dimension=32),
]



In [24]:

    
import tempfile
model_dir = tempfile.mkdtemp()
train_test_index = int(len(train) * 0.7)
m = tf.estimator.DNNLinearCombinedClassifier(
    model_dir = model_dir,
    linear_feature_columns=base_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[64, 128, 256, 512]
)









    



INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp7qm3ngdb', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9073f05c50>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}



In [ ]:

    
def input_fn(df_data, num_epochs, shuffle):
    df_data = df_data.dropna(how='any', axis=0)
    labels = df_data['target'].astype(int)
    df_data = df_data.drop('target', axis=1)
    print(df_data.dtypes)
    return tf.estimator.inputs.pandas_input_fn(
        x=df_data,
        y=labels,
        batch_size=1000,
        num_epochs=num_epochs,
        shuffle=shuffle,
        num_threads=5
    )



In [ ]:

    
m.train(input_fn=input_fn(train_merged[:train_test_index], num_epochs=None, shuffle=True),steps=30000)









    



msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
song_length            int64
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              object
msno_no                int64
song_no                int64
dtype: object
INFO:tensorflow:Create CheckpointSaverHook.



In [ ]:

    
results = m.evaluate(
    input_fn=input_fn(train_merged[train_test_index:], num_epochs=1, shuffle=False),
    steps=None
)
print('model directory = %s' % model_dir)
for key in sorted(results):
    print('%s : %s' % (key, results[key]))



In [ ]:

	song_id	song_length	genre_ids	artist_name	composer	lyricist	language
0	CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=	247640	465	張信哲 (Jeff Chang)	董貞	何啟弘	3.0
1	o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=	197328	444	BLACKPINK	TEDDY\| FUTURE BOUNCE\| Bekuh BOOM	TEDDY	31.0
2	DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=	231781	465	SUPER JUNIOR	NaN	NaN	31.0
3	dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=	273554	465	S.H.E	湯小康	徐世珍	3.0
4	W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=	140329	726	貴族精選	Traditional	Traditional	52.0

	msno	song_id	source_system_tab	source_screen_name	source_type	target	song_length	genre_ids	artist_name	composer	lyricist	language	msno_no	song_no
0	FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=	BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=	explore	Explore	online-playlist	1	206471	359	Bastille	Dan Smith\| Mark Crew	NaN	52.0	9176	86872
1	e5Ezre9HPuPos+CXQXtmo32E/hHIZTMmo6jG3yRf6UA=	BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=	my library	Local playlist more	local-playlist	1	206471	359	Bastille	Dan Smith\| Mark Crew	NaN	52.0	22652	86872
2	pouJqjNRmZOnRNzzMWWkamTKkIGHyvhl/jo4HgbncnM=	BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=	discover	Online playlist more	online-playlist	0	206471	359	Bastille	Dan Smith\| Mark Crew	NaN	52.0	28978	86872
3	sSexP400TJOZRhx3JB+0s9cqrCnqrlV51B9njoKR1II=	BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=	discover	Online playlist more	online-playlist	0	206471	359	Bastille	Dan Smith\| Mark Crew	NaN	52.0	30456	86872
4	hKdGiUKHVqKkXGHLrc+EzdSW6q0ERAJ2Cs7/L1N0Ae4=	BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=	discover	NaN	online-playlist	0	206471	359	Bastille	Dan Smith\| Mark Crew	NaN	52.0	24383	86872