In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from six.moves.urllib.request import urlopen

import tensorflow as tf
import numpy as np
import pandas as pd


/home/voyageth/develop/anaconda3/envs/kaggle/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)

In [2]:
########################################
## load the data
########################################
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
song = pd.read_csv('./input/songs.csv')
song_extra = pd.read_csv('./input/song_extra_info.csv')
member = pd.read_csv('./input/members.csv')

In [3]:
tf.__version__


Out[3]:
'1.4.0'

In [4]:
from sklearn.preprocessing import LabelEncoder

def encode_str(train_data, test_data):
    data_encoder = LabelEncoder()
    data_encoder.fit(train_data.append(test_data))
    t_train_data = data_encoder.transform(train_data)
    t_test_data = data_encoder.transform(test_data)
    return t_train_data, t_test_data

def generate_encoded_data(data_raw, data_test_raw):
    data, data_test = encode_str(data_raw, data_test_raw)
    data_cnt = int(max(data.max(), data_test.max()) + 1)
    return data, data_test, data_cnt

In [5]:
song['language'] = song['language'].fillna(0).astype(str)

In [6]:
song.head()


Out[6]:
song_id song_length genre_ids artist_name composer lyricist language
0 CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E= 247640 465 張信哲 (Jeff Chang) 董貞 何啟弘 3.0
1 o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU= 197328 444 BLACKPINK TEDDY| FUTURE BOUNCE| Bekuh BOOM TEDDY 31.0
2 DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0= 231781 465 SUPER JUNIOR NaN NaN 31.0
3 dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE= 273554 465 S.H.E 湯小康 徐世珍 3.0
4 W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o= 140329 726 貴族精選 Traditional Traditional 52.0

In [7]:
song.columns


Out[7]:
Index(['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer',
       'lyricist', 'language'],
      dtype='object')

In [8]:
song.dtypes


Out[8]:
song_id        object
song_length     int64
genre_ids      object
artist_name    object
composer       object
lyricist       object
language       object
dtype: object

In [9]:
print('genre_ids unique count :', len(song['genre_ids'].unique()))
print('artist_name unique count :', len(song['artist_name'].unique()))
print('composer unique count :', len(song['composer'].unique()))
print('lyricist unique count :', len(song['lyricist'].unique()))
print('language unique count :', len(song['language'].unique()))


genre_ids unique count : 1046
artist_name unique count : 222363
composer unique count : 329825
lyricist unique count : 110927
language unique count : 11

In [10]:
song_length = tf.feature_column.numeric_column('song_length')
genre_ids = tf.feature_column.categorical_column_with_hash_bucket('genre_ids', hash_bucket_size=1000)
artist_name = tf.feature_column.categorical_column_with_hash_bucket('artist_name', hash_bucket_size=200000)
composer = tf.feature_column.categorical_column_with_hash_bucket('composer', hash_bucket_size=300000)
lyricist = tf.feature_column.categorical_column_with_hash_bucket('lyricist', hash_bucket_size=100000)
language = tf.feature_column.categorical_column_with_hash_bucket('language', hash_bucket_size=10)

In [11]:
train_merged = train.merge(song, on='song_id')
test_merged = test.merge(song, on='song_id')

In [12]:
train_merged.columns


Out[12]:
Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'song_length', 'genre_ids', 'artist_name',
       'composer', 'lyricist', 'language'],
      dtype='object')

In [13]:
train_merged.dtypes


Out[13]:
msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
target                 int64
song_length            int64
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              object
dtype: object

In [14]:
uid_raw = train_merged.msno
sid_raw = train_merged.song_id

uid_test_raw = test_merged.msno
sid_test_raw = test_merged.song_id

uid, uid_test = encode_str(uid_raw, uid_test_raw)
sid, sid_test = encode_str(sid_raw, sid_test_raw)

train_merged['msno_no'] = uid
train_merged['song_no'] = sid
test_merged['msno_no'] = uid_test
test_merged['song_no'] = sid_test

In [15]:
msno = tf.feature_column.categorical_column_with_identity('msno_no', num_buckets=1000000)
song_id = tf.feature_column.categorical_column_with_identity('song_no', num_buckets=1000000)

In [16]:
train_merged['source_system_tab'].unique()


Out[16]:
array(['explore', 'my library', 'discover', 'search', 'radio',
       'listen with', 'notification', nan, 'null', 'settings'], dtype=object)

In [17]:
source_system_tab = tf.feature_column.categorical_column_with_vocabulary_list('source_system_tab', 
    ['explore', 'my library', 'search', 'discover', 'radio', 'none', 'nan', 
     'listen with', 'notification', 'null', 'settings']
)

In [18]:
train_merged['source_screen_name'].unique()


Out[18]:
array(['Explore', 'Local playlist more', 'Online playlist more', nan,
       'Discover Chart', 'My library', 'Discover Genre', 'Radio',
       'Album more', 'Search', 'Discover Feature', 'Unknown',
       'Others profile more', 'My library_Search', 'Artist more',
       'Search Trends', 'Search Home', 'Concert', 'Discover New',
       'Self profile more', 'Payment'], dtype=object)

In [19]:
source_screen_name = tf.feature_column.categorical_column_with_vocabulary_list('source_screen_name', 
    ['Explore', 'Local playlist more', 'None', 'My library',
       'Online playlist more', 'Album more', 'Discover Feature', 'Unknown',
       'Discover Chart', 'Radio', 'Artist more', 'Search',
       'Others profile more', 'Search Trends', 'Discover Genre',
       'My library_Search', 'Search Home', 'Discover New',
       'Self profile more', 'Concert', 'Payment', 'none', 'nan']
)

In [20]:
train_merged['source_type'].unique()


Out[20]:
array(['online-playlist', 'local-playlist', 'local-library',
       'song-based-playlist', 'radio', 'top-hits-for-artist', 'song',
       'artist', 'listen-with', 'my-daily-playlist', 'album', nan,
       'topic-article-playlist'], dtype=object)

In [21]:
train_merged.head()


Out[21]:
msno song_id source_system_tab source_screen_name source_type target song_length genre_ids artist_name composer lyricist language msno_no song_no
0 FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg= BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik= explore Explore online-playlist 1 206471 359 Bastille Dan Smith| Mark Crew NaN 52.0 9176 86872
1 e5Ezre9HPuPos+CXQXtmo32E/hHIZTMmo6jG3yRf6UA= BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik= my library Local playlist more local-playlist 1 206471 359 Bastille Dan Smith| Mark Crew NaN 52.0 22652 86872
2 pouJqjNRmZOnRNzzMWWkamTKkIGHyvhl/jo4HgbncnM= BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik= discover Online playlist more online-playlist 0 206471 359 Bastille Dan Smith| Mark Crew NaN 52.0 28978 86872
3 sSexP400TJOZRhx3JB+0s9cqrCnqrlV51B9njoKR1II= BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik= discover Online playlist more online-playlist 0 206471 359 Bastille Dan Smith| Mark Crew NaN 52.0 30456 86872
4 hKdGiUKHVqKkXGHLrc+EzdSW6q0ERAJ2Cs7/L1N0Ae4= BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik= discover NaN online-playlist 0 206471 359 Bastille Dan Smith| Mark Crew NaN 52.0 24383 86872

In [22]:
source_type = tf.feature_column.categorical_column_with_vocabulary_list('source_type', 
    ['online-playlist', 'local-playlist', 'local-library',
       'top-hits-for-artist', 'album', 'none', 'nan', 'song-based-playlist', 'radio',
       'song', 'listen-with', 'artist', 'topic-article-playlist',
       'my-daily-playlist']
)

In [23]:
base_columns = [
    song_length,
]

deep_columns = [
    tf.feature_column.indicator_column(source_system_tab),
    tf.feature_column.indicator_column(source_screen_name),
    tf.feature_column.indicator_column(source_type),
    tf.feature_column.indicator_column(language),
    tf.feature_column.indicator_column(genre_ids),
    
    tf.feature_column.embedding_column(msno, dimension=128),
    tf.feature_column.embedding_column(song_id, dimension=128),
    tf.feature_column.embedding_column(artist_name, dimension=32),
    tf.feature_column.embedding_column(composer, dimension=32),
    tf.feature_column.embedding_column(lyricist, dimension=32),
]

In [24]:
import tempfile
model_dir = tempfile.mkdtemp()
train_test_index = int(len(train) * 0.7)
m = tf.estimator.DNNLinearCombinedClassifier(
    model_dir = model_dir,
    linear_feature_columns=base_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[64, 128, 256, 512]
)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp7qm3ngdb', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9073f05c50>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

In [ ]:
def input_fn(df_data, num_epochs, shuffle):
    df_data = df_data.dropna(how='any', axis=0)
    labels = df_data['target'].astype(int)
    df_data = df_data.drop('target', axis=1)
    print(df_data.dtypes)
    return tf.estimator.inputs.pandas_input_fn(
        x=df_data,
        y=labels,
        batch_size=1000,
        num_epochs=num_epochs,
        shuffle=shuffle,
        num_threads=5
    )

In [ ]:
m.train(input_fn=input_fn(train_merged[:train_test_index], num_epochs=None, shuffle=True),steps=30000)


msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
song_length            int64
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              object
msno_no                int64
song_no                int64
dtype: object
INFO:tensorflow:Create CheckpointSaverHook.

In [ ]:
results = m.evaluate(
    input_fn=input_fn(train_merged[train_test_index:], num_epochs=1, shuffle=False),
    steps=None
)
print('model directory = %s' % model_dir)
for key in sorted(results):
    print('%s : %s' % (key, results[key]))

In [ ]: