In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from six.moves.urllib.request import urlopen
import tensorflow as tf
import numpy as np
import pandas as pd
In [2]:
########################################
## load the data
########################################
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
song = pd.read_csv('./input/songs.csv')
song_extra = pd.read_csv('./input/song_extra_info.csv')
member = pd.read_csv('./input/members.csv')
In [3]:
tf.__version__
Out[3]:
In [4]:
from sklearn.preprocessing import LabelEncoder
def encode_str(train_data, test_data):
data_encoder = LabelEncoder()
data_encoder.fit(train_data.append(test_data))
t_train_data = data_encoder.transform(train_data)
t_test_data = data_encoder.transform(test_data)
return t_train_data, t_test_data
def generate_encoded_data(data_raw, data_test_raw):
data, data_test = encode_str(data_raw, data_test_raw)
data_cnt = int(max(data.max(), data_test.max()) + 1)
return data, data_test, data_cnt
In [5]:
song['language'] = song['language'].fillna(0).astype(str)
In [6]:
song.head()
Out[6]:
In [7]:
song.columns
Out[7]:
In [8]:
song.dtypes
Out[8]:
In [9]:
print('genre_ids unique count :', len(song['genre_ids'].unique()))
print('artist_name unique count :', len(song['artist_name'].unique()))
print('composer unique count :', len(song['composer'].unique()))
print('lyricist unique count :', len(song['lyricist'].unique()))
print('language unique count :', len(song['language'].unique()))
In [10]:
song_length = tf.feature_column.numeric_column('song_length')
genre_ids = tf.feature_column.categorical_column_with_hash_bucket('genre_ids', hash_bucket_size=1000)
artist_name = tf.feature_column.categorical_column_with_hash_bucket('artist_name', hash_bucket_size=200000)
composer = tf.feature_column.categorical_column_with_hash_bucket('composer', hash_bucket_size=300000)
lyricist = tf.feature_column.categorical_column_with_hash_bucket('lyricist', hash_bucket_size=100000)
language = tf.feature_column.categorical_column_with_hash_bucket('language', hash_bucket_size=10)
In [11]:
train_merged = train.merge(song, on='song_id')
test_merged = test.merge(song, on='song_id')
In [12]:
train_merged.columns
Out[12]:
In [13]:
train_merged.dtypes
Out[13]:
In [14]:
uid_raw = train_merged.msno
sid_raw = train_merged.song_id
uid_test_raw = test_merged.msno
sid_test_raw = test_merged.song_id
uid, uid_test = encode_str(uid_raw, uid_test_raw)
sid, sid_test = encode_str(sid_raw, sid_test_raw)
train_merged['msno_no'] = uid
train_merged['song_no'] = sid
test_merged['msno_no'] = uid_test
test_merged['song_no'] = sid_test
In [15]:
msno = tf.feature_column.categorical_column_with_identity('msno_no', num_buckets=1000000)
song_id = tf.feature_column.categorical_column_with_identity('song_no', num_buckets=1000000)
In [16]:
train_merged['source_system_tab'].unique()
Out[16]:
In [17]:
source_system_tab = tf.feature_column.categorical_column_with_vocabulary_list('source_system_tab',
['explore', 'my library', 'search', 'discover', 'radio', 'none', 'nan',
'listen with', 'notification', 'null', 'settings']
)
In [18]:
train_merged['source_screen_name'].unique()
Out[18]:
In [19]:
source_screen_name = tf.feature_column.categorical_column_with_vocabulary_list('source_screen_name',
['Explore', 'Local playlist more', 'None', 'My library',
'Online playlist more', 'Album more', 'Discover Feature', 'Unknown',
'Discover Chart', 'Radio', 'Artist more', 'Search',
'Others profile more', 'Search Trends', 'Discover Genre',
'My library_Search', 'Search Home', 'Discover New',
'Self profile more', 'Concert', 'Payment', 'none', 'nan']
)
In [20]:
train_merged['source_type'].unique()
Out[20]:
In [21]:
train_merged.head()
Out[21]:
In [22]:
source_type = tf.feature_column.categorical_column_with_vocabulary_list('source_type',
['online-playlist', 'local-playlist', 'local-library',
'top-hits-for-artist', 'album', 'none', 'nan', 'song-based-playlist', 'radio',
'song', 'listen-with', 'artist', 'topic-article-playlist',
'my-daily-playlist']
)
In [23]:
base_columns = [
song_length,
]
deep_columns = [
tf.feature_column.indicator_column(source_system_tab),
tf.feature_column.indicator_column(source_screen_name),
tf.feature_column.indicator_column(source_type),
tf.feature_column.indicator_column(language),
tf.feature_column.indicator_column(genre_ids),
tf.feature_column.embedding_column(msno, dimension=128),
tf.feature_column.embedding_column(song_id, dimension=128),
tf.feature_column.embedding_column(artist_name, dimension=32),
tf.feature_column.embedding_column(composer, dimension=32),
tf.feature_column.embedding_column(lyricist, dimension=32),
]
In [24]:
import tempfile
model_dir = tempfile.mkdtemp()
train_test_index = int(len(train) * 0.7)
m = tf.estimator.DNNLinearCombinedClassifier(
model_dir = model_dir,
linear_feature_columns=base_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[64, 128, 256, 512]
)
In [ ]:
def input_fn(df_data, num_epochs, shuffle):
df_data = df_data.dropna(how='any', axis=0)
labels = df_data['target'].astype(int)
df_data = df_data.drop('target', axis=1)
print(df_data.dtypes)
return tf.estimator.inputs.pandas_input_fn(
x=df_data,
y=labels,
batch_size=1000,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5
)
In [ ]:
m.train(input_fn=input_fn(train_merged[:train_test_index], num_epochs=None, shuffle=True),steps=30000)
In [ ]:
results = m.evaluate(
input_fn=input_fn(train_merged[train_test_index:], num_epochs=1, shuffle=False),
steps=None
)
print('model directory = %s' % model_dir)
for key in sorted(results):
print('%s : %s' % (key, results[key]))
In [ ]: