Classify a movie genre based on its plot.
https://www.kaggle.com/c/miia4200-20191-p2-moviegenreclassification/overview
Input:
Output: Probability of the movie belong to each genre
We thank Professor Fabio Gonzalez, Ph.D. and his student John Arevalo for providing this dataset.
In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split
In [2]:
dataTraining = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/dataTesting.zip', encoding='UTF-8', index_col=0)
In [18]:
dataTraining.head()
Out[18]:
In [4]:
dataTesting.head()
Out[4]:
In [5]:
vect = CountVectorizer(max_features=1000)
X_dtm = vect.fit_transform(dataTraining['plot'])
X_dtm.shape
Out[5]:
In [6]:
print(vect.get_feature_names()[:50])
In [7]:
dataTraining['genres'] = dataTraining['genres'].map(lambda x: eval(x))
le = MultiLabelBinarizer()
y_genres = le.fit_transform(dataTraining['genres'])
In [8]:
y_genres
Out[8]:
In [9]:
X_train, X_test, y_train_genres, y_test_genres = train_test_split(X_dtm, y_genres, test_size=0.33, random_state=42)
In [10]:
clf = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1, n_estimators=100, max_depth=10, random_state=42))
In [11]:
clf.fit(X_train, y_train_genres)
Out[11]:
In [12]:
y_pred_genres = clf.predict_proba(X_test)
In [13]:
roc_auc_score(y_test_genres, y_pred_genres, average='macro')
Out[13]:
In [14]:
X_test_dtm = vect.transform(dataTesting['plot'])
cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']
y_pred_test_genres = clf.predict_proba(X_test_dtm)
In [15]:
res = pd.DataFrame(y_pred_test_genres, index=dataTesting.index, columns=cols)
In [16]:
res.head()
Out[16]:
In [17]:
res.to_csv('pred_genres_text_RF.csv', index_label='ID')