In [ ]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
sns.set(color_codes=True)
sns.set(style="ticks")
data_frame = pd.read_csv("movie_metadata.csv")
pd.options.display.max_columns = len(list(data_frame))
# Second feature does not useful at all but first one could be utilized.
# Which may require semantic analysis of containing words to extract meaningful
# representations which then could be used as feature.
data_frame.drop(['movie_title', 'movie_imdb_link'], axis=1, inplace=True)
data_frame.head()
In [ ]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
for column_name in data_frame.columns:
column = data_frame[column_name]
# Numeric columns
if column.dtype.kind in 'biufc':
# Mean value imputation, could be changed...
mean_value = column.mean()
data_frame[column_name].update(column.fillna(mean_value))
continue
# Multilabel columns
if column_name == 'plot_keywords' or column_name == 'genres':
mlb = MultiLabelBinarizer()
keywords = list()
column.fillna('UNKNOWN')
column.apply(lambda x: keywords.append(x.split('|')) if x is not np.nan else keywords.append(['UNKNOWN']))
binarized = mlb.fit_transform(keywords)
plt.imshow(binarized)
class_values = pd.Series(list(binarized))
data_frame[column_name].update(class_values)
# Label columns
else:
le = LabelEncoder()
normalized_column = column.fillna('UNKNOWN')
class_values = pd.Series(list(le.fit_transform(normalized_column)))
data_frame[column_name].update(class_values)
data_frame.head()
In [ ]:
# Figures for representing some features
sns.set(font_scale = 2)
g = sns.jointplot('gross', 'imdb_score', size=12, data=data_frame, kind="resid", color="#10275F")
plt.subplots_adjust(top=0.95)
g.fig.suptitle('Imdb_Score and Gross', size=20, weight='bold')
sns.set(font_scale = 1)
# This is different from the kaggle kernel result(as in shape)
In [ ]:
# Showing the pearson correlation of features
with sns.plotting_context(font_scale=1.25):
f, ax = plt.subplots(figsize=(20, 20))
plt.title('Pearson Correlation of Movie Features', {'weight': 'bold', 'size': 20})
# plot_keywords features are encoded as array which requires more care to plot in this way
new_dataframe = data_frame.drop(['plot_keywords', 'genres'], axis=1)
sns.heatmap(new_dataframe.astype(float).corr(), linewidths=0.25, vmax=1.0, square=True, annot=True)
In [ ]: