IMDB Movie Analysis



In [ ]:

    
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

sns.set(color_codes=True)
sns.set(style="ticks")

data_frame = pd.read_csv("movie_metadata.csv")
pd.options.display.max_columns = len(list(data_frame))
# Second feature does not useful at all but first one could be utilized.
# Which may require semantic analysis of containing words to extract meaningful
# representations which then could be used as feature.
data_frame.drop(['movie_title', 'movie_imdb_link'], axis=1, inplace=True)
data_frame.head()



In [ ]:

    
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

for column_name in data_frame.columns:
    column = data_frame[column_name]
    # Numeric columns
    if column.dtype.kind in 'biufc':
        # Mean value imputation, could be changed...
        mean_value = column.mean()
        data_frame[column_name].update(column.fillna(mean_value))
        continue
    
    # Multilabel columns
    if column_name == 'plot_keywords' or column_name == 'genres':
        mlb = MultiLabelBinarizer()
        keywords = list()
        column.fillna('UNKNOWN')
        column.apply(lambda x: keywords.append(x.split('|')) if x is not np.nan else keywords.append(['UNKNOWN']))
        binarized = mlb.fit_transform(keywords)
        plt.imshow(binarized)
        class_values = pd.Series(list(binarized))
        data_frame[column_name].update(class_values)
    # Label columns
    else:
        le = LabelEncoder()
        normalized_column = column.fillna('UNKNOWN')

        class_values = pd.Series(list(le.fit_transform(normalized_column)))
        data_frame[column_name].update(class_values)

data_frame.head()



In [ ]:

    
# Figures for representing some features
sns.set(font_scale = 2)
g = sns.jointplot('gross', 'imdb_score', size=12, data=data_frame, kind="resid", color="#10275F")
plt.subplots_adjust(top=0.95)
g.fig.suptitle('Imdb_Score and Gross', size=20, weight='bold')
sns.set(font_scale = 1)
# This is different from the kaggle kernel result(as in shape)



In [ ]:

    
# Showing the pearson correlation of features
with sns.plotting_context(font_scale=1.25):
    f, ax = plt.subplots(figsize=(20, 20))
    plt.title('Pearson Correlation of Movie Features', {'weight': 'bold', 'size': 20})
    # plot_keywords features are encoded as array which requires more care to plot in this way
    new_dataframe = data_frame.drop(['plot_keywords', 'genres'], axis=1)
    sns.heatmap(new_dataframe.astype(float).corr(), linewidths=0.25, vmax=1.0, square=True, annot=True)



In [ ]: