Bar graphs are useful for displaying relationships between categorical data and at least one numerical variable. seaborn.countplot is a barplot where the dependent variable is the number of instances of each instance of the independent variable.
dataset: IMDB 5000 Movie Dataset
In [67]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (20.0, 10.0)
In [68]:
df = pd.read_csv('../../../datasets/movie_metadata.csv')
In [69]:
df.head()
Out[69]:
For the bar plot, let's look at the number of movies in each category, allowing each movie to be counted more than once.
In [70]:
# split each movie's genre list, then form a set from the unwrapped list of all genres
categories = set([s for genre_list in df.genres.unique() for s in genre_list.split("|")])
# one-hot encode each movie's classification
for cat in categories:
df[cat] = df.genres.transform(lambda s: int(cat in s))
# drop other columns
df = df[['director_name','genres','duration'] + list(categories)]
df.head()
Out[70]:
In [71]:
# convert from wide to long format and remove null classificaitons
df = pd.melt(df,
id_vars=['duration'],
value_vars = list(categories),
var_name = 'Category',
value_name = 'Count')
df = df.loc[df.Count>0]
# add an indicator whether a movie is short or long, split at 100 minutes runtime
df['islong'] = df.duration.transform(lambda x: int(x > 100))
# sort in descending order
#df = df.loc[df.groupby('Category').transform(sum).sort_values('Count', ascending=False).index]
In [72]:
df.head()
Out[72]:
Basic plot
In [73]:
p = sns.countplot(data=df, x = 'Category')
color by a category
In [74]:
p = sns.countplot(data=df,
x = 'Category',
hue = 'islong')
make plot horizontal
In [75]:
p = sns.countplot(data=df,
y = 'Category',
hue = 'islong')
Saturation
In [76]:
p = sns.countplot(data=df,
y = 'Category',
hue = 'islong',
saturation=.5)
Various palettes
In [77]:
p = sns.countplot(data=df,
y = 'Category',
hue = 'islong',
saturation=.9,
palette = 'deep')
In [78]:
p = sns.countplot(data=df,
y = 'Category',
hue = 'islong',
saturation=.9,
palette = 'muted')
In [79]:
p = sns.countplot(data=df,
y = 'Category',
hue = 'islong',
saturation=.9,
palette = 'pastel')
In [80]:
p = sns.countplot(data=df,
y = 'Category',
hue = 'islong',
saturation=.9,
palette = 'bright')
In [81]:
p = sns.countplot(data=df,
y = 'Category',
hue = 'islong',
saturation=.9,
palette = 'dark')
In [82]:
p = sns.countplot(data=df,
y = 'Category',
hue = 'islong',
saturation=.9,
palette = 'colorblind')
In [83]:
p = sns.countplot(data=df,
y = 'Category',
hue = 'islong',
saturation=.9,
palette = ((50/255, 132/255.0, 191/255.0), (255/255.0, 232/255.0, 0/255.0)))
In [84]:
p = sns.countplot(data=df,
y = 'Category',
hue = 'islong',
saturation=.9,
palette = 'Dark2')
In [85]:
help(sns.color_palette)
In [ ]:
In [ ]:
In [86]:
help(sns.countplot)
In [87]:
sns.set(rc={"axes.facecolor":"#ccddff",
"axes.grid":False,
'axes.labelsize':30,
'figure.figsize':(20.0, 10.0),
'xtick.labelsize':25,
'ytick.labelsize':20})
p = sns.countplot(data=df, x = 'Category')
plt.text(9,2000, "Color Palettes", fontsize = 95, color='black', fontstyle='italic')
Out[87]:
In [88]:
p.get_figure().savefig('../../figures/colors.png')