In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import matplotlib
%matplotlib inline
In [2]:
!head ../data/vgsales.csv
#!more ../data/vgsales.csv
In [3]:
# read data from csv file
data = pd.read_csv("../data/vgsales.csv",
warn_bad_lines = True,
error_bad_lines = False,
verbose = True)
print data.info()
print "\n\n data shape: ", data.shape
In [4]:
# drop first & last coloumn
data.drop(data.columns[[0, 11]], axis = 1, inplace = True)
data["Year"] = data["Year"].astype('category')
print data.info()
print data.shape
# print data.head()
In [5]:
def count_missing(x):
return sum(x.isnull()) / float(len(x))
# missing value info
print "Missing Value Statistics"
print data.apply(count_missing, axis = 0)
In [6]:
data[(data['Global_Sales'] < (data['NA_Sales'] + data['EU_Sales'] + data['JP_Sales'] + data['Other_Sales']) - 0.02)]
Out[6]:
In [7]:
# basic descriptive statistics, here 50% quartile is just the median
data.describe(percentiles = [0.25,0.5,0.75,0.997],
include = 'all')
Out[7]:
In [8]:
data.skew(axis = 0)
Out[8]:
In [9]:
data.kurt(axis = 0)
Out[9]:
In [10]:
data.plot.box()
Out[10]:
In [11]:
data2 = data.loc[(data['NA_Sales'] <= 1) & (data['EU_Sales'] <= 1) &
(data['JP_Sales'] <= 1) & (data['Other_Sales'] <= 1) &
(data['Global_Sales'] <= 1)]
data2.describe(percentiles = [0.25,0.5,0.75,0.997], include = 'all')
Out[11]:
In [12]:
data2.boxplot(figsize = (8,8))
Out[12]:
In [13]:
data['NA_Sales'].hist(bins = 50)
Out[13]:
In [14]:
data['Genre'].unique()
Out[14]:
In [15]:
genre_cnt = data['Genre'].value_counts(sort = False)
print genre_cnt
genre_freq = genre_cnt / float(data.shape[0])
print genre_freq
In [16]:
fig, ax = plt.subplots(figsize=(8,5))
genre_cnt.plot(kind = 'bar', ax = ax, rot = 90)
plt.title('Genre Distribution', fontsize = 15)
plt.xlabel('Genre', fontsize = 15)
plt.ylabel('Sales Number', fontsize = 15)
Out[16]:
In [17]:
fig, ax = plt.subplots(figsize=(18,15))
data.groupby(['Year', 'Platform']).sum().unstack().plot(y = 'Global_Sales', kind = 'bar', ax = ax, stacked = True, colormap = 'Paired')
plt.show()
In [18]:
# print data['Genre'].dropna().tolist()
genre = Counter(data['Genre'].dropna().tolist())
total = sum(genre.values())
genre = genre.most_common() # parameter can be the N
N = len(genre)
print "%d categories with total %d records" % (N, total)
genre_name = [item[0] for item in genre]
genre_counts = [item[1] / float(total) for item in genre]
# print genre_counts
fig, ax = plt.subplots(figsize=(18,15))
sns.barplot(x = genre_name, y = genre_counts, ax = ax)
plt.title("Top-%d Genre" % (N), fontsize = 15)
plt.xlabel("Genre", fontsize = 15)
plt.ylabel("Top-%d Genre" % (N), fontsize = 15)
ticks = plt.setp(ax.get_xticklabels(), fontsize = 15, rotation = 60)
In [19]:
platforms = data['Platform'].unique()
platform_sales = []
for platform in platforms:
platform_sales.append(data[data['Platform'] == platform]['Global_Sales'].dropna().sum())
fig, ax = plt.subplots(figsize = (18, 15))
sns.barplot(x = platforms, y = platform_sales, ax = ax, palette = sns.color_palette("PuBu", 10))
plt.title("Platform Sales", fontsize = 15)
plt.xlabel("Platform Category", fontsize = 15)
plt.ylabel("Total Sales", fontsize = 15)
ticks = plt.setp(ax.get_xticklabels(), fontsize = 15, rotation = 60)
In [20]:
table_sales = pd.pivot_table(data, values = ['Global_Sales'], index = ['Platform'], columns = ['Genre'], aggfunc = np.mean, fill_value = 0, margins = False)
fig, ax = plt.subplots(figsize = (18, 15))
sns.heatmap(table_sales, linewidth = .5, annot = True, vmin = 0.01, fmt = '.2f', cmap = 'PuBu')
plt.title("Platform-Genre Sales", fontsize = 15)
# ticks_y = plt.setp(ax.get_yticklabels(), fontsize = 15)
ticks_x = plt.setp(ax.get_xticklabels(), rotation = 60)
print ax.get_xticklabels()
In [21]:
table_sales = pd.pivot_table(data, values = ['Global_Sales'], index = ['Platform'], columns = ['Genre'], aggfunc = 'count', fill_value = 0, margins = False)
# print table_sales
fig, ax = plt.subplots(figsize = (18, 15))
sns.heatmap(table_sales, linewidth = .5, annot = True, vmin = 0, fmt = '2.0f', cmap = 'PuBu')
plt.title("Platform-Genre Sales", fontsize = 15)
# ticks_y = plt.setp(ax.get_yticklabels(), fontsize = 15)
# ticks = plt.setp(ax.get_xticklabels(), fontsize = 15, rotation = 60)
Out[21]:
In [22]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
stopwords = set(STOPWORDS)
for genre in data.Genre.unique():
# print data.Name[data.Genre == genre].to_string()
if genre == "Sports":
wc = WordCloud(background_color = "white", max_font_size = 40, max_words = 200, stopwords = stopwords, random_state = 42)
wc.generate(data.Name[data.Genre == genre].to_string())
plt.imshow(wc)
plt.title(genre)
plt.axis("off")
plt.show()
In [23]:
fig, ax = plt.subplots(figsize = (8, 8))
publisher = data.groupby('Publisher').sum()['Global_Sales']
publisher.sort_values(ascending = False)[:10].plot.pie()
ax.set_ylabel("")
plt.tight_layout()
# print publisher