In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mplc
import matplotlib.pyplot as plt
from bokeh import mpl
from bokeh.charts import Bar, Scatter, defaults, Histogram
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
In [2]:
output_notebook()
In [3]:
apps = pd.DataFrame()
apps = pd.read_csv("../gapps.csv", index_col=None, parse_dates=['app_updated'], thousands=',')
apps.head()
Out[3]:
In [4]:
defaults.width = 800
defaults.height = 420
In [5]:
apps.describe()
Out[5]:
In [6]:
# create series with genre and counts
genre = apps.app_genre.value_counts()
# convert series to dataframe
genre = pd.DataFrame(genre)
genre.head()
Out[6]:
In [7]:
genre = genre.reset_index()
genre.columns = ["Categories", "Counts"]
genre.head()
Out[7]:
In [8]:
plot = Bar(genre, "Categories", values="Counts", color="wheat",
title="App distribution with categories")
show(plot)
In [9]:
devs_apps_count = apps.developer_name.value_counts()
devs_apps_count.head()
Out[9]:
In [10]:
# convert series to dataframe
devs_apps_count = pd.DataFrame(devs_apps_count)
devs_apps_count = devs_apps_count.reset_index()
devs_apps_count.head()
Out[10]:
In [11]:
devs_apps_count.columns = ["Developers", "Count"]
devs_apps_count = devs_apps_count.head(20)
plot = Bar(devs_apps_count, "Developers", values="Count", color="wheat",
title="Top 20 Developers with highest number of apps uploaded")
show(plot)
In [ ]:
# developer | average min downloads | average max downloads
# dev_average_downloads = apps[["developer_name", "app_downloads_min", "app_downloads_max"]]
# dev_average_downloads.head()
In [ ]:
# from bokeh.charts import Histogram
# p = Histogram(dev_average_downloads.app_downloads_min, title="Download distribution")
# show(p)
In [ ]:
# dev_average_downloads = dev_average_downloads.groupby('developer_name')
In [ ]:
# average_downloads = pd.DataFrame()
# for n, g in dev_average_downloads:
# data = {"developer": n}
# lower = g.app_downloads_min.sum()
# higher = g.app_downloads_max.sum()
# data["downloads"] = np.mean((lower, higher))
# average_downloads.append(data, ignore_index=True)
In [ ]:
# ["2.3": "2.2", "2.3.3": "4.0", "2.1", "4.0.3", "3.0", "1.6", "4.1", "1.5", "2.0", "4.3", "3.2"]
# df = apps.app_min_os.value_counts()[:14]
# df = pd.DataFrame(df)
# df = df.reset_index()
# df.columns = ["version", "count"]
# df = df.set_index("version")
# #df[["version"]] = df[["version"]].astype(float)
# df
In [12]:
apps.app_min_os.describe()
Out[12]:
In [13]:
%matplotlib inline
In [14]:
df = apps.app_min_os.value_counts()[:14]
df = pd.DataFrame(df)
df = df.reset_index()
df.columns = ["version", "count"]
df = df.set_index("version")
# #df[["version"]] = df[["version"]].astype(float)
df
Out[14]:
In [15]:
df = apps[["app_rating", "app_genre", "app_rating_count"]]
df = df.groupby('app_genre').mean()
df = df.reset_index()
df[["app_rating_count"]] = df[["app_rating_count"]] / 1000
df
Out[15]:
In [16]:
plot = Scatter(df, x='app_rating', y='app_rating_count', color='app_genre', xlabel='Mean Rating',
ylabel='Mean Rating Count (in Thousands)', legend='top_left', title='Frequency Distribution of Categories with Rating')
In [17]:
show(plot)
In [18]:
df = apps[["app_content_rating", "app_downloads_min", "app_rating"]]
df = df.groupby('app_content_rating').mean()
df = df.reset_index()
df[["app_downloads_min"]] = df[["app_downloads_min"]] / 1000
df.columns = ["Content Rating", "Mean Downloads", "Mean App Rating"]
df
Out[18]:
In [19]:
plot = Scatter(df, x='Mean App Rating', y='Mean Downloads', color='Content Rating',
legend='top_left', title="Relation of Content Rating with Downloads and App Rating")
In [20]:
show(plot)
In [21]:
df = apps.app_min_os.convert_objects(convert_numeric=True).dropna()
In [22]:
plt.rcParams['figure.figsize'] = (12.0, 6.0)
In [23]:
sns.kdeplot(df, shade=True)
Out[23]:
In [24]:
df = apps[["app_rating"]]
p = Histogram(df, xlabel="Rating", ylabel="Count", bins=10, title="App Rating Histogram")
show(p)
In [25]:
df = apps[["app_rating", "app_genre"]]
df = df[(df["app_genre"] == "Educational") | (df["app_genre"] == "Books & Reference")]
df = df.dropna()
# df.head()
p = Histogram(df, values="app_rating", color="app_genre", legend=True,
xlabel="Rating", ylabel="Count", bins=10, title="Educational and Books & Reference comparision")
In [26]:
show(p)
In [27]:
disney = apps[apps["developer_name"] == "Disney"]
disney.head()
Out[27]:
In [28]:
dfree = disney[["app_price", "app_rating"]]
dfree.ix[dfree.app_price == "0", "app_price"] = "Free"
dfree.ix[dfree.app_price != "Free", "app_price"] = "Paid"
dfree
Out[28]:
In [29]:
p = Histogram(dfree, values="app_rating", color="app_price", legend=True, bins=10,
title="Ratings on Free and Paid by Disney")
In [30]:
show(p)
In [31]:
df = apps.app_updated.dt.month.value_counts()
df = df.sort_index()
df = pd.DataFrame(df)
df
Out[31]:
In [32]:
cat = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
df.index = cat
df = df.reset_index()
df.columns = ["Month", "Count"]
df
Out[32]:
In [33]:
p = Bar(df, "Month", values="Count", xlabel="Month", ylabel="Number of Updates",
title="App updates by Month of Year")
In [34]:
show(p)
In [ ]:
### Work in Progress
In [ ]: