In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mplc
import matplotlib.pyplot as plt
from bokeh import mpl
from bokeh.charts import Bar, Scatter, defaults, Histogram
from bokeh.plotting import figure, show
from bokeh.io import output_notebook


/Users/diwaker/miniconda3/envs/notebook/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [2]:
output_notebook()


BokehJS successfully loaded.

Playing with google play store datasets using pandas, bokeh and jupyter

@idwaker

Data Sample


In [3]:
apps = pd.DataFrame()
apps = pd.read_csv("../gapps.csv", index_col=None, parse_dates=['app_updated'], thousands=',')
apps.head()


Out[3]:
app_downloads_max app_updated app_rating_count app_name developer_address developer_email app_content_rating app_downloads_min app_genre developer_name developer_website app_rating app_price app_size app_link id app_min_os
0 5000000 2015-09-29 31139 Disney Junior Play 500 South Buena Vista Street\nBurbank, CA 9152... support@disneymobile.com Everyone 1000000 Educational Disney http://help.disney.com 3.7 0 36M https://play.google.com/store/apps/details?id=... com.disney.disneyjuniorplay_goo 4.0
1 5000000 2015-08-12 33760 Animals Farm For Kids Poland,\nPoznań 61-697 os. Wichrowe Wzgórze 28... kidsgamesprojects@gmail.com Everyone 1000000 Educational Kids Games Projects http://kidsgameprojects.com 4.1 0 29M https://play.google.com/store/apps/details?id=... pl.kidsgameprojects.com.AnimalsFarmForKids 2.3
2 50000 2014-10-13 199 Pet Link DELUXE NaN phanhaiduong80@gmail.com Everyone 10000 Board phanhaiduong80 NaN 4.2 0 4.1M https://play.google.com/store/apps/details?id=... com.fungalaxy.petlinkdeluxe 2.1
3 50000 2015-06-08 131 Aesops Fables stories for kids 610, Shiromani complex,\nAbove kandoi bhogilal... pratikmachchar@gmail.com Everyone 10000 Books & Reference Pratik Machchar http://vyaap.com 4.3 0 9.2M https://play.google.com/store/apps/details?id=... com.pratik.mobileapps.aesopfables.shortstories... 2.3.3
4 5000 2015-05-25 16 Маша и Медведь: Кто икнул? Bachemer Str. 210\n50935 Köln games@mashabear.ru Everyone 1000 Educational tekkon IPM GmbH http://games.mashabear.ru/ 3.4 $1.00 21M https://play.google.com/store/apps/details?id=... air.ru.mashabear.hiccup 2.2

Notes

  • app distribution with categories
  • top 20 app developers according to number of apps published
  • top 20 app developers according to aggregated mean downloads
  • top 20 app developers according to average rating of all apps
  • top 20 app developers with highest number of rating counts
  • top 20 most expensive apps
  • average ratings comparision between 2 or more categories

In [4]:
defaults.width = 800
defaults.height = 420

In [5]:
apps.describe()


Out[5]:
app_downloads_max app_rating_count app_downloads_min app_rating
count 7.922800e+04 75333.000000 7.922800e+04 75333.000000
mean 2.463210e+06 14616.394515 5.946835e+05 4.053591
std 6.674247e+07 253880.087074 1.394023e+07 0.597489
min 5.000000e+00 1.000000 1.000000e+00 0.000000
25% 1.000000e+03 17.000000 5.000000e+02 3.800000
50% 1.000000e+04 125.000000 5.000000e+03 4.100000
75% 1.000000e+05 1002.000000 5.000000e+04 4.400000
max 5.000000e+09 31881332.000000 1.000000e+09 5.000000

In [6]:
# create series with genre and counts
genre = apps.app_genre.value_counts()

# convert series to dataframe
genre = pd.DataFrame(genre)
genre.head()


Out[6]:
app_genre
Personalization 8472
Books & Reference 7303
Sports 5266
Tools 5161
Entertainment 4797

In [7]:
genre = genre.reset_index()
genre.columns = ["Categories", "Counts"]
genre.head()


Out[7]:
Categories Counts
0 Personalization 8472
1 Books & Reference 7303
2 Sports 5266
3 Tools 5161
4 Entertainment 4797

In [8]:
plot = Bar(genre, "Categories", values="Counts", color="wheat", 
           title="App distribution with categories")
show(plot)


/Users/diwaker/miniconda3/envs/notebook/lib/python3.5/site-packages/bokeh/charts/_attributes.py:78: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df = df.sort(columns=columns)

In [9]:
devs_apps_count = apps.developer_name.value_counts()
devs_apps_count.head()


Out[9]:
Oceanhouse Media, Inc.    73
Google Inc.               71
MobiSystems               71
ZT.art                    69
romeLab                   65
Name: developer_name, dtype: int64

In [10]:
# convert series to dataframe
devs_apps_count = pd.DataFrame(devs_apps_count)
devs_apps_count = devs_apps_count.reset_index()
devs_apps_count.head()


Out[10]:
index developer_name
0 Oceanhouse Media, Inc. 73
1 Google Inc. 71
2 MobiSystems 71
3 ZT.art 69
4 romeLab 65

In [11]:
devs_apps_count.columns = ["Developers", "Count"]
devs_apps_count = devs_apps_count.head(20)
plot = Bar(devs_apps_count, "Developers", values="Count", color="wheat",
          title="Top 20 Developers with highest number of apps uploaded")
show(plot)


/Users/diwaker/miniconda3/envs/notebook/lib/python3.5/site-packages/bokeh/charts/_attributes.py:78: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df = df.sort(columns=columns)

In [ ]:
# developer | average min downloads | average max downloads
# dev_average_downloads = apps[["developer_name", "app_downloads_min", "app_downloads_max"]]
# dev_average_downloads.head()

In [ ]:
# from bokeh.charts import Histogram
# p = Histogram(dev_average_downloads.app_downloads_min, title="Download distribution")
# show(p)

In [ ]:
# dev_average_downloads = dev_average_downloads.groupby('developer_name')

In [ ]:
# average_downloads = pd.DataFrame()

# for n, g in dev_average_downloads:
#     data = {"developer": n}
#     lower = g.app_downloads_min.sum()
#     higher = g.app_downloads_max.sum()
#     data["downloads"] = np.mean((lower, higher))
#     average_downloads.append(data, ignore_index=True)

In [ ]:
# ["2.3": "2.2", "2.3.3": "4.0", "2.1", "4.0.3", "3.0", "1.6", "4.1", "1.5", "2.0", "4.3", "3.2"]
# df = apps.app_min_os.value_counts()[:14]
# df = pd.DataFrame(df)
# df = df.reset_index()
# df.columns = ["version", "count"]
# df = df.set_index("version")
# #df[["version"]] = df[["version"]].astype(float)
# df

In [12]:
apps.app_min_os.describe()


Out[12]:
count     79874
unique       70
top         2.3
freq      15745
Name: app_min_os, dtype: object

In [13]:
%matplotlib inline

In [14]:
df = apps.app_min_os.value_counts()[:14]
df = pd.DataFrame(df)
df = df.reset_index()
df.columns = ["version", "count"]
df = df.set_index("version")
# #df[["version"]] = df[["version"]].astype(float)
df


Out[14]:
count
version
2.3 15745
2.2 13996
2.3.3 10285
4.0 9149
2.1 7299
4.0.3 4350
3.0 3928
1.6 3541
Varies with device 2599
4.1 2491
1.5 2177
2.0 851
4.3 636
3.2 587

In [15]:
df = apps[["app_rating", "app_genre", "app_rating_count"]]
df = df.groupby('app_genre').mean()
df = df.reset_index()
df[["app_rating_count"]] = df[["app_rating_count"]] / 1000
df


Out[15]:
app_genre app_rating app_rating_count
0 Action 4.011181 104.134200
1 Adventure 4.097447 28.195745
2 Arcade 4.041518 59.328449
3 Board 4.120513 13.869318
4 Books & Reference 4.173396 1.924583
5 Business 3.997228 6.026630
6 Card 4.037453 29.860734
7 Casino 4.206897 31.942126
8 Casual 3.929950 55.386864
9 Comics 4.105493 2.125908
10 Communication 4.021654 108.735026
11 Education 4.040881 2.530699
12 Educational 3.948350 1.991588
13 Entertainment 4.005387 10.359418
14 Finance 4.007984 2.669704
15 Health & Fitness 4.002206 5.344012
16 Libraries & Demo 4.065567 1.117482
17 Lifestyle 4.046205 4.349274
18 Media & Video 3.848709 14.090607
19 Medical 4.007736 0.263632
20 Music 3.991935 2.353355
21 Music & Audio 4.107416 14.102717
22 News & Magazines 4.033912 6.457827
23 Personalization 4.204609 3.927539
24 Photography 4.007900 41.435332
25 Productivity 4.083363 20.985893
26 Puzzle 4.066897 17.188863
27 Racing 3.934063 139.357741
28 Role Playing 4.100704 38.847852
29 Shopping 3.974662 29.719652
30 Simulation 3.804110 52.400362
31 Social 4.058929 84.588328
32 Sports 4.061368 6.603193
33 Strategy 4.137278 112.142562
34 Tools 4.046820 13.443610
35 Transportation 3.953732 1.060029
36 Travel & Local 3.981950 14.982025
37 Trivia 4.041410 42.324718
38 Weather 4.026106 2.673707
39 Word 4.105648 34.428100

In [16]:
plot = Scatter(df, x='app_rating', y='app_rating_count', color='app_genre', xlabel='Mean Rating', 
               ylabel='Mean Rating Count (in Thousands)', legend='top_left', title='Frequency Distribution of Categories with Rating')


/Users/diwaker/miniconda3/envs/notebook/lib/python3.5/site-packages/bokeh/charts/_attributes.py:78: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df = df.sort(columns=columns)

In [17]:
show(plot)



In [18]:
df = apps[["app_content_rating", "app_downloads_min", "app_rating"]]
df = df.groupby('app_content_rating').mean()
df = df.reset_index()
df[["app_downloads_min"]] = df[["app_downloads_min"]] / 1000
df.columns = ["Content Rating", "Mean Downloads", "Mean App Rating"]
df


Out[18]:
Content Rating Mean Downloads Mean App Rating
0 Adults only 18+ 463.273077 3.900000
1 Everyone 598.845695 4.098742
2 Everyone 10+ 1710.798305 4.136165
3 Mature 17+ 921.755529 4.126981
4 Teen 3148.191777 4.094936
5 Unrated 71.411608 3.907865

In [19]:
plot = Scatter(df, x='Mean App Rating', y='Mean Downloads', color='Content Rating',
              legend='top_left', title="Relation of Content Rating with Downloads and App Rating")


/Users/diwaker/miniconda3/envs/notebook/lib/python3.5/site-packages/bokeh/charts/_attributes.py:78: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df = df.sort(columns=columns)

In [20]:
show(plot)



In [21]:
df = apps.app_min_os.convert_objects(convert_numeric=True).dropna()


/Users/diwaker/miniconda3/envs/notebook/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if __name__ == '__main__':

In [22]:
plt.rcParams['figure.figsize'] = (12.0, 6.0)

Kernel Density Plot for Minimum os requirement on apps


In [23]:
sns.kdeplot(df, shade=True)


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x106503908>

App Rating Histogram


In [24]:
df = apps[["app_rating"]]
p = Histogram(df, xlabel="Rating", ylabel="Count", bins=10, title="App Rating Histogram")
show(p)



In [25]:
df = apps[["app_rating", "app_genre"]]
df = df[(df["app_genre"] == "Educational") | (df["app_genre"] == "Books & Reference")]
df = df.dropna()
# df.head()
p = Histogram(df, values="app_rating", color="app_genre", legend=True,
              xlabel="Rating", ylabel="Count", bins=10, title="Educational and Books & Reference comparision")


/Users/diwaker/miniconda3/envs/notebook/lib/python3.5/site-packages/bokeh/charts/_attributes.py:78: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df = df.sort(columns=columns)

In [26]:
show(p)



In [27]:
disney = apps[apps["developer_name"] == "Disney"]
disney.head()


Out[27]:
app_downloads_max app_updated app_rating_count app_name developer_address developer_email app_content_rating app_downloads_min app_genre developer_name developer_website app_rating app_price app_size app_link id app_min_os
0 5000000 2015-09-29 31139 Disney Junior Play 500 South Buena Vista Street\nBurbank, CA 9152... support@disneymobile.com Everyone 1000000 Educational Disney http://help.disney.com 3.7 0 36M https://play.google.com/store/apps/details?id=... com.disney.disneyjuniorplay_goo 4.0
560 100000 2015-05-14 1532 Where's My Mickey? XL 500 South Buena Vista Street\nBurbank, CA 9152... support@disneymobile.com Everyone 50000 Puzzle Disney http://help.disney.com 4.2 $2.99 83M https://play.google.com/store/apps/details?id=... com.disney.wheresmymickey_tab_goo 2.3.3
570 100000 2013-12-04 1264 Disney Fairies: Lost & Found 500 South Buena Vista Street\nBurbank, CA 9152... support@disneymobile.com Everyone 50000 Puzzle Disney http://help.disney.com 3.6 $0.99 88M https://play.google.com/store/apps/details?id=... com.disney.lostandfound 2.3.3
573 5000000 2014-01-14 80202 Where's My Water? T-Mo Edition 500 South Buena Vista Street\nBurbank, CA 9152... support@disneymobile.com Everyone 1000000 Puzzle Disney http://help.disney.com 4.4 0 49M https://play.google.com/store/apps/details?id=... com.disney.WMW_TMobile 4.0
580 100000000 2015-05-21 1300424 Where's My Water? 2 500 South Buena Vista Street\nBurbank, CA 9152... support@disneymobile.com Everyone 50000000 Puzzle Disney http://help.disney.com 4.1 0 69M https://play.google.com/store/apps/details?id=... com.disney.wheresmywater2_goo 2.3.3

In [28]:
dfree = disney[["app_price", "app_rating"]]
dfree.ix[dfree.app_price == "0", "app_price"] = "Free"
dfree.ix[dfree.app_price != "Free", "app_price"] = "Paid"
dfree


/Users/diwaker/miniconda3/envs/notebook/lib/python3.5/site-packages/pandas/core/indexing.py:426: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
Out[28]:
app_price app_rating
0 Free 3.7
560 Paid 4.2
570 Paid 3.6
573 Free 4.4
580 Free 4.1
608 Paid 4.1
609 Paid 4.6
610 Paid 4.7
1035 Free 3.7
1036 Free 4.4
1191 Free 4.2
1192 Free 4.4
1193 Free 4.2
1194 Free 3.8
1195 Free 4.1
1196 Free 3.9
1197 Free 4.3
1233 Paid 4.2
1481 Paid 4.4
1770 Paid 3.5
1772 Paid 4.3
2831 Free 4.0
3559 Free 4.0
5377 Free 4.3
5861 Free 4.4
46455 Free 4.5
60829 Paid 3.5
65047 Free 4.1

In [29]:
p = Histogram(dfree, values="app_rating", color="app_price", legend=True, bins=10,
             title="Ratings on Free and Paid by Disney")


/Users/diwaker/miniconda3/envs/notebook/lib/python3.5/site-packages/bokeh/charts/_attributes.py:78: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df = df.sort(columns=columns)

In [30]:
show(p)



In [31]:
df = apps.app_updated.dt.month.value_counts()
df = df.sort_index()
df = pd.DataFrame(df)
df


Out[31]:
app_updated
1 4091
2 3774
3 4412
4 4516
5 4861
6 5544
7 6759
8 7678
9 8525
10 11140
11 14284
12 4290

In [32]:
cat = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
df.index = cat
df = df.reset_index()
df.columns = ["Month", "Count"]
df


Out[32]:
Month Count
0 Jan 4091
1 Feb 3774
2 Mar 4412
3 Apr 4516
4 May 4861
5 Jun 5544
6 Jul 6759
7 Aug 7678
8 Sep 8525
9 Oct 11140
10 Nov 14284
11 Dec 4290

In [33]:
p = Bar(df, "Month", values="Count", xlabel="Month", ylabel="Number of Updates", 
        title="App updates by Month of Year")


/Users/diwaker/miniconda3/envs/notebook/lib/python3.5/site-packages/bokeh/charts/_attributes.py:78: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  df = df.sort(columns=columns)

In [34]:
show(p)



In [ ]:
### Work in Progress

In [ ]: