In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
app = pd.read_pickle('/Users/krystal/Desktop/app_clean.p')
app = app.drop_duplicates()
app.head()
Out[2]:
In [97]:
len(app)
Out[97]:
Categorical Variables
In this part, frequency table for each categorical variable is made.
Category
In [31]:
def frequecy_table(var_name):
table = pd.DataFrame(app[var_name].value_counts())
table.reset_index(level = 0, inplace = True)
table['percentage'] = table[var_name]/table[var_name].sum()
return table
In [32]:
frequecy_table('category')
Out[32]:
Multiple Languages
In [33]:
frequecy_table('multiple languages')
Out[33]:
Price
In [34]:
frequecy_table('price')
Out[34]:
We can find that most apps are free, as a result, we may discard variable 'price' in the following analysis.
Multiple Devices
In [35]:
frequecy_table('multiple devices')
Out[35]:
We can find that most apps are multiple devices, as a result, we may discard variable 'multiple devices' in the following analysis.
Continuous Variables
In this part, for all continuous variables, mean, var, median, range, min value and max value are calculated, density plot is also made for each continuous variable.
Current Rating
In [71]:
def statistics(var_name):
table = []
for each in app[var_name]:
if each != '' and float(each) > 0:
table.append(float(each))
mean = np.mean(table)
var = np.var(table)
median = np.median(table)
range_1 = np.max(table) - np.min(table)
min_1 = np.min(table)
max_1 = np.max(table)
dict_1 = {'mean':mean, 'var':var, 'median':median, 'range':range_1, 'min':min_1, 'max':max_1}
summary_table = pd.DataFrame.from_dict(dict_1, orient='index').T
return summary_table
In [72]:
statistics('current rating')
Out[72]:
In [92]:
def plot_density(var_name):
table = []
for each in app[var_name]:
if each != '' and float(each) > 0:
table.append(float(each))
table = pd.DataFrame(table)
table.plot(kind = "density")
plt.legend(labels = [var_name], loc='upper left')
plt.title('Distribution of %s'%(var_name))
plt.show()
In [93]:
plot_density('current rating')
Current Reviews
In [73]:
statistics('current reviews')
Out[73]:
In [94]:
plot_density('current reviews')
Overall Rating
In [74]:
statistics('overall rating')
Out[74]:
In [95]:
plot_density('overall rating')
Overall Reviews
In [75]:
statistics('overall reviews')
Out[75]:
In [96]:
plot_density('overall reviews')
We can see that only variables current reviews and overall reviews generally follow a normal distribution.
In [ ]: