In [11]:
#Import libraries 
import pandas as pd
import numpy as np 
import os
import json 
os.getcwd()


Out[11]:
'C:\\dev\\Classes\\Gitclone\\09-12-2017-GW-Arlington-Class-Repository-DATA\\Class_Activities\\04-Pandas'

In [4]:
#Import data 
# json.load
# json.loads (it use of only strings)

json_path = "purchase_data.json"
p_data_df = pd.read_json(json_path)
p_data_df.head()
p_data_df.columns
#p_data_df.dtypes

#data = pd.read_json("http://localhost:8840/edit/Homework/Week04/Instructions/HeroesOfPymoli/purchase_data.json")
#ata = json.load(open('http://localhost:8840/edit/Homework/Week04/Instructions/HeroesOfPymoli/purchase_data.json', 'r'))


Out[4]:
Index(['Age', 'Gender', 'Item ID', 'Item Name', 'Price', 'SN'], dtype='object')

In [ ]:
#total number of players
#TotalPlayers = p_data_df["SN"].count()
TotalPlayers = p_data_df["SN"].unique()
TotalUniquePlayers = len(TotalPlayers)

In [ ]:
#Purchasing Analysis (Total)
#Number of Unique Items
UniqueItems = p_data_df["Item ID"].unique()
NumberUniqueItems = len(UniqueItems)

#Average Purchase Price
averagePurchase = p_data_df["Price"].mean()

#Total Number of Purchases
TotalNumberPurchases = len(p_data_df["Price"])

#Total Revenue
TotalRevenue = sum(p_data_df["Price"])

#p_data_df.describe()
#p_data_df.counts()
#type(p_data_df)

Gender Demographics

Percentage and Count of Male Players

Percentage and Count of Female Players

Percentage and Count of Other / Non-Disclosed


In [16]:
#demographics = p_data_df["Gender"].value_counts()
#demographics

total_gender = p_data_df["Gender"].count()
male = p_data_df["Gender"].value_counts()['Male']
female = p_data_df["Gender"].value_counts()['Female']
non_gender_specific = total_gender - male - female
male_percent = (male/total_gender) * 100
female_percent = (female/total_gender) * 100
non_gender_specific_percent = (non_gender_specific/total_gender) * 100

#Everything to a table 
GenderDemographics = pd.DataFrame({"Total Gender": [total_gender],
                                   "Total Male": [male],
                                   "% attended Male": [male_percent],
                                   "Total Female": [female],
                                   "% attended Female": [female_percent],
                                   "% Non Gender Specific":[non_gender_specific],
                                   "% Non Gender Specific Percent":[non_gender_specific_percent],
                                   
                                 })
GenderDemographics
#percentage = pd.DataFrame.((demographics/NumberUniqueItems)*100)
#percentage


#genderdemographics = p_data_df.groupby("Gender").count()
#genderdemographics
#How to change female / male / other to rows 
#gendercount = p_data_df(gender["Age"].value_counts())
#gendercount.heads()
#genderpercentagefemale = TotalUniquePlayers /


Out[16]:
% Non Gender Specific % Non Gender Specific Percent % attended Female % attended Male Total Female Total Gender Total Male
0 1 1.282051 16.666667 82.051282 13 78 64
  1. Purchasing Analysis (Gender) #How to combine them in one single table? The below each broken by gender
  • Purchase Count (.count)
  • Average Purchase Price (.mean)
  • Total Purchase Value (.sum)
  • Normalized Totals (?????)

In [ ]:
genderdf = p_data_df.groupby("Gender")
genderPurchasing = pd.DataFrame(genderdf["Price"].value_counts())
genderPurchasing = pd.DataFrame(genderdf["Price"].count())
genderPurchasingAverage = pd.DataFrame(genderdf["Price"].mean())
genderPurchasingTotal = pd.DataFrame(genderdf["Price"].sum())
genderPurchasing

Age Demographics

The below each broken into bins of 4 years (i.e. <10, 10-14, 15-19, etc.) Purchase Count Average Purchase Price Total Purchase Value Normalized Totals


In [9]:
agedf = p_data_df.groupby("Age")
agePurchasing = pd.DataFrame(agedf["Price"].value_counts())


Out[9]:
Price
count 78.0
mean 1.0
std 0.0
min 1.0
25% 1.0
50% 1.0
75% 1.0
max 1.0

Top Spenders

Identify the the top 5 spenders in the game by total purchase value, then list (in a table): SN Purchase Count Average Purchase Price Total Purchase Value


In [5]:
spenders = p_data_df.groupby("SN")
#Why meand and sum are the same? 
#spenders.head()
topSpenders = pd.DataFrame(spenders["Price"].count())
topSpenders.head()
topSpendersmeand = pd.DataFrame(spenders["Price"].mean())
topSpendersmeand.head()
#genderPurchasing = pd.DataFrame(genderdf["Price"].value_counts())


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-5-b141b11344df> in <module>()
      2 #Why meand and sum are the same?
      3 #spenders.head()
----> 4 topSpenders = pd.DataFrame(spenders["Price"] > 4)
      5 topSpenders.head()
      6 topSpendersmeand = pd.DataFrame(spenders["Price"].mean())

TypeError: '>' not supported between instances of 'SeriesGroupBy' and 'int'
# by trying to filter by the max I did this topSpenders = pd.DataFrame(spenders["Price"] > 4) = '>' not supported between instances of 'SeriesGroupBy' and 'int'

In [ ]:
#Final Report 
#print ("The total number of players for Pymoli game is " + str(TotalUniquePlayers))
#print ("The total number of unique items for Pymoli game is " + str(NumberUniqueItems))
#print ("The average Purchase price for Pymoli game is (USD) " + str(averagePurchase))
#print ("The total revenue  is (USD) " + str(TotalRevenue))