In [1]:
# Run this cell :)
1+2
Out[1]:
In [3]:
# Uncomment next command if you need to install a missing module
#!pip install statsmodels
import matplotlib.pyplot as plt
import pandas as pd
try:
import statsmodels.api as sm
except:
!pip install statsmodels
import numpy as np
%matplotlib inline
In [4]:
import sys
print(sys.version)
In [5]:
import sys
if sys.platform.startswith('linux'):
!ls
elif sys.platform.startswith('freebsd'):
!ls
elif sys.platform.startswith('darwin'):
!ls
elif sys.platform.startswith('win'):
!dir
To download the data, we will use !wget (on DataScientistWorkbench)
In [6]:
if sys.platform.startswith('linux'):
!wget -O /resources/customer_dbase_sel.csv http://analytics.romanko.ca/data/customer_dbase_sel.csv
customer_dbase_sel.csv:We have downloaded an extract from IBM SPSS sample dataset with customer data, customer_dbase_sel.csv, which contains customer-specific data such as age, income, credit card spendings, commute type and time, etc. Dataset source
In [7]:
url = "http://analytics.romanko.ca/data/customer_dbase_sel.csv"
df = pd.read_csv(url)
## On DataScientistWorkbench you can read from /resources directory
#df = pd.read_csv("/resources/customer_dbase_sel.csv")
# display first 5 rows of the dataset
df.head()
Out[7]:
In [8]:
# Summarize the data
df.describe()
Out[8]:
In [9]:
# Number of rows and columns in the data
df.shape
Out[9]:
In [10]:
# Display column names
df.columns
Out[10]:
In [11]:
# To label data into high-income and low-income
df['income_category'] = df['annual_income'].map(lambda x: 1 if x>30000 else 0)
df[['annual_income','income_category']].head()
Out[11]:
In [12]:
viz = df[['cardspent','debtinc','carditems','commutetime']]
viz.head()
Out[12]:
In [13]:
viz.describe()
Out[13]:
Drop NaN (Not-a-Number) observations:
In [14]:
df[['commutetime']].dropna().count()
Out[14]:
Print observations with NaN commutetime:
In [15]:
print( df[np.isnan(df["commutetime"])] )
In [16]:
viz.hist()
plt.show()
In [17]:
df[['cardspent']].hist()
plt.show()
In [18]:
df[['commutetime']].hist()
plt.show()
In [19]:
from scipy import stats
Confidence intervals tell us how close we think the mean is to the true value, with a certain level of confidence.
We compute mean mu, standard deviation sigma and the number of observations N in our sample of the debt-to-income ratio:
In [20]:
mu, sigma = np.mean(df[['debtinc']]), np.std(df[['debtinc']])
print ("mean = %G, st. dev = %g" % (mu, sigma))
In [21]:
N = len(df[['debtinc']])
N
Out[21]:
The 95% confidence interval for the mean of N draws from a Normal distribution with mean mu and standard deviation sigma is
In [22]:
conf_int = stats.norm.interval( 0.95, loc = mu, scale = sigma/np.sqrt(N) )
conf_int
Out[22]:
In [23]:
print ("95%% confidence interval for the mean of debt to income ratio = [%g %g]") % (conf_int[0], conf_int[1])
Select columns by name:
In [24]:
adf=df[['gender','cardspent','debtinc']]
print(adf['gender'])
Compute means for cardspent and debtinc for the male and female populations:
In [25]:
gender_data = adf.groupby('gender')
print (gender_data.mean())
Compute mean for cardspent for female population only:
In [26]:
adf[adf['gender'] == 'Female']['cardspent'].mean()
Out[26]:
We have seen above that the mean cardspent and debtinc in the male and female populations were different. To test if this is significant, we do a 2-sample t-test with scipy.stats.ttest_ind():
In [27]:
female_card = adf[adf['gender'] == 'Female']['cardspent']
male_card = adf[adf['gender'] == 'Male']['cardspent']
tc, pc = stats.ttest_ind(female_card, male_card)
print ("t-test: t = %g p = %g" % (tc, pc))
In the case of amount spent on primary credit card, we conclude that men tend to charge more on their primary card (p-value = 2e-6 < 0.05, statistically significant).
In [28]:
female_debt = adf[adf['gender'] == 'Female']['debtinc']
male_debt = adf[adf['gender'] == 'Male']['debtinc']
td, pd = stats.ttest_ind(female_debt, male_debt)
print ("t-test: t = %g p = %g" % (td, pd))
In the case of debt-to-income ratio, we conclude that there is no significant difference between men and women (p-value = 0.758 > 0.05, not statistically significant).
In [29]:
adf.boxplot(column='cardspent', by='gender', grid=False, showfliers=False)
plt.show()
In [30]:
gend = list(['Female', 'Male'])
for i in [1,2]:
y = adf.cardspent[adf.gender==gend[i-1]].dropna()
# Add some random "jitter" to the x-axis
x = np.random.normal(i, 0.04, size=len(y))
plt.plot(x, y, 'r.', alpha=0.2)
plt.boxplot([female_card,male_card],labels=gend)
plt.ylabel("cardspent")
plt.ylim((-50,850))
plt.show()
In [31]:
plt.scatter(df.age, df.annual_income)
plt.xlabel("Age")
plt.ylabel("Income")
plt.show()