In [3]:
#I recommend installing Anaconda for ease:
#https://www.continuum.io/downloads
#certain packages i.e. seaborn, do not come with the standard installation. A simple search for
#how to 'install seaborn with anaconda' will easily find the installation instructions.
#i.e. in this case:
#http://seaborn.pydata.org/installing.html
#conda install seaborn
In [ ]:
#With regards to the data, and the study during which this data was collected, rights belong to:
#Friedmann, Peter. Criminal Justice Drug Abuse Treatment Studies (CJ-DATS): Step 'N Out, 2002-2006 [United States].
#ICPSR30221-v1. Ann Arbor, MI: Inter-university Consortium for Political and Social Research [distributor], 2011-07-27.
#They made the entire dataset openly available at:
#http://doi.org/10.3886/ICPSR30221.v1
In [4]:
#indicate the directory where your data folder, in this case the file 'stepnout.csv', is located
#in this case, it sits in a folder on the dropbox cloud.
cd Dropbox/Data visualization/Course 5/_32ec92f8b8c6d83a374ae0989bec1447_StepNOut
In [252]:
#import the packages we will use
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
import statsmodels.api as sm
import scipy.stats
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.linear_model import LassoLarsCV
from sklearn.cluster import KMeans
%matplotlib inline
sns.set(style="whitegrid", color_codes=True)
In [192]:
#load the full dataset
full_dataset = pd.read_csv('stepnout.csv')
In [231]:
#show the first ten rows of the dataset to get an idea of the variables and their values
full_dataset.head(10)
Out[231]:
In [8]:
#rename all columns to small caps
full_dataset.rename(columns = lambda x: x.lower(), inplace=True)
In [9]:
#same result, different way of doing it
#full_dataset.columns = map(str.lower, full_dataset.columns)
In [10]:
#same result, different way of doing it
#df = df.rename(columns=lambda x: x.replace('$', ''))
In [11]:
#try to get all columns, one sees there are 691
full_dataset.columns
Out[11]:
In [12]:
#print all columns/variables
print(full_dataset.describe().to_string())
In [13]:
#initial exploratory data analysis
#frequency distributions for categorical variables
#graphical representations
#calculations of center and spread for quantitative variables
In [14]:
#we select 17 variables of interest
reduced_dataset = full_dataset[['cond', 'csex', 'age', 'clive', 'majsup', 'allarrests', 'anyarrest', 'alldrugs', 'anydrugs', 'allcrimes', 'anycrime',
'arrest_9mo', 'reincarc_9mo', 'num_arrest', 'num_reincarc', 'violent_charge',
'property_charge']]
In [15]:
#we take a look at the first 30 rows
reduced_dataset.head(30)
Out[15]:
In [16]:
#we check the length of the dataset
print(len(reduced_dataset))
In [17]:
#we confirm the number of columns
print(len(reduced_dataset.columns))
In [18]:
#we rename some of the columns to give them more meaningful names
reduced_dataset.rename(columns = {'csex': 'sex', 'clive': 'living_situation', 'majsup': 'support'}, inplace=True)
In [19]:
#convert all variables to numeric ones
In [193]:
#mark all variables as numeric data, and signify, for the relevant ones, that they are categorical
#rather than quantitative variables
#errors='coerce' tells pandas to return invalid values as NaN rather than as the input values themselves
reduced_dataset['cond'] = pd.to_numeric(reduced_dataset['cond'], errors='coerce').astype('category')
reduced_dataset['sex'] = pd.to_numeric(reduced_dataset['sex'], errors='coerce').astype('category')
reduced_dataset['age'] = pd.to_numeric(reduced_dataset['age'], errors='coerce')
reduced_dataset['living_situation'] = pd.to_numeric(reduced_dataset['living_situation'], errors='coerce').astype('category')
reduced_dataset['support'] = pd.to_numeric(reduced_dataset['support'], errors='coerce').astype('category')
reduced_dataset['allarrests'] = pd.to_numeric(reduced_dataset['allarrests'], errors='coerce')
reduced_dataset['anyarrest'] = pd.to_numeric(reduced_dataset['anyarrest'], errors='coerce').astype('category')
reduced_dataset['alldrugs'] = pd.to_numeric(reduced_dataset['alldrugs'], errors='coerce')
reduced_dataset['anydrugs'] = pd.to_numeric(reduced_dataset['anydrugs'], errors='coerce').astype('category')
reduced_dataset['allcrimes'] = pd.to_numeric(reduced_dataset['allcrimes'], errors='coerce')
reduced_dataset['anycrime'] = pd.to_numeric(reduced_dataset['anycrime'], errors='coerce').astype('category')
reduced_dataset['arrest_9mo'] = pd.to_numeric(reduced_dataset['arrest_9mo'], errors='coerce').astype('category')
reduced_dataset['reincarc_9mo'] = pd.to_numeric(reduced_dataset['reincarc_9mo'], errors='coerce').astype('category')
reduced_dataset['num_arrest'] = pd.to_numeric(reduced_dataset['num_arrest'], errors='coerce')
reduced_dataset['num_reincarc'] = pd.to_numeric(reduced_dataset['num_reincarc'], errors='coerce')
reduced_dataset['violent_charge'] = pd.to_numeric(reduced_dataset['violent_charge'], errors='coerce').astype('category')
reduced_dataset['property_charge'] = pd.to_numeric(reduced_dataset['property_charge'], errors='coerce').astype('category')
In [21]:
#now let's look at the counts and frequency distributions for these variables
#for this, we will aim to get an idea of the shape, center, and spread of these variables
#we will analyze the shape visually by checking for modality and skewness
#we will check for measure of center such as mean, median and mode
#we will check the spread through the standard deviation
In [194]:
reduced_dataset['cond'].value_counts(sort=False, normalize = True)
Out[194]:
In [148]:
reduced_dataset['cond'].describe()
Out[148]:
In [24]:
#rules for visualizing data:
#for visualizing a variable:
#if it is categorical we use a bar chart i.e. sns's countplot function
#if it is quantitative, we can combine a kernel density estimate and a histogram with sns's
#distplot function
#for visualizing two variables:
# C-C: bivariate bar graph with sns factorplot
# C-Q: bivariate bar graph with sns factorplot
# Q-Q: scatterplot with sns regplot
In [25]:
#given that the study condition is a categorical variable, we use a count plot to visualize it.
sns.countplot(x='cond', data=reduced_dataset)
Out[25]:
In [26]:
#we check the counts per each value of the variable. sort=False tells pandas not to sort the results
#by values. normalize = True tells it to return the relative frequencies rather than the absolute counts
reduced_dataset['sex'].value_counts(sort=False, normalize=True)
Out[26]:
In [27]:
#print all age values as a list to look through them
lis = [x for x in reduced_dataset['age']]
print(lis)
In [28]:
#the describe request gives us the count, mean, std, min, max, as well as the quartiles for the
#respective value distribution
reduced_dataset['age'].dropna().describe()
Out[28]:
In [29]:
#given that age is a quantitative variable, we use a distplot to visualize it.
sns.distplot(reduced_dataset['age'].dropna())
Out[29]:
In [32]:
#We use a regplot to plot two quantitative variables, age and allcrimes, while also having a regression line suggesting
#any association present
sns.regplot(x='age', y='allcrimes', data=reduced_dataset)
Out[32]:
In [33]:
#categorical explanatory variable 'sex' and quantitative response variable 'allcrimes'
sns.factorplot(x='sex', y='allcrimes', data=reduced_dataset, kind='bar', ci=None)
Out[33]:
In [38]:
sns.factorplot(x='sex', y='allcrimes', kde='bar', data=reduced_dataset)
Out[38]:
In [39]:
#before starting to manipulate the dataset itself, we make a copy, and will work on the copy
#rather than the original reduced dataset
rdc = reduced_dataset.copy()
In [40]:
ct1 = rdc.groupby('anyarrest').size()*100/len(rdc['anyarrest'])
print(ct1)
In [41]:
rdc['anyarrest'].value_counts(sort=False, dropna=False, normalize=True)
Out[41]:
In [42]:
rdc['cond'].isnull().value_counts()
Out[42]:
In [310]:
rdc['anyarrest'].isnull().sum()
Out[310]:
In [46]:
rdc['anyarrest'] = rdc['anyarrest'].replace(11, np.nan)
In [47]:
rdc['anyarrest'].value_counts()
Out[47]:
In [48]:
living_dic = {1: 1, 2:2, 3:1, 4:1, 5:1, 6:1, 7:1, 8:2}
In [49]:
rdc['living_situation'] = rdc['living_situation'].map(living_dic)
In [152]:
sns.swarmplot('living_situation', data=reduced_dataset)
plt.xlabel('Living Situation')
plt.title('Distribution of the study sample by type of living arrangement')
Out[152]:
In [142]:
sns.factorplot(x='living_situation', y='allcrimes', data=rdc)
Out[142]:
In [149]:
sns.countplot('living_situation', data=rdc)
plt.xlabel('Living Situation')
plt.title('Distribution of the study sample by type of living arrangement')
Out[149]:
In [140]:
rdc['age'].describe()
Out[140]:
In [51]:
def age_group(x):
if x < 30:
return 20
elif x < 40:
return 30
elif x < 50:
return 40
elif x < 61:
return 50
In [52]:
rdc['age'] = rdc['age'].map(age_group)
In [53]:
rdc['age']
Out[53]:
In [141]:
#describing the dataset/the variables of the dataset, after we group the values in the dataset
#by age category
rdc.groupby('age').describe()
Out[141]:
In [55]:
#quartile split
rdc['age_unprocessed'] = reduced_dataset['age']
print('Age - 4 categories - quartiles')
age_binned = rdc['age_binned'] = pd.qcut(rdc['age_unprocessed'], 4, labels = ["1=0%tile", "2=25%tile",
"3=50%tile", "4=75%tile"]).value_counts(sort=False, dropna=True)
print(age_binned)
In [56]:
#Hypothesis testing:
#1. specify the null hypothesis and the alternate hypothesis
#2. choose a sample
#3. assess the evidence
#4. draw conclusions
#Definition: assessing evidence provided by the data in favor or against each hypothesis
#about the population
#a result is statistically significant if it is unlikely to have occurred by chance
#p value is also the type 1 error rate: the number of times we would be wrong in rejecting the
#null hypothesis when it is true
#p=0.03: if we reject the null hypothesis, we would be correct 97/100 times.
#Bivariate statistical tools:
#ANOVA; chi-square; correlation coefficient
#ANOVA F Test: are the differences among the sample means due to true differences among the
#population means, or merely due to sampling variability
#F is the variation among samples means divided by the variation within groups
#for explanatory variables with multiples levels, F test and p value do not tell us why the
#group means are not equal, or how. there are many ways in which this can be the case.
#before performing these analyses, one needs to use the .dropna() function to include only
#valid data
In [57]:
#we will test the null hypothesis that the study condition (subject or control) and number of crimes are not related.
test1_data = rdc[['cond', 'allcrimes']].dropna()
len(test1_data)
Out[57]:
In [58]:
test1 = smf.ols(formula='allcrimes ~ C(cond)', data=test1_data).fit()
print(test1.summary())
In [59]:
#now we examine the means and stds
grouped1_mean = test1_data.groupby('cond').mean()
print(grouped1_mean)
In [60]:
grouped1_std = test1_data.groupby('cond').std()
print(grouped1_std)
In [61]:
#we repeat the same analysis with arrests
test2_data = rdc[['cond', 'allarrests']].dropna()
test2 = smf.ols(formula = 'allarrests ~ C(cond)', data=test2_data).fit()
print(test2.summary())
In [62]:
grouped2_mean = test2_data.groupby('cond').mean()
print(grouped2_mean)
In [63]:
grouped2_std = test2_data.groupby('cond').std()
print(grouped2_std)
In [64]:
#we now try to use the grouped age variable as well
rdc['age'] = rdc['age'].astype('category')
test3_data = rdc[['age', 'allcrimes']].dropna()
test3 = smf.ols(formula = 'allcrimes ~ C(age)', data=rdc).fit()
print(test3.summary())
In [65]:
#given that we have an explanatory categorical variable with multiple levels, we use the
#tuckey hsd test
tuckey1 = multi.MultiComparison(test3_data['allcrimes'], test3_data['age'])
res1 = tuckey1.tukeyhsd()
print(res1.summary())
In [66]:
#We will now test two other hypotheses:
#Hypothesis(0)(a): the study condition (0 or 1) and the committing of a crime are independent
#i.e. there is no relationship between them
#Hypothesis(0)(b): there is no relationship between age and being arrested during the study
#period
In [67]:
#contingency table of observed counts
#when creating contingency tables, we put the response variable first (therefore vertical in
#table), and the explanatory variable second, therefore horizontal at the top of the table.
ct1 = pd.crosstab(rdc['anycrime'], rdc['cond'])
print(ct1)
In [68]:
#column percentages
colsum = ct1.sum(axis=0)
colpct = ct1/colsum
print(colpct)
In [69]:
#chi square test
print('chi-square value, p value, expected counts')
cs1 = scipy.stats.chi2_contingency(ct1)
print(cs1)
In [70]:
#now the second hypothesis test
rdc['age'] = rdc['age'].astype('category')
rdc['anyarrest'] = rdc['anyarrest'].astype('category')
In [71]:
#contingency table of observed counts
ct2 = pd.crosstab(rdc['anyarrest'], rdc['age'])
print(ct2)
In [72]:
#column percentages
colsum2 = ct2.sum(axis=0)
colpct2 = ct2/colsum2
print(colpct2)
In [73]:
#chi square test
print('chi-square value, p value, expected counts')
cs2 = scipy.stats.chi2_contingency(ct2)
print(cs2)
In [74]:
#We therefore cannot reject the null hypothesis, but given that the explanatory variable has several levels,
#we cannot know why the null hypothesis was not rejected
#we therefore perform a 2 by 2 comparison
In [75]:
sub1 = rdc.copy()
recode1 = {20:20, 30:30}
sub1['comp1v'] = sub1['age'].map(recode1)
In [153]:
ct3 = pd.crosstab(sub1['cond'], sub1['comp1v'])
print(ct3)
In [ ]:
#column percentages
colsum3 = ct3.sum(axis=0)
colpct3 = ct3 / colsum3
print(colpct3)
In [ ]:
#chi square test
print('chi square value, p value, expected values')
cs3 = scipy.stats.chi2_contingency(ct3)
print(cs3)
In [ ]:
recode2 = {20:20, 40:40}
sub1['comp2v'] = sub1['age'].map(recode2)
In [ ]:
ct4 = pd.crosstab(sub1['anyarrest'], sub1['comp2v'])
print(ct4)
In [ ]:
colsum4 = ct4.sum(axis=0)
colpct4 = ct4/colsum4
print(colpct4)
In [ ]:
print('chi square value, p value, expected values')
cs4 = scipy.stats.chi2_contingency(ct4)
print(cs4)
In [ ]:
recode3 = {20:20, 50:50}
sub1['compv3'] = sub1['age'].map(recode3)
In [ ]:
ct5 = pd.crosstab(sub1['anyarrest'], sub1['compv3'])
print(ct5)
In [ ]:
colsum5 = ct5.sum(axis=0)
colpct5 = ct5/colsum5
print(colpct5)
In [ ]:
print('chi square value, p value, expected values')
cs5 = scipy.stats.chi2_contingency(ct5)
print(cs5)
In [ ]:
recode4 = {30:30, 40:40}
sub1['compv4'] = sub1['age'].map(recode4)
In [ ]:
ct6 = pd.crosstab(sub1['anyarrest'], sub1['compv4'])
print(ct6)
In [ ]:
colsum6 = ct6.sum(axis=0)
colpct6 = ct6 / colsum6
print(colpct6)
In [ ]:
print('chi square value, p value, expected values')
cs6 = scipy.stats.chi2_contingency(ct6)
print(cs6)
In [ ]:
recode6 = {30:30, 50:50}
sub1['compv6'] = sub1['age'].map(recode6)
In [ ]:
ct7 = pd.crosstab(sub1['anyarrest'], sub1['compv6'])
print(ct7)
In [ ]:
colsum7 = ct7.sum(axis=0)
colpct7 = ct7/colsum7
print(colpct7)
In [ ]:
print('chi square value, p value, expected values')
cs7 = scipy.stats.chi2_contingency(ct7)
print(cs7)
In [ ]:
recode7 = {40:40, 50:50}
sub1['compv7'] = sub1['age'].map(recode7)
In [ ]:
ct8 = pd.crosstab(sub1['anyarrest'], sub1['compv7'])
print(ct8)
In [ ]:
colsum8 = ct8.sum(axis=0)
colpct8 = ct8 / colsum8
print(colpct8)
In [ ]:
print('chi square value, p value, expected values')
cs8 = scipy.stats.chi2_contingency(ct8)
print(cs8)
In [ ]:
#now we will test whether there is a relationship between two quantitative variables, age_unprocessed and allcrimes
#for this we use the pearson correlation test
#r, going from -1 to 1 only tells us whether the two variables are linearly related. they may be related in nonlinear ways
#therefore it's always important to look at r in parallel with a scatterplot of the two variables
#r squared is a measure of how much variability in one variable can be explained by the other variable
#to calculate the pearson coefficient we need to remove all missing values
In [ ]:
rdc.columns
In [154]:
scat1 = sns.regplot(x='age_unprocessed', y='allcrimes', fit_reg=True, data=rdc)
plt.xlabel('Age')
plt.ylabel('Number of crimes')
plt.title('Relationship between age and number of crimes')
scat1
Out[154]:
In [ ]:
data_pearson_test = rdc[['age_unprocessed', 'allcrimes']].dropna()
In [ ]:
print('association between age and number of crimes')
print(scipy.stats.pearsonr(data_pearson_test['age_unprocessed'],
data_pearson_test['allcrimes']))
In [ ]:
#a moderator is a third variable that affects the direction and/or strength between your explanatory and response variables
#the question is, is our response variable associated with our explanatory variable, for each level of our third variable?
In [ ]:
#let's see if gender is a moderator variable for test group -> allcrimes
In [ ]:
sub11 = rdc[['cond', 'sex', 'allcrimes']].dropna()
In [ ]:
rdc['sex'].value_counts()
In [ ]:
sub12 = sub11[sub11['sex']==1]
sub13 = sub11[sub11['sex']==2]
In [ ]:
model_12 = smf.ols(formula='allcrimes ~ C(cond)', data=sub12).fit()
print(model_12.summary())
In [ ]:
model_13 = smf.ols(formula='allcrimes ~ C(cond)', data=sub13).fit()
print(model_13.summary())
In [ ]:
sub12.groupby('cond').mean()
In [ ]:
sub12.groupby('cond').std()
In [ ]:
sub13.groupby('cond').mean()
In [ ]:
sub13.groupby('cond').std()
In [ ]:
sns.factorplot(x='cond', y='allcrimes', kind='bar', data=sub12)
In [ ]:
sns.factorplot(x='cond', y='allcrimes', kind='bar', data=sub13)
In [ ]:
#we would test for moderator variables with the chi square test the same way
#divide the population into the sublevels of the third variables
#conduct a chi square test for each to see if the relationship is statistically significant for each level
#we would visualize it with a linegraph factorplot(kind='point')
In [ ]:
#to know if your data is obersevational or experimental, you ask if the explanatory variable was manipulated or not
#data reporting tells you what is happening, but data analysis tells you why it is happening
In [ ]:
#randomization works best as your sample size approaches infinity. for small sizes, imbalances in the groups
#can occur. if you check randomized studies, one of first steps is to check for imbalances between groups
#on covariates. this is also why we can conclude that variables are associated, but hardly that one causes the other
#statistical control: include unbalanced covariates as additional explanatory variables in the study.
#In a true experiment, 3 conditions:
#1. only one variable is manipulated
#2. we have a control group
#3. random assignment
#In theory, in this case one can determine causality
#Quasi experiment:
#1. only one variable is manipulated
#2. control group
#3. no random assignment; groups pre selected. i.e. drug users study.
#To improve a quasi experimental design: add confounding variables; have a control group; use a pre-test/post-test design
#confounder=control variable=covariate=third variable=lurking variable
In [ ]:
#identifying a confounding variable does not allow to establish causation, just to get closer to a causal connection.
#due to infinite number of possible lurking variables, observational studies cannot rly establish causation
#a lurking of confounding variable is a third variable that is associated with both the explanatory and response
#variables.
#i.e. x=firefighters; y=damage caused by a fire. plot would suggest more firefighters cuases more fire damage.
#in reality there is a third lurking variable that influences both, seriousness of the fire.
#In a study we want to demonstrate that our statistical relationship is valid even after controlling for confounders.
In [ ]:
#Linear regression:
#multivariate linear regression for quantitative response variable
#logistic regression for binary categorical response variable
#Assumptions:
#Normality: residuals from our linear regression model are normally distributed. if they are not,
#our model may be misspecified.
#Linearity: association between explanatory and response variable is linear
#Homoscedasticity (or assumption of constant variance): variability in the response variable is the same at all levels
#of the explanatory variable. i.e. if spread of residuals values increases as you move along x axis, assumption is
#false.
#Independence: observations are not correlated with each other. Longitudinal data can violate this assumption, as well
#as hierarchical nesting/clustering data i.e. looking at students by classes. this assumption is the most serious
#to be violated, and also cannot be fixed by modifying the variables. the data structure itself is the problem.
#We have to contend with:
#Multicollinearity: explanatory variables are highly correlated with each other. this can mess up your parameter estimates
#or make them highly unstable. Signs: 1. highly associated variable not significant. 2. negative regression coefficient
#that should be positive. 3. taking out an explanatory variable drastically changes the results.
#Outliers: can affect your regression line, meaning it will not fit the data as well as it should.
In [ ]:
#multiple regression model allows us to find the relationship between one explanatory variable and the
#reponse variable, while controlling (holding constant at 0) all the other variables.
#categorical sex (1 and 2); age restricted to 18-25 group: each variable needs to include a meaningful
#value of 0, so as to make it easier to interpret the coefficients
#for a categorical variable, we can just recode one of the values to be 0
#for a quantitative variable, we have to center it. Centering = subtracting the mean of a variable
#from the value of the variable. We are therefore recoding it so that its mean=0.
#Note: do not center the response variable.
In [ ]:
#We will create a multiple regression model, investigating the relationship between the study group 'cond', 'age', 'sex',
#and the quantitative response variable 'allcrimes'. We will then do the same for 'allarrests'.
#we will first center the explanatory variables. for categorical variables, one of the categories needs to be 0, for
#quantitative variables, we need to subtract the mean from each value.
In [81]:
rdc_linear = rdc[['cond', 'age_unprocessed', 'sex', 'allcrimes', 'allarrests']].dropna()
len(rdc_linear)
Out[81]:
In [83]:
rdc_linear['age_unprocessed_c'] = rdc['age_unprocessed']-rdc['age_unprocessed'].mean()
print(rdc_linear['age_unprocessed_c'].mean())
In [84]:
rdc_linear['sex'].value_counts()
Out[84]:
In [85]:
recode20 = {1:1, 2:0}
rdc_linear['sex_c'] = rdc_linear['sex'].map(recode20)
In [92]:
model20 = smf.ols(formula = 'allcrimes ~ C(cond)', data=rdc_linear).fit()
print(model20.summary())
In [93]:
model21 = smf.ols(formula = 'allcrimes ~ C(cond)+age_unprocessed_c', data=rdc_linear).fit()
print(model21.summary())
In [94]:
model22 = smf.ols(formula = 'allcrimes ~ C(cond)+age_unprocessed_c+C(sex)', data=rdc_linear).fit()
print(model22.summary())
In [95]:
model23 = smf.ols(formula = 'allcrimes ~ C(cond)+age_unprocessed_c + I(age_unprocessed_c**2)+C(sex)', data=rdc_linear).fit()
print(model23.summary())
In [97]:
model24 = smf.ols(formula = 'allarrests ~ C(cond)', data=rdc_linear).fit()
print(model24.summary())
In [99]:
model25 = smf.ols(formula = 'allarrests ~ C(cond)+age_unprocessed_c', data=rdc_linear).fit()
print(model25.summary())
In [103]:
model26 = smf.ols(formula = 'allarrests ~ C(cond)+age_unprocessed_c+C(sex)', data=rdc_linear).fit()
print(model26.summary())
In [102]:
model27 = smf.ols(formula = 'allarrests ~ C(cond)+age_unprocessed_c + I(age_unprocessed_c**2)+C(sex)', data=rdc_linear).fit()
print(model27.summary())
In [105]:
#group means and sd
print('Mean')
ds1 = rdc_linear.groupby('cond').mean()
print(ds1)
In [106]:
print('Standard Deviation')
ds2 = rdc_linear.groupby('cond').std()
print(ds2)
In [ ]:
#For each response variable, allcrimes and allarrests, we choose the model that gave us the highest overall
#explanatory power, and run further tests to check for evidence of model misspecification.
#If model is correctly specified, residuals are not correlated with explanatory variables.
#If data fails to meet regression assumptions, or misses explanatory variables, we have model misspecification.
#Intercept = value of response variable when all explanatory variables are held constant at 0
In [110]:
#Q-Q plot for normality
fig4 = sm.qqplot(model23.resid, line='r')
#red line represents residuals we would expect if model residuals were normally distributed
#our residuals below deviate significantly from red line, especially at lower and higher quantiles, meaning they do not
#follow a normal distribution. This means curvilinear association we saw is not fully explained by our model. We could add
#more explanatory variables.
In [ ]:
#normalizing or standardizing values makes them fit a standard normal distribution
In [111]:
#simple plot of residuals
stdres = pd.DataFrame(model23.resid_pearson)
plt.plot(stdres, 'o', ls='None')
l = plt.axhline(y=0, color='r')
plt.ylabel('Standardized Residual')
plt.xlabel('Observation Number')
#resid_pearson normalizes our model's residuals
#ls='none' means points will not be connected
#we expect most residuals to fall within 2sd of the mean. More than 2 are outliers, and more than 3 extreme outliers.
#if more than 1% of our observations have standardized residuals with an absolute value greater than 2.5, or more than 5%
#have one greater than or equal to 2, there is evidence that the fit of the model is poor. largest cause of this is ommission
#of important explanatory variables in our model.
Out[111]:
In [113]:
#additional regression diagnostic plots
#fig1 = plt.figure(figsize(12,8))
fig1 = sm.graphics.plot_regress_exog(model23, 'age_unprocessed_c')
In [114]:
#leverage plot
fig3 = sm.graphics.influence_plot(model23, size=8)
print(fig3)
#we see that we have extreme outliers, but they are low leverage, meaning they do not have an undue influence on our
#estimation of the regression model.
#we also have high leverage observations, but they are not outliers.
#we have no observations that are both high leverage and outliers.
In [ ]:
#now we will apply logistic regression models to the binary response variables anycrime and anyarrest
#the data management has already been performed
In [158]:
rdc_logistic['anycrime'] = pd.to_numeric(rdc_logistic['anycrime'], errors='coerce')
rdc_logistic['anyarrest'] = pd.to_numeric(rdc_logistic['anyarrest'], errors='coerce')
In [159]:
rdc_logistic['cond'] = rdc_logistic['cond'].astype('category')
lreg1 = smf.logit(formula='anyarrest ~ C(cond)', data=rdc_logistic).fit()
print(lreg1.summary())
#equation would be anyarrest = -1.0259-0.1268*cond
In [164]:
#however, for logistic regression it makes much more sense to calculate the odds ratio
#if OR=1, the model is not statistically significant
#if OR<1, the response variable becomes less likely as the explanatory one increases
#if OR>1, the response variable becomes more likely as the explanatory one increases
print('Odds Ratios')
print(np.exp(lreg1.params))
#Study subjects in the treatment group (cond=1) are 0.88 times as likely to have had an arrest since release on parole
#as study subjects in the control group
In [163]:
# odd ratios with 95% confidence intervals
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (np.exp(conf))
#we have 95% confidence that the population odds ratio will be between 0.57 and 1.35.
In [ ]:
#Machine learning encompasses a wide range of statistical methods. These can be used for:
#1. Describe associations
#2. Search for patterns
#3. Make predictions
#We typically do not use ML with hypotheses in mind. instead we learn from the data
#We learn from the test set.
#Accuracy = test error rate. The rate at which it correctly classifies or estimates.
#Goal is to minimize test error rate.
#Linear regression: accuracy = mean squared error
#Variance = change in parameter estimates across different data sets
#Bias = how far off model estimated values are from true values
#ideally we want low variance and low bias, but they are negatively associated. As one decreases, the other increases.
#Generally, complexity of model leads to high variance and low bias
#Simple models will have lower variance, but also be more biased.
#Logistic regression: accuracy = how well the model classifies observations
In [ ]:
#Supervised Prediction includes:
#Linear regression
#Pattern recognition
#Discriminant analysis
#Multivariate function estimation
#Supervised ML techniques
#Decision trees
#Like linear regression, decision trees are designed for supervised prediction problems.
#Root node, and terminal nodes or leaves.
#Growing the tree process: binary splits maximize correct classification; all cut points are tested; subgroups showing
#similar outcomes are generated
#Validating the tree: cross validation guards against overfit. A random subset is tested and only 'branches' that improve
#the classification are retained
#Selected sub tree is the lowest probability of misclassification
#Trees allow the handling of many variables that cannot be done as efficiently in linear regression. They can also uncover
#constellations of variables that can predict high or low rates of the response variable.
In [ ]:
#Strengths of decision trees
#Can select from a large number of variables those and their interactions that are most important in determining the
#target or response variable to be explained
#They are easy to interpret and visualize, especially when the tree is small
#Can handle large data sets well and can predict both binary, categorical target variables and also quantitative ones
#Limitations: small changes in the data can lead to different splits and this can undermine the interpretability of the model
#and decision trees are not very reproducible on future data
In [201]:
reduced_dataset_clean = reduced_dataset.dropna()
len(reduced_dataset_clean)
Out[201]:
In [205]:
predictors = reduced_dataset_clean.ix[:, reduced_dataset_clean.columns != 'allcrimes']
In [206]:
predictors
Out[206]:
In [207]:
targets = reduced_dataset_clean['allcrimes']
In [208]:
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size= 0.4)
In [209]:
print(pred_train.shape, pred_test.shape, tar_train.shape, tar_test.shape)
In [210]:
classifier = DecisionTreeClassifier()
classifier = classifier.fit(pred_train, tar_train)
In [213]:
predictions = classifier.predict(pred_test)
In [215]:
sklearn.metrics.confusion_matrix(tar_test, predictions)
Out[215]:
In [216]:
sklearn.metrics.accuracy_score(tar_test, predictions)
Out[216]:
In [ ]:
#Random forests
#Forests of trees
#Splits on only ONE variable in each node. Variable with largest association with Target among
#candidate variables. Only among variables randomly selected to be tested for that node.
#First a subset of explanatory variables is selected at random
#Next the node is split with the Best variable of the subset. After this node is split, a new list of subset variables
#is selected at random to split on the next node.
#typical k fold values: 5 or 10
In [219]:
classifier2 = RandomForestClassifier(n_estimators=25)
classifier2 = classifier.fit(pred_train, tar_train)
In [221]:
predictions2 = classifier2.predict(pred_test)
In [222]:
sklearn.metrics.confusion_matrix(tar_test, predictions)
Out[222]:
In [223]:
sklearn.metrics.accuracy_score(tar_test, predictions)
Out[223]:
In [224]:
#fit an extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(pred_train, tar_train)
#display the relative importance of each attribute
print(model.feature_importances_)
In [227]:
trees = range(25)
accuracy = np.zeros(25)
In [228]:
for idx in range(len(trees)):
classifier = RandomForestClassifier(n_estimators = idx + 1)
classifier = classifier.fit(pred_train, tar_train)
predictions = classifier.predict(pred_test)
accuracy[idx] = sklearn.metrics.accuracy_score(tar_test, predictions)
In [229]:
plt.cla()
plt.plot(trees, accuracy)
Out[229]:
In [230]:
#Lasso regression
#penalized regression method
#supervised learning method
#shrinkage and selection method
#shrinkage = constraints on parameters that shrinks coefficients to 0
#selection = identifies most imp. variables associated with response variable
#Can increase prediction accuracy and improve model interpretability vs standard OLS
#When lambda=0, it becomes OLS regression
#Bias increases and variance decreases as lambda increases
#in Lasso regression, penalty is not fair if variables are not on the same scale
#standardize all predictor variables to have means equal to 0 and sd = 1
#Lasso regression has several algorithms, among them LAR (least angle regression-)
#sklearn library refers to the penalty term as 'alpha'
In [ ]:
#Limitations of lasso regression
#1. Selection of variables is 100% statistically driven
#2. If predictors are correlated, lasso arbitrarily selects one
#3. Estimating p values is not straightforward
#4. Different selection methods or statistical softwares can provide different results
#5. No guarantee that selected model is not overfitted nor that it's the best model
#All regression models can produce meaningless models without human intervention
#Best approach is a combination of ML, human intervention, and independent application
In [233]:
predictors.columns
Out[233]:
In [240]:
predictors['cond'] = preprocessing.scale(predictors['cond'].astype('float64'))
predictors['sex'] = preprocessing.scale(predictors['sex'].astype('float64'))
predictors['age'] = preprocessing.scale(predictors['age'].astype('float64'))
predictors['living_situation'] = preprocessing.scale(predictors['living_situation'].astype('float64'))
predictors['support'] = preprocessing.scale(predictors['support'].astype('float64'))
predictors['allarrests'] = preprocessing.scale(predictors['allarrests'].astype('float64'))
predictors['anyarrest'] = preprocessing.scale(predictors['anyarrest'].astype('float64'))
predictors['alldrugs'] = preprocessing.scale(predictors['alldrugs'].astype('float64'))
predictors['anydrugs'] = preprocessing.scale(predictors['anydrugs'].astype('float64'))
predictors['anycrime'] = preprocessing.scale(predictors['anycrime'].astype('float64'))
predictors['arrest_9mo'] = preprocessing.scale(predictors['arrest_9mo'].astype('float64'))
predictors['reincarc_9mo'] = preprocessing.scale(predictors['reincarc_9mo'].astype('float64'))
predictors['num_arrest'] = preprocessing.scale(predictors['num_arrest'].astype('float64'))
predictors['num_reincarc'] = preprocessing.scale(predictors['num_reincarc'].astype('float64'))
predictors['violent_charge'] = preprocessing.scale(predictors['violent_charge'].astype('float64'))
predictors['property_charge'] = preprocessing.scale(predictors['property_charge'].astype('float64'))
In [243]:
#split data into train and test tests
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets,
test_size=.3, random_state=123)
In [246]:
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False).fit(pred_train,tar_train)
In [247]:
# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))
Out[247]:
In [248]:
# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')
Out[248]:
In [249]:
# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
Out[249]:
In [250]:
# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
test_error = mean_squared_error(tar_test, model.predict(pred_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)
In [251]:
# R-square from training and test data
rsquared_train=model.score(pred_train,tar_train)
rsquared_test=model.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
In [ ]:
#Cluster analysis
#Unsupervised learning method = no response variable included in the analysis
#Goal: to have less variance within clusters, and more between clusters
#Can also be used as a method of data reduction, to reduce number of variables
#to the number of categorical variables equal to the clusters produced
#Canonical discriminant analysis:
#creates a smaller number of variables
#linear combinations of clustering variables
#canonical variables are ordered by proportion of variance accounted for
#majority of variance is accounted for by first few canonical variables
In [255]:
clustervar = predictors.copy()
In [257]:
# split data into train and test sets
clus_train, clus_test = train_test_split(clustervar, test_size=.3, random_state=123)
In [258]:
# k-means cluster analysis for 1-9 clusters
from scipy.spatial.distance import cdist
clusters=range(1,10)
meandist=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(clus_train)
clusassign=model.predict(clus_train)
meandist.append(sum(np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1))
/ clus_train.shape[0])
In [259]:
"""
Plot average distance from observations from the cluster centroid
to use the Elbow Method to identify number of clusters to choose
"""
plt.plot(clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')
Out[259]:
In [260]:
# Interpret 3 cluster solution
model3=KMeans(n_clusters=3)
model3.fit(clus_train)
clusassign=model3.predict(clus_train)
In [261]:
# plot clusters
from sklearn.decomposition import PCA
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(clus_train)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model3.labels_,)
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 3 Clusters')
plt.show()
In [288]:
"""
BEGIN multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
"""
# create a unique identifier variable from the index for the
# cluster training data to merge with the cluster assignment variable
clus_train.reset_index(level=0, drop=True)
# create a list that has the new index variable
cluslist=list(clus_train['index'])
# create a list of cluster assignments
labels=list(model3.labels_)
# combine index variable list with cluster assignment list into a dictionary
newlist=dict(zip(cluslist, labels))
newlist
# convert newlist dictionary to a dataframe
newclus=pd.DataFrame.from_dict(newlist, orient= 'index')
newclus
# rename the cluster assignment column
newclus.columns = ['cluster']
# now do the same for the cluster assignment variable
# create a unique identifier variable from the index for the
# cluster assignment dataframe
# to merge with cluster training data
newclus.reset_index(level=0, drop=True)
# merge the cluster assignment dataframe with the cluster training variable dataframe
# by the index variable
merged_train=pd.merge(clus_train, newclus, left_index=True, right_index=True)
merged_train.head(n=100)
# cluster frequencies
merged_train.cluster.value_counts()
Out[288]:
In [289]:
"""
END multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
"""
# FINALLY calculate clustering variable means by cluster
clustergrp = merged_train.groupby('cluster').mean()
print ("Clustering variable means by cluster")
print(clustergrp)
In [307]:
# validate clusters in training data by examining cluster differences in GPA using ANOVA
# first have to merge GPA with clustering variables and cluster assignment data
allarrests_data['targets'] = pd.DataFrame(targets)
# split GPA data into train and test sets
arrests_train, arrests_test = train_test_split(allarrests_data, test_size=.3, random_state=123)
arrests_train1=pd.DataFrame(arrests_train)
arrests_train1.reset_index(level=0, inplace=True)
merged_train_all=pd.merge(arrests_train1, merged_train, left_index=True, right_index=True)
sub1 = merged_train_all[['targets', 'cluster']].dropna()
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
allarrests_mod = smf.ols(formula='targets ~ C(cluster)', data=sub1).fit()
print (allarrests_mod.summary())
print ('means for all arrests by cluster')
m1= sub1.groupby('cluster').mean()
print (m1)
print ('standard deviations for all arrests by cluster')
m2= sub1.groupby('cluster').std()
print (m2)
mc1 = multi.MultiComparison(sub1['targets'], sub1['cluster'])
res1 = mc1.tukeyhsd()
print(res1.summary())
In [ ]:
In [ ]: