Labos en R

Tous les labos, version R. (Pas de labo 1.)

Labo 2

Help



In [ ]:

    
# help()
# help(function)
# help(package='package-name)

Packages



In [ ]:

    
# install
# install.packages('package-name')

# already installed with conda
#install.packages("foreign")

# new installs
#install.packages("Rcmdr", dependencies = TRUE, repos="http://cran.rstudio.com/") # in conda?
#install.packages("nortest", repos="http://cran.rstudio.com/")
#install.packages("sas7bdat", repos="http://cran.rstudio.com/")
#install.packages("Hmisc", repos="http://cran.rstudio.com/")
#install.packages("pastecs", repos="http://cran.rstudio.com/")
# import
# library('package-name')

library(foreign)
library(nortest)
library(sas7bdat)
library(Hmisc)
library(pastecs)

Working space



In [ ]:

    
ls()
# rm(list=ls())
# setwd()
getwd()

Read data



In [ ]:

    
# import excel : via txt tab separated
#fichierTexte <- read.table("data/labo2/SR_Data.txt", header = TRUE)

# import DBF (DBase)
fichierDBF <- read.dbf("data/labo2/SR_Data.dbf")

# import SPSS
#fichierSPSS <- read.spss("data/labo2/Data_SPSS.sav", to.data.frame=TRUE)

# import SAS
#fichierSAS <- read.sas7bdat("data/labo2/tableau1.sas7bdat", debug=FALSE)

head(fichierDBF)

Table structure



In [ ]:

    
# show variable names
names(fichierDBF)
# indexes start at 1

# delete variable
fichierDBF$Shape_Leng <- NULL

# rename variable
names(fichierDBF)[1] <- "POPTOT"

# create variable
fichierDBF$km <- fichierDBF$Shape_Area / 1000000
fichierDBF$HabKm2 <- fichierDBF$POPTOT / fichierDBF$km

head(fichierDBF)



In [ ]:

    
# new table from a subset
names(fichierDBF)
ZScores <-fichierDBF[,c(12:15)]
names(ZScores)

Normality



In [ ]:

    
#install.packages("moments", repos="http://cran.rstudio.com/")
library(moments)

Skewness



In [ ]:

    
skewness(fichierDBF)

Kurtosis



In [ ]:

    
kurtosis(fichierDBF)

Kolmogorov-Smirnov



In [ ]:

    
#lillie.test(Tableau1$HabKm2)
# sapply(fichierDBF[18:20],lillie.test)
sapply(fichierDBF[18],lillie.test)



In [ ]:

    
#ks.test(x, y) # two sample

#m <- mean(fichierDBF[18])
#s <- sd(fichierDBF[18])

#ks.test(fichierDBF[18], "pnorm", m, s)

Shapiro-Wilk



In [ ]:

    
sapply(fichierDBF[18],shapiro.test)  # sapply(fichierDBF[18:20],shapiro.test)

Transformations

Square root



In [ ]:

    
fichierDBF$SqrtDens <- sqrt(fichierDBF$HabKm2)
fichierDBF$SqrtImg <- sqrt(fichierDBF$IMMREC_PCT)

Logarithmic



In [ ]:

    
# log(0) = error
fichierDBF$LogDens <- log(fichierDBF$HabKm2)
fichierDBF$LogImg <- log(fichierDBF$IMMREC_PCT+1)

summary(fichierDBF)

Centrage et réduction



In [ ]:

    
ZScores$INDICE_PAU <- scale(fichierDBF[1], center = TRUE, scale = TRUE)
ZScores$Dist_Min <- scale(fichierDBF[2], center = TRUE, scale = TRUE)
ZScores$N_1000 <- scale(fichierDBF[3], center = TRUE, scale = TRUE)
ZScores$Dist_Moy_3 <- scale(fichierDBF[4], center = TRUE, scale = TRUE)

#help(sapply)
sapply(ZScores,mean)
sapply(ZScores,sd)

Descriptive statistics



In [ ]:

    
summary(fichierDBF)



In [ ]:

    
sapply(fichierDBF, mean)
sapply(fichierDBF, sd)
sapply(fichierDBF, min)
sapply(fichierDBF, max)
sapply(fichierDBF, median)
sapply(fichierDBF, range)
sapply(fichierDBF, quantile)



In [ ]:

    
# Hmisc.describe
describe(fichierDBF)



In [ ]:

    
# pastecs.stat.desc
stat.desc(fichierDBF, basic=TRUE, norm=TRUE)

Histograms



In [ ]:

    
hist(fichierDBF$HabKm2, main="Histogramme", xlab="Habitants au km2", ylab="Effectif", breaks=10, col='lightblue')



In [ ]:

    
hist(fichierDBF$SqrtDens, main="Histogramme", xlab="Habitants au km2 (racine)", ylab="Effectif", breaks=10, col='gold')



In [ ]:

    
hist(fichierDBF$LogDens, main="Histogramme", xlab="Habitants au km2 log)", ylab="Effectif", breaks=10, col='coral')

Histogram with normal curve



In [ ]:

    
x <- fichierDBF$HabKm2
h<-hist(x, breaks=10, col="lightblue", xlab="Habitants au km2", ylab="Effectif", 
main="Histogramme avec courbe normale")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)



In [ ]:

    
x <- fichierDBF$SqrtDens
h<-hist(x, breaks=10, col="red", xlab="Habitants au km2 (racine)", ylab = "Effectif",
main="Histogramme avec courbe normale")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)

Labo 3

Graphiques



In [ ]:

    
# install
#install.packages('doBy', repos="http://cran.rstudio.com/")
#install.packages('gmodels', repos="http://cran.rstudio.com/")
#install.packages('scatterplot3d', repos="http://cran.rstudio.com/")

# import
library(foreign)
library(nortest)
library(sas7bdat)
library(Hmisc)
library(pastecs)
library(ggplot2)
library(doBy)
library(gmodels)
library(scatterplot3d)

# data
Tableau1 <- read.sas7bdat("data/labo3/tableau1.sas7bdat", debug=FALSE)
names(Tableau1)

TableauKhi2 <- read.sas7bdat("data/labo3/khi2.sas7bdat", debug=FALSE)
names(TableauKhi2)

Histogrammes classiques



In [ ]:

    
hist(Tableau1$IMMREC_PCT, breaks=10, xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme")

breaks = nombre de barres



In [ ]:

    
hist(Tableau1$IMMREC_PCT, breaks=20, xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme")

density = pour rendu barres (ex.: hachures)



In [ ]:

    
hist(Tableau1$IMMREC_PCT, density=20, breaks=20, xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme")

col = colours



In [ ]:

    
hist(Tableau1$IMMREC_PCT, breaks=20, col="red", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme") 
hist(Tableau1$IMMREC_PCT, breaks=20, col="lightyellow", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme") 
hist(Tableau1$IMMREC_PCT, breaks=20, col="lightsalmon", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme") 
hist(Tableau1$IMMREC_PCT, breaks=20, col="lightgreen", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme")



In [ ]:

ylim = limites



In [ ]:

    
plot(
    hist(Tableau1$IMMREC_PCT, breaks=20),
    ylim=c(0, 80), col="lightgreen", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme"
)

prob : proportion vs effectif



In [ ]:

    
hist(Tableau1$IMMREC_PCT, col="lightgray", breaks=20, xlab="Immigrants récents (%)", ylab = "Proportion", main="Histogramme", prob=TRUE)

Histogrammes avec courbe normale

y = proportion



In [ ]:

    
m <- mean(Tableau1$IMMREC_PCT)
std <- sd(Tableau1$IMMREC_PCT)
hist(Tableau1$IMMREC_PCT, col="lightyellow", breaks=20, prob=TRUE, xlab="Immigrants récents (%)", ylab = "Proportion", main="Histogramme avec la courbe normale")
curve(dnorm(x, mean=m, sd=std), col="darkblue", lwd=2, add=TRUE)

y = effectif



In [ ]:

    
x <- Tableau1$IMMREC_PCT
h<-hist(x, breaks=20, col="lightyellow", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme avec la courbe normale") 
xfit<-seq(min(x),max(x),length=40) 
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) 
yfit <- yfit*diff(h$mids[1:2])*length(x) 
lines(xfit, yfit, col="darkblue", lwd=2)

Nuages de points



In [ ]:

    
plot(Tableau1$IMMREC_PCT, Tableau1$FAIBREVPCT, xlab="Immigrants récents (%)", ylab = "Faible revenu (%)", main="Nuage de points")

Nuages de points avec droite de régression



In [ ]:

    
plot(Tableau1$IMMREC_PCT, Tableau1$FAIBREVPCT, xlab="Immigrants récents (%)", ylab = "Faible revenu (%)", main="Nuage de points avec droite de régression")
abline(lsfit(Tableau1$IMMREC_PCT, Tableau1$FAIBREVPCT))

Matrice de nuage de points



In [ ]:

    
pairs(~MONOPCT+MENAGE1PCT+TX_CHOM+FAIBREVPCT,data=Tableau1, 
      main="Matrice de nuages de points")

Nuages de point 3D



In [ ]:

    
scatterplot3d(Tableau1$MONOPCT, Tableau1$TX_CHOM, Tableau1$FAIBREVPCT, main="Nuage de points 3D")
scatterplot3d(Tableau1$MONOPCT, Tableau1$TX_CHOM, Tableau1$FAIBREVPCT, main="Nuage de points 3D", xlab="Familles monoparentales (%)", ylab="Taux de chômage", zlab="Faible revenu (%)");

Matrice de corrélation

Pearson



In [ ]:

    
rcorr(cbind(Tableau1$MONOPCT,Tableau1$MENAGE1PCT,Tableau1$TX_CHOM,Tableau1$FAIBREVPCT,Tableau1$Dist_Min,Tableau1$N_1000), type="pearson")

Spearman



In [ ]:

    
rcorr(cbind(Tableau1$MONOPCT,Tableau1$MENAGE1PCT,Tableau1$TX_CHOM,Tableau1$FAIBREVPCT,Tableau1$Dist_Min,Tableau1$N_1000), type="spearman")

Régression linéaire simple



In [ ]:

    
reg <- lm(TX_CHOM ~ FAIBREVPCT, data = Tableau1)
summary(reg)

names(Tableau1)

Tableau de contingence



In [ ]:

    
names(TableauKhi2)

Modalités variables nominales



In [ ]:

    
# sex
table(TableauKhi2$SEX)
TableauKhi2$SEX <- factor(TableauKhi2$SEX, levels = c(1,2), labels = c("Homme", "Femme"))
table(TableauKhi2$SEX)

# transport mode
table(TableauKhi2$Mode)
TableauKhi2$Mode <- factor(TableauKhi2$Mode, levels = c(0:4), labels = c("Auto (conducteur)", "Auto (passager)", "Transport en commun", "Tranport actif", "Autres"))
table(TableauKhi2$Mode)

# distance
table(TableauKhi2$DIST)
TableauKhi2$DIST <- factor(TableauKhi2$DIST, levels = c(1:7), labels = c("Moins de 5 km", "5 à 9,9 km","10 à 14,9 km", "15 à 19,9 km", "20 à 24,9 km", "25 à 29,9 km", "30 km et plus"))
table(TableauKhi2$DIST)

Tableau de contingence



In [ ]:

    
CrossTable(TableauKhi2$SEX, TableauKhi2$Mode, chisq=TRUE, expected=TRUE, resid=TRUE, format="SPSS")
CrossTable(TableauKhi2$SEX, TableauKhi2$DIST, chisq=TRUE, expected=TRUE, resid=TRUE, format="SPSS")
CrossTable(TableauKhi2$Mode, TableauKhi2$DIST, chisq=TRUE, expected=TRUE, resid=TRUE, format="SPSS")

Labo 4



In [ ]:

    
# import
library(foreign)
library(nortest)
library(sas7bdat)
library(doBy)

# data
MTL <- read.sas7bdat("data/labo4/mtl_ttest.sas7bdat", debug=FALSE)
TOR <- read.sas7bdat("data/labo4/tor_ttest.sas7bdat", debug=FALSE)
VAN <- read.sas7bdat("data/labo4/van_ttest.sas7bdat", debug=FALSE)
TROISRMR <- read.sas7bdat("data/labo4/troisrmr_anova.sas7bdat", debug=FALSE)
names(MTL)
names(TOR)
names(VAN)
names(TROISRMR)



In [ ]:

    
# modalités (labels)
table(MTL$SEX)
table(TOR$SEX)
table(VAN$SEX)
MTL$SEX <- factor(MTL$SEX, levels = c(1,2), labels = c("Homme", "Femme"))
TOR$SEX <- factor(TOR$SEX, levels = c(1,2), labels = c("Homme", "Femme"))
VAN$SEX <- factor(VAN$SEX, levels = c(1,2), labels = c("Homme", "Femme"))
table(MTL$SEX)
table(TOR$SEX)
table(VAN$SEX)

TROISRMR$CMA <- factor(TROISRMR$CMA, levels = c(462,535,933), labels = c("Montréal", "Toronto", "Vancouver"))
table(TROISRMR$CMA)

T-Test : Comparaison de moyennes

Test F

Vérification de l'égalité des variances



In [ ]:

    
var.test(TOTINC ~ SEX, alternative='two.sided', conf.level=.95, data=MTL)

Interprétation

p-value < 2.2e-16
- p < 0.05 alors méthode Satterthwaite
true ratio of variances is not equal to 1

Méthode Satterthwaite

Pas égales : P < 0,05

var.equal=FALSE



In [ ]:

    
t.test(TOTINC~SEX, alternative='two.sided', conf.level=.95, var.equal=FALSE, data=MTL)

Interprétation

t = -27.088
p-value < 2.2e-16

Méthode Pooled

Égales : P >= 0,05

var.equal=TRUE



In [ ]:

    
t.test(TOTINC~SEX, alternative='two.sided', conf.level=.95, var.equal=TRUE, data=MTL)
boxplot(TOTINC~SEX, data = MTL, col = "coral", main="Boites à moustache (RMR de Montréal)", xlab="Sexe", ylab="Revenu total")
boxplot(LogTotInc~SEX, data = MTL, col = "coral", main="Boites à moustache (RMR de Montréal)", xlab="Sexe", ylab="Revenu total (log)")

Interprétation

t = -27.783
p-value < 2.2e-16

Analyse des résultats

Contexte dataset, valeurs et comparaison des 2 moyennes des 2 modes de la variable qualitative, "la différence entre les moyennes (x) est d'ailleurs significative (t=27,09; P<0,001)".

ANOVA : Analyse de variance

Moyenne par groupe



In [ ]:

    
# doBy
summaryBy(GROSRT ~ CMA, TROISRMR, FUN=c(mean), na.rm=TRUE)

Boxplot

Visualisation d'ANOVA



In [ ]:

    
boxplot(GROSRT ~ CMA, data = TROISRMR, col = "lightyellow", main="Boites à moustache", xlab="Région métropolitaine", ylab="Loyer ($)")  #Analyse de variance : test F

ANOVA



In [ ]:

    
anova.aov <- aov(GROSRT ~ CMA, data = TROISRMR)
summary(anova.aov)

Interprétation

CMA Sum Sq = variance expliquée (inter)
Residuals Sum Sq = variance non expliquée (intra)
CMA Df = nombre de degrés de liberté pour variance expliquée (inter)
Residuals Df = nombre de degrés de liberté pour variance non expliquée (intra)
CMA F value = F observé
CMA Pr(>F) = Valeur de P rattachée à valeur de F

Test de F

Hypothèse H0 = "indépendance entre les deux variances (inter et intra)"

k = nombre de groupes
n = nombre d'observations
DL numérateur (VE, inter) de table de Fisher
- k - 1
DL dénominateur (VNE, intra) de table de Fisher
- n - k

Calcul F théorique

F théorique
P associé au F théorique, seuils de signification
- 95% : p=0,05
- 99% : p=0,01
- 99,9% : p=0,001



In [ ]:

    
f_theorique <- qf(0.99, 2, 8379)
f_theorique
# qt() pour table Student t pour coefficient de ... 
# (voir autres cours)

Interprétation

F observé > à F théorique
- moyennes sont statistiquement différentes
- H0 rejeté
F observé < F théorique
- moyennes des groupes ne sont pas différentes
- H0 validée

Calcul R carré

Pour obtenir Coefficient de détermination



In [ ]:

    
anova.r2 <- lm(GROSRT ~ CMA, data = TROISRMR)
summary(anova.r2)

Interprétation

Multiple R-squared = Coefficient de détermination
- la variable qualitative explique à x% la variation de la vaiable quantitative

Test de Tukey

Comparaison des moyennes groupes, 2 à 2



In [ ]:

    
TukeyHSD(anova.aov)

Labo 5



In [ ]:

    
# install
#install.packages("MASS", repos="http://cran.rstudio.com/")      ## Tests de normalité supp.
#install.packages("car", repos="http://cran.rstudio.com/")      ## Companion to Applied Regression



In [4]:

    
# import
library(foreign)
library(MASS)
library(sas7bdat)
library(pastecs)
library(car)

# data
MTL <- read.sas7bdat("data/labo5/pauvretemtl.sas7bdat", debug=FALSE)
names(MTL)









    Out[4]:





	'srnom'
	'FAIBREVPCT'
	'SqrtChom'
	'MONOPCT'
	'menage1per'
	'SqrtImmig'
	'pasecol1524'
	'tpspartiel'



In [5]:

    
# stats univariees
summary(MTL)









    Out[5]:





     srnom       FAIBREVPCT        SqrtChom        MONOPCT     
 0001.00:  1   Min.   : 1.232   Min.   :0.000   Min.   : 0.00  
 0002.00:  1   1st Qu.:19.761   1st Qu.:2.568   1st Qu.:16.05  
 0003.00:  1   Median :28.699   Median :2.925   Median :21.23  
 0004.00:  1   Mean   :29.982   Mean   :2.995   Mean   :21.38  
 0005.00:  1   3rd Qu.:39.803   3rd Qu.:3.416   3rd Qu.:26.18  
 0006.00:  1   Max.   :82.642   Max.   :6.887   Max.   :51.28  
 (Other):500                                                   
   menage1per       SqrtImmig      pasecol1524      tpspartiel   
 Min.   : 3.943   Min.   :0.000   Min.   : 0.00   Min.   :30.65  
 1st Qu.:28.587   1st Qu.:1.454   1st Qu.:24.51   1st Qu.:41.26  
 Median :38.598   Median :1.961   Median :32.66   Median :45.47  
 Mean   :37.674   Mean   :2.074   Mean   :32.66   Mean   :45.61  
 3rd Qu.:46.752   3rd Qu.:2.543   3rd Qu.:40.93   3rd Qu.:49.65  
 Max.   :72.632   Max.   :5.078   Max.   :68.75   Max.   :69.79



In [11]:

    
# regression lineaire multiple
ols <- lm(
    FAIBREVPCT ~ 
    SqrtChom + 
    MONOPCT + 
    menage1per + 
    SqrtImmig + 
    pasecol1524 + 
    tpspartiel, 
    data=MTL)
summary(ols)









    Out[11]:





Call:
lm(formula = FAIBREVPCT ~ SqrtChom + MONOPCT + menage1per + SqrtImmig + 
    pasecol1524 + tpspartiel, data = MTL)

Residuals:
    Min      1Q  Median      3Q     Max 
-20.405  -4.290  -0.586   3.498  34.343 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -36.10961    2.20183 -16.400  < 2e-16 ***
SqrtChom      7.97771    0.57049  13.984  < 2e-16 ***
MONOPCT       0.60825    0.04703  12.933  < 2e-16 ***
menage1per    0.20132    0.02517   7.997 8.94e-15 ***
SqrtImmig     2.58049    0.37394   6.901 1.57e-11 ***
pasecol1524   0.10986    0.02818   3.899  0.00011 ***
tpspartiel    0.27767    0.05336   5.204 2.86e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.326 on 499 degrees of freedom
Multiple R-squared:  0.8059,	Adjusted R-squared:  0.8035 
F-statistic: 345.2 on 6 and 499 DF,  p-value: < 2.2e-16



In [10]:

    
## coefficients standardisés
CoefStand <- lm(
    scale(FAIBREVPCT) ~ 
    scale(SqrtChom) + 
    scale(MONOPCT) + 
    scale(menage1per) + 
    scale(SqrtImmig) + 
    scale(pasecol1524) + 
    scale(tpspartiel), 
    data = MTL
)
summary(CoefStand)









    Out[10]:





Call:
lm(formula = scale(FAIBREVPCT) ~ scale(SqrtChom) + scale(MONOPCT) + 
    scale(menage1per) + scale(SqrtImmig) + scale(pasecol1524) + 
    scale(tpspartiel), data = MTL)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.42971 -0.30057 -0.04107  0.24509  2.40635 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -1.646e-16  1.971e-02   0.000  1.00000    
scale(SqrtChom)     3.899e-01  2.788e-02  13.984  < 2e-16 ***
scale(MONOPCT)      3.343e-01  2.585e-02  12.933  < 2e-16 ***
scale(menage1per)   1.825e-01  2.282e-02   7.997 8.94e-15 ***
scale(SqrtImmig)    1.714e-01  2.484e-02   6.901 1.57e-11 ***
scale(pasecol1524)  9.386e-02  2.407e-02   3.899  0.00011 ***
scale(tpspartiel)   1.268e-01  2.436e-02   5.204 2.86e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4433 on 499 degrees of freedom
Multiple R-squared:  0.8059,	Adjusted R-squared:  0.8035 
F-statistic: 345.2 on 6 and 499 DF,  p-value: < 2.2e-16



In [12]:

    
## multicolinéarité?
### Valeurs de VIF
vif(ols)
vif(ols) > 5 # problème de multicolinéarité (VIF > 5)?









    Out[12]:





	SqrtChom
		1.99767242206301
	MONOPCT
		1.7169166500618
	menage1per
		1.33888028140516
	SqrtImmig
		1.58542766716237
	pasecol1524
		1.48920558001708
	tpspartiel
		1.52538045714979








    Out[12]:





	SqrtChom
		FALSE
	MONOPCT
		FALSE
	menage1per
		FALSE
	SqrtImmig
		FALSE
	pasecol1524
		FALSE
	tpspartiel
		FALSE



In [13]:

    
## Graphiques et distance de cook
opar <- par(mfrow = c(2, 2), oma = c(0, 0, 1.1, 0))
plot(ols, las = 1)
par(opar)



In [14]:

    
## Histogramme sur les résidus et vérification de la normalité des résidus
m <- mean(residuals(ols))
std <- sd(residuals(ols))
hist(residuals(ols), col="lightyellow", breaks=20, prob=TRUE, xlab="Résidus OLS", ylab = "Proportion", main="Histogramme avec la courbe normale")
curve(dnorm(x, mean=m, sd=std), col="darkblue", lwd=2, add=TRUE)
stat.desc(residuals(ols), basic=TRUE, norm=TRUE)









    Out[14]:





	nbr.val
		506
	nbr.null
		0
	nbr.na
		0
	min
		-20.4047979910667
	max
		34.3433376307782
	range
		54.748135621845
	sum
		-7.40241201668823e-14
	median
		-0.586204938060121
	mean
		-1.46830543304696e-16
	SE.mean
		0.27955974003622
	CI.mean.0.95
		0.549243372142321
	var
		39.5457460140541
	std.dev
		6.28854084935878
	coef.var
		-42828560787309032
	skewness
		0.676033517801706
	skew.2SE
		3.11329159768941
	kurtosis
		2.13343255710525
	kurt.2SE
		4.92204145621073
	normtest.W
		0.972936774972511
	normtest.p
		4.69730411019801e-08



In [16]:

    
# Repérer les valeurs aberrantes : Distance de Cook > 4 / n ou 8 / n
nobs <- NROW(na.omit(residuals(ols)))            # Nombre d'observations dans le jeu de données
cook <- cooks.distance(ols)                      # Distance de Cook
ypredit <- fitted.values(ols)                    # Y prédits par le modèle
res <- residuals(ols)                            # résidus (Y - Y prédits par le modèle)
res_std <- rstandard(ols)                        # résidus standardisés

a <- cbind(MTL, cook, ypredit, res, res_std)
a <- a[order(-cook), ]
a[cook > 4/nobs, ]    # Observations dont la distance de Cook > 4 / n
a[cook > 8/nobs, ]    # Observations dont la distance de Cook > 8 / n









    Out[16]:





srnom FAIBREVPCT SqrtChom MONOPCT menage1per SqrtImmig pasecol1524 tpspartiel cook ypredit res res_std

	21 0021.00 62.77603 4.053217 39.70588 45.04792 1.369306 52.63158 49.62963 0.006716149 52.54256 10.23347 1.631841
	71 0073.00 55.39906 3.67194 25 55.08475 1.928971 58.82353 50 0.006426216 44.80397 10.59509 1.687955
	114 0114.00 24.43325 3.635362 16.32653 37.07865 3.58418 16.94915 53.42466 0.005897266 36.23329 -11.80005 -1.876161
	135 0133.00 45.43689 2.983635 14.94253 56.81818 2.944575 27.86885 51.52439 0.005863305 33.18773 12.24916 1.946705
	373 0395.01 58.20896 3.708991 38.46154 42.0354 1.552301 44.0678 47.25275 0.005672756 47.30444 10.90451 1.735029
	473 0601.02 33.96226 4.051199 24.84848 36 3.681114 14 45.74899 0.004762004 42.31183 -8.349568 -1.33217
	484 0605.01 28.87324 2.43975 15.85366 34.4 1.453505 26.66667 35.31746 0.004694459 16.40964 12.4636 1.978397
	494 0610.06 40.14337 3.132303 27.81457 21.86047 1.466471 37.3494 38.22394 0.004607817 28.69945 11.44392 1.81777
	105 0106.00 17.71429 1.591115 28.125 42.37288 2.320008 43.47826 39.31624 0.004509052 23.90192 -6.187635 -0.9936036
	20 0019.00 54.01302 3.445598 33.98058 49.38272 1.137147 53.62319 47.47475 0.004492352 43.99694 10.01608 1.593039
	165 0164.00 27.43902 3.558617 8.695652 57.5 3.157497 30.43478 51.75439 0.004426104 35.0076 -7.568571 -1.208992
	367 0385.00 16.31944 2.522625 13.04348 61.34021 1.950956 15.38462 47.2 0.004403388 24.1289 -7.80946 -1.246638
	43 0044.00 55.58252 3.837797 24.28571 50.84746 2.246793 31.81818 51.73913 0.004392712 43.1758 12.40672 1.968917
	143 0141.00 17.95666 2.761036 21.79487 46.28571 2.360668 34.48276 51.33929 0.004330848 32.62783 -14.67118 -2.325591
	136 0134.00 28.14371 2.649065 28.57143 52.94118 2.182179 45.45455 51.30435 0.004211983 37.93137 -9.787662 -1.55654
	66 0067.00 44.82759 2.790059 29.26829 40.8805 2.618615 23.68421 47.82609 0.004111846 34.82063 10.00695 1.590789
	23 0023.00 50 3.281651 29.62963 45.21739 1.894055 65 45.83333 0.003924724 41.95125 8.048745 1.282856
	18 0017.00 48.25 3.611576 35.78947 43.71859 0 41.93548 52.23881 0.003781063 42.38543 5.864572 0.9407829
	368 0390.00 28.04878 3.735437 25.37313 44.44444 1.084652 28 39.55224 0.003407594 34.92921 -6.880426 -1.098303
	166 0165.00 44.2623 3.441236 14.77273 45.29915 2.862992 41.55844 57.62712 0.003208054 37.404 6.858298 1.094223
	132 0130.00 53.6 4.427445 8.888889 67.52336 4.697417 7.555556 69.79167 0.00209248 50.54303 3.05697 0.4973239
	185 0184.00 27.03151 3.251772 27.54491 40.78947 1.815184 43.75 39.32927 0.001673101 35.20933 -8.177823 -1.297175
	204 0199.00 28.35249 3.517752 21.53846 46.69118 2.354768 36.2069 42.36111 0.001505015 36.27156 -7.919066 -1.255955
	292 0286.00 43.01288 3.711843 21.9697 29.17706 3.732667 27.04918 43.54839 0.001429056 37.43548 5.5774 0.8872145
	400 0430.00 17.68642 2.256851 18.18182 45.69288 1.796906 45.66929 41.72185 0.001215297 23.39228 -5.705859 -0.9065917
	334 0325.02 26.84015 3.508565 22.28261 33.10345 2.585823 28.57143 41.92593 0.0009907295 33.5519 -6.711754 -1.064183
	278 0273.00 22.39521 2.947197 19.82379 42.15686 2.444119 34.14634 42.47788 0.0008003792 29.80063 -7.405418 -1.17297
	342 0329.00 22.26833 2.512107 20.72539 37.9822 1.739579 47.82609 38.70192 0.0001845368 24.67387 -2.405544 -0.3819289
	422 0512.02 10.38186 2.132007 13.93443 26.5861 1.760377 22.68908 40.93686 0.0001670965 13.12932 -2.747458 -0.4356323
	202 0197.00 28.34994 2.40906 24.03433 48.54167 1.994467 42.85714 45.61028 8.764326e-05 30.02045 -1.670502 -0.2652086
	433 0520.02 20.56396 2.769892 18.99038 15.15789 2.431183 29.24528 41.03832 2.907333e-05 21.47213 -0.9081725 -0.1442568









    Out[16]:





srnom FAIBREVPCT SqrtChom MONOPCT menage1per SqrtImmig pasecol1524 tpspartiel cook ypredit res res_std

	21 0021.00 62.77603 4.053217 39.70588 45.04792 1.369306 52.63158 49.62963 0.006716149 52.54256 10.23347 1.631841
	71 0073.00 55.39906 3.67194 25 55.08475 1.928971 58.82353 50 0.006426216 44.80397 10.59509 1.687955
	114 0114.00 24.43325 3.635362 16.32653 37.07865 3.58418 16.94915 53.42466 0.005897266 36.23329 -11.80005 -1.876161
	373 0395.01 58.20896 3.708991 38.46154 42.0354 1.552301 44.0678 47.25275 0.005672756 47.30444 10.90451 1.735029
	484 0605.01 28.87324 2.43975 15.85366 34.4 1.453505 26.66667 35.31746 0.004694459 16.40964 12.4636 1.978397
	105 0106.00 17.71429 1.591115 28.125 42.37288 2.320008 43.47826 39.31624 0.004509052 23.90192 -6.187635 -0.9936036
	20 0019.00 54.01302 3.445598 33.98058 49.38272 1.137147 53.62319 47.47475 0.004492352 43.99694 10.01608 1.593039
	165 0164.00 27.43902 3.558617 8.695652 57.5 3.157497 30.43478 51.75439 0.004426104 35.0076 -7.568571 -1.208992
	367 0385.00 16.31944 2.522625 13.04348 61.34021 1.950956 15.38462 47.2 0.004403388 24.1289 -7.80946 -1.246638
	143 0141.00 17.95666 2.761036 21.79487 46.28571 2.360668 34.48276 51.33929 0.004330848 32.62783 -14.67118 -2.325591
	66 0067.00 44.82759 2.790059 29.26829 40.8805 2.618615 23.68421 47.82609 0.004111846 34.82063 10.00695 1.590789
	23 0023.00 50 3.281651 29.62963 45.21739 1.894055 65 45.83333 0.003924724 41.95125 8.048745 1.282856
	18 0017.00 48.25 3.611576 35.78947 43.71859 0 41.93548 52.23881 0.003781063 42.38543 5.864572 0.9407829
	368 0390.00 28.04878 3.735437 25.37313 44.44444 1.084652 28 39.55224 0.003407594 34.92921 -6.880426 -1.098303
	132 0130.00 53.6 4.427445 8.888889 67.52336 4.697417 7.555556 69.79167 0.00209248 50.54303 3.05697 0.4973239
	185 0184.00 27.03151 3.251772 27.54491 40.78947 1.815184 43.75 39.32927 0.001673101 35.20933 -8.177823 -1.297175
	292 0286.00 43.01288 3.711843 21.9697 29.17706 3.732667 27.04918 43.54839 0.001429056 37.43548 5.5774 0.8872145
	334 0325.02 26.84015 3.508565 22.28261 33.10345 2.585823 28.57143 41.92593 0.0009907295 33.5519 -6.711754 -1.064183
	202 0197.00 28.34994 2.40906 24.03433 48.54167 1.994467 42.85714 45.61028 8.764326e-05 30.02045 -1.670502 -0.2652086



In [17]:

    
# Tableau de données sans les valeurs aberrantes (Cook > 8 / n )
dataSansOutliers <- a[a$cook  < 8/nobs, ]
dataSansOutliers$cook <- NULL
dataSansOutliers$ypredit <- NULL
dataSansOutliers$res <- NULL
dataSansOutliers$res_std <- NULL
head(dataSansOutliers)









    Out[17]:





srnom FAIBREVPCT SqrtChom MONOPCT menage1per SqrtImmig pasecol1524 tpspartiel

	59 0061.00 30.95238 3.216338 36.84211 53.84615 1.856953 15.38462 45.90164
	52 0053.00 55.03356 3.576408 13.04348 68.57143 3.172206 48.27586 59.74026
	83 0085.00 44.28571 2.995723 32.35294 29.82456 1.43839 37.93103 32.25806
	151 0150.00 33.69565 4.082483 26.04167 53.06859 2.464498 56.25 49.52077
	42 0043.00 42.85714 3.867897 11.76471 62.10526 1.301889 17.3913 50.80645
	50 0051.00 44.61538 2.773501 16.27907 57.92683 2.438843 19.60784 62.43094



In [18]:

    
# Nouveau modèle de régression sans les valeurs aberrantes
ols2 <- lm(FAIBREVPCT ~ SqrtChom+MONOPCT+menage1per+SqrtImmig+pasecol1524+tpspartiel, data = dataSansOutliers)
summary(ols2)
opar <- par(mfrow = c(2, 2), oma = c(0, 0, 1.1, 0))
plot(ols, las = 1)
par(opar)









    Out[18]:





Call:
lm(formula = FAIBREVPCT ~ SqrtChom + MONOPCT + menage1per + SqrtImmig + 
    pasecol1524 + tpspartiel, data = dataSansOutliers)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.4311  -3.7694  -0.2836   3.3556  14.3676 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -38.07012    1.98689 -19.161  < 2e-16 ***
SqrtChom      8.62818    0.56045  15.395  < 2e-16 ***
MONOPCT       0.62153    0.04414  14.082  < 2e-16 ***
menage1per    0.13012    0.02321   5.605 3.52e-08 ***
SqrtImmig     2.37947    0.34582   6.881 1.87e-11 ***
pasecol1524   0.13829    0.02593   5.333 1.49e-07 ***
tpspartiel    0.30772    0.04772   6.448 2.77e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.404 on 480 degrees of freedom
Multiple R-squared:  0.842,	Adjusted R-squared:  0.8401 
F-statistic: 426.5 on 6 and 480 DF,  p-value: < 2.2e-16



In [19]:

    
# Comparaison des deux modèles : coefficients
summary(ols)
summary(ols2)









    Out[19]:





Call:
lm(formula = FAIBREVPCT ~ SqrtChom + MONOPCT + menage1per + SqrtImmig + 
    pasecol1524 + tpspartiel, data = MTL)

Residuals:
    Min      1Q  Median      3Q     Max 
-20.405  -4.290  -0.586   3.498  34.343 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -36.10961    2.20183 -16.400  < 2e-16 ***
SqrtChom      7.97771    0.57049  13.984  < 2e-16 ***
MONOPCT       0.60825    0.04703  12.933  < 2e-16 ***
menage1per    0.20132    0.02517   7.997 8.94e-15 ***
SqrtImmig     2.58049    0.37394   6.901 1.57e-11 ***
pasecol1524   0.10986    0.02818   3.899  0.00011 ***
tpspartiel    0.27767    0.05336   5.204 2.86e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.326 on 499 degrees of freedom
Multiple R-squared:  0.8059,	Adjusted R-squared:  0.8035 
F-statistic: 345.2 on 6 and 499 DF,  p-value: < 2.2e-16







    Out[19]:





Call:
lm(formula = FAIBREVPCT ~ SqrtChom + MONOPCT + menage1per + SqrtImmig + 
    pasecol1524 + tpspartiel, data = dataSansOutliers)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.4311  -3.7694  -0.2836   3.3556  14.3676 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -38.07012    1.98689 -19.161  < 2e-16 ***
SqrtChom      8.62818    0.56045  15.395  < 2e-16 ***
MONOPCT       0.62153    0.04414  14.082  < 2e-16 ***
menage1per    0.13012    0.02321   5.605 3.52e-08 ***
SqrtImmig     2.37947    0.34582   6.881 1.87e-11 ***
pasecol1524   0.13829    0.02593   5.333 1.49e-07 ***
tpspartiel    0.30772    0.04772   6.448 2.77e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.404 on 480 degrees of freedom
Multiple R-squared:  0.842,	Adjusted R-squared:  0.8401 
F-statistic: 426.5 on 6 and 480 DF,  p-value: < 2.2e-16



In [20]:

    
# Comparaison des deux modèles 
vif(ols)
vif(ols2)









    Out[20]:





	SqrtChom
		1.99767242206301
	MONOPCT
		1.7169166500618
	menage1per
		1.33888028140516
	SqrtImmig
		1.58542766716237
	pasecol1524
		1.48920558001708
	tpspartiel
		1.52538045714979








    Out[20]:





	SqrtChom
		2.14092431277651
	MONOPCT
		1.76657445847508
	menage1per
		1.4393672286109
	SqrtImmig
		1.70721014765762
	pasecol1524
		1.62084718042594
	tpspartiel
		1.5256792012903



In [21]:

    
# Comparaison des deux histogrammes
m <- mean(residuals(ols))
std <- sd(residuals(ols))
hist(residuals(ols), col="lightyellow", breaks=20, prob=TRUE, xlab="Résidus OLS", ylab = "Proportion", main="Modèle de départ")
curve(dnorm(x, mean=m, sd=std), col="darkblue", lwd=2, add=TRUE)
stat.desc(residuals(ols), basic=TRUE, norm=TRUE)

m <- mean(residuals(ols2))
std <- sd(residuals(ols2))
hist(residuals(ols2), col="lightyellow", breaks=20, prob=TRUE, xlab="Résidus OLS", ylab = "Proportion", main="Modèle sans les outliers")
curve(dnorm(x, mean=m, sd=std), col="darkblue", lwd=2, add=TRUE)
stat.desc(residuals(ols2), basic=TRUE, norm=TRUE)









    Out[21]:





	nbr.val
		506
	nbr.null
		0
	nbr.na
		0
	min
		-20.4047979910667
	max
		34.3433376307782
	range
		54.748135621845
	sum
		-7.40241201668823e-14
	median
		-0.586204938060121
	mean
		-1.46830543304696e-16
	SE.mean
		0.27955974003622
	CI.mean.0.95
		0.549243372142321
	var
		39.5457460140541
	std.dev
		6.28854084935878
	coef.var
		-42828560787309032
	skewness
		0.676033517801706
	skew.2SE
		3.11329159768941
	kurtosis
		2.13343255710525
	kurt.2SE
		4.92204145621073
	normtest.W
		0.972936774972511
	normtest.p
		4.69730411019801e-08








    













    Out[21]:





	nbr.val
		487
	nbr.null
		0
	nbr.na
		0
	min
		-15.4310542378812
	max
		14.3676352481711
	range
		29.7986894860523
	sum
		2.59237076249974e-14
	median
		-0.283621129995047
	mean
		5.31495051027719e-17
	SE.mean
		0.243344138468579
	CI.mean.0.95
		0.478136476019468
	var
		28.8383720570564
	std.dev
		5.37013706129149
	coef.var
		101038326714568336
	skewness
		0.165991949575083
	skew.2SE
		0.750028230078793
	kurtosis
		0.0971118490001404
	kurt.2SE
		0.219841661800126
	normtest.W
		0.994250731001488
	normtest.p
		0.0634160048608441



In [22]:

    
# Comparaison de la normalité
stat.desc(residuals(ols), basic=TRUE, norm=TRUE)
stat.desc(residuals(ols2), basic=TRUE, norm=TRUE)









    Out[22]:





	nbr.val
		506
	nbr.null
		0
	nbr.na
		0
	min
		-20.4047979910667
	max
		34.3433376307782
	range
		54.748135621845
	sum
		-7.40241201668823e-14
	median
		-0.586204938060121
	mean
		-1.46830543304696e-16
	SE.mean
		0.27955974003622
	CI.mean.0.95
		0.549243372142321
	var
		39.5457460140541
	std.dev
		6.28854084935878
	coef.var
		-42828560787309032
	skewness
		0.676033517801706
	skew.2SE
		3.11329159768941
	kurtosis
		2.13343255710525
	kurt.2SE
		4.92204145621073
	normtest.W
		0.972936774972511
	normtest.p
		4.69730411019801e-08








    Out[22]:





	nbr.val
		487
	nbr.null
		0
	nbr.na
		0
	min
		-15.4310542378812
	max
		14.3676352481711
	range
		29.7986894860523
	sum
		2.59237076249974e-14
	median
		-0.283621129995047
	mean
		5.31495051027719e-17
	SE.mean
		0.243344138468579
	CI.mean.0.95
		0.478136476019468
	var
		28.8383720570564
	std.dev
		5.37013706129149
	coef.var
		101038326714568336
	skewness
		0.165991949575083
	skew.2SE
		0.750028230078793
	kurtosis
		0.0971118490001404
	kurt.2SE
		0.219841661800126
	normtest.W
		0.994250731001488
	normtest.p
		0.0634160048608441

Labo 6

Fichiers IVT de StatsCan

.IVT = fichiers Beyond 20/20

Beyond = software, permet

lecture data Stats Can
tableaux croisés de plus de 2 dimensions
méta-données sur variables

Export vers Excel

sélectionner les lignes pertinentes
right+clic + montrer
- ex.: Pour pourcentage de 0 à 14 ans, et 65 ans et + : prendre lignes, prendre total
drag étiquette de colonne (ex.: Géographie) vers panneua des lignes
refaire 1. et 2.
- ex.: Montréal = 462



In [23]:

    
# import
library(foreign)
library(MASS)
library(pastecs)
library(car)
library(gmodels)
library(sas7bdat)

# data
RMR <- read.sas7bdat("data/labo6/rmrmtl06.sas7bdat", debug=FALSE)
names(RMR)









    Out[23]:





	'SRIDU'
	'FRPCT_ApIm'
	'Pop65PCT'
	'TxChom'
	'MonoPct'
	'Menage1'
	'ImmigRec'
	'MinorVisib'
	'EmplAtypiq'
	'AucuneDipl'
	'BacEtPlus'
	'ILE_MTL'
	'Zones'
	'DistCBD_KM'
	'SqrtTxChom'



In [24]:

    
# stats univariees
summary(RMR)









    Out[24]:





     SRIDU       FRPCT_ApIm       Pop65PCT          TxChom      
 0001.00:  1   Min.   : 1.00   Min.   : 2.620   Min.   : 0.000  
 0002.00:  1   1st Qu.: 7.50   1st Qu.: 9.037   1st Qu.: 4.700  
 0003.00:  1   Median :15.65   Median :12.605   Median : 6.500  
 0004.00:  1   Mean   :17.95   Mean   :13.763   Mean   : 7.342  
 0005.00:  1   3rd Qu.:25.45   3rd Qu.:17.185   3rd Qu.: 9.300  
 0006.00:  1   Max.   :85.40   Max.   :56.570   Max.   :29.100  
 (Other):854                                                    
    MonoPct         Menage1         ImmigRec        MinorVisib   
 Min.   : 0.00   Min.   : 6.25   Min.   : 0.000   Min.   : 0.00  
 1st Qu.:13.71   1st Qu.:19.83   1st Qu.: 1.188   1st Qu.: 5.07  
 Median :18.32   Median :31.33   Median : 3.345   Median :13.12  
 Mean   :19.00   Mean   :31.90   Mean   : 4.830   Mean   :16.80  
 3rd Qu.:23.39   3rd Qu.:43.66   3rd Qu.: 6.692   3rd Qu.:23.23  
 Max.   :50.00   Max.   :75.00   Max.   :28.850   Max.   :84.78  
                                                                 
   EmplAtypiq      AucuneDipl       BacEtPlus        ILE_MTL      
 Min.   :28.80   Min.   : 0.000   Min.   : 3.88   Min.   :0.0000  
 1st Qu.:39.36   1st Qu.: 8.258   1st Qu.:15.31   1st Qu.:0.0000  
 Median :42.66   Median :13.535   Median :23.71   Median :1.0000  
 Mean   :43.44   Mean   :14.266   Mean   :27.95   Mean   :0.5884  
 3rd Qu.:46.83   3rd Qu.:19.250   3rd Qu.:38.03   3rd Qu.:1.0000  
 Max.   :71.68   Max.   :42.270   Max.   :79.87   Max.   :1.0000  
                                                                  
     Zones        DistCBD_KM       SqrtTxChom   
 Min.   :1.00   Min.   : 0.000   Min.   :0.000  
 1st Qu.:1.00   1st Qu.: 5.303   1st Qu.:2.168  
 Median :1.00   Median : 9.775   Median :2.550  
 Mean   :1.93   Mean   :13.092   Mean   :2.632  
 3rd Qu.:3.00   3rd Qu.:19.349   3rd Qu.:3.050  
 Max.   :4.00   Max.   :57.504   Max.   :5.394



In [25]:

    
# variables muettes
# Zones = 1 alors Montréal
# Zones = 2 alors Laval
# Zones = 3 alors Couronne Nord
# Zones = 4 alors Couronne Sud

RMR$Montreal     <- ifelse(RMR$Zones == 1, 1, 0)
RMR$Laval        <- ifelse(RMR$Zones == 2, 1, 0)
RMR$CouronneNord <- ifelse(RMR$Zones == 3, 1, 0)
RMR$CouronneSud  <- ifelse(RMR$Zones == 4, 1, 0)



In [26]:

    
## verification
CrossTable(RMR$Zones, RMR$Montreal)
CrossTable(RMR$Zones, RMR$Laval)
CrossTable(RMR$Zones, RMR$CouronneNord)
CrossTable(RMR$Zones, RMR$CouronneSud)









    



 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  860 

 
             | RMR$Montreal 
   RMR$Zones |         0 |         1 | Row Total | 
-------------|-----------|-----------|-----------|
           1 |         0 |       506 |       506 | 
             |   208.284 |   145.716 |           | 
             |     0.000 |     1.000 |     0.588 | 
             |     0.000 |     1.000 |           | 
             |     0.000 |     0.588 |           | 
-------------|-----------|-----------|-----------|
           2 |        73 |         0 |        73 | 
             |    61.393 |    42.951 |           | 
             |     1.000 |     0.000 |     0.085 | 
             |     0.206 |     0.000 |           | 
             |     0.085 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           3 |       116 |         0 |       116 | 
             |    97.557 |    68.251 |           | 
             |     1.000 |     0.000 |     0.135 | 
             |     0.328 |     0.000 |           | 
             |     0.135 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           4 |       165 |         0 |       165 | 
             |   138.766 |    97.081 |           | 
             |     1.000 |     0.000 |     0.192 | 
             |     0.466 |     0.000 |           | 
             |     0.192 |     0.000 |           | 
-------------|-----------|-----------|-----------|
Column Total |       354 |       506 |       860 | 
             |     0.412 |     0.588 |           | 
-------------|-----------|-----------|-----------|

 

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  860 

 
             | RMR$Laval 
   RMR$Zones |         0 |         1 | Row Total | 
-------------|-----------|-----------|-----------|
           1 |       506 |         0 |       506 | 
             |     3.984 |    42.951 |           | 
             |     1.000 |     0.000 |     0.588 | 
             |     0.643 |     0.000 |           | 
             |     0.588 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           2 |         0 |        73 |        73 | 
             |    66.803 |   720.197 |           | 
             |     0.000 |     1.000 |     0.085 | 
             |     0.000 |     1.000 |           | 
             |     0.000 |     0.085 |           | 
-------------|-----------|-----------|-----------|
           3 |       116 |         0 |       116 | 
             |     0.913 |     9.847 |           | 
             |     1.000 |     0.000 |     0.135 | 
             |     0.147 |     0.000 |           | 
             |     0.135 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           4 |       165 |         0 |       165 | 
             |     1.299 |    14.006 |           | 
             |     1.000 |     0.000 |     0.192 | 
             |     0.210 |     0.000 |           | 
             |     0.192 |     0.000 |           | 
-------------|-----------|-----------|-----------|
Column Total |       787 |        73 |       860 | 
             |     0.915 |     0.085 |           | 
-------------|-----------|-----------|-----------|

 

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  860 

 
             | RMR$CouronneNord 
   RMR$Zones |         0 |         1 | Row Total | 
-------------|-----------|-----------|-----------|
           1 |       506 |         0 |       506 | 
             |    10.641 |    68.251 |           | 
             |     1.000 |     0.000 |     0.588 | 
             |     0.680 |     0.000 |           | 
             |     0.588 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           2 |        73 |         0 |        73 | 
             |     1.535 |     9.847 |           | 
             |     1.000 |     0.000 |     0.085 | 
             |     0.098 |     0.000 |           | 
             |     0.085 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           3 |         0 |       116 |       116 | 
             |   100.353 |   643.647 |           | 
             |     0.000 |     1.000 |     0.135 | 
             |     0.000 |     1.000 |           | 
             |     0.000 |     0.135 |           | 
-------------|-----------|-----------|-----------|
           4 |       165 |         0 |       165 | 
             |     3.470 |    22.256 |           | 
             |     1.000 |     0.000 |     0.192 | 
             |     0.222 |     0.000 |           | 
             |     0.192 |     0.000 |           | 
-------------|-----------|-----------|-----------|
Column Total |       744 |       116 |       860 | 
             |     0.865 |     0.135 |           | 
-------------|-----------|-----------|-----------|

 

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  860 

 
             | RMR$CouronneSud 
   RMR$Zones |         0 |         1 | Row Total | 
-------------|-----------|-----------|-----------|
           1 |       506 |         0 |       506 | 
             |    23.048 |    97.081 |           | 
             |     1.000 |     0.000 |     0.588 | 
             |     0.728 |     0.000 |           | 
             |     0.588 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           2 |        73 |         0 |        73 | 
             |     3.325 |    14.006 |           | 
             |     1.000 |     0.000 |     0.085 | 
             |     0.105 |     0.000 |           | 
             |     0.085 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           3 |       116 |         0 |       116 | 
             |     5.284 |    22.256 |           | 
             |     1.000 |     0.000 |     0.135 | 
             |     0.167 |     0.000 |           | 
             |     0.135 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           4 |         0 |       165 |       165 | 
             |   133.343 |   561.657 |           | 
             |     0.000 |     1.000 |     0.192 | 
             |     0.000 |     1.000 |           | 
             |     0.000 |     0.192 |           | 
-------------|-----------|-----------|-----------|
Column Total |       695 |       165 |       860 | 
             |     0.808 |     0.192 |           | 
-------------|-----------|-----------|-----------|



In [27]:

    
# variables d'interaction muettes
RMR$Pop65PCT_Dist <- RMR$Pop65PCT * RMR$DistCBD_KM
RMR$Menag1_Dist   <- RMR$Menage1 * RMR$DistCBD_KM



In [ ]:

    
## modele 1 : sans variable muette ni d'interaction

Modele1 <- lm(
    FRPCT_ApIm ~ 
    SqrtTxChom + 
    MonoPct + 
    Menage1 + 
    MinorVisib + 
    EmplAtypiq + 
    AucuneDipl + 
    Pop65PCT, 
    data=RMR
)

### modele 1 without outliers
cook <- cooks.distance(Modele1)   # Distance de Cook
DataSansOutliers <- cbind(RMR, cook) # Fusion des deux tableaux
DataSansOutliers <- DataSansOutliers[DataSansOutliers$cook  < 8/nobs, ]

Modele1_Final <- lm(
    FRPCT_ApIm ~ 
    SqrtTxChom + 
    MonoPct + 
    Menage1 + 
    MinorVisib + 
    EmplAtypiq + 
    AucuneDipl + 
    Pop65PCT, 
    data=DataSansOutliers
)
summary(Modele1_Final)

# multicolinéarité?
vif(Modele1_Final)
vif(Modele1_Final) > 5



In [ ]:

    
## modele 2 : variable muette Ile de Montréal

Modele2 <- lm(
    FRPCT_ApIm ~ 
    SqrtTxChom + 
    MonoPct + 
    Menage1 + 
    MinorVisib + 
    EmplAtypiq + 
    AucuneDipl + 
    Pop65PCT + 
    Montreal, 
    data=RMR
)

### modele 2 without outliers
cook <- cooks.distance(Modele2)   # Distance de Cook
DataSansOutliers <- cbind(RMR, cook) # Fusion des deux tableaux
DataSansOutliers <- DataSansOutliers[DataSansOutliers$cook  < 8/nobs, ]
Modele2_Final <- lm(FRPCT_ApIm ~ SqrtTxChom+MonoPct+Menage1+MinorVisib+EmplAtypiq+AucuneDipl+Pop65PCT+Montreal, data = DataSansOutliers)
summary(Modele2_Final)

### multicolinéarité?
vif(Modele2_Final)
vif(Modele2_Final) > 5



In [ ]:

    
## modele 3 : variable muette Zones (sauf Montréal, en référence)

Modele3 <- lm(
    FRPCT_ApIm ~ 
    SqrtTxChom + 
    MonoPct + 
    Menage1 + 
    MinorVisib + 
    EmplAtypiq + 
    AucuneDipl + 
    Pop65PCT + 
    Laval + 
    CouronneNord + 
    CouronneSud, 
    data=RMR
)

### modele 3 without outliers
cook <- cooks.distance(Modele3)   # Distance de Cook
DataSansOutliers <- cbind(RMR, cook) # Fusion des deux tableaux
DataSansOutliers <- DataSansOutliers[DataSansOutliers$cook  < 8/nobs, ]
Modele3_Final <- lm(FRPCT_ApIm ~ SqrtTxChom+MonoPct+Menage1+MinorVisib+EmplAtypiq+AucuneDipl+Pop65PCT+Laval+CouronneNord+CouronneSud, data = DataSansOutliers)
summary(Modele3_Final)

### multicolinéarité?
vif(Modele3_Final)
vif(Modele3_Final) > 5



In [ ]:

    
## modele 4 : modele 3 + variable de distance + variable d'interaction

Modele4 <- lm(
    FRPCT_ApIm ~ 
    SqrtTxChom + 
    MonoPct + 
    Menage1 + 
    MinorVisib + 
    EmplAtypiq + 
    AucuneDipl + 
    Pop65PCT + 
    Laval + 
    CouronneNord + 
    CouronneSud + 
    DistCBD_KM + 
    Menag1_Dist, 
    data=RMR
)

### modele 4 without outliers
cook <- cooks.distance(Modele4)   # Distance de Cook
DataSansOutliers <- cbind(RMR, cook) # Fusion des deux tableaux
DataSansOutliers <- DataSansOutliers[DataSansOutliers$cook  < 8/nobs, ]
Modele4_Final <- lm(FRPCT_ApIm ~ SqrtTxChom+MonoPct+Menage1+MinorVisib+EmplAtypiq+AucuneDipl+Pop65PCT+Laval+CouronneNord+CouronneSud+DistCBD_KM+Menag1_Dist, data = DataSansOutliers)
summary(Modele4_Final)

### multicolinéarité?
vif(Modele4_Final)
vif(Modele4_Final) > 5

	srnom	FAIBREVPCT	SqrtChom	MONOPCT	menage1per	SqrtImmig	pasecol1524	tpspartiel	cook	ypredit	res	res_std
21	0021.00	62.77603	4.053217	39.70588	45.04792	1.369306	52.63158	49.62963	0.006716149	52.54256	10.23347	1.631841
71	0073.00	55.39906	3.67194	25	55.08475	1.928971	58.82353	50	0.006426216	44.80397	10.59509	1.687955
114	0114.00	24.43325	3.635362	16.32653	37.07865	3.58418	16.94915	53.42466	0.005897266	36.23329	-11.80005	-1.876161
135	0133.00	45.43689	2.983635	14.94253	56.81818	2.944575	27.86885	51.52439	0.005863305	33.18773	12.24916	1.946705
373	0395.01	58.20896	3.708991	38.46154	42.0354	1.552301	44.0678	47.25275	0.005672756	47.30444	10.90451	1.735029
473	0601.02	33.96226	4.051199	24.84848	36	3.681114	14	45.74899	0.004762004	42.31183	-8.349568	-1.33217
484	0605.01	28.87324	2.43975	15.85366	34.4	1.453505	26.66667	35.31746	0.004694459	16.40964	12.4636	1.978397
494	0610.06	40.14337	3.132303	27.81457	21.86047	1.466471	37.3494	38.22394	0.004607817	28.69945	11.44392	1.81777
105	0106.00	17.71429	1.591115	28.125	42.37288	2.320008	43.47826	39.31624	0.004509052	23.90192	-6.187635	-0.9936036
20	0019.00	54.01302	3.445598	33.98058	49.38272	1.137147	53.62319	47.47475	0.004492352	43.99694	10.01608	1.593039
165	0164.00	27.43902	3.558617	8.695652	57.5	3.157497	30.43478	51.75439	0.004426104	35.0076	-7.568571	-1.208992
367	0385.00	16.31944	2.522625	13.04348	61.34021	1.950956	15.38462	47.2	0.004403388	24.1289	-7.80946	-1.246638
43	0044.00	55.58252	3.837797	24.28571	50.84746	2.246793	31.81818	51.73913	0.004392712	43.1758	12.40672	1.968917
143	0141.00	17.95666	2.761036	21.79487	46.28571	2.360668	34.48276	51.33929	0.004330848	32.62783	-14.67118	-2.325591
136	0134.00	28.14371	2.649065	28.57143	52.94118	2.182179	45.45455	51.30435	0.004211983	37.93137	-9.787662	-1.55654
66	0067.00	44.82759	2.790059	29.26829	40.8805	2.618615	23.68421	47.82609	0.004111846	34.82063	10.00695	1.590789
23	0023.00	50	3.281651	29.62963	45.21739	1.894055	65	45.83333	0.003924724	41.95125	8.048745	1.282856
18	0017.00	48.25	3.611576	35.78947	43.71859	0	41.93548	52.23881	0.003781063	42.38543	5.864572	0.9407829
368	0390.00	28.04878	3.735437	25.37313	44.44444	1.084652	28	39.55224	0.003407594	34.92921	-6.880426	-1.098303
166	0165.00	44.2623	3.441236	14.77273	45.29915	2.862992	41.55844	57.62712	0.003208054	37.404	6.858298	1.094223
132	0130.00	53.6	4.427445	8.888889	67.52336	4.697417	7.555556	69.79167	0.00209248	50.54303	3.05697	0.4973239
185	0184.00	27.03151	3.251772	27.54491	40.78947	1.815184	43.75	39.32927	0.001673101	35.20933	-8.177823	-1.297175
204	0199.00	28.35249	3.517752	21.53846	46.69118	2.354768	36.2069	42.36111	0.001505015	36.27156	-7.919066	-1.255955
292	0286.00	43.01288	3.711843	21.9697	29.17706	3.732667	27.04918	43.54839	0.001429056	37.43548	5.5774	0.8872145
400	0430.00	17.68642	2.256851	18.18182	45.69288	1.796906	45.66929	41.72185	0.001215297	23.39228	-5.705859	-0.9065917
334	0325.02	26.84015	3.508565	22.28261	33.10345	2.585823	28.57143	41.92593	0.0009907295	33.5519	-6.711754	-1.064183
278	0273.00	22.39521	2.947197	19.82379	42.15686	2.444119	34.14634	42.47788	0.0008003792	29.80063	-7.405418	-1.17297
342	0329.00	22.26833	2.512107	20.72539	37.9822	1.739579	47.82609	38.70192	0.0001845368	24.67387	-2.405544	-0.3819289
422	0512.02	10.38186	2.132007	13.93443	26.5861	1.760377	22.68908	40.93686	0.0001670965	13.12932	-2.747458	-0.4356323
202	0197.00	28.34994	2.40906	24.03433	48.54167	1.994467	42.85714	45.61028	8.764326e-05	30.02045	-1.670502	-0.2652086
433	0520.02	20.56396	2.769892	18.99038	15.15789	2.431183	29.24528	41.03832	2.907333e-05	21.47213	-0.9081725	-0.1442568

	srnom	FAIBREVPCT	SqrtChom	MONOPCT	menage1per	SqrtImmig	pasecol1524	tpspartiel
59	0061.00	30.95238	3.216338	36.84211	53.84615	1.856953	15.38462	45.90164
52	0053.00	55.03356	3.576408	13.04348	68.57143	3.172206	48.27586	59.74026
83	0085.00	44.28571	2.995723	32.35294	29.82456	1.43839	37.93103	32.25806
151	0150.00	33.69565	4.082483	26.04167	53.06859	2.464498	56.25	49.52077
42	0043.00	42.85714	3.867897	11.76471	62.10526	1.301889	17.3913	50.80645
50	0051.00	44.61538	2.773501	16.27907	57.92683	2.438843	19.60784	62.43094