Labos en R

Tous les labos, version R. (Pas de labo 1.)


Labo 2

Help


In [ ]:
# help()
# help(function)
# help(package='package-name)

Packages


In [ ]:
# install
# install.packages('package-name')

# already installed with conda
#install.packages("foreign")

# new installs
#install.packages("Rcmdr", dependencies = TRUE, repos="http://cran.rstudio.com/") # in conda?
#install.packages("nortest", repos="http://cran.rstudio.com/")
#install.packages("sas7bdat", repos="http://cran.rstudio.com/")
#install.packages("Hmisc", repos="http://cran.rstudio.com/")
#install.packages("pastecs", repos="http://cran.rstudio.com/")
# import
# library('package-name')

library(foreign)
library(nortest)
library(sas7bdat)
library(Hmisc)
library(pastecs)

Working space


In [ ]:
ls()
# rm(list=ls())
# setwd()
getwd()

Read data


In [ ]:
# import excel : via txt tab separated
#fichierTexte <- read.table("data/labo2/SR_Data.txt", header = TRUE)

# import DBF (DBase)
fichierDBF <- read.dbf("data/labo2/SR_Data.dbf")

# import SPSS
#fichierSPSS <- read.spss("data/labo2/Data_SPSS.sav", to.data.frame=TRUE)

# import SAS
#fichierSAS <- read.sas7bdat("data/labo2/tableau1.sas7bdat", debug=FALSE)

head(fichierDBF)

Table structure


In [ ]:
# show variable names
names(fichierDBF)
# indexes start at 1

# delete variable
fichierDBF$Shape_Leng <- NULL

# rename variable
names(fichierDBF)[1] <- "POPTOT"

# create variable
fichierDBF$km <- fichierDBF$Shape_Area / 1000000
fichierDBF$HabKm2 <- fichierDBF$POPTOT / fichierDBF$km

head(fichierDBF)

In [ ]:
# new table from a subset
names(fichierDBF)
ZScores <-fichierDBF[,c(12:15)]
names(ZScores)

Normality


In [ ]:
#install.packages("moments", repos="http://cran.rstudio.com/")
library(moments)

Skewness


In [ ]:
skewness(fichierDBF)

Kurtosis


In [ ]:
kurtosis(fichierDBF)

Kolmogorov-Smirnov


In [ ]:
#lillie.test(Tableau1$HabKm2)
# sapply(fichierDBF[18:20],lillie.test)
sapply(fichierDBF[18],lillie.test)

In [ ]:
#ks.test(x, y) # two sample

#m <- mean(fichierDBF[18])
#s <- sd(fichierDBF[18])

#ks.test(fichierDBF[18], "pnorm", m, s)

Shapiro-Wilk


In [ ]:
sapply(fichierDBF[18],shapiro.test)  # sapply(fichierDBF[18:20],shapiro.test)

Transformations

Square root


In [ ]:
fichierDBF$SqrtDens <- sqrt(fichierDBF$HabKm2)
fichierDBF$SqrtImg <- sqrt(fichierDBF$IMMREC_PCT)

Logarithmic


In [ ]:
# log(0) = error
fichierDBF$LogDens <- log(fichierDBF$HabKm2)
fichierDBF$LogImg <- log(fichierDBF$IMMREC_PCT+1)

summary(fichierDBF)

Centrage et réduction


In [ ]:
ZScores$INDICE_PAU <- scale(fichierDBF[1], center = TRUE, scale = TRUE)
ZScores$Dist_Min <- scale(fichierDBF[2], center = TRUE, scale = TRUE)
ZScores$N_1000 <- scale(fichierDBF[3], center = TRUE, scale = TRUE)
ZScores$Dist_Moy_3 <- scale(fichierDBF[4], center = TRUE, scale = TRUE)

#help(sapply)
sapply(ZScores,mean)
sapply(ZScores,sd)

Descriptive statistics


In [ ]:
summary(fichierDBF)

In [ ]:
sapply(fichierDBF, mean)
sapply(fichierDBF, sd)
sapply(fichierDBF, min)
sapply(fichierDBF, max)
sapply(fichierDBF, median)
sapply(fichierDBF, range)
sapply(fichierDBF, quantile)

In [ ]:
# Hmisc.describe
describe(fichierDBF)

In [ ]:
# pastecs.stat.desc
stat.desc(fichierDBF, basic=TRUE, norm=TRUE)

Histograms


In [ ]:
hist(fichierDBF$HabKm2, main="Histogramme", xlab="Habitants au km2", ylab="Effectif", breaks=10, col='lightblue')

In [ ]:
hist(fichierDBF$SqrtDens, main="Histogramme", xlab="Habitants au km2 (racine)", ylab="Effectif", breaks=10, col='gold')

In [ ]:
hist(fichierDBF$LogDens, main="Histogramme", xlab="Habitants au km2 log)", ylab="Effectif", breaks=10, col='coral')

Histogram with normal curve


In [ ]:
x <- fichierDBF$HabKm2
h<-hist(x, breaks=10, col="lightblue", xlab="Habitants au km2", ylab="Effectif", 
main="Histogramme avec courbe normale")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)

In [ ]:
x <- fichierDBF$SqrtDens
h<-hist(x, breaks=10, col="red", xlab="Habitants au km2 (racine)", ylab = "Effectif",
main="Histogramme avec courbe normale")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)

Labo 3


In [ ]:
# install
#install.packages('doBy', repos="http://cran.rstudio.com/")
#install.packages('gmodels', repos="http://cran.rstudio.com/")
#install.packages('scatterplot3d', repos="http://cran.rstudio.com/")

# import
library(foreign)
library(nortest)
library(sas7bdat)
library(Hmisc)
library(pastecs)
library(ggplot2)
library(doBy)
library(gmodels)
library(scatterplot3d)

# data
Tableau1 <- read.sas7bdat("data/labo3/tableau1.sas7bdat", debug=FALSE)
names(Tableau1)

TableauKhi2 <- read.sas7bdat("data/labo3/khi2.sas7bdat", debug=FALSE)
names(TableauKhi2)

Histogrammes classiques


In [ ]:
hist(Tableau1$IMMREC_PCT, breaks=10, xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme")

breaks = nombre de barres


In [ ]:
hist(Tableau1$IMMREC_PCT, breaks=20, xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme")

density = pour rendu barres (ex.: hachures)


In [ ]:
hist(Tableau1$IMMREC_PCT, density=20, breaks=20, xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme")

col = colours


In [ ]:
hist(Tableau1$IMMREC_PCT, breaks=20, col="red", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme") 
hist(Tableau1$IMMREC_PCT, breaks=20, col="lightyellow", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme") 
hist(Tableau1$IMMREC_PCT, breaks=20, col="lightsalmon", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme") 
hist(Tableau1$IMMREC_PCT, breaks=20, col="lightgreen", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme")

In [ ]:

ylim = limites


In [ ]:
plot(
    hist(Tableau1$IMMREC_PCT, breaks=20),
    ylim=c(0, 80), col="lightgreen", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme"
)

prob : proportion vs effectif


In [ ]:
hist(Tableau1$IMMREC_PCT, col="lightgray", breaks=20, xlab="Immigrants récents (%)", ylab = "Proportion", main="Histogramme", prob=TRUE)

Histogrammes avec courbe normale

y = proportion


In [ ]:
m <- mean(Tableau1$IMMREC_PCT)
std <- sd(Tableau1$IMMREC_PCT)
hist(Tableau1$IMMREC_PCT, col="lightyellow", breaks=20, prob=TRUE, xlab="Immigrants récents (%)", ylab = "Proportion", main="Histogramme avec la courbe normale")
curve(dnorm(x, mean=m, sd=std), col="darkblue", lwd=2, add=TRUE)

y = effectif


In [ ]:
x <- Tableau1$IMMREC_PCT
h<-hist(x, breaks=20, col="lightyellow", xlab="Immigrants récents (%)", ylab = "Effectif", main="Histogramme avec la courbe normale") 
xfit<-seq(min(x),max(x),length=40) 
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) 
yfit <- yfit*diff(h$mids[1:2])*length(x) 
lines(xfit, yfit, col="darkblue", lwd=2)

Nuages de points


In [ ]:
plot(Tableau1$IMMREC_PCT, Tableau1$FAIBREVPCT, xlab="Immigrants récents (%)", ylab = "Faible revenu (%)", main="Nuage de points")

Nuages de points avec droite de régression


In [ ]:
plot(Tableau1$IMMREC_PCT, Tableau1$FAIBREVPCT, xlab="Immigrants récents (%)", ylab = "Faible revenu (%)", main="Nuage de points avec droite de régression")
abline(lsfit(Tableau1$IMMREC_PCT, Tableau1$FAIBREVPCT))

Matrice de nuage de points


In [ ]:
pairs(~MONOPCT+MENAGE1PCT+TX_CHOM+FAIBREVPCT,data=Tableau1, 
      main="Matrice de nuages de points")

Nuages de point 3D


In [ ]:
scatterplot3d(Tableau1$MONOPCT, Tableau1$TX_CHOM, Tableau1$FAIBREVPCT, main="Nuage de points 3D")
scatterplot3d(Tableau1$MONOPCT, Tableau1$TX_CHOM, Tableau1$FAIBREVPCT, main="Nuage de points 3D", xlab="Familles monoparentales (%)", ylab="Taux de chômage", zlab="Faible revenu (%)");

Matrice de corrélation

Pearson


In [ ]:
rcorr(cbind(Tableau1$MONOPCT,Tableau1$MENAGE1PCT,Tableau1$TX_CHOM,Tableau1$FAIBREVPCT,Tableau1$Dist_Min,Tableau1$N_1000), type="pearson")

Spearman


In [ ]:
rcorr(cbind(Tableau1$MONOPCT,Tableau1$MENAGE1PCT,Tableau1$TX_CHOM,Tableau1$FAIBREVPCT,Tableau1$Dist_Min,Tableau1$N_1000), type="spearman")

Régression linéaire simple


In [ ]:
reg <- lm(TX_CHOM ~ FAIBREVPCT, data = Tableau1)
summary(reg)

names(Tableau1)

Tableau de contingence


In [ ]:
names(TableauKhi2)

Modalités variables nominales


In [ ]:
# sex
table(TableauKhi2$SEX)
TableauKhi2$SEX <- factor(TableauKhi2$SEX, levels = c(1,2), labels = c("Homme", "Femme"))
table(TableauKhi2$SEX)

# transport mode
table(TableauKhi2$Mode)
TableauKhi2$Mode <- factor(TableauKhi2$Mode, levels = c(0:4), labels = c("Auto (conducteur)", "Auto (passager)", "Transport en commun", "Tranport actif", "Autres"))
table(TableauKhi2$Mode)

# distance
table(TableauKhi2$DIST)
TableauKhi2$DIST <- factor(TableauKhi2$DIST, levels = c(1:7), labels = c("Moins de 5 km", "5 à 9,9 km","10 à 14,9 km", "15 à 19,9 km", "20 à 24,9 km", "25 à 29,9 km", "30 km et plus"))
table(TableauKhi2$DIST)

Tableau de contingence


In [ ]:
CrossTable(TableauKhi2$SEX, TableauKhi2$Mode, chisq=TRUE, expected=TRUE, resid=TRUE, format="SPSS")
CrossTable(TableauKhi2$SEX, TableauKhi2$DIST, chisq=TRUE, expected=TRUE, resid=TRUE, format="SPSS")
CrossTable(TableauKhi2$Mode, TableauKhi2$DIST, chisq=TRUE, expected=TRUE, resid=TRUE, format="SPSS")

Labo 4


In [ ]:
# import
library(foreign)
library(nortest)
library(sas7bdat)
library(doBy)

# data
MTL <- read.sas7bdat("data/labo4/mtl_ttest.sas7bdat", debug=FALSE)
TOR <- read.sas7bdat("data/labo4/tor_ttest.sas7bdat", debug=FALSE)
VAN <- read.sas7bdat("data/labo4/van_ttest.sas7bdat", debug=FALSE)
TROISRMR <- read.sas7bdat("data/labo4/troisrmr_anova.sas7bdat", debug=FALSE)
names(MTL)
names(TOR)
names(VAN)
names(TROISRMR)

In [ ]:
# modalités (labels)
table(MTL$SEX)
table(TOR$SEX)
table(VAN$SEX)
MTL$SEX <- factor(MTL$SEX, levels = c(1,2), labels = c("Homme", "Femme"))
TOR$SEX <- factor(TOR$SEX, levels = c(1,2), labels = c("Homme", "Femme"))
VAN$SEX <- factor(VAN$SEX, levels = c(1,2), labels = c("Homme", "Femme"))
table(MTL$SEX)
table(TOR$SEX)
table(VAN$SEX)

TROISRMR$CMA <- factor(TROISRMR$CMA, levels = c(462,535,933), labels = c("Montréal", "Toronto", "Vancouver"))
table(TROISRMR$CMA)

T-Test : Comparaison de moyennes

Test F

Vérification de l'égalité des variances


In [ ]:
var.test(TOTINC ~ SEX, alternative='two.sided', conf.level=.95, data=MTL)

Interprétation

  • p-value < 2.2e-16
    • p < 0.05 alors méthode Satterthwaite
  • true ratio of variances is not equal to 1

Méthode Satterthwaite

Pas égales : P < 0,05

  • var.equal=FALSE

In [ ]:
t.test(TOTINC~SEX, alternative='two.sided', conf.level=.95, var.equal=FALSE, data=MTL)

Interprétation

  • t = -27.088
  • p-value < 2.2e-16

Méthode Pooled

Égales : P >= 0,05

  • var.equal=TRUE

In [ ]:
t.test(TOTINC~SEX, alternative='two.sided', conf.level=.95, var.equal=TRUE, data=MTL)
boxplot(TOTINC~SEX, data = MTL, col = "coral", main="Boites à moustache (RMR de Montréal)", xlab="Sexe", ylab="Revenu total")
boxplot(LogTotInc~SEX, data = MTL, col = "coral", main="Boites à moustache (RMR de Montréal)", xlab="Sexe", ylab="Revenu total (log)")

Interprétation

  • t = -27.783
  • p-value < 2.2e-16

Analyse des résultats

Contexte dataset, valeurs et comparaison des 2 moyennes des 2 modes de la variable qualitative, "la différence entre les moyennes (x) est d'ailleurs significative (t=27,09; P<0,001)".

ANOVA : Analyse de variance

Moyenne par groupe


In [ ]:
# doBy
summaryBy(GROSRT ~ CMA, TROISRMR, FUN=c(mean), na.rm=TRUE)

Boxplot

Visualisation d'ANOVA


In [ ]:
boxplot(GROSRT ~ CMA, data = TROISRMR, col = "lightyellow", main="Boites à moustache", xlab="Région métropolitaine", ylab="Loyer ($)")  #Analyse de variance : test F

ANOVA


In [ ]:
anova.aov <- aov(GROSRT ~ CMA, data = TROISRMR)
summary(anova.aov)

Interprétation

  • CMA Sum Sq = variance expliquée (inter)
  • Residuals Sum Sq = variance non expliquée (intra)
  • CMA Df = nombre de degrés de liberté pour variance expliquée (inter)
  • Residuals Df = nombre de degrés de liberté pour variance non expliquée (intra)
  • CMA F value = F observé
  • CMA Pr(>F) = Valeur de P rattachée à valeur de F

Test de F

Hypothèse H0 = "indépendance entre les deux variances (inter et intra)"

  • k = nombre de groupes
  • n = nombre d'observations

  • DL numérateur (VE, inter) de table de Fisher

    • k - 1
  • DL dénominateur (VNE, intra) de table de Fisher
    • n - k

Calcul F théorique

  • F théorique
  • P associé au F théorique, seuils de signification
    • 95% : p=0,05
    • 99% : p=0,01
    • 99,9% : p=0,001

In [ ]:
f_theorique <- qf(0.99, 2, 8379)
f_theorique
# qt() pour table Student t pour coefficient de ... 
# (voir autres cours)

Interprétation

  • F observé > à F théorique
    • moyennes sont statistiquement différentes
    • H0 rejeté
  • F observé < F théorique
    • moyennes des groupes ne sont pas différentes
    • H0 validée

Calcul R carré

Pour obtenir Coefficient de détermination


In [ ]:
anova.r2 <- lm(GROSRT ~ CMA, data = TROISRMR)
summary(anova.r2)

Interprétation

  • Multiple R-squared = Coefficient de détermination
    • la variable qualitative explique à x% la variation de la vaiable quantitative

Test de Tukey

Comparaison des moyennes groupes, 2 à 2


In [ ]:
TukeyHSD(anova.aov)

Labo 5


In [ ]:
# install
#install.packages("MASS", repos="http://cran.rstudio.com/")      ## Tests de normalité supp.
#install.packages("car", repos="http://cran.rstudio.com/")      ## Companion to Applied Regression

In [4]:
# import
library(foreign)
library(MASS)
library(sas7bdat)
library(pastecs)
library(car)

# data
MTL <- read.sas7bdat("data/labo5/pauvretemtl.sas7bdat", debug=FALSE)
names(MTL)


Out[4]:
  1. 'srnom'
  2. 'FAIBREVPCT'
  3. 'SqrtChom'
  4. 'MONOPCT'
  5. 'menage1per'
  6. 'SqrtImmig'
  7. 'pasecol1524'
  8. 'tpspartiel'

In [5]:
# stats univariees
summary(MTL)


Out[5]:
     srnom       FAIBREVPCT        SqrtChom        MONOPCT     
 0001.00:  1   Min.   : 1.232   Min.   :0.000   Min.   : 0.00  
 0002.00:  1   1st Qu.:19.761   1st Qu.:2.568   1st Qu.:16.05  
 0003.00:  1   Median :28.699   Median :2.925   Median :21.23  
 0004.00:  1   Mean   :29.982   Mean   :2.995   Mean   :21.38  
 0005.00:  1   3rd Qu.:39.803   3rd Qu.:3.416   3rd Qu.:26.18  
 0006.00:  1   Max.   :82.642   Max.   :6.887   Max.   :51.28  
 (Other):500                                                   
   menage1per       SqrtImmig      pasecol1524      tpspartiel   
 Min.   : 3.943   Min.   :0.000   Min.   : 0.00   Min.   :30.65  
 1st Qu.:28.587   1st Qu.:1.454   1st Qu.:24.51   1st Qu.:41.26  
 Median :38.598   Median :1.961   Median :32.66   Median :45.47  
 Mean   :37.674   Mean   :2.074   Mean   :32.66   Mean   :45.61  
 3rd Qu.:46.752   3rd Qu.:2.543   3rd Qu.:40.93   3rd Qu.:49.65  
 Max.   :72.632   Max.   :5.078   Max.   :68.75   Max.   :69.79  
                                                                 

In [11]:
# regression lineaire multiple
ols <- lm(
    FAIBREVPCT ~ 
    SqrtChom + 
    MONOPCT + 
    menage1per + 
    SqrtImmig + 
    pasecol1524 + 
    tpspartiel, 
    data=MTL)
summary(ols)


Out[11]:
Call:
lm(formula = FAIBREVPCT ~ SqrtChom + MONOPCT + menage1per + SqrtImmig + 
    pasecol1524 + tpspartiel, data = MTL)

Residuals:
    Min      1Q  Median      3Q     Max 
-20.405  -4.290  -0.586   3.498  34.343 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -36.10961    2.20183 -16.400  < 2e-16 ***
SqrtChom      7.97771    0.57049  13.984  < 2e-16 ***
MONOPCT       0.60825    0.04703  12.933  < 2e-16 ***
menage1per    0.20132    0.02517   7.997 8.94e-15 ***
SqrtImmig     2.58049    0.37394   6.901 1.57e-11 ***
pasecol1524   0.10986    0.02818   3.899  0.00011 ***
tpspartiel    0.27767    0.05336   5.204 2.86e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.326 on 499 degrees of freedom
Multiple R-squared:  0.8059,	Adjusted R-squared:  0.8035 
F-statistic: 345.2 on 6 and 499 DF,  p-value: < 2.2e-16

In [10]:
## coefficients standardisés
CoefStand <- lm(
    scale(FAIBREVPCT) ~ 
    scale(SqrtChom) + 
    scale(MONOPCT) + 
    scale(menage1per) + 
    scale(SqrtImmig) + 
    scale(pasecol1524) + 
    scale(tpspartiel), 
    data = MTL
)
summary(CoefStand)


Out[10]:
Call:
lm(formula = scale(FAIBREVPCT) ~ scale(SqrtChom) + scale(MONOPCT) + 
    scale(menage1per) + scale(SqrtImmig) + scale(pasecol1524) + 
    scale(tpspartiel), data = MTL)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.42971 -0.30057 -0.04107  0.24509  2.40635 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -1.646e-16  1.971e-02   0.000  1.00000    
scale(SqrtChom)     3.899e-01  2.788e-02  13.984  < 2e-16 ***
scale(MONOPCT)      3.343e-01  2.585e-02  12.933  < 2e-16 ***
scale(menage1per)   1.825e-01  2.282e-02   7.997 8.94e-15 ***
scale(SqrtImmig)    1.714e-01  2.484e-02   6.901 1.57e-11 ***
scale(pasecol1524)  9.386e-02  2.407e-02   3.899  0.00011 ***
scale(tpspartiel)   1.268e-01  2.436e-02   5.204 2.86e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4433 on 499 degrees of freedom
Multiple R-squared:  0.8059,	Adjusted R-squared:  0.8035 
F-statistic: 345.2 on 6 and 499 DF,  p-value: < 2.2e-16

In [12]:
## multicolinéarité?
### Valeurs de VIF
vif(ols)
vif(ols) > 5 # problème de multicolinéarité (VIF > 5)?


Out[12]:
SqrtChom
1.99767242206301
MONOPCT
1.7169166500618
menage1per
1.33888028140516
SqrtImmig
1.58542766716237
pasecol1524
1.48920558001708
tpspartiel
1.52538045714979
Out[12]:
SqrtChom
FALSE
MONOPCT
FALSE
menage1per
FALSE
SqrtImmig
FALSE
pasecol1524
FALSE
tpspartiel
FALSE

In [13]:
## Graphiques et distance de cook
opar <- par(mfrow = c(2, 2), oma = c(0, 0, 1.1, 0))
plot(ols, las = 1)
par(opar)



In [14]:
## Histogramme sur les résidus et vérification de la normalité des résidus
m <- mean(residuals(ols))
std <- sd(residuals(ols))
hist(residuals(ols), col="lightyellow", breaks=20, prob=TRUE, xlab="Résidus OLS", ylab = "Proportion", main="Histogramme avec la courbe normale")
curve(dnorm(x, mean=m, sd=std), col="darkblue", lwd=2, add=TRUE)
stat.desc(residuals(ols), basic=TRUE, norm=TRUE)


Out[14]:
nbr.val
506
nbr.null
0
nbr.na
0
min
-20.4047979910667
max
34.3433376307782
range
54.748135621845
sum
-7.40241201668823e-14
median
-0.586204938060121
mean
-1.46830543304696e-16
SE.mean
0.27955974003622
CI.mean.0.95
0.549243372142321
var
39.5457460140541
std.dev
6.28854084935878
coef.var
-42828560787309032
skewness
0.676033517801706
skew.2SE
3.11329159768941
kurtosis
2.13343255710525
kurt.2SE
4.92204145621073
normtest.W
0.972936774972511
normtest.p
4.69730411019801e-08

In [16]:
# Repérer les valeurs aberrantes : Distance de Cook > 4 / n ou 8 / n
nobs <- NROW(na.omit(residuals(ols)))            # Nombre d'observations dans le jeu de données
cook <- cooks.distance(ols)                      # Distance de Cook
ypredit <- fitted.values(ols)                    # Y prédits par le modèle
res <- residuals(ols)                            # résidus (Y - Y prédits par le modèle)
res_std <- rstandard(ols)                        # résidus standardisés

a <- cbind(MTL, cook, ypredit, res, res_std)
a <- a[order(-cook), ]
a[cook > 4/nobs, ]    # Observations dont la distance de Cook > 4 / n
a[cook > 8/nobs, ]    # Observations dont la distance de Cook > 8 / n


Out[16]:
srnomFAIBREVPCTSqrtChomMONOPCTmenage1perSqrtImmigpasecol1524tpspartielcookypreditresres_std
210021.0062.776034.05321739.7058845.047921.36930652.6315849.629630.00671614952.5425610.233471.631841
710073.0055.399063.671942555.084751.92897158.82353500.00642621644.8039710.595091.687955
1140114.0024.433253.63536216.3265337.078653.5841816.9491553.424660.00589726636.23329-11.80005-1.876161
1350133.0045.436892.98363514.9425356.818182.94457527.8688551.524390.00586330533.1877312.249161.946705
3730395.0158.208963.70899138.4615442.03541.55230144.067847.252750.00567275647.3044410.904511.735029
4730601.0233.962264.05119924.84848363.6811141445.748990.00476200442.31183-8.349568-1.33217
4840605.0128.873242.4397515.8536634.41.45350526.6666735.317460.00469445916.4096412.46361.978397
4940610.0640.143373.13230327.8145721.860471.46647137.349438.223940.00460781728.6994511.443921.81777
1050106.0017.714291.59111528.12542.372882.32000843.4782639.316240.00450905223.90192-6.187635-0.9936036
200019.0054.013023.44559833.9805849.382721.13714753.6231947.474750.00449235243.9969410.016081.593039
1650164.0027.439023.5586178.69565257.53.15749730.4347851.754390.00442610435.0076-7.568571-1.208992
3670385.0016.319442.52262513.0434861.340211.95095615.3846247.20.00440338824.1289-7.80946-1.246638
430044.0055.582523.83779724.2857150.847462.24679331.8181851.739130.00439271243.175812.406721.968917
1430141.0017.956662.76103621.7948746.285712.36066834.4827651.339290.00433084832.62783-14.67118-2.325591
1360134.0028.143712.64906528.5714352.941182.18217945.4545551.304350.00421198337.93137-9.787662-1.55654
660067.0044.827592.79005929.2682940.88052.61861523.6842147.826090.00411184634.8206310.006951.590789
230023.00503.28165129.6296345.217391.8940556545.833330.00392472441.951258.0487451.282856
180017.0048.253.61157635.7894743.71859041.9354852.238810.00378106342.385435.8645720.9407829
3680390.0028.048783.73543725.3731344.444441.0846522839.552240.00340759434.92921-6.880426-1.098303
1660165.0044.26233.44123614.7727345.299152.86299241.5584457.627120.00320805437.4046.8582981.094223
1320130.0053.64.4274458.88888967.523364.6974177.55555669.791670.0020924850.543033.056970.4973239
1850184.0027.031513.25177227.5449140.789471.81518443.7539.329270.00167310135.20933-8.177823-1.297175
2040199.0028.352493.51775221.5384646.691182.35476836.206942.361110.00150501536.27156-7.919066-1.255955
2920286.0043.012883.71184321.969729.177063.73266727.0491843.548390.00142905637.435485.57740.8872145
4000430.0017.686422.25685118.1818245.692881.79690645.6692941.721850.00121529723.39228-5.705859-0.9065917
3340325.0226.840153.50856522.2826133.103452.58582328.5714341.925930.000990729533.5519-6.711754-1.064183
2780273.0022.395212.94719719.8237942.156862.44411934.1463442.477880.000800379229.80063-7.405418-1.17297
3420329.0022.268332.51210720.7253937.98221.73957947.8260938.701920.000184536824.67387-2.405544-0.3819289
4220512.0210.381862.13200713.9344326.58611.76037722.6890840.936860.000167096513.12932-2.747458-0.4356323
2020197.0028.349942.4090624.0343348.541671.99446742.8571445.610288.764326e-0530.02045-1.670502-0.2652086
4330520.0220.563962.76989218.9903815.157892.43118329.2452841.038322.907333e-0521.47213-0.9081725-0.1442568
Out[16]:
srnomFAIBREVPCTSqrtChomMONOPCTmenage1perSqrtImmigpasecol1524tpspartielcookypreditresres_std
210021.0062.776034.05321739.7058845.047921.36930652.6315849.629630.00671614952.5425610.233471.631841
710073.0055.399063.671942555.084751.92897158.82353500.00642621644.8039710.595091.687955
1140114.0024.433253.63536216.3265337.078653.5841816.9491553.424660.00589726636.23329-11.80005-1.876161
3730395.0158.208963.70899138.4615442.03541.55230144.067847.252750.00567275647.3044410.904511.735029
4840605.0128.873242.4397515.8536634.41.45350526.6666735.317460.00469445916.4096412.46361.978397
1050106.0017.714291.59111528.12542.372882.32000843.4782639.316240.00450905223.90192-6.187635-0.9936036
200019.0054.013023.44559833.9805849.382721.13714753.6231947.474750.00449235243.9969410.016081.593039
1650164.0027.439023.5586178.69565257.53.15749730.4347851.754390.00442610435.0076-7.568571-1.208992
3670385.0016.319442.52262513.0434861.340211.95095615.3846247.20.00440338824.1289-7.80946-1.246638
1430141.0017.956662.76103621.7948746.285712.36066834.4827651.339290.00433084832.62783-14.67118-2.325591
660067.0044.827592.79005929.2682940.88052.61861523.6842147.826090.00411184634.8206310.006951.590789
230023.00503.28165129.6296345.217391.8940556545.833330.00392472441.951258.0487451.282856
180017.0048.253.61157635.7894743.71859041.9354852.238810.00378106342.385435.8645720.9407829
3680390.0028.048783.73543725.3731344.444441.0846522839.552240.00340759434.92921-6.880426-1.098303
1320130.0053.64.4274458.88888967.523364.6974177.55555669.791670.0020924850.543033.056970.4973239
1850184.0027.031513.25177227.5449140.789471.81518443.7539.329270.00167310135.20933-8.177823-1.297175
2920286.0043.012883.71184321.969729.177063.73266727.0491843.548390.00142905637.435485.57740.8872145
3340325.0226.840153.50856522.2826133.103452.58582328.5714341.925930.000990729533.5519-6.711754-1.064183
2020197.0028.349942.4090624.0343348.541671.99446742.8571445.610288.764326e-0530.02045-1.670502-0.2652086

In [17]:
# Tableau de données sans les valeurs aberrantes (Cook > 8 / n )
dataSansOutliers <- a[a$cook  < 8/nobs, ]
dataSansOutliers$cook <- NULL
dataSansOutliers$ypredit <- NULL
dataSansOutliers$res <- NULL
dataSansOutliers$res_std <- NULL
head(dataSansOutliers)


Out[17]:
srnomFAIBREVPCTSqrtChomMONOPCTmenage1perSqrtImmigpasecol1524tpspartiel
590061.0030.952383.21633836.8421153.846151.85695315.3846245.90164
520053.0055.033563.57640813.0434868.571433.17220648.2758659.74026
830085.0044.285712.99572332.3529429.824561.4383937.9310332.25806
1510150.0033.695654.08248326.0416753.068592.46449856.2549.52077
420043.0042.857143.86789711.7647162.105261.30188917.391350.80645
500051.0044.615382.77350116.2790757.926832.43884319.6078462.43094

In [18]:
# Nouveau modèle de régression sans les valeurs aberrantes
ols2 <- lm(FAIBREVPCT ~ SqrtChom+MONOPCT+menage1per+SqrtImmig+pasecol1524+tpspartiel, data = dataSansOutliers)
summary(ols2)
opar <- par(mfrow = c(2, 2), oma = c(0, 0, 1.1, 0))
plot(ols, las = 1)
par(opar)


Out[18]:
Call:
lm(formula = FAIBREVPCT ~ SqrtChom + MONOPCT + menage1per + SqrtImmig + 
    pasecol1524 + tpspartiel, data = dataSansOutliers)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.4311  -3.7694  -0.2836   3.3556  14.3676 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -38.07012    1.98689 -19.161  < 2e-16 ***
SqrtChom      8.62818    0.56045  15.395  < 2e-16 ***
MONOPCT       0.62153    0.04414  14.082  < 2e-16 ***
menage1per    0.13012    0.02321   5.605 3.52e-08 ***
SqrtImmig     2.37947    0.34582   6.881 1.87e-11 ***
pasecol1524   0.13829    0.02593   5.333 1.49e-07 ***
tpspartiel    0.30772    0.04772   6.448 2.77e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.404 on 480 degrees of freedom
Multiple R-squared:  0.842,	Adjusted R-squared:  0.8401 
F-statistic: 426.5 on 6 and 480 DF,  p-value: < 2.2e-16

In [19]:
# Comparaison des deux modèles : coefficients
summary(ols)
summary(ols2)


Out[19]:
Call:
lm(formula = FAIBREVPCT ~ SqrtChom + MONOPCT + menage1per + SqrtImmig + 
    pasecol1524 + tpspartiel, data = MTL)

Residuals:
    Min      1Q  Median      3Q     Max 
-20.405  -4.290  -0.586   3.498  34.343 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -36.10961    2.20183 -16.400  < 2e-16 ***
SqrtChom      7.97771    0.57049  13.984  < 2e-16 ***
MONOPCT       0.60825    0.04703  12.933  < 2e-16 ***
menage1per    0.20132    0.02517   7.997 8.94e-15 ***
SqrtImmig     2.58049    0.37394   6.901 1.57e-11 ***
pasecol1524   0.10986    0.02818   3.899  0.00011 ***
tpspartiel    0.27767    0.05336   5.204 2.86e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.326 on 499 degrees of freedom
Multiple R-squared:  0.8059,	Adjusted R-squared:  0.8035 
F-statistic: 345.2 on 6 and 499 DF,  p-value: < 2.2e-16
Out[19]:
Call:
lm(formula = FAIBREVPCT ~ SqrtChom + MONOPCT + menage1per + SqrtImmig + 
    pasecol1524 + tpspartiel, data = dataSansOutliers)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.4311  -3.7694  -0.2836   3.3556  14.3676 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -38.07012    1.98689 -19.161  < 2e-16 ***
SqrtChom      8.62818    0.56045  15.395  < 2e-16 ***
MONOPCT       0.62153    0.04414  14.082  < 2e-16 ***
menage1per    0.13012    0.02321   5.605 3.52e-08 ***
SqrtImmig     2.37947    0.34582   6.881 1.87e-11 ***
pasecol1524   0.13829    0.02593   5.333 1.49e-07 ***
tpspartiel    0.30772    0.04772   6.448 2.77e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.404 on 480 degrees of freedom
Multiple R-squared:  0.842,	Adjusted R-squared:  0.8401 
F-statistic: 426.5 on 6 and 480 DF,  p-value: < 2.2e-16

In [20]:
# Comparaison des deux modèles 
vif(ols)
vif(ols2)


Out[20]:
SqrtChom
1.99767242206301
MONOPCT
1.7169166500618
menage1per
1.33888028140516
SqrtImmig
1.58542766716237
pasecol1524
1.48920558001708
tpspartiel
1.52538045714979
Out[20]:
SqrtChom
2.14092431277651
MONOPCT
1.76657445847508
menage1per
1.4393672286109
SqrtImmig
1.70721014765762
pasecol1524
1.62084718042594
tpspartiel
1.5256792012903

In [21]:
# Comparaison des deux histogrammes
m <- mean(residuals(ols))
std <- sd(residuals(ols))
hist(residuals(ols), col="lightyellow", breaks=20, prob=TRUE, xlab="Résidus OLS", ylab = "Proportion", main="Modèle de départ")
curve(dnorm(x, mean=m, sd=std), col="darkblue", lwd=2, add=TRUE)
stat.desc(residuals(ols), basic=TRUE, norm=TRUE)

m <- mean(residuals(ols2))
std <- sd(residuals(ols2))
hist(residuals(ols2), col="lightyellow", breaks=20, prob=TRUE, xlab="Résidus OLS", ylab = "Proportion", main="Modèle sans les outliers")
curve(dnorm(x, mean=m, sd=std), col="darkblue", lwd=2, add=TRUE)
stat.desc(residuals(ols2), basic=TRUE, norm=TRUE)


Out[21]:
nbr.val
506
nbr.null
0
nbr.na
0
min
-20.4047979910667
max
34.3433376307782
range
54.748135621845
sum
-7.40241201668823e-14
median
-0.586204938060121
mean
-1.46830543304696e-16
SE.mean
0.27955974003622
CI.mean.0.95
0.549243372142321
var
39.5457460140541
std.dev
6.28854084935878
coef.var
-42828560787309032
skewness
0.676033517801706
skew.2SE
3.11329159768941
kurtosis
2.13343255710525
kurt.2SE
4.92204145621073
normtest.W
0.972936774972511
normtest.p
4.69730411019801e-08
Out[21]:
nbr.val
487
nbr.null
0
nbr.na
0
min
-15.4310542378812
max
14.3676352481711
range
29.7986894860523
sum
2.59237076249974e-14
median
-0.283621129995047
mean
5.31495051027719e-17
SE.mean
0.243344138468579
CI.mean.0.95
0.478136476019468
var
28.8383720570564
std.dev
5.37013706129149
coef.var
101038326714568336
skewness
0.165991949575083
skew.2SE
0.750028230078793
kurtosis
0.0971118490001404
kurt.2SE
0.219841661800126
normtest.W
0.994250731001488
normtest.p
0.0634160048608441

In [22]:
# Comparaison de la normalité
stat.desc(residuals(ols), basic=TRUE, norm=TRUE)
stat.desc(residuals(ols2), basic=TRUE, norm=TRUE)


Out[22]:
nbr.val
506
nbr.null
0
nbr.na
0
min
-20.4047979910667
max
34.3433376307782
range
54.748135621845
sum
-7.40241201668823e-14
median
-0.586204938060121
mean
-1.46830543304696e-16
SE.mean
0.27955974003622
CI.mean.0.95
0.549243372142321
var
39.5457460140541
std.dev
6.28854084935878
coef.var
-42828560787309032
skewness
0.676033517801706
skew.2SE
3.11329159768941
kurtosis
2.13343255710525
kurt.2SE
4.92204145621073
normtest.W
0.972936774972511
normtest.p
4.69730411019801e-08
Out[22]:
nbr.val
487
nbr.null
0
nbr.na
0
min
-15.4310542378812
max
14.3676352481711
range
29.7986894860523
sum
2.59237076249974e-14
median
-0.283621129995047
mean
5.31495051027719e-17
SE.mean
0.243344138468579
CI.mean.0.95
0.478136476019468
var
28.8383720570564
std.dev
5.37013706129149
coef.var
101038326714568336
skewness
0.165991949575083
skew.2SE
0.750028230078793
kurtosis
0.0971118490001404
kurt.2SE
0.219841661800126
normtest.W
0.994250731001488
normtest.p
0.0634160048608441

Labo 6

Fichiers IVT de StatsCan

.IVT = fichiers Beyond 20/20

Beyond = software, permet

  • lecture data Stats Can
  • tableaux croisés de plus de 2 dimensions
  • méta-données sur variables

Export vers Excel

  1. sélectionner les lignes pertinentes
  2. right+clic + montrer
    • ex.: Pour pourcentage de 0 à 14 ans, et 65 ans et + : prendre lignes, prendre total
  3. drag étiquette de colonne (ex.: Géographie) vers panneua des lignes
  4. refaire 1. et 2.
    • ex.: Montréal = 462

In [23]:
# import
library(foreign)
library(MASS)
library(pastecs)
library(car)
library(gmodels)
library(sas7bdat)

# data
RMR <- read.sas7bdat("data/labo6/rmrmtl06.sas7bdat", debug=FALSE)
names(RMR)


Out[23]:
  1. 'SRIDU'
  2. 'FRPCT_ApIm'
  3. 'Pop65PCT'
  4. 'TxChom'
  5. 'MonoPct'
  6. 'Menage1'
  7. 'ImmigRec'
  8. 'MinorVisib'
  9. 'EmplAtypiq'
  10. 'AucuneDipl'
  11. 'BacEtPlus'
  12. 'ILE_MTL'
  13. 'Zones'
  14. 'DistCBD_KM'
  15. 'SqrtTxChom'

In [24]:
# stats univariees
summary(RMR)


Out[24]:
     SRIDU       FRPCT_ApIm       Pop65PCT          TxChom      
 0001.00:  1   Min.   : 1.00   Min.   : 2.620   Min.   : 0.000  
 0002.00:  1   1st Qu.: 7.50   1st Qu.: 9.037   1st Qu.: 4.700  
 0003.00:  1   Median :15.65   Median :12.605   Median : 6.500  
 0004.00:  1   Mean   :17.95   Mean   :13.763   Mean   : 7.342  
 0005.00:  1   3rd Qu.:25.45   3rd Qu.:17.185   3rd Qu.: 9.300  
 0006.00:  1   Max.   :85.40   Max.   :56.570   Max.   :29.100  
 (Other):854                                                    
    MonoPct         Menage1         ImmigRec        MinorVisib   
 Min.   : 0.00   Min.   : 6.25   Min.   : 0.000   Min.   : 0.00  
 1st Qu.:13.71   1st Qu.:19.83   1st Qu.: 1.188   1st Qu.: 5.07  
 Median :18.32   Median :31.33   Median : 3.345   Median :13.12  
 Mean   :19.00   Mean   :31.90   Mean   : 4.830   Mean   :16.80  
 3rd Qu.:23.39   3rd Qu.:43.66   3rd Qu.: 6.692   3rd Qu.:23.23  
 Max.   :50.00   Max.   :75.00   Max.   :28.850   Max.   :84.78  
                                                                 
   EmplAtypiq      AucuneDipl       BacEtPlus        ILE_MTL      
 Min.   :28.80   Min.   : 0.000   Min.   : 3.88   Min.   :0.0000  
 1st Qu.:39.36   1st Qu.: 8.258   1st Qu.:15.31   1st Qu.:0.0000  
 Median :42.66   Median :13.535   Median :23.71   Median :1.0000  
 Mean   :43.44   Mean   :14.266   Mean   :27.95   Mean   :0.5884  
 3rd Qu.:46.83   3rd Qu.:19.250   3rd Qu.:38.03   3rd Qu.:1.0000  
 Max.   :71.68   Max.   :42.270   Max.   :79.87   Max.   :1.0000  
                                                                  
     Zones        DistCBD_KM       SqrtTxChom   
 Min.   :1.00   Min.   : 0.000   Min.   :0.000  
 1st Qu.:1.00   1st Qu.: 5.303   1st Qu.:2.168  
 Median :1.00   Median : 9.775   Median :2.550  
 Mean   :1.93   Mean   :13.092   Mean   :2.632  
 3rd Qu.:3.00   3rd Qu.:19.349   3rd Qu.:3.050  
 Max.   :4.00   Max.   :57.504   Max.   :5.394  
                                                

In [25]:
# variables muettes
# Zones = 1 alors Montréal
# Zones = 2 alors Laval
# Zones = 3 alors Couronne Nord
# Zones = 4 alors Couronne Sud

RMR$Montreal     <- ifelse(RMR$Zones == 1, 1, 0)
RMR$Laval        <- ifelse(RMR$Zones == 2, 1, 0)
RMR$CouronneNord <- ifelse(RMR$Zones == 3, 1, 0)
RMR$CouronneSud  <- ifelse(RMR$Zones == 4, 1, 0)

In [26]:
## verification
CrossTable(RMR$Zones, RMR$Montreal)
CrossTable(RMR$Zones, RMR$Laval)
CrossTable(RMR$Zones, RMR$CouronneNord)
CrossTable(RMR$Zones, RMR$CouronneSud)


 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  860 

 
             | RMR$Montreal 
   RMR$Zones |         0 |         1 | Row Total | 
-------------|-----------|-----------|-----------|
           1 |         0 |       506 |       506 | 
             |   208.284 |   145.716 |           | 
             |     0.000 |     1.000 |     0.588 | 
             |     0.000 |     1.000 |           | 
             |     0.000 |     0.588 |           | 
-------------|-----------|-----------|-----------|
           2 |        73 |         0 |        73 | 
             |    61.393 |    42.951 |           | 
             |     1.000 |     0.000 |     0.085 | 
             |     0.206 |     0.000 |           | 
             |     0.085 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           3 |       116 |         0 |       116 | 
             |    97.557 |    68.251 |           | 
             |     1.000 |     0.000 |     0.135 | 
             |     0.328 |     0.000 |           | 
             |     0.135 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           4 |       165 |         0 |       165 | 
             |   138.766 |    97.081 |           | 
             |     1.000 |     0.000 |     0.192 | 
             |     0.466 |     0.000 |           | 
             |     0.192 |     0.000 |           | 
-------------|-----------|-----------|-----------|
Column Total |       354 |       506 |       860 | 
             |     0.412 |     0.588 |           | 
-------------|-----------|-----------|-----------|

 

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  860 

 
             | RMR$Laval 
   RMR$Zones |         0 |         1 | Row Total | 
-------------|-----------|-----------|-----------|
           1 |       506 |         0 |       506 | 
             |     3.984 |    42.951 |           | 
             |     1.000 |     0.000 |     0.588 | 
             |     0.643 |     0.000 |           | 
             |     0.588 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           2 |         0 |        73 |        73 | 
             |    66.803 |   720.197 |           | 
             |     0.000 |     1.000 |     0.085 | 
             |     0.000 |     1.000 |           | 
             |     0.000 |     0.085 |           | 
-------------|-----------|-----------|-----------|
           3 |       116 |         0 |       116 | 
             |     0.913 |     9.847 |           | 
             |     1.000 |     0.000 |     0.135 | 
             |     0.147 |     0.000 |           | 
             |     0.135 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           4 |       165 |         0 |       165 | 
             |     1.299 |    14.006 |           | 
             |     1.000 |     0.000 |     0.192 | 
             |     0.210 |     0.000 |           | 
             |     0.192 |     0.000 |           | 
-------------|-----------|-----------|-----------|
Column Total |       787 |        73 |       860 | 
             |     0.915 |     0.085 |           | 
-------------|-----------|-----------|-----------|

 

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  860 

 
             | RMR$CouronneNord 
   RMR$Zones |         0 |         1 | Row Total | 
-------------|-----------|-----------|-----------|
           1 |       506 |         0 |       506 | 
             |    10.641 |    68.251 |           | 
             |     1.000 |     0.000 |     0.588 | 
             |     0.680 |     0.000 |           | 
             |     0.588 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           2 |        73 |         0 |        73 | 
             |     1.535 |     9.847 |           | 
             |     1.000 |     0.000 |     0.085 | 
             |     0.098 |     0.000 |           | 
             |     0.085 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           3 |         0 |       116 |       116 | 
             |   100.353 |   643.647 |           | 
             |     0.000 |     1.000 |     0.135 | 
             |     0.000 |     1.000 |           | 
             |     0.000 |     0.135 |           | 
-------------|-----------|-----------|-----------|
           4 |       165 |         0 |       165 | 
             |     3.470 |    22.256 |           | 
             |     1.000 |     0.000 |     0.192 | 
             |     0.222 |     0.000 |           | 
             |     0.192 |     0.000 |           | 
-------------|-----------|-----------|-----------|
Column Total |       744 |       116 |       860 | 
             |     0.865 |     0.135 |           | 
-------------|-----------|-----------|-----------|

 

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  860 

 
             | RMR$CouronneSud 
   RMR$Zones |         0 |         1 | Row Total | 
-------------|-----------|-----------|-----------|
           1 |       506 |         0 |       506 | 
             |    23.048 |    97.081 |           | 
             |     1.000 |     0.000 |     0.588 | 
             |     0.728 |     0.000 |           | 
             |     0.588 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           2 |        73 |         0 |        73 | 
             |     3.325 |    14.006 |           | 
             |     1.000 |     0.000 |     0.085 | 
             |     0.105 |     0.000 |           | 
             |     0.085 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           3 |       116 |         0 |       116 | 
             |     5.284 |    22.256 |           | 
             |     1.000 |     0.000 |     0.135 | 
             |     0.167 |     0.000 |           | 
             |     0.135 |     0.000 |           | 
-------------|-----------|-----------|-----------|
           4 |         0 |       165 |       165 | 
             |   133.343 |   561.657 |           | 
             |     0.000 |     1.000 |     0.192 | 
             |     0.000 |     1.000 |           | 
             |     0.000 |     0.192 |           | 
-------------|-----------|-----------|-----------|
Column Total |       695 |       165 |       860 | 
             |     0.808 |     0.192 |           | 
-------------|-----------|-----------|-----------|

 

In [27]:
# variables d'interaction muettes
RMR$Pop65PCT_Dist <- RMR$Pop65PCT * RMR$DistCBD_KM
RMR$Menag1_Dist   <- RMR$Menage1 * RMR$DistCBD_KM

In [ ]:
## modele 1 : sans variable muette ni d'interaction

Modele1 <- lm(
    FRPCT_ApIm ~ 
    SqrtTxChom + 
    MonoPct + 
    Menage1 + 
    MinorVisib + 
    EmplAtypiq + 
    AucuneDipl + 
    Pop65PCT, 
    data=RMR
)

### modele 1 without outliers
cook <- cooks.distance(Modele1)   # Distance de Cook
DataSansOutliers <- cbind(RMR, cook) # Fusion des deux tableaux
DataSansOutliers <- DataSansOutliers[DataSansOutliers$cook  < 8/nobs, ]

Modele1_Final <- lm(
    FRPCT_ApIm ~ 
    SqrtTxChom + 
    MonoPct + 
    Menage1 + 
    MinorVisib + 
    EmplAtypiq + 
    AucuneDipl + 
    Pop65PCT, 
    data=DataSansOutliers
)
summary(Modele1_Final)

# multicolinéarité?
vif(Modele1_Final)
vif(Modele1_Final) > 5

In [ ]:
## modele 2 : variable muette Ile de Montréal

Modele2 <- lm(
    FRPCT_ApIm ~ 
    SqrtTxChom + 
    MonoPct + 
    Menage1 + 
    MinorVisib + 
    EmplAtypiq + 
    AucuneDipl + 
    Pop65PCT + 
    Montreal, 
    data=RMR
)

### modele 2 without outliers
cook <- cooks.distance(Modele2)   # Distance de Cook
DataSansOutliers <- cbind(RMR, cook) # Fusion des deux tableaux
DataSansOutliers <- DataSansOutliers[DataSansOutliers$cook  < 8/nobs, ]
Modele2_Final <- lm(FRPCT_ApIm ~ SqrtTxChom+MonoPct+Menage1+MinorVisib+EmplAtypiq+AucuneDipl+Pop65PCT+Montreal, data = DataSansOutliers)
summary(Modele2_Final)

### multicolinéarité?
vif(Modele2_Final)
vif(Modele2_Final) > 5

In [ ]:
## modele 3 : variable muette Zones (sauf Montréal, en référence)

Modele3 <- lm(
    FRPCT_ApIm ~ 
    SqrtTxChom + 
    MonoPct + 
    Menage1 + 
    MinorVisib + 
    EmplAtypiq + 
    AucuneDipl + 
    Pop65PCT + 
    Laval + 
    CouronneNord + 
    CouronneSud, 
    data=RMR
)

### modele 3 without outliers
cook <- cooks.distance(Modele3)   # Distance de Cook
DataSansOutliers <- cbind(RMR, cook) # Fusion des deux tableaux
DataSansOutliers <- DataSansOutliers[DataSansOutliers$cook  < 8/nobs, ]
Modele3_Final <- lm(FRPCT_ApIm ~ SqrtTxChom+MonoPct+Menage1+MinorVisib+EmplAtypiq+AucuneDipl+Pop65PCT+Laval+CouronneNord+CouronneSud, data = DataSansOutliers)
summary(Modele3_Final)

### multicolinéarité?
vif(Modele3_Final)
vif(Modele3_Final) > 5

In [ ]:
## modele 4 : modele 3 + variable de distance + variable d'interaction

Modele4 <- lm(
    FRPCT_ApIm ~ 
    SqrtTxChom + 
    MonoPct + 
    Menage1 + 
    MinorVisib + 
    EmplAtypiq + 
    AucuneDipl + 
    Pop65PCT + 
    Laval + 
    CouronneNord + 
    CouronneSud + 
    DistCBD_KM + 
    Menag1_Dist, 
    data=RMR
)

### modele 4 without outliers
cook <- cooks.distance(Modele4)   # Distance de Cook
DataSansOutliers <- cbind(RMR, cook) # Fusion des deux tableaux
DataSansOutliers <- DataSansOutliers[DataSansOutliers$cook  < 8/nobs, ]
Modele4_Final <- lm(FRPCT_ApIm ~ SqrtTxChom+MonoPct+Menage1+MinorVisib+EmplAtypiq+AucuneDipl+Pop65PCT+Laval+CouronneNord+CouronneSud+DistCBD_KM+Menag1_Dist, data = DataSansOutliers)
summary(Modele4_Final)

### multicolinéarité?
vif(Modele4_Final)
vif(Modele4_Final) > 5