In [1]:
version
(for windows) https://www.r-statistics.com/2015/06/a-step-by-step-screenshots-tutorial-for-upgrading-r-on-windows/
In [2]:
# this will install the required packages
if (!require(data.table)) {
install.packages('data.table', repos='https://cloud.r-project.org/')
require(data.table)
}
if (!require(ggplot2)) {
install.packages('ggplot2', repos='https://cloud.r-project.org/')
require(ggplot2)
}
https://github.com/Rdatatable/data.table/wiki/Installation#openmp-enabled-compiler-for-mac
In [3]:
# shows the current working directory (wd)
getwd()
In [1]:
'C:\Users\me\github\data.csv'
In [2]:
'C:\\Users\\me\\github\\data.csv'
In [3]:
'C:/Users/me/github/data.csv'
In [4]:
# loading takes a while...
# this should be the path to the data. Adapt for your system
filepath <- 'C:/Users/ngeorge/Documents/GitHub/preprocess_lending_club_data/full_data/'
accepted_def <- read.csv(gzfile(paste(filepath, 'accepted_2007_to_2016.csv.gz', sep='')), na.strings='')
acc_dt <- as.data.table(accepted_def)
In [5]:
# that's a lot of observations
dim(acc_dt)
In [6]:
# and a lot of columns
names(acc_dt)
In [7]:
str(acc_dt, list.len=ncol(acc_dt))
In [8]:
# outliers are screwing it up!
hist(acc_dt[, dti])
In [9]:
# from here: http://stackoverflow.com/questions/4787332/how-to-remove-outliers-from-a-dataset
remove_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
y
}
In [10]:
dti_no_outliers <- remove_outliers(acc_dt[, dti])
In [11]:
hist(dti_no_outliers)
A B C D E F G
https://stat.ethz.ch/R-manual/R-devel/library/base/html/sort.html
https://stat.ethz.ch/R-manual/R-devel/library/base/html/unique.html
In [12]:
#TODO: print the unique values for the grade column
| grade | Avg_Interest_Rate |
|---|---|
| A | 7.129947 |
| B | 10.626637 |
| C | 13.918715 |
| D | 17.502870 |
| E | 20.574477 |
| F | 24.230820 |
| G | 26.653138 |
http://www.statmethods.net/management/sorting.html
http://www.r-tutor.com/elementary-statistics/numerical-measures/mean
http://stackoverflow.com/questions/12353820/sort-rows-in-data-table
In [13]:
# TODO: print out average interest rates for each grade, and store the result as a data table
In [14]:
# TODO: sort by grade
In [15]:
# TODO: permanantly change the order of the data table with average interest rates and print it
# hint: use the setkey() function (find it in the data table cheat sheet)
https://stat.ethz.ch/R-manual/R-devel/library/graphics/html/barplot.html
In [16]:
# TODO: show barplot of interest rates for each grade
https://www.r-bloggers.com/summarising-data-using-box-and-whisker-plots/
In [17]:
# TODO: make boxplot of the interest rate vs grade
http://www.statmethods.net/stats/anova.html
http://www.gardenersown.co.uk/education/lectures/r/anova.htm
In [18]:
# TODO: do ANOVA on interest rates and grades
In [19]:
# OPTIONAL TODO: plot the ANOVA fit
# warning...this takes some time. at least a minute or two...it will make many plots
# we're looking for homoscedasticity here: https://en.wikipedia.org/wiki/Homoscedasticity
# if the residuals are very biased to one side or the other, the analysis may not be trustworthy
In [20]:
# TODO: print out summary of the ANOVA fit
http://www.gardenersown.co.uk/education/lectures/r/anova.htm
In [21]:
# TODO: perform a Tukey HSD test
# here we're looking at the p adj column, which is the p-value of the pairwise comparisons. If this is less than 0.05,
# we can say it's a statistically significant difference
In [ ]: