In [1]:
# Making sure Julia is working properly
+(2, 2)
Out[1]:
In [2]:
# Pkg.add("")
In [3]:
using DataFrames
In [4]:
using Gadfly
In [5]:
using StatsBase
In [6]:
using HypothesisTests
In [7]:
using Distributions
In [8]:
#using Plotly
In [9]:
#include("plotly_credentials.jl")
In [8]:
df = readtable("CCS.csv");
In [10]:
first(df, 6)
Out[10]:
In [11]:
# Making sure there are no NA-values
# Looking at the data types
# showcols(df)
In [12]:
# Calculating the number of rows and columns in the DataFrame
nrows, ncols = size(df)
Out[12]:
In [13]:
# Results of a specific row and column entry
df[3, 4]
Out[13]:
In [14]:
# Column 4 is also :Var1
df[3, :Var1]
Out[14]:
In [15]:
# Select some rows and all columns
df[3:5, :]
Out[15]:
In [16]:
# Select some rows and some columns
df[3:5, [2, 4]]
Out[16]:
In [17]:
# Select some rows and some columns
df[3:5, [:Cat1, :Var1]]
Out[17]:
In [18]:
# More selection
df[[2, 5, 99], 2:4]
Out[18]:
In [19]:
# Changing the values of Cat1
# A was minor infections
# B was major infections
for r in 1:nrows # Loop through all the rows
temp = df[r, :Cat1] # Create a variable called temp
if isna(temp)
# do nothing
elseif temp == "A"
df[r, :Cat1] = "Minor infection"
elseif temp == "B"
df[r, :Cat1] = "Major infection"
else
# do nothing
end
end
In [21]:
# Changing the values of Cat2
for r in 1:nrows
temp = df[r, :Cat2]
if isna(temp)
# do nothing
elseif temp == "C" || temp == "X" || temp == "R" # Using OR
df[r, :Cat2] = "Female"
elseif temp == "L" || temp == "B" || temp == "F"
df[r, :Cat2] = "Male"
else
# do nothing
end
end
In [22]:
# Correcting the age
df[:Var1] = df[:Var1] - 5
Out[22]:
In [23]:
# Renaming the columns
rename!(df, :Cat1, :Infection)
rename!(df, :Cat2, :Gender)
rename!(df, :Var1, :Age)
rename!(df, :Var2, :HbA1c)
rename!(df, :Var3, :CRP)
Out[23]:
In [24]:
# Count of number per group of amputation
# Use the values for categorical data analysis
groups = by(df, :Infection, d -> DataFrame(N = size(d, 1)))
Out[24]:
In [25]:
# Count of number per group of gender
gender = by(df, :Gender, d -> DataFrame(N = size(d, 1)))
Out[25]:
In [26]:
# Calculating the mean of a column
mean(df[:Age])
Out[26]:
In [27]:
median(df[:Age])
Out[27]:
In [28]:
std(df[:Age])
Out[28]:
In [29]:
# Describe the values in a column
describe(df[:Age])
In [30]:
describe(df[:HbA1c])
In [31]:
describe(df[:CRP])
In [32]:
# Using the Gadfly package
plot(df, x = "Infection", y = "Age", Geom.boxplot, Guide.title("Age analysis by type of infection"),
Guide.xlabel("Type of infection"), Guide.ylabel("Age"))
Out[32]:
In [33]:
plot(df, x = "Gender", y = "Age", Geom.boxplot, Guide.title("Age analysis by gender"),
Guide.xlabel("Gender"), Guide.ylabel("Age"), Theme(default_color = colorant"orange"))
Out[33]:
In [34]:
plot(df, x = "Age", color = "Infection", Geom.density, Guide.title("Age distribution by type of infection"),
Guide.xlabel("Age"), Guide.ylabel("Distribution"))
Out[34]:
In [35]:
plot(df, x = "Age", color = "Gender", Geom.density, Guide.title("Age distribution by gender"),
Guide.xlabel("Age"), Guide.ylabel("Distribution"))
Out[35]:
In [36]:
plot(df, x = "Infection", y = "HbA1c", Geom.boxplot, Guide.title("HbA1c analysis by type of infection"),
Guide.xlabel("Type of infection"), Guide.ylabel("HbA1c"))
Out[36]:
In [37]:
plot(df, x = "HbA1c", color = "Infection", Geom.density, Guide.title("HbA1c distribution by type of infection"),
Guide.xlabel("HbA1c"), Guide.ylabel("Distribution"))
Out[37]:
In [38]:
plot(df, x = "Gender", y = "HbA1c", Geom.boxplot, Guide.title("HbA1c analysis by gender"),
Guide.xlabel("Gender"), Guide.ylabel("Age"), Theme(default_color = colorant"orange"))
Out[38]:
In [39]:
plot(df, x = "Infection", y = "CRP", Geom.boxplot, Guide.title("CRP analysis by type of infection"),
Guide.xlabel("Type of infection"), Guide.ylabel("CRP"))
Out[39]:
In [40]:
plot(df, x = "Gender", y = "CRP", Geom.boxplot, Guide.title("CRP analysis by gender"),
Guide.xlabel("Gender"), Guide.ylabel("CRP"), Theme(default_color = colorant"orange"))
Out[40]:
In [41]:
# Creating individual DataFrames
minor = df[df[:Infection] .== "Minor infection", :]
major = df[df[:Infection] .== "Major infection", :]
female = df[df[:Gender] .== "Female", :]
male = df[df[:Gender] .== "Male", :];
In [42]:
# Count levels of amputatations by gender
by(minor, :Gender,d -> DataFrame(N = size(d, 1)))
Out[42]:
In [43]:
by(major, :Gender,d -> DataFrame(N = size(d, 1)))
Out[43]:
In [44]:
# Combining toe and foot amputations to get 2x2 contingency table
# Using FishersExactTest from the HypothesisTests package
# Female Male
# Minor infection 29 31
# Major infections 31 29
FisherExactTest(29, 31, 31, 29)
Out[44]:
In [45]:
# Checking distributions using the Kolmogorov-Smirnov test from the HypothesisTests package
ExactOneSampleKSTest(df[:Age], Normal(mean(df[:Age]), std(df[:Age])))
Out[45]:
In [46]:
ExactOneSampleKSTest(df[:HbA1c], Normal(mean(df[:HbA1c]), std(df[:HbA1c])))
Out[46]:
In [47]:
ExactOneSampleKSTest(df[:CRP], Normal(mean(df[:CRP]), std(df[:CRP])))
Out[47]:
In [48]:
# Using nonparametric tests for two groups
MannWhitneyUTest(minor[:Age], major[:Age])
Out[48]:
In [49]:
MannWhitneyUTest(minor[:HbA1c], major[:HbA1c])
Out[49]:
In [50]:
var(minor[:HbA1c]), var(major[:HbA1c])
Out[50]:
In [51]:
# Using a parametric test
EqualVarianceTTest(minor[:HbA1c], major[:HbA1c])
Out[51]:
In [52]:
# Checking descriptive statistics for HbA1c in the two groups
describe(minor[:HbA1c])
In [53]:
describe(major[:HbA1c])
In [54]:
MannWhitneyUTest(minor[:CRP], major[:CRP])
Out[54]:
In [55]:
# Using MannWhitneyU test
MannWhitneyUTest(female[:Age], male[:Age])
Out[55]:
In [56]:
MannWhitneyUTest(female[:HbA1c], male[:HbA1c])
Out[56]:
In [57]:
MannWhitneyUTest(female[:CRP], male[:CRP])
Out[57]:
In [ ]: