In [84]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
In [85]:
df <- read.csv("data/cars.csv")
colnames(df)
Out[85]:
In [86]:
head(df, 3)
Out[86]:
In [87]:
length(unique(df$name))
Out[87]:
In [88]:
cleaned_df <- select(df, name, model, price, type, Engine.Size..cc., Fuel.Type, Gross.Weight..kg.,
Ground.Clearance..mm., Max.Power..bhp...rpm., Mileage.in.City..kmpl., Mileage.on.Highway..kmpl.,
No..of.Cylinders, Seat.Capacity, Transmission.Type, brand)
In [89]:
colnames(cleaned_df) <- c("name", "model", "price", "type", "engine", "fuel", "weight",
"clearance", "max_power_bhp_rpm", "mileage_city", "mileage_highway",
"cylinders", "seats", "transmission_type", "brand")
In [90]:
colnames(cleaned_df)
Out[90]:
In [91]:
head(cleaned_df, 3)
Out[91]:
In [92]:
unique(cleaned_df$transmission_type)
Out[92]:
In [93]:
getNumGear <- function(row) {
return(unlist(strsplit(row, split = " "))[1])
}
In [94]:
transmission <- data.frame(as.character(cleaned_df[,"transmission_type"]))
head(transmission)
Out[94]:
In [95]:
transmission[,1] <- as.character(transmission[,1])
In [96]:
class(transmission[,1])
Out[96]:
In [97]:
transmission[1,1]
Out[97]:
In [98]:
transmission[,1] <- apply(transmission, 2, function(x) trimws(x))
In [99]:
head(transmission)
Out[99]:
In [100]:
gearNumbers <- apply(transmission, 1, function(x) getNumGear(x))
In [101]:
gearNumbers[is.na(gearNumbers)] <- 4
In [102]:
table(gearNumbers)
Out[102]:
In [103]:
gearNumbers[gearNumbers=="Automatic"] = 5
In [104]:
table(gearNumbers)
Out[104]:
In [105]:
gearNumbers[gearNumbers=="Manual"] = 5
In [106]:
table(gearNumbers)
Out[106]:
In [107]:
gearNumbers <- as.numeric(gearNumbers)
In [108]:
class(gearNumbers)
Out[108]:
In [109]:
table(gearNumbers)
Out[109]:
In [110]:
cleaned_df$gears <- gearNumbers
In [111]:
head(cleaned_df)
Out[111]:
In [112]:
getTransmissionType <- function(row) {
if (is.na(row) || row == "Manual") {
return("Manual")
}
if (row == "Automatic") {
return("Automatic")
}
return(unlist(strsplit(row, split = " "))[3])
}
In [113]:
getTransmissionType(NA)
Out[113]:
In [114]:
#transmission <- data.frame(as.character(cleaned_df[,"transmission_type"]))
head(transmission)
Out[114]:
In [115]:
unique(transmission)
Out[115]:
In [116]:
transmissionType <- apply(transmission, 1, function(x) getTransmissionType(x))
In [117]:
head(transmissionType)
Out[117]:
In [118]:
table(transmissionType)
Out[118]:
In [119]:
cleaned_df$transmission <- transmissionType
head(cleaned_df)
Out[119]:
In [120]:
getMaxBHP <- function(row) {
return(trimws(unlist(strsplit(row, split = "@"))[1]))
}
In [121]:
cleaned_df <- cleaned_df %>%
separate(max_power_bhp_rpm, c("bhp", "rpm"), sep = "@")
In [122]:
head(cleaned_df, 3)
Out[122]:
In [123]:
table(cleaned_df$fuel)
Out[123]:
In [124]:
table(cleaned_df$clearance)
Out[124]:
In [125]:
table(cleaned_df$type)
Out[125]:
In [126]:
cleaned_df$type[cleaned_df$type == "Seadan" ] = "Sedan"
cleaned_df$type[cleaned_df$type == "Sedna" | cleaned_df$type == "Sedans" | cleaned_df$type == "Seadn"] = "Sedan"
table(cleaned_df$type)
Out[126]:
In [127]:
cleaned_df$type[cleaned_df$type == "Hatchback" | cleaned_df$type == "Hatchabck" |
cleaned_df$type == "hatchback" | cleaned_df$type == "Hatchback"] = "Hatchback"
table(cleaned_df$type)
Out[127]:
In [128]:
unique(trimws(cleaned_df$type))
Out[128]:
In [129]:
cleaned_df$type[cleaned_df$type == "VAN" ] = "Van"
In [130]:
unique(cleaned_df$type)
Out[130]:
In [131]:
cleaned_df$model[is.na(cleaned_df$type)]
Out[131]:
In [132]:
unique(cleaned_df$type)
Out[132]:
In [133]:
table(cleaned_df$engine)
Out[133]:
In [134]:
table(cleaned_df$cylinders)
Out[134]:
In [135]:
str(cleaned_df)
In [136]:
cleaned_df$cylinders <- as.character(cleaned_df$cylinders)
In [137]:
unique(cleaned_df$cylinders)
Out[137]:
In [138]:
filter(cleaned_df, cylinders == " Yes")
Out[138]:
In [139]:
cleaned_df$cylinders[cleaned_df$cylinders == " Yes"] = " 4"
In [140]:
table(cleaned_df$cylinders)
Out[140]:
In [141]:
table(cleaned_df$mileage_city)
Out[141]:
In [142]:
table(cleaned_df$mileage_highway)
Out[142]:
In [143]:
table(cleaned_df$seats)
Out[143]:
In [144]:
cleaned_df$seats <- as.character(cleaned_df$seats)
In [145]:
filter(cleaned_df, seats == " Yes")
Out[145]:
In [146]:
filter(cleaned_df, seats == " No")
Out[146]:
In [147]:
cleaned_df$seats[cleaned_df$seats == " Yes"] = " 5"
cleaned_df$seats[cleaned_df$seats == " No"] = " 5"
In [148]:
table(cleaned_df$seats)
Out[148]:
In [149]:
table(cleaned_df$brand)
Out[149]:
Refine
In [150]:
head(cleaned_df)
Out[150]:
In [151]:
by_name <- group_by(cleaned_df, name)
In [152]:
filtered_df <- slice(by_name,which.min(price))
In [153]:
head(filtered_df)
Out[153]:
mileage fill na with mean
In [154]:
filtered_df$mileage_city <- as.numeric(as.character(filtered_df$mileage_city))
filtered_df$mileage_highway <- as.numeric(as.character(filtered_df$mileage_highway))
In [155]:
table(filtered_df$mileage_city)
Out[155]:
In [156]:
filtered_df$mileage_city[is.na(filtered_df$mileage_city)] = mean(filtered_df$mileage_city, na.rm=T)
filtered_df$mileage_highway[is.na(filtered_df$mileage_highway)] = mean(filtered_df$mileage_highway, na.rm=T)
In [157]:
print(mean(filtered_df$mileage_city))
print(mean(filtered_df$mileage_highway))
In [158]:
table(filtered_df$mileage_city)
Out[158]:
In [159]:
unique(filtered_df$engine)
Out[159]:
In [160]:
filtered_df$model[is.na(filtered_df$engine)]
Out[160]:
In [161]:
table(filtered_df$engine)
Out[161]:
In [162]:
filtered_df$engine[is.na(filtered_df$engine)] = 1198
In [163]:
filtered_df$engine <- as.numeric(as.character(filtered_df$engine))
In [186]:
filtered_df$fuel_type = -1
filtered_df$fuel_type[filtered_df$fuel == ' Petrol'] = 0
filtered_df$fuel_type[filtered_df$fuel == ' Diesel'] = 1
filtered_df$fuel_type[filtered_df$fuel == ' CNG'] = 2
filtered_df$fuel_type[filtered_df$fuel_type == -1] = 0
In [187]:
unique(filtered_df$fuel_type)
Out[187]:
In [188]:
write.csv(filtered_df, file = "data/cars.tidy.csv", row.names = FALSE)
In [ ]: