In [84]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)

In [85]:
df <- read.csv("data/cars.csv")
colnames(df)


Out[85]:
  1. 'name'
  2. 'model'
  3. 'url'
  4. 'price'
  5. 'type'
  6. 'ABS'
  7. 'Acceleration..0.100.kmph.'
  8. 'Air.Conditioner'
  9. 'Audio.Controls.on.Streeing.Wheel'
  10. 'Audio.System..with.remote.'
  11. 'Bluetooth.Connectivity'
  12. 'Body.Coloured.Bumpers'
  13. 'Boot.Space..litres.'
  14. 'Brakes.Front'
  15. 'Brakes.Rear'
  16. 'CD.Player'
  17. 'Central.Locking'
  18. 'Clean.Air.Filter'
  19. 'Cruise.Control'
  20. 'Door.Ajar.Warning'
  21. 'Driver.Seatbelt.Warning'
  22. 'Dual.SRS.Airbags..D.P.'
  23. 'EBD'
  24. 'Engine.Size..cc.'
  25. 'Engine.Type'
  26. 'Front.Track'
  27. 'Front.Tyres'
  28. 'Fuel.Supply.System'
  29. 'Fuel.Tank.Capacity..litres.'
  30. 'Fuel.Type'
  31. 'Full.Wheel.Caps'
  32. 'Gross.Weight..kg.'
  33. 'Ground.Clearance..mm.'
  34. 'Heater'
  35. 'Immobiliser'
  36. 'Kerb.Weight..kg.'
  37. 'Keyless.Entry'
  38. 'Max.Power..bhp...rpm.'
  39. 'Max.Torque..Nm...rpm.'
  40. 'Mileage.in.City..kmpl.'
  41. 'Mileage.on.Highway..kmpl.'
  42. 'No..of.Cylinders'
  43. 'Overall.Length.mm..x.Width.mm..x.Height.mm.'
  44. 'Panoramic.Sunroof'
  45. 'Passenger.Airbags'
  46. 'Power.Steering'
  47. 'Power.Windows..Front.'
  48. 'Power.Windows..Rear.'
  49. 'Rain.Sensing.Wipers'
  50. 'Rear.Track'
  51. 'Rear.Tyres'
  52. 'Seat.Capacity'
  53. 'Speakers'
  54. 'Tilt.Function'
  55. 'Top.Speed..kmph.'
  56. 'Traction.Control'
  57. 'Transmission.Type'
  58. 'Tubeless.Tyres'
  59. 'Turning.Circle.Radius..metres.'
  60. 'USB...Auxiliary.Input'
  61. 'Wheelbase..mm.'
  62. 'brand'

In [86]:
head(df, 3)


Out[86]:
namemodelurlpricetypeABSAcceleration..0.100.kmph.Air.ConditionerAudio.Controls.on.Streeing.WheelAudio.System..with.remote.Bluetooth.ConnectivityBody.Coloured.BumpersBoot.Space..litres.Brakes.FrontBrakes.RearCD.PlayerCentral.LockingClean.Air.FilterCruise.ControlDoor.Ajar.WarningDriver.Seatbelt.WarningDual.SRS.Airbags..D.P.EBDEngine.Size..cc.Engine.TypeFront.TrackFront.TyresFuel.Supply.SystemFuel.Tank.Capacity..litres.Fuel.TypeFull.Wheel.CapsGross.Weight..kg.Ground.Clearance..mm.HeaterImmobiliserKerb.Weight..kg.Keyless.EntryMax.Power..bhp...rpm.Max.Torque..Nm...rpm.Mileage.in.City..kmpl.Mileage.on.Highway..kmpl.No..of.CylindersOverall.Length.mm..x.Width.mm..x.Height.mm.Panoramic.SunroofPassenger.AirbagsPower.SteeringPower.Windows..Front.Power.Windows..Rear.Rain.Sensing.WipersRear.TrackRear.TyresSeat.CapacitySpeakersTilt.FunctionTop.Speed..kmph.Traction.ControlTransmission.TypeTubeless.TyresTurning.Circle.Radius..metres.USB...Auxiliary.InputWheelbase..mm.brand
1Ashok Leyland StileAshok Leyland Stile LE 8-STR (Diesel)http://carzoom.in/car-specification/ashok-leyland-stile-le-8-str-diesel/749990MPV No 18.7 Manual No No No Yes 500 Disc Drum No No Yes No No No No No1461 Turbo-intercooled, Common Rail Diesel EngineNA 165 R14 CRDi 50 Diesel Yes 2000180 Yes Yes 1426 No 75 @ 3300 185 @ 1750-2750 16.2 20.7 4 4400 x 1700 x 1860 No No Yes No No NoNA 165 R14 8 No Yes 140 No 5 Speed Manual Yes5.2 No 2725Ashok
2Ashok Leyland StileAshok Leyland Stile LS 8-STR (Diesel)http://carzoom.in/car-specification/ashok-leyland-stile-ls-8-str-diesel/799990MPV No 18.7 Manual No No No Yes 500 Disc Drum No Yes No No No No No No1461 Turbo-intercooled, Common Rail Diesel EngineNA 165 R14 CRDi 50 Diesel No 2000180 Yes Yes 1426 No 75 @ 3300 185 @ 1750-2750 16.2 20.7 4 4400 x 1700 x 1860 No No Yes Yes No NoNA 165 R14 8 No Yes 140 No 5 Speed Manual Yes5.2 No 2725Ashok
3Ashok Leyland StileAshok Leyland Stile LX 8-STR (Diesel)http://carzoom.in/car-specification/ashok-leyland-stile-lx-8-str-diesel/829990MPV No 18.7 Manual No No No Yes 500 Disc Drum No Yes Yes No No No No No1461 Turbo-intercooled, Common Rail Diesel EngineNA 165 R14 CRDi 50 Diesel No 2000180 Yes Yes 1426 No 75 @ 3300 185 @ 1750-2750 16.2 20.7 4 4400 x 1700 x 1860 No No Yes Yes No NoNA 165 R14 8 No Yes 140 No 5 Speed Manual Yes5.2 No 2725Ashok

In [87]:
length(unique(df$name))


Out[87]:
203

Rows to Consider

consider the base model for each vehicle

Columns to consider

  • name
  • price
  • type
  • Engine.Size..cc.
  • Fuel.Type
  • Gross.Weight..kg.
  • Ground.Clearance..mm.
  • Max.Power..bhp...rpm.
  • Mileage.in.City..kmpl.
  • Mileage.on.Highway..kmpl.
  • No..of.Cylinders
  • Seat.Capacity
  • Transmission.Type
  • brand

In [88]:
cleaned_df <- select(df, name, model, price, type, Engine.Size..cc., Fuel.Type, Gross.Weight..kg.,
       Ground.Clearance..mm., Max.Power..bhp...rpm., Mileage.in.City..kmpl., Mileage.on.Highway..kmpl.,
      No..of.Cylinders, Seat.Capacity, Transmission.Type, brand)

In [89]:
colnames(cleaned_df) <- c("name", "model", "price", "type", "engine", "fuel", "weight",
                         "clearance", "max_power_bhp_rpm", "mileage_city", "mileage_highway",
                         "cylinders", "seats", "transmission_type", "brand")

In [90]:
colnames(cleaned_df)


Out[90]:
  1. 'name'
  2. 'model'
  3. 'price'
  4. 'type'
  5. 'engine'
  6. 'fuel'
  7. 'weight'
  8. 'clearance'
  9. 'max_power_bhp_rpm'
  10. 'mileage_city'
  11. 'mileage_highway'
  12. 'cylinders'
  13. 'seats'
  14. 'transmission_type'
  15. 'brand'

In [91]:
head(cleaned_df, 3)


Out[91]:
namemodelpricetypeenginefuelweightclearancemax_power_bhp_rpmmileage_citymileage_highwaycylindersseatstransmission_typebrand
1Ashok Leyland StileAshok Leyland Stile LE 8-STR (Diesel)749990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 8 5 Speed ManualAshok
2Ashok Leyland StileAshok Leyland Stile LS 8-STR (Diesel)799990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 8 5 Speed ManualAshok
3Ashok Leyland StileAshok Leyland Stile LX 8-STR (Diesel)829990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 8 5 Speed ManualAshok

In [92]:
unique(cleaned_df$transmission_type)


Out[92]:
  1. 5 Speed Manual
  2. 6 Speed Automatic
  3. 6 Speed Manual
  4. 7 Speed Automatic
  5. Automatic
  6. 8 Speed Automatic
  7. 7 Speed
  8. NA
  9. 5 Speed Automatic
  10. Manual
  11. 4 Speed Automatic
  12. 4 Speed Manual

In [93]:
getNumGear <- function(row) {
    return(unlist(strsplit(row, split = " "))[1])
}

In [94]:
transmission <- data.frame(as.character(cleaned_df[,"transmission_type"]))
head(transmission)


Out[94]:
as.character.cleaned_df....transmission_type...
1 5 Speed Manual
2 5 Speed Manual
3 5 Speed Manual
4 5 Speed Manual
5 5 Speed Manual
6 5 Speed Manual

In [95]:
transmission[,1] <- as.character(transmission[,1])

In [96]:
class(transmission[,1])


Out[96]:
'character'

In [97]:
transmission[1,1]


Out[97]:
' 5 Speed Manual'

In [98]:
transmission[,1] <- apply(transmission, 2, function(x) trimws(x))

In [99]:
head(transmission)


Out[99]:
as.character.cleaned_df....transmission_type...
15 Speed Manual
25 Speed Manual
35 Speed Manual
45 Speed Manual
55 Speed Manual
65 Speed Manual

In [100]:
gearNumbers <- apply(transmission, 1, function(x) getNumGear(x))

In [101]:
gearNumbers[is.na(gearNumbers)] <- 4

In [102]:
table(gearNumbers)


Out[102]:
gearNumbers
        4         5         6         7         8 Automatic    Manual 
       27       513       141        74        72         5         1 

In [103]:
gearNumbers[gearNumbers=="Automatic"] = 5

In [104]:
table(gearNumbers)


Out[104]:
gearNumbers
     4      5      6      7      8 Manual 
    27    518    141     74     72      1 

In [105]:
gearNumbers[gearNumbers=="Manual"] = 5

In [106]:
table(gearNumbers)


Out[106]:
gearNumbers
  4   5   6   7   8 
 27 519 141  74  72 

In [107]:
gearNumbers <- as.numeric(gearNumbers)

In [108]:
class(gearNumbers)


Out[108]:
'numeric'

In [109]:
table(gearNumbers)


Out[109]:
gearNumbers
  4   5   6   7   8 
 27 519 141  74  72 

In [110]:
cleaned_df$gears <- gearNumbers

In [111]:
head(cleaned_df)


Out[111]:
namemodelpricetypeenginefuelweightclearancemax_power_bhp_rpmmileage_citymileage_highwaycylindersseatstransmission_typebrandgears
1Ashok Leyland StileAshok Leyland Stile LE 8-STR (Diesel)749990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 8 5 Speed ManualAshok5
2Ashok Leyland StileAshok Leyland Stile LS 8-STR (Diesel)799990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 8 5 Speed ManualAshok5
3Ashok Leyland StileAshok Leyland Stile LX 8-STR (Diesel)829990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 8 5 Speed ManualAshok5
4Ashok Leyland StileAshok Leyland Stile LS 7-STR (Diesel)849990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 7 5 Speed ManualAshok5
5Ashok Leyland StileAshok Leyland Stile LS 7-STR Alloy (Diesel)879990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 7 5 Speed ManualAshok5
6Ashok Leyland StileAshok Leyland Stile LX 7-STR (Diesel)899990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 7 5 Speed ManualAshok5

In [112]:
getTransmissionType <- function(row) {
    if (is.na(row) || row == "Manual") {
        return("Manual")
    }
    
    if (row == "Automatic") {
        return("Automatic")
    }
    
    return(unlist(strsplit(row, split = " "))[3])
        
}

In [113]:
getTransmissionType(NA)


Out[113]:
'Manual'

In [114]:
#transmission <- data.frame(as.character(cleaned_df[,"transmission_type"]))
head(transmission)


Out[114]:
as.character.cleaned_df....transmission_type...
15 Speed Manual
25 Speed Manual
35 Speed Manual
45 Speed Manual
55 Speed Manual
65 Speed Manual

In [115]:
unique(transmission)


Out[115]:
as.character.cleaned_df....transmission_type...
15 Speed Manual
86 Speed Automatic
96 Speed Manual
127 Speed Automatic
22Automatic
268 Speed Automatic
367 Speed
67NA
1915 Speed Automatic
194Manual
2634 Speed Automatic
4364 Speed Manual

In [116]:
transmissionType <- apply(transmission, 1, function(x) getTransmissionType(x))

In [117]:
head(transmissionType)


Out[117]:
  1. 'Manual'
  2. 'Manual'
  3. 'Manual'
  4. 'Manual'
  5. 'Manual'
  6. 'Manual'

In [118]:
table(transmissionType)


Out[118]:
transmissionType
Automatic    Manual 
      245       587 

In [119]:
cleaned_df$transmission <- transmissionType
head(cleaned_df)


Out[119]:
namemodelpricetypeenginefuelweightclearancemax_power_bhp_rpmmileage_citymileage_highwaycylindersseatstransmission_typebrandgearstransmission
1Ashok Leyland StileAshok Leyland Stile LE 8-STR (Diesel)749990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 8 5 Speed ManualAshok5Manual
2Ashok Leyland StileAshok Leyland Stile LS 8-STR (Diesel)799990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 8 5 Speed ManualAshok5Manual
3Ashok Leyland StileAshok Leyland Stile LX 8-STR (Diesel)829990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 8 5 Speed ManualAshok5Manual
4Ashok Leyland StileAshok Leyland Stile LS 7-STR (Diesel)849990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 7 5 Speed ManualAshok5Manual
5Ashok Leyland StileAshok Leyland Stile LS 7-STR Alloy (Diesel)879990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 7 5 Speed ManualAshok5Manual
6Ashok Leyland StileAshok Leyland Stile LX 7-STR (Diesel)899990MPV1461 Diesel 2000180 75 @ 3300 16.2 20.7 4 7 5 Speed ManualAshok5Manual

In [120]:
getMaxBHP <- function(row) {
    return(trimws(unlist(strsplit(row, split = "@"))[1]))
}

In [121]:
cleaned_df <- cleaned_df %>% 
                separate(max_power_bhp_rpm, c("bhp", "rpm"), sep = "@")


Warning message:
: Too few values at 1 locations: 454

In [122]:
head(cleaned_df, 3)


Out[122]:
namemodelpricetypeenginefuelweightclearancebhprpmmileage_citymileage_highwaycylindersseatstransmission_typebrandgearstransmission
1Ashok Leyland StileAshok Leyland Stile LE 8-STR (Diesel)749990MPV1461 Diesel 2000180 75 3300 16.2 20.7 4 8 5 Speed ManualAshok5Manual
2Ashok Leyland StileAshok Leyland Stile LS 8-STR (Diesel)799990MPV1461 Diesel 2000180 75 3300 16.2 20.7 4 8 5 Speed ManualAshok5Manual
3Ashok Leyland StileAshok Leyland Stile LX 8-STR (Diesel)829990MPV1461 Diesel 2000180 75 3300 16.2 20.7 4 8 5 Speed ManualAshok5Manual

In [123]:
table(cleaned_df$fuel)


Out[123]:
    CNG  Diesel     LPG  Petrol 
     13     441       3     367 

In [124]:
table(cleaned_df$clearance)


Out[124]:
  100   109   110   114   117   118   120   123   124   125   128   131   134 
   10     3     1     1     1     1     9     1     2     2     6     5     2 
  135   136   140   142   145   146   147   150   151   152   154   155   157 
   12     4     4     8    11     2     1    11     2     1    13    12     1 
  158   159   160   161   163   165   167   168   170   172   174   175   176 
    4    11    29    19    10   147     3    42    93     9    19    27     1 
  177   179   180   181   183   184   185   186   187   189   190   195   200 
    1     3    47     3     3     9    24     9     2    10    14     2    29 
  201 204.8   205   206   209   210   212   215   220   223   227   238 239.8 
    2     1    34     3     1    13     4     6     5     2     5     2     1 
  240 295.5  2501 
    1     6     2 

In [125]:
table(cleaned_df$type)


Out[125]:
   Cabriolet  Convertible        Coupe    Hatchabck    hatchback    Hatchback 
           1            4           33            1            1          213 
    Hatchback‎       Luxury Luxury Sedan          MPV          MUV       Saloon 
           7            2            1           47           32            1 
Saloon Sedan       Seadan        Seadn        Sedan       Sedans        Sedna 
           1            1            1          286            1            1 
 Sport Coupe     Supercar          SUV          Van          VAN 
           1            4          188            1            4 

In [126]:
cleaned_df$type[cleaned_df$type == "Seadan" ] = "Sedan"
cleaned_df$type[cleaned_df$type == "Sedna" | cleaned_df$type == "Sedans" | cleaned_df$type == "Seadn"] = "Sedan"
table(cleaned_df$type)


Out[126]:
   Cabriolet  Convertible        Coupe    Hatchabck    hatchback    Hatchback 
           1            4           33            1            1          213 
    Hatchback‎       Luxury Luxury Sedan          MPV          MUV       Saloon 
           7            2            1           47           32            1 
Saloon Sedan       Seadan        Seadn        Sedan       Sedans        Sedna 
           1            0            0          290            0            0 
 Sport Coupe     Supercar          SUV          Van          VAN 
           1            4          188            1            4 

In [127]:
cleaned_df$type[cleaned_df$type == "Hatchback" | cleaned_df$type == "Hatchabck" | 
                cleaned_df$type == "hatchback" | cleaned_df$type == "Hatchback"] = "Hatchback"
table(cleaned_df$type)


Out[127]:
   Cabriolet  Convertible        Coupe    Hatchabck    hatchback    Hatchback 
           1            4           33            0            0          215 
    Hatchback‎       Luxury Luxury Sedan          MPV          MUV       Saloon 
           7            2            1           47           32            1 
Saloon Sedan       Seadan        Seadn        Sedan       Sedans        Sedna 
           1            0            0          290            0            0 
 Sport Coupe     Supercar          SUV          Van          VAN 
           1            4          188            1            4 

In [128]:
unique(trimws(cleaned_df$type))


Out[128]:
  1. 'MPV'
  2. 'Sedan'
  3. 'Coupe'
  4. 'Cabriolet'
  5. 'SUV'
  6. 'Luxury'
  7. 'Luxury Sedan'
  8. 'Convertible'
  9. 'Hatchback‎'
  10. 'Hatchback'
  11. 'MUV'
  12. 'Sport Coupe'
  13. 'Supercar'
  14. 'Saloon'
  15. 'Saloon Sedan'
  16. 'NA'
  17. 'VAN'
  18. 'Van'

In [129]:
cleaned_df$type[cleaned_df$type == "VAN" ] = "Van"

In [130]:
unique(cleaned_df$type)


Out[130]:
  1. MPV
  2. Sedan
  3. Coupe
  4. Cabriolet
  5. SUV
  6. Luxury
  7. Luxury Sedan
  8. Convertible
  9. Hatchback‎
  10. Hatchback
  11. MUV
  12. Sport Coupe
  13. Supercar
  14. Saloon
  15. Saloon Sedan
  16. NA
  17. Van

In [131]:
cleaned_df$model[is.na(cleaned_df$type)]


Out[131]:
Porsche Boxster GTS

In [132]:
unique(cleaned_df$type)


Out[132]:
  1. MPV
  2. Sedan
  3. Coupe
  4. Cabriolet
  5. SUV
  6. Luxury
  7. Luxury Sedan
  8. Convertible
  9. Hatchback‎
  10. Hatchback
  11. MUV
  12. Sport Coupe
  13. Supercar
  14. Saloon
  15. Saloon Sedan
  16. NA
  17. Van

In [133]:
table(cleaned_df$engine)


Out[133]:
   3  624  793  796  814  936  995  998  999 1086 1120 1149 1172 1193 1196 1197 
   3    8    4   13    5    4    3   24    2    3    9    1    2   13    6   39 
1198 1199 1248 1298 1364 1368 1373 1390 1395 1396 1399 1405 1448 1461 1493 1496 
  38   17   59    2   15   10   10    2    2   17   11    7    1   70    4    8 
1497 1498 1499 1582 1586 1591 1595 1596 1598 1797 1798 1948 1968 1969 1984 1991 
   6   43    5    8    4    4    2    4   25    3   12    2   26    4    8    7 
1995 1997 1998 1999 2143 2149 2179 2199 2200 2231 2354 2359 2393 2400 2477 2489 
  17    3    4    2    9    2   52    3    1    2    1    2    4    2    2    4 
2496 2498 2499 2523 2650 2696 2953 2956 2967 2979 2982 2987 2993 2995 2996 2997 
   2    1   10    6    1    3    2    8    7    5    6    3   13    5    1    1 
2999 3436 3496 3498 3597 3604 3800 3993 4134 4163 4297 4367 4395 4499 4663 4735 
   1    5    1    2    1    1    1    3    3    4    1    3    4    1    1    3 
4806 4999 5000 5204 5461 5935 5972 5998 5999 6208 6262 6299 6592 
   1    2   10    4    4    4    1    4    1    1    1    1    2 

In [134]:
table(cleaned_df$cylinders)


Out[134]:
  10   12    2    3    4    5    6    8  Yes 
   4    8   13   86  609   10   44   41    1 

In [135]:
str(cleaned_df)


'data.frame':	833 obs. of  18 variables:
 $ name             : Factor w/ 203 levels "Ashok Leyland Stile",..: 1 1 1 1 1 1 1 2 3 4 ...
 $ model            : Factor w/ 832 levels "Ashok Leyland Stile LE 8-STR (Diesel)",..: 1 4 7 3 2 6 5 8 9 10 ...
 $ price            : num  749990 799990 829990 849990 879990 ...
 $ type             : Factor w/ 23 levels "Cabriolet","Convertible",..: 10 10 10 10 10 10 10 16 16 3 ...
 $ engine           : int  1461 1461 1461 1461 1461 1461 1461 5935 5935 5935 ...
 $ fuel             : Factor w/ 4 levels " CNG"," Diesel",..: 2 2 2 2 2 2 2 4 4 4 ...
 $ weight           : Factor w/ 134 levels " 1185"," 1210",..: 68 68 68 68 68 68 68 NA 67 NA ...
 $ clearance        : num  180 180 180 180 180 180 180 120 110 120 ...
 $ bhp              : chr  " 75 " " 75 " " 75 " " 75 " ...
 $ rpm              : chr  " 3300" " 3300" " 3300" " 3300" ...
 $ mileage_city     : Factor w/ 161 levels " 10"," 10.0",..: 67 67 67 67 67 67 67 NA 122 134 ...
 $ mileage_highway  : Factor w/ 285 levels " 10.0"," 10.1",..: 198 198 198 198 198 198 198 262 26 278 ...
 $ cylinders        : Factor w/ 9 levels " 10"," 12"," 2",..: 5 5 5 5 5 5 NA 5 2 2 ...
 $ seats            : Factor w/ 13 levels " 10"," 2"," 4",..: 9 9 9 6 6 6 6 3 3 2 ...
 $ transmission_type: Factor w/ 11 levels " 4 Speed Automatic",..: 4 4 4 4 4 4 4 5 6 6 ...
 $ brand            : Factor w/ 38 levels "Ashok","Aston",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gears            : num  5 5 5 5 5 5 5 6 6 6 ...
 $ transmission     : chr  "Manual" "Manual" "Manual" "Manual" ...

In [136]:
cleaned_df$cylinders <- as.character(cleaned_df$cylinders)

In [137]:
unique(cleaned_df$cylinders)


Out[137]:
  1. ' 4'
  2. 'NA'
  3. ' 12'
  4. ' 8'
  5. ' 6'
  6. ' 10'
  7. ' 3'
  8. ' 2'
  9. ' 5'
  10. ' Yes'

In [138]:
filter(cleaned_df, cylinders == " Yes")


Out[138]:
namemodelpricetypeenginefuelweightclearancebhprpmmileage_citymileage_highwaycylindersseatstransmission_typebrandgearstransmission
1Tata ZestTata Zest XM (Petrol)544315Hatchback1193 PetrolNA175 88.8 5000 13.2 17.6 Yes 5 5 Speed ManualTata5Manual

In [139]:
cleaned_df$cylinders[cleaned_df$cylinders == " Yes"] = " 4"

In [140]:
table(cleaned_df$cylinders)


Out[140]:
 10  12   2   3   4   5   6   8 
  4   8  13  86 610  10  44  41 

In [141]:
table(cleaned_df$mileage_city)


Out[141]:
    10   10.0   10.1   10.2  10.34   10.5   10.6   10.7   10.8   10.9   11.0 
     1      2      1      2      1      2      1      2      4      2      3 
  11.2   11.3  11.38   11.4   11.5   11.8   11.9     12    1.2   12.0  12.02 
     2      4      8      3      1      2      2      4      1     20      3 
 12.08   12.2   12.3  12.36   12.4  12.82     13   13.0   13.1   13.2   13.3 
     3      1      5      1      2      1      2      5      1      6      9 
 13.35   13.4  13.45   13.5   13.7   13.8     14   14.0  14.02   14.1  14,12 
     1      3      3     14      1      1      1      5      2      1      1 
 14.12  14.21  14.25   14.3  14.44  14.54   14.7     15   15.0   15.1  15.16 
     2      1      1      2      3      2      3      5     15      5      2 
 15.22  15.24  15.54   15.6   15.7  15.77   15.9   16.0  16.04  16.08   16.1 
     3      2      1      5      5      2     10      3      1      5     10 
  16.2  16.24   16.3  16.36   16.8     17   17.0  17.08   17.1   17.2  17.25 
    15      3      9      1      6      4     14      7      5      9      8 
  17.4   17.5   17.9   18.0  18.04  18.08   18.2   18.3   18.4   18.5   18.6 
     1      4      1     10      1      4      4      3      9     10      6 
  18.9   19.0  19.03   19.2   19.5   19.7   19.9    2.0  20.04   20.2  20.32 
     9     12      2      5      1      2      2      2      3      1      9 
  20.4   20.8   20.9     21   21.0     22   22.0   22.3   22.4   23.6   23.8 
     6      1      4      5      4      1      1      1      4      2      4 
  25.0  25.02   28.0  28.29    4.0    4.5   4.85      5    5.0    5.3    5.5 
     1      1      1      3      2      1      1      3      1      1      1 
   5.6    5.7    5.8    6.0   6.12   6.13    6.2   6.28    6.3    6.4    6.5 
     2      1      1      3      1      1      1      1      2      1      1 
   6.9      7    7.0    7.1   7.14   7.16    7.2   7.32   7.69    7.7    7.9 
     1      1      1      1      1      2      3      2      1      1      1 
     8    8.0   8.03   8.19    8.2    8.4   8.45    8.5   8.69    8.9      9 
     1      2      2      2      1      3      1      1      1      2      4 
   9.0   9.02   9.12    9.2   9.25    9.5    9.8 
     3      7      1      1      1      5      2 

In [142]:
table(cleaned_df$mileage_highway)


Out[142]:
  10.0   10.1  10.10   10.2   10.3  10.34  10.37   10.4  10.41   10.7  10.75 
     3      1      1      1      3      1      2      1      1      1      2 
 10.77   10.9  10.98   11.1  11.11  11.23  11.33   11.4  11.49   11.5   11.6 
     1      1      1      2      1      1      1      3      2      2      3 
  11.7   11.8  11.81   11.9  11.96     12   12.0  12.05  12.07  12.12   12.2 
     2      1      1      2      2      1      7      7      1      1      1 
 12.35  12.39   12.4  12.45   12.5  12.55  12.67   12.8     13   13.0  13.05 
     1      2      2      1      2      5      2      1      3      2      1 
 13.08   13.1  13.15   13.2  13.22  13.24  13.25   13.3  13.32  13.49   13.5 
     1      3      1      1      1      2      1      3      3      1      4 
 13.53  13.55  13.58   13.7   13.8  13.85   13.9  13.93     14   14,0   14.0 
     1      4      9      7      2      1      4      4      4      1      7 
 14.02  14.07  14.09   14.1  14.16   14.2  14.28   14.3   14.4  14.49   14.5 
     5      3      3      3      1      1      1      4      4      1      1 
 14.53  14.56  14.57   14.6  14.69   14.7  14.74  14.79   14.8  14.81   14.9 
     5      1      1      1      2     10      1      1      1      1      2 
 14.94     15   15.0  15.04  15.05   15.1  15.16  15.33  15.35  15.42   15.5 
     1      2      5      1      3      8      8      4      1      3      5 
  15.6  15.62  15.64   15.7  15.73   15.8  15.96     16   16.0  16.02  16.09 
     3      1      1      4      2      5      3      1      2      3      3 
 16.12  16.15   16.2  16.26   16.3  16.36   16.4  16.47   16.5  16.51  16.55 
     1      1      5      1      4      1      1      3      2      3      2 
  16.6  16.60  16.66  16.73  16.78   16.8  16.95  16.96     17   17.0  17.01 
     1      1      1      1      6      3      7      1      2      2      3 
  17.1  17.15  17.19   17.2   17.3  17.32   17.4  17.43   17.5  17.54   17.6 
     6      1      1      3      6      1      1      1      3      2      4 
 17.67  17.68   17.7  17.71  17.72   17.9  17.97     18   18.0   18.1  18.19 
     1      1      2      5      1      3      3      5      3      2      2 
  18.2   18.3  18.44   18.5  18.56   18.6  18.78  18.88   18.9     19   19.0 
    10      2      2      6      2      9      2      3      8      1      3 
 19.01  19.08   19.1  19.12  19.27   19.3  19.33  19.34   19.4  19.49   19.5 
     4      1     11      2      1      7      4      1      6      3      6 
 19.59   19.6  19.64   19.7  19.72  19.81  19.87  19.98  19.99  20.14   20.3 
     2      2      4      1      2      2      5      5      4      3      1 
 20.38   20.4  20.45   20.5  20.51  20.58   20.6  20.62  20.63  20.64   20.7 
     4     11      1     11      3      1     11      1      2      3      7 
 20.73  20.77   20.8  20.85   20.9     21  21.04   21.1  21.14  21.15  21.19 
     5      1      4      1      1      6      4      6      4      2      2 
  21.2   21.3  21.43   21.5  21.64  21.66  21.79   21.9  22.01  22.07  22.08 
     7      1      3      3      5      3      1      1      1      1      1 
  22.1   22.3  22.54   22.7  22.71  22.74  22.77   23.0  23.08   23.1   23.2 
     8      2      5      5      5      7      2      5      9      7      3 
  23.4   23.5  23.54  23.59   23.7   23.9   24.0  24.07   24.2   24.4   24.6 
     3      5      1     10      2      7      4      5      6      5      2 
  24.8     25   25.0   25.1   25.2  25.44   25.8  26.21   26.6  27.62   27.8 
     1      1      5      1      4      4      5      4      1      4      2 
  29.0   30.0  30.46  32.36    4.0     42    5.9    6.4      7    7.5   7.51 
     1      1      3      1      2      2      3      1      1      1      1 
  7.63   7.75   7.81      8    8.0   8.06   8.08    8.1    8.3    8.4    8.5 
     1      1      1      1      4      1      1      1      1      1      2 
   8.6    8.7      9    9.0    9.2   9.24   9.26    9.4    9.5    9.6 
     1      2      2      4      1      1      1      1      2      1 

In [143]:
table(cleaned_df$seats)


Out[143]:
    10      2      4      5      6      7  7 & 8  7 & 9      8  8 & 7      9 
     9     37     29    572      9    102     19      9     21      1      2 
    No    Yes 
     2      5 

In [144]:
cleaned_df$seats <- as.character(cleaned_df$seats)

In [145]:
filter(cleaned_df, seats == " Yes")


Out[145]:
namemodelpricetypeenginefuelweightclearancebhprpmmileage_citymileage_highwaycylindersseatstransmission_typebrandgearstransmission
1BMW 6 SeriesBMW 6 Series 640d Design Pure Experience(Diesel)12190000Coupe2993 Diesel 2410124 313 4400 14.54 17.54 6 Yes 8 Speed AutomaticBMW8Automatic
2Mercedes-Benz CLAMercedes-Benz CLA 200 CDI Sport (Diesel)3590000Sedan2143 Diesel 2005NA 135 3600 4400 13.0 17.9 4 Yes 7 Speed AutomaticMercedes-Benz7Automatic
3Tata NanoTata GenX Nano XTA289000Hatchback624 PetrolNA180 37.5 5250 - 5750 20.4 23.9 2 Yes 4 Speed AutomaticTata4Automatic
4Toyota EtiosToyota Etios V SP (Petrol)673030Sedan1496 PetrolNA170 88.77 5600 13.5 16.78 4 Yes 5 Speed ManualToyota5Manual
5Volvo V40Volvo V40 D3 R-Design (Diesel)2770000Hatchback1984 Diesel 1980145 150 3500 23.6 27.8 5 Yes 6 Speed AutomaticVolvo6Automatic

In [146]:
filter(cleaned_df, seats == " No")


Out[146]:
namemodelpricetypeenginefuelweightclearancebhprpmmileage_citymileage_highwaycylindersseatstransmission_typebrandgearstransmission
1Maruti Suzuki Alto 800Maruti Suzuki Alto 800 Lx (Petrol)284435Hatchback796 Petrol 1285160 47.3 6000 17.0 22.74 3 No 5 Speed ManualMaruti5Manual
2Maruti Suzuki Wagon-RMaruti Wagon R LX (Petrol)362917Hatchback998 Petrol 1350165 67.07 6200 17.08 20.5 3 No 5 Speed ManualMaruti5Manual

In [147]:
cleaned_df$seats[cleaned_df$seats == " Yes"] = " 5"
cleaned_df$seats[cleaned_df$seats == " No"] = " 5"

In [148]:
table(cleaned_df$seats)


Out[148]:
    10      2      4      5      6      7  7 & 8  7 & 9      8  8 & 7      9 
     9     37     29    579      9    102     19      9     21      1      2 

In [149]:
table(cleaned_df$brand)


Out[149]:
        Ashok         Aston          Audi       Bentley           BMW 
            7             7            37             4            32 
          Car     Chevrolet        Datsun       Ferrari          Fiat 
           13            41             5             4            23 
        Force          Ford       Gumpert         Honda       Hyundai 
            4            30             1            36            56 
        Isuzu        Jaguar   Lamborghini          Land      Mahindra 
            1            13             1            16            43 
       Maruti      Mercedes Mercedes-Benz          Mini    Mitsubishi 
           97             2            30             6             2 
          New        Nissan       Porsche       Renault         Rolls 
           25            42            10            44             1 
  Rolls-Royce           San         Skoda     Ssangyong          Tata 
            1             1            27             4            86 
       Toyota    Volkswagen         Volvo 
           37            32            12 

Refine

  • brand
  • seat capacity
  • num cylinders
  • type

In [150]:
head(cleaned_df)


Out[150]:
namemodelpricetypeenginefuelweightclearancebhprpmmileage_citymileage_highwaycylindersseatstransmission_typebrandgearstransmission
1Ashok Leyland StileAshok Leyland Stile LE 8-STR (Diesel)749990MPV1461 Diesel 2000180 75 3300 16.2 20.7 4 8 5 Speed ManualAshok5Manual
2Ashok Leyland StileAshok Leyland Stile LS 8-STR (Diesel)799990MPV1461 Diesel 2000180 75 3300 16.2 20.7 4 8 5 Speed ManualAshok5Manual
3Ashok Leyland StileAshok Leyland Stile LX 8-STR (Diesel)829990MPV1461 Diesel 2000180 75 3300 16.2 20.7 4 8 5 Speed ManualAshok5Manual
4Ashok Leyland StileAshok Leyland Stile LS 7-STR (Diesel)849990MPV1461 Diesel 2000180 75 3300 16.2 20.7 4 7 5 Speed ManualAshok5Manual
5Ashok Leyland StileAshok Leyland Stile LS 7-STR Alloy (Diesel)879990MPV1461 Diesel 2000180 75 3300 16.2 20.7 4 7 5 Speed ManualAshok5Manual
6Ashok Leyland StileAshok Leyland Stile LX 7-STR (Diesel)899990MPV1461 Diesel 2000180 75 3300 16.2 20.7 4 7 5 Speed ManualAshok5Manual

In [151]:
by_name <- group_by(cleaned_df, name)

In [152]:
filtered_df <- slice(by_name,which.min(price))

In [153]:
head(filtered_df)


Out[153]:
namemodelpricetypeenginefuelweightclearancebhprpmmileage_citymileage_highwaycylindersseatstransmission_typebrandgearstransmission
1Ashok Leyland StileAshok Leyland Stile LE 8-STR (Diesel)749990MPV1461 Diesel 2000180 75 3300 16.2 20.7 4 8 5 Speed ManualAshok5Manual
2Aston Martin RapideAston Martin Rapide LUXE (Petrol)3.5e+07Sedan5935 PetrolNA120 350 6000NA 7 4 4 6 Speed AutomaticAston6Automatic
3Aston Martin Rapide SAston Martin Rapide S (Petrol)4.4e+07Sedan5935 Petrol 1990110 550 6750 5.6 11.9 12 4 6 Speed ManualAston6Manual
4Aston Martin V12 VantageAston Martin V12 Vantage Coupe (Petrol)3.5e+07Coupe5935 PetrolNA120 510 6500 7 9 12 2 6 Speed ManualAston6Manual
5Aston Martin V8 VantageAston Martin V8 Vantage Coupe (Petrol)13500000Coupe4735 PetrolNA120 313 7000 5 8 8 2 6 Speed ManualAston6Manual
6Aston Martin VanquishAston Martin Vanquish V12 (Petrol)38500000Sedan5935 PetrolNA147 564 6750 4.0 8.0 4 2 6 Speed AutomaticAston6Automatic

mileage fill na with mean


In [154]:
filtered_df$mileage_city <- as.numeric(as.character(filtered_df$mileage_city))
filtered_df$mileage_highway <- as.numeric(as.character(filtered_df$mileage_highway))


Warning message:
In eval(expr, envir, enclos): NAs introduced by coercion

In [155]:
table(filtered_df$mileage_city)


Out[155]:
  1.2     2     4     5   5.3   5.5   5.6   5.7   5.8     6  6.13   6.2   6.4 
    1     2     2     1     1     1     2     1     1     3     1     1     1 
  6.5   6.9     7  7.14  7.16   7.2  7.32   7.7   7.9     8  8.03  8.19   8.4 
    1     1     2     1     1     2     1     1     1     2     1     2     2 
  8.9     9   9.2   9.5   9.8    10  10.5  10.7  10.8  10.9  11.3 11.38  11.4 
    1     2     1     1     1     2     1     1     1     1     1     1     2 
 11.8  11.9    12 12.02 12.08  12.2  12.3 12.82    13  13.2  13.3  13.4 13.45 
    1     2     4     1     1     1     2     1     3     1     2     1     1 
 13.5  13.7    14  14.1 14.21 14.25  14.3 14.44 14.54  14.7    15  15.1 15.16 
    4     1     1     1     1     1     1     1     1     1     4     1     1 
15.54  15.6  15.7 15.77  15.9 16.08  16.2 16.24  16.3  16.8    17 17.08  17.1 
    1     1     1     1     2     1     2     1     2     1     2     2     1 
 17.2  17.9 18.08    19 19.03  20.2  20.4  23.6    28 
    1     1     1     2     1     1     1     1     1 

In [156]:
filtered_df$mileage_city[is.na(filtered_df$mileage_city)] = mean(filtered_df$mileage_city, na.rm=T)
filtered_df$mileage_highway[is.na(filtered_df$mileage_highway)] = mean(filtered_df$mileage_highway, na.rm=T)

In [157]:
print(mean(filtered_df$mileage_city))
print(mean(filtered_df$mileage_highway))


[1] 11.83425
[1] 15.35149

In [158]:
table(filtered_df$mileage_city)


Out[158]:
     1.2        2        4        5      5.3      5.5      5.6      5.7 
       1        2        2        1        1        1        2        1 
     5.8        6     6.13      6.2      6.4      6.5      6.9        7 
       1        3        1        1        1        1        1        2 
    7.14     7.16      7.2     7.32      7.7      7.9        8     8.03 
       1        1        2        1        1        1        2        1 
    8.19      8.4      8.9        9      9.2      9.5      9.8       10 
       2        2        1        2        1        1        1        2 
    10.5     10.7     10.8     10.9     11.3    11.38     11.4     11.8 
       1        1        1        1        1        1        2        1 
11.83425     11.9       12    12.02    12.08     12.2     12.3    12.82 
      83        2        4        1        1        1        2        1 
      13     13.2     13.3     13.4    13.45     13.5     13.7       14 
       3        1        2        1        1        4        1        1 
    14.1    14.21    14.25     14.3    14.44    14.54     14.7       15 
       1        1        1        1        1        1        1        4 
    15.1    15.16    15.54     15.6     15.7    15.77     15.9    16.08 
       1        1        1        1        1        1        2        1 
    16.2    16.24     16.3     16.8       17    17.08     17.1     17.2 
       2        1        2        1        2        2        1        1 
    17.9    18.08       19    19.03     20.2     20.4     23.6       28 
       1        1        2        1        1        1        1        1 

In [159]:
unique(filtered_df$engine)


Out[159]:
  1. 1461
  2. 5935
  3. 4735
  4. 1968
  5. 2967
  6. 1984
  7. 2995
  8. 4163
  9. 3993
  10. 5998
  11. 1995
  12. 2993
  13. 1499
  14. 2979
  15. 4395
  16. 2231
  17. 1998
  18. 1399
  19. 1199
  20. 995
  21. 2499
  22. 1198
  23. 4499
  24. 5999
  25. 4297
  26. 6262
  27. 1368
  28. 1172
  29. 2650
  30. 1596
  31. 1498
  32. 1196
  33. 1497
  34. 1997
  35. NA
  36. 1797
  37. 814
  38. 1197
  39. 1086
  40. 2199
  41. 2359
  42. 2999
  43. 2179
  44. 1999
  45. 5204
  46. 2523
  47. 3
  48. 1493
  49. 2489
  50. 998
  51. 796
  52. 1373
  53. 2393
  54. 1298
  55. 1248
  56. 1586
  57. 2143
  58. 1595
  59. 1991
  60. 5461
  61. 2987
  62. 3496
  63. 3498
  64. 6208
  65. 1496
  66. 1598
  67. 2477
  68. 1396
  69. 1798
  70. 2496
  71. 3436
  72. 6592
  73. 1149
  74. 1395
  75. 2696
  76. 1193
  77. 1405
  78. 624
  79. 2956
  80. 1948
  81. 2982
  82. 1390
  83. 3597
  84. 1969

In [160]:
filtered_df$model[is.na(filtered_df$engine)]


Out[160]:
  1. Honda jazz – Expected Specification
  2. Tata Bolt Revotron 1.2T XE (Petrol)
  3. Volkswagen Taigun -Expected Specification

In [161]:
table(filtered_df$engine)


Out[161]:
   3  624  796  814  995  998 1086 1149 1172 1193 1196 1197 1198 1199 1248 1298 
   1    1    2    1    1    5    1    1    1    3    2    8    9    4    2    1 
1368 1373 1390 1395 1396 1399 1405 1461 1493 1496 1497 1498 1499 1586 1595 1596 
   4    2    1    1    1    1    2    7    1    3    3    3    2    1    1    1 
1598 1797 1798 1948 1968 1969 1984 1991 1995 1997 1998 1999 2143 2179 2199 2231 
   8    1    2    1    6    2    5    3    7    1    2    1    5   10    1    1 
2359 2393 2477 2489 2496 2499 2523 2650 2696 2956 2967 2979 2982 2987 2993 2995 
   1    2    1    1    1    2    3    1    1    1    3    2    1    2    8    3 
2999 3436 3496 3498 3597 3993 4163 4297 4395 4499 4735 5204 5461 5935 5998 5999 
   1    3    1    2    1    3    3    1    2    1    1    1    1    4    4    1 
6208 6262 6592 
   1    1    2 

In [162]:
filtered_df$engine[is.na(filtered_df$engine)] = 1198

In [163]:
filtered_df$engine <- as.numeric(as.character(filtered_df$engine))

In [186]:
filtered_df$fuel_type = -1
filtered_df$fuel_type[filtered_df$fuel == ' Petrol'] = 0
filtered_df$fuel_type[filtered_df$fuel == ' Diesel'] = 1
filtered_df$fuel_type[filtered_df$fuel == ' CNG'] = 2
filtered_df$fuel_type[filtered_df$fuel_type == -1] = 0

In [187]:
unique(filtered_df$fuel_type)


Out[187]:
  1. 1
  2. 0
  3. 2

In [188]:
write.csv(filtered_df, file = "data/cars.tidy.csv", row.names = FALSE)

In [ ]: