In [1]:
import datetime
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import scipy
import seaborn as sns
%matplotlib inline
sns.set_style("white")
In [3]:
# read the data
cars = pd.read_csv("autos.csv", encoding="Latin1", parse_dates = ['dateCrawled','dateCreated','lastSeen'])
In [3]:
# clean the data:
# only these values make sense for car age:
cars = cars[(cars.yearOfRegistration < 2017) & (cars.yearOfRegistration > 1900)]
# we don't want to have non-sensible prices:
cars = cars[(cars.price < 500000) & (cars.price > 500)]
# only interested in working cars for now:
cars = cars[cars.notRepairedDamage != "ja"]
In [4]:
cars = cars.assign(mileage_cat=[("low", "medium", "med-high", "high")[min(3, int(math.floor(x/50000)))] for x in cars.kilometer])
In [5]:
# age is a better feature than year of registration
# here we use the number of days since registration
cars = cars.assign(age=[datetime.timedelta(seconds=(x.dateCreated.timestamp() -
(datetime.datetime.strptime(
str(x.yearOfRegistration) + str(x.monthOfRegistration), "%Y%M")).timestamp())).days
for i, x in cars.iterrows()])
In [17]:
# only use cars not registered in the future
cars = cars[cars.age > 0]
# only use cars with PS
cars = cars[cars.powerPS > 0]
# only use cars with kilometers
cars = cars[cars.kilometer > 0]
In [ ]:
In [18]:
# save the modified csv
cars.to_csv("autos.mod.csv")
In [2]:
# to start with cleaned & modified data:
cars = pd.read_csv("autos.mod.csv")
In [7]:
cars.offerType.value_counts()
Out[7]:
In [8]:
#cars.plot(x="yearOfRegistration", y="price", kind="scatter", ylim=(0, 1000000))
plt.figure()
sns.lmplot('age', 'price', data=cars, fit_reg=False, hue="brand")
plt.xlim(0, 50000)
Out[8]:
In [9]:
# most common models
cars.model.value_counts()[:20]
Out[9]:
In [10]:
# get general depreciation
from sklearn import linear_model
clf = linear_model.LinearRegression()
clf.fit(cars.loc[:, ("kilometer", "yearOfRegistration")], y=cars.price)
Out[10]:
In [11]:
clf.coef_
Out[11]:
In [ ]:
# compare depreciation per model
In [12]:
cars.yearOfRegistration.hist()
Out[12]:
In [13]:
sns.lmplot('yearOfRegistration', 'price', data=cars[cars.model=="golf"], fit_reg=False, hue="mileage_cat")
Out[13]:
In [14]:
sns.lmplot('yearOfRegistration', 'price', data=cars[cars.model=="1er"], fit_reg=False, hue="mileage_cat")
Out[14]:
In [15]:
sns.lmplot('yearOfRegistration', 'price', data=cars[cars.model=="3er"], fit_reg=False, hue="mileage_cat")
Out[15]:
In [17]:
sns.lmplot('age', 'price', data=cars[cars.model=="3er"], fit_reg=False, hue="mileage_cat")
Out[17]:
In [16]:
sns.countplot(x="yearOfRegistration", hue="mileage_cat", data=cars[cars.model=="3er"])
Out[16]:
In [69]:
# write function for fit parameters for one model
# run function for all models > 100 entries
# test accuracy for each
# see how good my accuracy is, maybe also depending on input data
In [4]:
# try to fit model
import main
import importlib
importlib.reload(main)
main.fit_params(cars[cars.model=="golf"].loc[:, ("powerPS", "kilometer", "age")], cars.price[cars.model=="golf"])
In [12]:
# try to fit model
import main
import importlib
importlib.reload(main)
main.fit_params(cars[cars.model=="3er"].loc[:, ("powerPS", "kilometer", "age")], cars.price[cars.model=="3er"])
In [10]:
# try to fit model
import main
import importlib
importlib.reload(main)
main.fit_params(cars[cars.model=="1er"].loc[:, ("powerPS", "kilometer", "age")], cars.price[cars.model=="1er"])
All in all this seems to work OK. We can see how the model overfits without regularization and how the regularization fixes that. The depreciation curve show nicely how the cars lose value differently.
In [ ]: