Analisamos a tabela fipe de preço médio de veículos.
In [1]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
import numpy
import pandas
import seaborn
from matplotlib import pyplot
%matplotlib inline
from scipy import stats
pyplot.style.use(['ggplot', 'seaborn-colorblind', 'seaborn-talk'])
pyplot.xkcd()
Out[1]:
Carrega os dados da tabela de preços mais recente. Separa modelos e versões de modelos.
In [2]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
prices = pandas.read_csv('../dat/fipe_price_list_201710.csv.gz', index_col=[0])
# Major maker
major_makers = ['Mercedes-Benz', 'BMW', 'Audi', 'VW - Volkswagen', 'Fiat',
'GM - Chevrolet', 'Kia Motors', 'Hyundai', 'Ford', 'Citroën',
'Renault']
prices['maker_name_major'] = prices['maker_name'].map(lambda x: x
if x in major_makers else
'_outros_')
# Model and version
model_version = prices.model_name.str.split(' ', 1)
prices['model_name'] = model_version.str[0]
prices['model_version'] = model_version.str[1]
In [3]:
prices.head()
Out[3]:
In [4]:
prices.describe()
Out[4]:
In [5]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
prices_by_models = prices.groupby(['maker_name', 'model_name'])\
['price'].agg(['min', 'mean', 'max', 'std'])
prices_by_models.reset_index(level=1, inplace=True)
prices_by_models.head()
Out[5]:
In [6]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
models_by_maker = prices_by_models.pivot_table(index=['maker_name'],
values='model_name',
aggfunc='count')
models_by_maker.sort_values(by='model_name', ascending=False, inplace=True)
models_by_maker.head()
Out[6]:
In [7]:
ax = models_by_maker.plot.bar(legend=False, figsize=[11, 5])
ax.figure.tight_layout()
In [8]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
prices_by_models = prices.groupby(['maker_name', 'model_name', 'build_year'])\
['price'].agg(['min', 'mean', 'max', 'std'])
prices_by_models.reset_index(level=[2, 1], inplace=True)
prices_by_models.head()
Out[8]:
In [9]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
models_by_maker = prices_by_models.pivot_table(index=['maker_name'],
values='model_name',
columns=['build_year'],
aggfunc='count')
models_by_maker.head()
Out[9]:
In [10]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
models_by_maker_today = models_by_maker[32000].sort_values(ascending=False)
ax = models_by_maker_today[models_by_maker_today>0].plot.bar(legend=False,
figsize=[11, 5])
ax.figure.tight_layout()
In [11]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
prices_by_models_major = prices.groupby(['maker_name_major', 'model_name',
'build_year'])['price'].agg(['min', 'mean',
'max', 'std'])
prices_by_models_major.reset_index(level=2, inplace=True)
prices_by_models_major.head()
Out[11]:
In [12]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
models_by_maker_major = prices_by_models_major\
[prices_by_models_major.build_year < 32000].pivot_table(
index='build_year', columns=['maker_name_major'], values='mean',
aggfunc='count')
models_by_maker_major.head()
Out[12]:
In [13]:
colors = pyplot.cm.tab20c(numpy.linspace(0, 1, 11))
models_by_maker_major.plot.area(figsize=[11, 5], color=colors)
Out[13]:
In [14]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
ptable_versions = prices.pivot_table(index=['maker_name'], values='model_name',
aggfunc=len)
ptable_versions.sort_values(by='model_name', ascending=False, inplace=True)
In [15]:
ax = ptable_versions.plot.bar(legend=False, figsize=[11, 5])
ax.figure.tight_layout()
Hoje, considerando-se apenas carros novos (0 km):
In [16]:
ptable_versions_new = prices[prices.build_year==32000].pivot_table(
index=['maker_name'], values='model_name', aggfunc=len)
ptable_versions_new.sort_values(by='model_name', ascending=False, inplace=True)
In [17]:
ax = ptable_versions_new.plot.bar(legend=False, figsize=[11, 5])
ax.figure.tight_layout()
Ao longo do tempo, quantas versões de carros há no mercado?
In [18]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
ptable_model_version = prices.pivot_table(index=['maker_name', 'model_name'],
columns='build_year',
values='model_version', aggfunc=len)
In [19]:
ptable_model_version.head()
Out[19]:
In [20]:
version_timeseries = ptable_model_version.sum()
version_timeseries[version_timeseries.index < 32000].plot.line(figsize=[11, 5])
Out[20]:
In [21]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
bins = numpy.arange(0, 300e3, 1e3)
ax = prices['price'].plot.hist(bins=bins, legend=False, figsize=[11, 5],
linewidth=0, normed=True, histtype='stepfilled')
In [22]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
ax = prices['price'].plot.hist(bins=bins, legend=False, figsize=[11, 5],
linewidth=0, normed=True, histtype='stepfilled')
mean, median = prices.price.mean(), prices.price.median()
mode = prices.price.mode().values[0]
ax.axvline(mean)
ax.text(mean+2500, 2e-5, 'Media={:.0f}'.format(mean))
ax.axvline(median)
ax.text(median+2500, 1.8e-5, 'Mediana={:.0f}'.format(median))
ax.axvline(mode)
ax.text(mode+2500, 1.6e-5, 'Moda={:.0f}'.format(mode))
mu, loc, scale = stats.invgauss.fit(prices['price'])
x = 0.5 * (bins[1:] + bins[:-1]) # Center of the bins
pdf = stats.invgauss.pdf(x, mu, loc, scale)
ax.plot(x, pdf, 'r')
ax.annotate('{:.0f}'.format(scale/mu), xy=(scale/mu, 2.45e-5), xycoords='data',
xytext=(30, 10), textcoords='offset points',
fontsize='12', fontstretch='ultra-condensed',
ha='center', va='bottom',
arrowprops=dict(arrowstyle="-",
connectionstyle='arc3,rad=0.3',
color='#999999')
)
Out[22]:
In [23]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
ax = prices['price'][prices.build_year == 32000].plot.hist(
bins=bins, legend=False, figsize=[11, 5], linewidth=0,
histtype='stepfilled')
In [24]:
prices.where(prices.build_year < 32000)\
.plot.scatter('build_year', 'price', figsize=[11, 5])
Out[24]:
In [25]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
ax = prices.boxplot(column='price', by='build_year', rot=45, figsize=[11, 5],
grid=False)
ax.set_ylim([0, 5e5])
ax.figure.suptitle('')
ax.figure.tight_layout()
In [26]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
ptable = prices.pivot_table(values='price', index=['maker_name', 'model_name',
'fuel_type'],
columns=['build_year'])
ptable.describe()
Out[26]:
In [27]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
fig = pyplot.figure(figsize=[9, 200])
ax = seaborn.heatmap(ptable, square=False, cmap='viridis', linewidths=0.1,
linecolor='#cccccc', vmin=0, vmax=200e3,
cbar_kws=dict(orientation='horizontal', extend='max',
pad=0.0045, aspect=40))
fig.tight_layout()
In [28]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
def get_almost_new(df):
if not pandas.isnull(df[2018]):
return df[2018]
elif not pandas.isnull(df[2017]):
return df[2017]
elif not pandas.isnull(df[2016]):
return df[2016]
else:
return numpy.nan
prices_ptable = prices.where(prices.build_year >= 2016)\
.pivot_table(values='price', columns=['build_year'],
index=['maker_name', 'model_name', 'fipe_code'], aggfunc=sum)
prices_ptable['almostnew'] = [get_almost_new(prices_ptable.iloc[n]) for n
in range(len(prices_ptable))]
prices_ptable['rel_devaluation'] = 1 - (prices_ptable['almostnew'] /
prices_ptable[32000])
ax = prices_ptable.plot.scatter([32000], 'rel_devaluation', logx=True,
figsize=[11, 5], by='maker_name')
ax.set_xlabel('')
ax.figure.tight_layout()
In [29]:
#...:....|....:....|....:....|....:....|....:....|....:....|....:....|....:....|
ax = prices_ptable.boxplot(column='rel_devaluation', by='maker_name', rot=90,
grid=False, figsize=[11, 5])
ax.figure.suptitle('')
ax.figure.tight_layout()
In [ ]: