In [2]:
%pylab inline
import os
import pandas as pd
In [4]:
list(sorted(os.listdir('data/competition_data/')))
Out[4]:
In [28]:
train_set = pd.read_csv('data/competition_data/train_set.csv')
train_set['quote_date'] = pd.to_datetime(train_set['quote_date'])
train_set[:5]
Out[28]:
In [25]:
series = train_set['supplier'].value_counts()
df = pd.concat({'count': series}, axis=1)
df['index'] = range(len(df))
df['cum_sum'] = df['count'].cumsum()
df['cum_frac'] = df['cum_sum'] / df['count'].sum()
df
Out[25]:
In [55]:
train_set['quote_date'].quantile([0.005, 0.01, 0.05, 0.5, 0.95, 0.99, 0.995])
Out[55]:
In [59]:
train_set['annual_usage'].describe(percentiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])
Out[59]:
In [73]:
train_set['annual_usage'].plot(kind='hist', bins=100)
plt.ylim(0, 100)
#plt.xscale('log')
Out[73]:
In [75]:
train_set['min_order_quantity'].describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
Out[75]:
In [78]:
train_set['min_order_quantity'].plot(kind='hist', bins=100)
plt.ylim(0, 500)
Out[78]:
In [81]:
train_set['bracket_pricing'].value_counts(normalize=True)
Out[81]:
In [84]:
train_set['quantity'].describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
Out[84]:
In [89]:
train_set['quantity'].plot(kind='hist', bins=100)
plt.ylim(0, 10000)
# train_set['quantity'].value_counts()
Out[89]:
In [9]:
tube = pd.read_csv('data/competition_data/tube.csv')
tube[:5]
Out[9]:
In [ ]:
In [ ]: