In [1]:
%pylab inline
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

In [2]:
%ls ./competition_data/


bill_of_materials.csv  comp_other.csv         test_set.csv
comp_adaptor.csv       comp_sleeve.csv        train_set.csv
comp_boss.csv          comp_straight.csv      tube.csv
comp_elbow.csv         comp_tee.csv           tube_end_form.csv
comp_float.csv         comp_threaded.csv      type_component.csv
comp_hfl.csv           components.csv         type_connection.csv
comp_nut.csv           specs.csv              type_end_form.csv

In [3]:
train_df = pd.read_csv('./competition_data/train_set.csv')
train_df.shape


Out[3]:
(30213, 8)

In [4]:
train_df.head()


Out[4]:
tube_assembly_id supplier quote_date annual_usage min_order_quantity bracket_pricing quantity cost
0 TA-00002 S-0066 2013-07-07 0 0 Yes 1 21.905933
1 TA-00002 S-0066 2013-07-07 0 0 Yes 2 12.341214
2 TA-00002 S-0066 2013-07-07 0 0 Yes 5 6.601826
3 TA-00002 S-0066 2013-07-07 0 0 Yes 10 4.687770
4 TA-00002 S-0066 2013-07-07 0 0 Yes 25 3.541561

In [5]:
train_df.dtypes


Out[5]:
tube_assembly_id       object
supplier               object
quote_date             object
annual_usage            int64
min_order_quantity      int64
bracket_pricing        object
quantity                int64
cost                  float64
dtype: object

In [6]:
train_df.describe()


Out[6]:
annual_usage min_order_quantity quantity cost
count 30213.000000 30213.000000 30213.000000 30213.000000
mean 120.369377 2.084699 38.389369 13.433317
std 1590.331872 12.742776 70.761392 28.663200
min 0.000000 0.000000 1.000000 0.503553
25% 0.000000 0.000000 2.000000 3.878190
50% 0.000000 0.000000 10.000000 6.521146
75% 2.000000 0.000000 40.000000 13.431781
max 150000.000000 535.000000 2500.000000 1000.000000

In [7]:
test_df = pd.read_csv('./competition_data/test_set.csv')

In [8]:
test_df.head()


Out[8]:
id tube_assembly_id supplier quote_date annual_usage min_order_quantity bracket_pricing quantity
0 1 TA-00001 S-0066 2013-06-23 0 0 Yes 1
1 2 TA-00001 S-0066 2013-06-23 0 0 Yes 2
2 3 TA-00001 S-0066 2013-06-23 0 0 Yes 5
3 4 TA-00001 S-0066 2013-06-23 0 0 Yes 10
4 5 TA-00001 S-0066 2013-06-23 0 0 Yes 25

In [9]:
train_df.cost.describe()


Out[9]:
count    30213.000000
mean        13.433317
std         28.663200
min          0.503553
25%          3.878190
50%          6.521146
75%         13.431781
max       1000.000000
Name: cost, dtype: float64

In [10]:
plt.boxplot(train_df.cost)
plt.show()



In [11]:
train_df[train_df.cost > 200].shape


Out[11]:
(125, 8)

In [12]:
train_df[train_df.cost > 100].shape


Out[12]:
(399, 8)

In [13]:
plt.hist(train_df.cost, bins=100, range=(0, 100))
plt.show()



In [14]:
train_df[train_df.cost==1000]


Out[14]:
tube_assembly_id supplier quote_date annual_usage min_order_quantity bracket_pricing quantity cost
10364 TA-06161 S-0026 2008-06-23 1 1 No 1 1000

In [15]:
train_df.quantity.plot(kind='hist', bins = 100, range=(0, 100))


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x10829d510>

In [38]:
quantity_feature = train_df.quantity

In [51]:
quantity_feature_val_counts = quantity_feature.value_counts()

In [59]:
quantity_feature_val_counts[[1,5,10,2,25,50,100,250,20]].sum()


Out[59]:
26815

In [60]:
26815 / 30213.0


Out[60]:
0.8875318571475854

In [ ]: