In [1]:
%pylab inline
import pandas as pd
In [2]:
%ls ./competition_data/
In [3]:
train_df = pd.read_csv('./competition_data/train_set.csv')
train_df.shape
Out[3]:
In [4]:
train_df.head()
Out[4]:
In [5]:
train_df.dtypes
Out[5]:
In [6]:
train_df.describe()
Out[6]:
In [7]:
test_df = pd.read_csv('./competition_data/test_set.csv')
In [8]:
test_df.head()
Out[8]:
In [9]:
train_df.cost.describe()
Out[9]:
In [10]:
plt.boxplot(train_df.cost)
plt.show()
In [11]:
train_df[train_df.cost > 200].shape
Out[11]:
In [12]:
train_df[train_df.cost > 100].shape
Out[12]:
In [13]:
plt.hist(train_df.cost, bins=100, range=(0, 100))
plt.show()
In [14]:
train_df[train_df.cost==1000]
Out[14]:
In [15]:
train_df.quantity.plot(kind='hist', bins = 100, range=(0, 100))
Out[15]:
In [38]:
quantity_feature = train_df.quantity
In [51]:
quantity_feature_val_counts = quantity_feature.value_counts()
In [59]:
quantity_feature_val_counts[[1,5,10,2,25,50,100,250,20]].sum()
Out[59]:
In [60]:
26815 / 30213.0
Out[60]:
In [ ]: