In [15]:
%pylab inline
from soln.dataset import load_raw_data
from itertools import islice
import pandas as pd
raw = load_raw_data()
In [2]:
train_set['cost'].describe()
Out[2]:
In [3]:
np.log(train_set['cost'] + 1).hist(bins=100)
Out[3]:
In [4]:
train_set['cost'].mean()
Out[4]:
In [5]:
train_set['supplier'].describe()
Out[5]:
In [6]:
train_set['supplier'].value_counts()
Out[6]:
In [41]:
tube_df = pd.read_csv('data/competition_data/tube.csv')
tube_df[:5]
Out[41]:
In [26]:
tube_df['material_id'].value_counts()
Out[26]:
In [32]:
train_set_ext = pd.merge(train_set, tube_df, on='tube_assembly_id')
train_set_ext[1000:1010]
Out[32]:
In [36]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30]})
df
Out[36]:
In [38]:
set(df['a'])
Out[38]:
In [43]:
train_set_ext[train_set['tube_assembly_id'] == 'TA-00034']
Out[43]:
In [53]:
tube_df[tube_df['tube_assembly_id'] == 'TA-00034']['diameter'].values
Out[53]:
In [46]:
train_set_ext[train_set['tube_assembly_id'] == 'TA-00034']['diameter'].unique()
Out[46]:
In [9]:
specs_df = pd.read_csv('data/competition_data/specs.csv')
specs_df[10:15]
Out[9]:
In [29]:
bill_df = pd.read_csv('data/competition_data/bill_of_materials.csv')
bill_df[:5]
Out[29]:
In [14]:
tube_df[tube_df.bend_radius == 9999]
In [51]:
tube_df[tube_df.bend_radius < 9999].bend_radius.hist(bins=100)
Out[51]:
In [49]:
tube_df.bend_radius.describe()
Out[49]:
In [19]:
tube = raw['tube']
print tube.end_a.value_counts()
print tube.end_x.value_counts()
In [ ]: