In [47]:
%pylab inline
import pandas as pd
from sklearn import linear_model
In [9]:
train_df = pd.read_csv('./competition_data/train_set.csv', parse_dates=['quote_date'])
print train_df.shape
print train_df.dtypes
print train_df.sample(n=10, random_state=0)
In [6]:
train_df[(train_df.bracket_pricing == 'No') & \
(train_df.quantity > 1) & \
(train_df.min_order_quantity != train_df.quantity)]
Out[6]:
In [10]:
train_df_no_bracket = train_df[(train_df.bracket_pricing == 'No')]
In [32]:
print train_df_no_bracket.tube_assembly_id.unique().size
print train_df_no_bracket.shape
In [18]:
bill_materials_df = pd.read_csv('./competition_data/bill_of_materials.csv')
print bill_materials_df.shape
print bill_materials_df.dtypes
print bill_materials_df.sample(10, random_state=0)
In [42]:
tube_df = pd.read_csv('./competition_data/tube.csv')
In [44]:
train_df_no_bracket_indexed_by_id = train_df_no_bracket.set_index('tube_assembly_id')
bill_materials_df_indexed_by_id = bill_materials_df.set_index('tube_assembly_id')
tube_df_indexed_by_id = tube_df.set_index('tube_assembly_id')
# model_df = train_df_no_bracket_indexed_by_id.join(bill_materials_df_indexed_by_id)
model_df = pd.concat([train_df_no_bracket_indexed_by_id, bill_materials_df_indexed_by_id, tube_df_indexed_by_id], \
axis=1, join='inner')
In [46]:
print model_df.shape
# print model_df.head()
print model_df.dtypes
In [52]:
model_X = model_df[['annual_usage', 'diameter', 'wall', 'length', 'num_bends']].values
model_y = model_df['cost'].values
# model_X
In [55]:
lin_reg = linear_model.LinearRegression()
lin_reg.fit(model_X, model_y)
lin_reg.score(model_X, model_y)
Out[55]: