In [40]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import sklearn.model_selection
import sklearn.linear_model
import sklearn.preprocessing
import sklearn.metrics
In [41]:
flying_animal_data_filename = 'data/report-201707170745.xls'
SPECIES_LBL = 'Species'
FLIES_LBL = 'flies'
MASS_LBL = 'Mass - average - g'
SQRT_MASS_LBL = 'sqrt mass'
LENGTH_LBL = 'Length - average - mm'
WINGSPAN_LBL = 'Wingspan - average - mm'
In [42]:
with open(flying_animal_data_filename, 'r') as infile:
df = pd.read_excel(infile)
In [43]:
print "Dataset Rows", len(df)
flying_animals = df[df[FLIES_LBL] == 'YES'].dropna()
flying_animals[SQRT_MASS_LBL] = np.sqrt(flying_animals[MASS_LBL])
print "Flying animal Rows", len(flying_animals)
train_flying_animals, test_flying_animals = sklearn.model_selection.train_test_split(flying_animals, test_size=0.2, random_state=34)
In [44]:
# Look at extreme
train_flying_animals.sort(columns=MASS_LBL, ascending=False).head(n=2)
Out[44]:
In [45]:
train_flying_animals.plot(kind='scatter', x=MASS_LBL, y=WINGSPAN_LBL, figsize=(15,8))
Out[45]:
In [46]:
train_flying_animals.plot(kind='scatter', x=SQRT_MASS_LBL, y=WINGSPAN_LBL, figsize=(15,8))
Out[46]:
In [47]:
train_flying_animals.plot(kind='scatter', x=LENGTH_LBL, y=WINGSPAN_LBL, figsize=(15,8))
Out[47]:
In [48]:
# Model with two features
feature_names = [SQRT_MASS_LBL, LENGTH_LBL]
output_name = WINGSPAN_LBL
model = sklearn.linear_model.LinearRegression()
model.fit(train_flying_animals[feature_names], train_flying_animals[output_name])
print "Train Score", model.score(train_flying_animals[feature_names], train_flying_animals[output_name])
print "Test Score", model.score(test_flying_animals[feature_names], test_flying_animals[output_name])
In [54]:
print model.coef_
In [53]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: