In [185]:
import pandas as pd
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
import statsmodels.formula.api as smf # package we'll be using for linear regression
%matplotlib inline
In [186]:
# Load the data.
df = pd.read_csv('../data/heights_weights_genders.csv')
df.head(3)
Out[186]:
In [187]:
# Take a look at it.
df.plot(kind="scatter",x="Height",y="Weight")
Out[187]:
In [188]:
lm = smf.ols(formula="Weight~Height",data=df).fit()
lm.params
intercept, height = lm.params
In [189]:
# Function using the built math.
def simplest_predictor(user_height, height, intercept):
height = float(height)
intercept = float(intercept)
user_height = float(user_height)
return height*user_height+intercept
In [190]:
# Input the data
user_height = input("Please enter your height in inches: ")
print("Your weight will probably be around", simplest_predictor(user_height,height,intercept), "pounds.")
In [191]:
# But how could I do that without using the built in formulas?
# Let's generate some simplified reference data.
ref_dict = {}
inches = range(54,79)
for inch in inches:
ref_dict[inch] = df.loc[(df["Height"] > inch) & (df["Height"] < inch + 1), "Weight"].median()
In [192]:
# Function to print out the corresponding weight value
def simple_predictor(height):
height = float(height)
try:
if height > 78 or height < 54:
print("Sorry, we don't have enough data to compute your weight.")
else:
print("Your weight is probably around", round(ref_dict[height]), "pounds.")
except:
print("Sorry, we don't understand your input.")
In [193]:
# Input the data
height = input("Please enter your Height in Inches: ")
simple_predictor(height)
In [210]:
# Unfortunately, the first predictor is not really a predictor. It just compares with others.
# It is useless if there is no data. Let's try to find another way.
df_ref = pd.DataFrame(ref_dict, index=['Weight'])
df_ref = df_ref.transpose()
df_ref['divided'] = df_ref.index / df_ref['Weight']
df_ref['intercept'] = df_ref['divided'].shift(1) - df_ref['divided']
df_ref
Out[210]:
In [211]:
# Let's create a 'magic number' out of the 'divided' column.
# We are using this number to calculate unknown inputs. The formula: Weight = Height * divided
magicnumber = float(df_ref['divided'].median())
magicintercept = float(df_ref['intercept'].median())
Out[211]:
In [212]:
def simple_predictor_2(height2, magicnumber):
height2 = float(height2)
return height2 / magicnumber + magicintercept
In [213]:
# Get the user data.
height2 = input("Please enter your height in inches: ")
print("Your weight is probably around", simple_predictor_2(height2,magicnumber), "pounds.")