In [ ]:
import pandas as pd
%matplotlib inline
In [ ]:
df = pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
In [ ]:
df
In [ ]:
pd.get_dummies(df['key'],prefix='key')
In [ ]:
df = pd.read_csv('data/ontime_reports_may_2015_ny.csv')
In [ ]:
#count number of NaNs in column
df['DEP_DELAY'].isnull().sum()
In [ ]:
#calculate the percentage this represents of the total number of instances
df['DEP_DELAY'].isnull().sum()/df['DEP_DELAY'].sum()
In [ ]:
#filter DEP_DELAY NaNs
df = df[pd.notnull(df['DEP_DELAY'])]
In [ ]:
#code whether delay or not delayed
df['IS_DELAYED'] = df['DEP_DELAY'].apply(lambda x: 1 if x>0 else 0 )
In [ ]:
#Let's check that our column was created properly
df[['DEP_DELAY','IS_DELAYED']]
In [ ]:
###Dummy variables create a
In [ ]:
pd.get_dummies(df['ORIGIN'],prefix='origin')
In [ ]:
#Normalize the data attributes for the Iris dataset
# Example from Jump Start Scikit Learn https://machinelearningmastery.com/jump-start-scikit-learn/
from sklearn.datasets import load_iris
from sklearn import preprocessing #load the iris dataset
iris=load_iris()
X=iris.data
y=iris.target #normalize the data attributes
normalized_X = preprocessing.normalize(X)
In [ ]:
zip(X,normalized_X)
In [ ]:
In [ ]:
In [ ]: