In [1]:
from IPython.display import Image
%matplotlib inline
In [2]:
# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
In [3]:
import pandas as pd
from io import StringIO
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
# If you are using Python 2.7, you need
# to convert the string to unicode:
# csv_data = unicode(csv_data)
df = pd.read_csv(StringIO(csv_data))
df
Out[3]:
In [4]:
df.isnull().sum()
Out[4]:
In [5]:
df.dropna()
Out[5]:
In [6]:
df.dropna(axis=1)
Out[6]:
In [7]:
# only drop rows where all columns are NaN
df.dropna(how='all')
Out[7]:
In [8]:
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4)
Out[8]:
In [9]:
# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])
Out[9]:
In [10]:
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data
Out[10]:
In [11]:
df.values
Out[11]:
In [12]:
Image(filename='./images/10_01.png', width=400)
Out[12]:
In [13]:
Image(filename='./images/10_02.png', width=400)
Out[13]:
In [14]:
import pandas as pd
df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
['red', 'L', 13.5, 'class2'],
['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df
Out[14]:
In [15]:
size_mapping = {'XL': 3,
'L': 2,
'M': 1}
df['size'] = df['size'].map(size_mapping)
df
Out[15]:
In [16]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)
Out[16]:
In [17]:
import numpy as np
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping
Out[17]:
In [18]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df
Out[18]:
In [19]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df
Out[19]:
In [21]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y
Out[21]:
In [22]:
class_le.inverse_transform(y)
Out[22]:
In [23]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X
Out[23]:
In [24]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()
Out[24]:
In [25]:
# An even more convenient way to create those dummy features via one-hot encoding
# is to use the get_dummies method implemented in pandas. Applied on a DataFrame ,
# the get_dummies method will only convert string columns and leave all other
# columns unchanged:
pd.get_dummies(df[['price', 'color', 'size']])
Out[25]:
In [23]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
'ml/machine-learning-databases/wine/wine.data',
header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium', 'Total phenols',
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
'Proline']
print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()
Out[23]:
In [26]:
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import train_test_split
else:
from sklearn.model_selection import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
In [27]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)
In [28]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
A visual example:
In [29]:
ex = pd.DataFrame([0, 1, 2, 3, 4, 5])
# standardize
ex[1] = (ex[0] - ex[0].mean()) / ex[0].std(ddof=0)
# Please note that pandas uses ddof=1 (sample standard deviation)
# by default, whereas NumPy's std method and the StandardScaler
# uses ddof=0 (population standard deviation)
# normalize
ex[2] = (ex[0] - ex[0].min()) / (ex[0].max() - ex[0].min())
ex.columns = ['input', 'standardized', 'normalized']
ex
Out[29]:
In [47]:
df_titanic = pd.read_csv('datasets/titanic_kaggle.csv', encoding='utf-8')
In [48]:
df_titanic.head()
Out[48]:
In [49]:
# info of features
df_titanic.info()
In [50]:
# check missing values in each feature
df_titanic.isnull().sum()
Out[50]:
In [51]:
# drop unnecessary columns, these columns won't be useful in analysis and prediction
df_titanic = df_titanic.drop(['PassengerId','Name','Ticket'], axis=1)
In [56]:
df_titanic['Age'] = df_titanic['Age'].fillna((df_titanic['Age'].mean()))
df_titanic.isnull().sum()
Out[56]:
In [58]:
# cabin has too many null values, drop it
df_titanic = df_titanic.drop(['Cabin'], axis=1)
In [60]:
df_titanic.info()
In [69]:
# find unique values
df_titanic["Embarked"].unique()
Out[69]:
In [68]:
# find frequency of each
df_titanic["Embarked"].value_counts()
Out[68]:
In [73]:
# null values in Embarked
df_titanic["Embarked"].isnull().sum()
Out[73]:
In [72]:
# replace it by the most frequent value in Embarked
df_titanic['Embarked'] = df_titanic['Embarked'].fillna("S")
In [74]:
# null values in Embarked
df_titanic["Embarked"].isnull().sum()
Out[74]:
In [75]:
# all null values
df_titanic.isnull().sum()
Out[75]:
In [77]:
# convert gender from male and female to binary values 0 and 1
gender_mapping = {'male': 0, 'female': 1}
df_titanic['Sex'] = df_titanic['Sex'].map(gender_mapping)
In [80]:
df_titanic["Sex"].value_counts()
Out[80]:
In [86]:
# convert Embarked using one hot encoding
df_titanic = pd.get_dummies(df_titanic)
In [103]:
df_titanic.info()
In [89]:
# split into train and test frames
df_train, df_test = train_test_split(df_titanic, test_size=0.3, random_state=0)
In [90]:
# read X and y
X_train, y_train = df_train.iloc[:, 1:].values, df_train.iloc[:, 0].values
X_test, y_test = df_test.iloc[:, 1:].values, df_test.iloc[:, 0].values
In [91]:
# standardize values
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
In [105]:
# train a logistic classifier
lr = LogisticRegression()
lr.fit(X_train_std, y_train)
print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))
In [32]:
Image(filename='./images/04_12.png', width=500)
Out[32]:
In [33]:
Image(filename='./images/04_13.png', width=500)
Out[33]:
In [40]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=0.1)
lr.fit(X_train_std, y_train)
print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))
In [41]:
lr.intercept_
Out[41]:
In [42]:
lr.coef_
Out[42]:
In [37]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = plt.subplot(111)
colors = ['blue', 'green', 'red', 'cyan',
'magenta', 'yellow', 'black',
'pink', 'lightgreen', 'lightblue',
'gray', 'indigo', 'orange']
weights, params = [], []
for c in np.arange(-4, 6):
lr = LogisticRegression(penalty='l1', C=10**c, random_state=0)
lr.fit(X_train_std, y_train)
weights.append(lr.coef_[1])
params.append(10**c)
weights = np.array(weights)
for column, color in zip(range(weights.shape[1]), colors):
plt.plot(params, weights[:, column],
label=df_wine.columns[column + 1],
color=color)
plt.axhline(0, color='black', linestyle='--', linewidth=3)
plt.xlim([10**(-5), 10**5])
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.xscale('log')
plt.legend(loc='upper left')
ax.legend(loc='upper center',
bbox_to_anchor=(1.38, 1.03),
ncol=1, fancybox=True)
# plt.savefig('./figures/l1_path.png', dpi=300)
plt.show()
Raschka, Sebastian. Python machine learning. Birmingham, UK: Packt Publishing, 2015. Print.