In [15]:
%matplotlib inline
import pyspark
import matplotlib.pyplot as plt
import numpy as np
import scipy
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
import pandas as pd
In [16]:
sc = pyspark.SparkContext("local", "task")
In [17]:
from math import isnan
from operator import or_
def parsePoints(df):
data = list()
for i in xrange(len(df)):
survived = float(df["Survived"][i])
age_ = float(df["Age"][i])
# Example of handling NaN values
# age = 23.0 if isnan(age_) else age_
pClass = float(df["Pclass"][i])
nameL = float(len(df["Name"][i]))
sex = 1.0 if df["Sex"][i] == "male" else 0.0
sibSp = float(df["SibSp"][i])
parch = float(df["Parch"][i])
fare = float(df["Fare"][i])
features = [age, pClass, nameL, sex, sibSp, parch, fare]
if reduce(or_, map(isnan, features)):
continue
p = LabeledPoint(survived, features)
data.append(p)
return data
df = pd.read_csv("./train.csv")
data = parsePoints(df)
print "N:", len(data)
train, test = sc.parallelize(data).randomSplit([0.5, 0.5], seed=2l)
print "Train positive:", train.filter(lambda p: p.label == 1.0).count()
print "Train negative:", train.filter(lambda p: p.label == 0.0).count()
print "Test positive:", test.filter(lambda p: p.label == 1.0).count()
print "Test negative:", test.filter(lambda p: p.label == 0.0).count()
In [18]:
### data: [(real, predicted)]
def roc_curve(data):
from sklearn.metrics import roc_curve, auc
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve([ r for r, p in data ], [ p for r, p in data ])
roc_auc = auc(fpr, tpr)
# Plot ROC curve
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111)
plt.plot(fpr, tpr, color='lightblue', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='dotted', label='random guessing')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC)')
plt.legend(loc="lower right")
ax.annotate('AUC = %0.2f' %roc_auc, xy=(0.35, 0.6))
plt.show()
In [19]:
xs = train.map(lambda p: p.features[0]).collect()
ys = train.map(lambda p: p.label).collect()
plt.figure()
plt.ylim([-0.1, 1.1])
plt.plot(xs, ys, "x")
plt.show()
All tasks should be done using spark and mllib if it's possible.
parsePoints function) in the age column and compare results.
In [ ]: