In [1]:
sc


Out[1]:
<pyspark.context.SparkContext at 0x104f85890>

In [11]:
# Cargamos el fichero rdd
file_rdd = sc.textFile('/data/titanic/train.csv')
testfile_rdd = sc.textFile('/data/titanic/test.csv')

In [34]:
with open('/data/titanic/README.txt') as readme:
    for l in readme:
        print l


VARIABLE DESCRIPTIONS:

survival        Survival

                (0 = No; 1 = Yes)

pclass          Passenger Class

                (1 = 1st; 2 = 2nd; 3 = 3rd)

name            Name

sex             Sex

age             Age

sibsp           Number of Siblings/Spouses Aboard

parch           Number of Parents/Children Aboard

ticket          Ticket Number

fare            Passenger Fare

cabin           Cabin

embarked        Port of Embarkation

                (C = Cherbourg; Q = Queenstown; S = Southampton)



SPECIAL NOTES:

Pclass is a proxy for socio-economic status (SES)

 1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower



Age is in Years; Fractional if Age less than One (1)

 If the Age is Estimated, it is in the form xx.5



With respect to the family relation variables (i.e. sibsp and parch)

some relations were ignored.  The following are the definitions used

for sibsp and parch.



Sibling:  Brother, Sister, Stepbrother, or Stepsister of Passenger Aboard Titanic

Spouse:   Husband or Wife of Passenger Aboard Titanic (Mistresses and Fiances Ignored)

Parent:   Mother or Father of Passenger Aboard Titanic

Child:    Son, Daughter, Stepson, or Stepdaughter of Passenger Aboard Titanic



Other family relatives excluded from this study include cousins,

nephews/nieces, aunts/uncles, and in-laws.  Some children travelled

only with a nanny, therefore parch=0 for them.  As well, some

travelled with very close friends or neighbors in a village, however,

the definitions do not support such relations.


In [36]:
# Vemos la cabecera con take(1)
file_rdd.take(3)


Out[36]:
[u'survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked',
 u'0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S',
 u'1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C']

In [15]:
# Generamos un nuevo rdd sin la cabecera, primero buscamos la cabecera y 
# creamos el rdd de pasajeros por resta
heading = file_rdd.filter(lambda x: 'survived,pclass,' in x)
passengers_rdd = file_rdd.subtract(heading).cache()

In [9]:
# O directamente filtramos la cabecera
passengers_rdd = file_rdd.filter(lambda x: 'survived,pclass,' in x)


Out[9]:
[u'1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C']

In [20]:
# Contamos cuantos pasajeros hay en el fichero (fichero de train)
# En el titanic, segun yahoo había 2225, de los que fallecieron 1522
passenger_count = passengers_rdd.count()

In [23]:
# Contamos cuantos sobrevivieron
d = passengers_rdd.map(lambda x: (x.split(',')[0], None)).countByKey()
print "Survivors : " + str(d['1']) + ", " + str(round(float(d['1'])/passenger_count, 2))
print "Casualties: " + str(d['0']) + ", " + str(round(float(d['0'])/passenger_count, 2))


Survivors : 342, 0.38
Casualties: 549, 0.62

In [25]:
# Dibujamos

import matplotlib.pyplot as plt

plt.bar(range(len(d)), d.values(), align='center')
plt.xticks(range(len(d)), d.keys())

plt.show()



In [48]:
def is_number(s):
    try:
        float(s)
        return True
    except:
        return None

In [53]:
ages = passengers_rdd.filter(lambda x: is_number(x.split(',')[5])).map(lambda x: (float(x.split(',')[5]), None)).countByKey()

In [54]:
# Tenemos un dict agregado
ages


Out[54]:
defaultdict(<type 'int'>, {40.5: 2, 1.0: 7, 2.0: 10, 3.0: 6, 4.0: 10, 5.0: 4, 0.75: 2, 7.0: 3, 8.0: 4, 9.0: 8, 10.0: 2, 11.0: 4, 12.0: 1, 13.0: 2, 14.0: 6, 15.0: 5, 16.0: 17, 17.0: 13, 18.0: 26, 19.0: 25, 20.0: 15, 21.0: 24, 22.0: 27, 23.0: 15, 24.0: 30, 25.0: 23, 26.0: 18, 27.0: 18, 28.0: 25, 29.0: 20, 30.0: 25, 31.0: 17, 32.0: 18, 33.0: 15, 34.0: 15, 35.0: 18, 36.0: 22, 37.0: 6, 38.0: 11, 39.0: 14, 40.0: 13, 41.0: 6, 42.0: 13, 43.0: 5, 44.0: 9, 45.0: 12, 46.0: 3, 47.0: 9, 48.0: 9, 49.0: 6, 50.0: 10, 51.0: 7, 52.0: 6, 53.0: 1, 54.0: 8, 55.0: 2, 56.0: 4, 57.0: 2, 58.0: 5, 59.0: 2, 60.0: 4, 61.0: 3, 62.0: 4, 63.0: 2, 64.0: 2, 65.0: 3, 66.0: 1, 70.0: 2, 71.0: 2, 74.0: 1, 80.0: 1, 0.92: 1, 55.5: 1, 30.5: 2, 36.5: 1, 14.5: 1, 0.83: 2, 23.5: 1, 0.42: 1, 6.0: 3, 20.5: 1, 45.5: 2, 24.5: 1, 70.5: 1, 34.5: 1, 0.67: 1, 28.5: 2, 32.5: 2})

In [56]:
# Consigamos las edades individuales (no recomendable si el dataset fuese muy grande)
individual_ages = passengers_rdd.filter(lambda x: is_number(x.split(',')[5])).map(lambda x: float(x.split(',')[5])).collect()

In [58]:
plt.hist(individual_ages)


Out[58]:
(array([ 54,  46, 177, 169, 118,  70,  45,  24,   9,   2]),
 array([  0.42 ,   8.378,  16.336,  24.294,  32.252,  40.21 ,  48.168,
         56.126,  64.084,  72.042,  80.   ]),
 <a list of 10 Patch objects>)

In [77]:
# Conseguir tasa de supervivencia por genero
def map_supervivencia(line):
    gender = line.split(',')[4]
    survive = line.split(',')[0]
    if survive == '1':
        return (gender, (1,0))
    else:
        return (gender, (0,1))

def reduce_supervivencia(a, b):
    return (a[0] + b[0], a[1] + b[1])
    

supervivencia_genero = passengers_rdd.map(map_supervivencia).reduceByKey(reduce_supervivencia)

In [88]:
surv = supervivencia_genero.collect()
for k in surv:
    print k[0] + " survivors: " + str(k[1]) + "  (" + str(round(float(k[1][0])/sum(k[1]),2)) + ")"


female survivors: (233, 81)  (0.74)
male survivors: (109, 468)  (0.19)

In [89]:
# Conseguir tasa de supervivencia por clase
def map_supervivencia(line):
    clase = line.split(',')[1]
    survive = line.split(',')[0]
    if survive == '1':
        return (clase, (1,0))
    else:
        return (clase, (0,1))

def reduce_supervivencia(a, b):
    return (a[0] + b[0], a[1] + b[1])
    

supervivencia_clase = passengers_rdd.map(map_supervivencia).reduceByKey(reduce_supervivencia)

In [91]:
surv = sorted(supervivencia_clase.collect())
for k in surv:
    print k[0] + " survivors: " + str(k[1]) + "  (" + str(round(float(k[1][0])/sum(k[1]),2)) + ")"


1 survivors: (136, 80)  (0.63)
2 survivors: (87, 97)  (0.47)
3 survivors: (119, 372)  (0.24)

In [ ]:
# Mlib