In [1]:
sc


Out[1]:
<pyspark.context.SparkContext at 0x104f85890>

In [11]:
# Cargamos el fichero rdd
file_rdd = sc.textFile('/data/titanic/train.csv')
testfile_rdd = sc.textFile('/data/titanic/test.csv')

In [34]:
with open('/data/titanic/README.txt') as readme:
    for l in readme:
        print l


VARIABLE DESCRIPTIONS:

survival        Survival

                (0 = No; 1 = Yes)

pclass          Passenger Class

                (1 = 1st; 2 = 2nd; 3 = 3rd)

name            Name

sex             Sex

age             Age

sibsp           Number of Siblings/Spouses Aboard

parch           Number of Parents/Children Aboard

ticket          Ticket Number

fare            Passenger Fare

cabin           Cabin

embarked        Port of Embarkation

                (C = Cherbourg; Q = Queenstown; S = Southampton)



SPECIAL NOTES:

Pclass is a proxy for socio-economic status (SES)

 1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower



Age is in Years; Fractional if Age less than One (1)

 If the Age is Estimated, it is in the form xx.5



With respect to the family relation variables (i.e. sibsp and parch)

some relations were ignored.  The following are the definitions used

for sibsp and parch.



Sibling:  Brother, Sister, Stepbrother, or Stepsister of Passenger Aboard Titanic

Spouse:   Husband or Wife of Passenger Aboard Titanic (Mistresses and Fiances Ignored)

Parent:   Mother or Father of Passenger Aboard Titanic

Child:    Son, Daughter, Stepson, or Stepdaughter of Passenger Aboard Titanic



Other family relatives excluded from this study include cousins,

nephews/nieces, aunts/uncles, and in-laws.  Some children travelled

only with a nanny, therefore parch=0 for them.  As well, some

travelled with very close friends or neighbors in a village, however,

the definitions do not support such relations.


In [36]:
# Vemos unas lineas 
file_rdd.__COMPLETAR__


Out[36]:
[u'survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked',
 u'0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S',
 u'1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C']

In [15]:
# Generamos un nuevo rdd sin la cabecera, primero buscamos la cabecera y 
# creamos el rdd de pasajeros por resta
heading = file_rdd.filter(lambda x: __COMPLETAR__)
passengers_rdd = file_rdd.__COMPLETAR__(heading).cache()

In [9]:
# O directamente filtramos la cabecera
passengers_rdd = file_rdd.filter(__COMPLETAR__)


Out[9]:
[u'1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C']

In [20]:
# Contamos cuantos pasajeros hay en el fichero (fichero de train)
# En el titanic, segun yahoo había 2225, de los que fallecieron 1522
passenger_count = passengers_rdd.__COMPLETAR__

In [23]:
# Contamos cuantos sobrevivieron
d = passengers_rdd.#__COMPLETAR
print "Survivors : " + str(d['1']) + ", " + str(round(float(d['1'])/passenger_count, 2))
print "Casualties: " + str(d['0']) + ", " + str(round(float(d['0'])/passenger_count, 2))


Survivors : 342, 0.38
Casualties: 549, 0.62

In [25]:
# Dibujamos

import matplotlib.pyplot as plt

plt.bar(range(len(d)), d.values(), align='center')
plt.xticks(range(len(d)), d.keys())

plt.show()



In [48]:
def is_number(s):
    try:
        float(s)
        return True
    except:
        return None

In [53]:
ages = passengers_rdd.filter(lambda x: __COMPLETAR__).map(lambda x: __COMPLETAR__).__COMPLETAR__

In [54]:
# Tenemos un dict agregado
ages


Out[54]:
defaultdict(<type 'int'>, {40.5: 2, 1.0: 7, 2.0: 10, 3.0: 6, 4.0: 10, 5.0: 4, 0.75: 2, 7.0: 3, 8.0: 4, 9.0: 8, 10.0: 2, 11.0: 4, 12.0: 1, 13.0: 2, 14.0: 6, 15.0: 5, 16.0: 17, 17.0: 13, 18.0: 26, 19.0: 25, 20.0: 15, 21.0: 24, 22.0: 27, 23.0: 15, 24.0: 30, 25.0: 23, 26.0: 18, 27.0: 18, 28.0: 25, 29.0: 20, 30.0: 25, 31.0: 17, 32.0: 18, 33.0: 15, 34.0: 15, 35.0: 18, 36.0: 22, 37.0: 6, 38.0: 11, 39.0: 14, 40.0: 13, 41.0: 6, 42.0: 13, 43.0: 5, 44.0: 9, 45.0: 12, 46.0: 3, 47.0: 9, 48.0: 9, 49.0: 6, 50.0: 10, 51.0: 7, 52.0: 6, 53.0: 1, 54.0: 8, 55.0: 2, 56.0: 4, 57.0: 2, 58.0: 5, 59.0: 2, 60.0: 4, 61.0: 3, 62.0: 4, 63.0: 2, 64.0: 2, 65.0: 3, 66.0: 1, 70.0: 2, 71.0: 2, 74.0: 1, 80.0: 1, 0.92: 1, 55.5: 1, 30.5: 2, 36.5: 1, 14.5: 1, 0.83: 2, 23.5: 1, 0.42: 1, 6.0: 3, 20.5: 1, 45.5: 2, 24.5: 1, 70.5: 1, 34.5: 1, 0.67: 1, 28.5: 2, 32.5: 2})

In [56]:
# Consigamos las edades individuales (no recomendable si el dataset fuese muy grande)
individual_ages = passengers_rdd.__COMPLETAR__

In [58]:
plt.hist(individual_ages)


Out[58]:
(array([ 54,  46, 177, 169, 118,  70,  45,  24,   9,   2]),
 array([  0.42 ,   8.378,  16.336,  24.294,  32.252,  40.21 ,  48.168,
         56.126,  64.084,  72.042,  80.   ]),
 <a list of 10 Patch objects>)

In [77]:
# Conseguir tasa de supervivencia por genero con mapreduce
def map_supervivencia(line):
    __COMPLETAR___

def reduce_supervivencia(a, b):
    __COMPLETAR___
    

supervivencia_genero = passengers_rdd.map(map_supervivencia).reduceByKey(reduce_supervivencia)

In [88]:
surv = supervivencia_genero.collect()
for k in surv:
    print k[0] + " survivors: " + str(k[1]) + "  (" + str(round(float(k[1][0])/sum(k[1]),2)) + ")"


female survivors: (233, 81)  (0.74)
male survivors: (109, 468)  (0.19)

In [ ]:
# Conseguir tasa de supervivencia por clase