In [115]:
def output( filename, text ):
    "Writes a file"
    f = open('./answers/'+str(filename), 'w')
    f.write(str(text))
    f.close()
    return

import pandas as pd
data = pd.read_csv('./data/titanic.csv', index_col='PassengerId')

In [116]:
data


Out[116]:
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 NaN S
6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
7 0 1 McCarthy, Mr. Timothy J male 54 0 0 17463 51.8625 E46 S
8 0 3 Palsson, Master. Gosta Leonard male 2 3 1 349909 21.0750 NaN S
9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27 0 2 347742 11.1333 NaN S
10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14 1 0 237736 30.0708 NaN C
11 1 3 Sandstrom, Miss. Marguerite Rut female 4 1 1 PP 9549 16.7000 G6 S
12 1 1 Bonnell, Miss. Elizabeth female 58 0 0 113783 26.5500 C103 S
13 0 3 Saundercock, Mr. William Henry male 20 0 0 A/5. 2151 8.0500 NaN S
14 0 3 Andersson, Mr. Anders Johan male 39 1 5 347082 31.2750 NaN S
15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14 0 0 350406 7.8542 NaN S
16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55 0 0 248706 16.0000 NaN S
17 0 3 Rice, Master. Eugene male 2 4 1 382652 29.1250 NaN Q
18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S
19 0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31 1 0 345763 18.0000 NaN S
20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C
21 0 2 Fynney, Mr. Joseph J male 35 0 0 239865 26.0000 NaN S
22 1 2 Beesley, Mr. Lawrence male 34 0 0 248698 13.0000 D56 S
23 1 3 McGowan, Miss. Anna "Annie" female 15 0 0 330923 8.0292 NaN Q
24 1 1 Sloper, Mr. William Thompson male 28 0 0 113788 35.5000 A6 S
25 0 3 Palsson, Miss. Torborg Danira female 8 3 1 349909 21.0750 NaN S
26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38 1 5 347077 31.3875 NaN S
27 0 3 Emir, Mr. Farred Chehab male NaN 0 0 2631 7.2250 NaN C
28 0 1 Fortune, Mr. Charles Alexander male 19 3 2 19950 263.0000 C23 C25 C27 S
29 1 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 330959 7.8792 NaN Q
30 0 3 Todoroff, Mr. Lalio male NaN 0 0 349216 7.8958 NaN S
... ... ... ... ... ... ... ... ... ... ... ...
862 0 2 Giles, Mr. Frederick Edward male 21 1 0 28134 11.5000 NaN S
863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48 0 0 17466 25.9292 D17 S
864 0 3 Sage, Miss. Dorothy Edith "Dolly" female NaN 8 2 CA. 2343 69.5500 NaN S
865 0 2 Gill, Mr. John William male 24 0 0 233866 13.0000 NaN S
866 1 2 Bystrom, Mrs. (Karolina) female 42 0 0 236852 13.0000 NaN S
867 1 2 Duran y More, Miss. Asuncion female 27 1 0 SC/PARIS 2149 13.8583 NaN C
868 0 1 Roebling, Mr. Washington Augustus II male 31 0 0 PC 17590 50.4958 A24 S
869 0 3 van Melkebeke, Mr. Philemon male NaN 0 0 345777 9.5000 NaN S
870 1 3 Johnson, Master. Harold Theodor male 4 1 1 347742 11.1333 NaN S
871 0 3 Balkic, Mr. Cerin male 26 0 0 349248 7.8958 NaN S
872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47 1 1 11751 52.5542 D35 S
873 0 1 Carlsson, Mr. Frans Olof male 33 0 0 695 5.0000 B51 B53 B55 S
874 0 3 Vander Cruyssen, Mr. Victor male 47 0 0 345765 9.0000 NaN S
875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28 1 0 P/PP 3381 24.0000 NaN C
876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15 0 0 2667 7.2250 NaN C
877 0 3 Gustafsson, Mr. Alfred Ossian male 20 0 0 7534 9.8458 NaN S
878 0 3 Petroff, Mr. Nedelio male 19 0 0 349212 7.8958 NaN S
879 0 3 Laleff, Mr. Kristo male NaN 0 0 349217 7.8958 NaN S
880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56 0 1 11767 83.1583 C50 C
881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25 0 1 230433 26.0000 NaN S
882 0 3 Markun, Mr. Johann male 33 0 0 349257 7.8958 NaN S
883 0 3 Dahlberg, Miss. Gerda Ulrika female 22 0 0 7552 10.5167 NaN S
884 0 2 Banfield, Mr. Frederick James male 28 0 0 C.A./SOTON 34068 10.5000 NaN S
885 0 3 Sutehall, Mr. Henry Jr male 25 0 0 SOTON/OQ 392076 7.0500 NaN S
886 0 3 Rice, Mrs. William (Margaret Norton) female 39 0 5 382652 29.1250 NaN Q
887 0 2 Montvila, Rev. Juozas male 27 0 0 211536 13.0000 NaN S
888 1 1 Graham, Miss. Margaret Edith female 19 0 0 112053 30.0000 B42 S
889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
890 1 1 Behr, Mr. Karl Howell male 26 0 0 111369 30.0000 C148 C
891 0 3 Dooley, Mr. Patrick male 32 0 0 370376 7.7500 NaN Q

891 rows × 11 columns


In [117]:
malesCount = data[(data.Sex=="male")].Sex.count()
femalesCount = data[(data.Sex=="female")].Sex.count()
print malesCount
print femalesCount


577
314

In [118]:
total = len(data)
check01 = femalesCount + malesCount == total
print "Q1 Check =", check01
print "Q1 = [",malesCount,femalesCount,"]"
output('q1_1.txt',str(malesCount)+" "+str(femalesCount))


Q1 Check = True
Q1 = [ 577 314 ]

In [119]:
survivedCount = data[(data.Survived==1)].Survived.count()
deadCount = data[(data.Survived==0)].Survived.count()
print survivedCount
print deadCount


342
549

In [120]:
check02 = survivedCount + deadCount == total
percent = float(survivedCount)/float(total)*100.0
print "Q2 Check =", check02, survivedCount, total
print "Q2 = [",(float(percent)),"]"
output('q1_2.txt',str(percent))


Q2 Check = True 342 891
Q2 = [ 38.3838383838 ]

In [121]:
firstClassCount = data[(data.Pclass==1)].Pclass.count()
print firstClassCount


216

In [122]:
percent = float(firstClassCount)/float(total)*100.0
print "Q3 = [",(float(percent)),"]"
output('q1_3.txt',str(percent))


Q3 = [ 24.2424242424 ]

In [123]:
meanAge = data.Age.mean()
medianAge = data.Age.median()
print "Q4_mean = [",meanAge,"]"
print "Q4_median = [",medianAge,"]"
output('q1_4.txt',str(meanAge)+" "+str(medianAge))


Q4_mean = [ 29.6991176471 ]
Q4_median = [ 28.0 ]

In [124]:
import numpy as np
a = np.corrcoef(data.SibSp, data.Parch)[0, 1]
a
print "Q5 = [",a,"]"
output('q1_5.txt',str(a))


Q5 = [ 0.41483769862 ]

In [ ]:


In [125]:
import scipy.stats as sc
b = sc.pearsonr(data.SibSp, data.Parch)
b
print "Q5 = [",b,"]"


Q5 = [ (0.41483769862015613, 2.2418236681398336e-38) ]

In [126]:
import re
del data['Pclass']
del data['SibSp']
del data['Parch']
del data['Ticket']
females = data[(data.Sex=="female")]

In [132]:
females['Surname'] = females.Name.str.split(',', expand=True)[0]
females['Names'] = females.Name.str.split(',', expand=True)[1]
females['Origin'] = females.Names.str.split('.', expand=True)[0]
females['Names'] = females.Names.str.split('.', expand=True)[1]
females['NamesD'] = females.Names.str.contains('(')===true and females.Names.str.split('(', expand=True)[1] or females.Names.str.split('(', expand=True)[0]
females


C:\Users\Kirill.Minenko\Anaconda2\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
C:\Users\Kirill.Minenko\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
C:\Users\Kirill.Minenko\Anaconda2\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
C:\Users\Kirill.Minenko\Anaconda2\lib\site-packages\ipykernel\__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
---------------------------------------------------------------------------
error                                     Traceback (most recent call last)
<ipython-input-132-40d9b16aec5f> in <module>()
      3 females['Origin'] = females.Names.str.split('.', expand=True)[0]
      4 females['Names'] = females.Names.str.split('.', expand=True)[1]
----> 5 females['NamesD'] = females.Names.str.contains('(') and females.Names.str.split('(', expand=True)[1] or females.Names.str.split('(', expand=True)[0]
      6 females

C:\Users\Kirill.Minenko\Anaconda2\lib\site-packages\pandas\core\strings.py in contains(self, pat, case, flags, na, regex)
   1246     def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
   1247         result = str_contains(self._data, pat, case=case, flags=flags,
-> 1248                               na=na, regex=regex)
   1249         return self._wrap_result(result)
   1250 

C:\Users\Kirill.Minenko\Anaconda2\lib\site-packages\pandas\core\strings.py in str_contains(arr, pat, case, flags, na, regex)
    203             flags |= re.IGNORECASE
    204 
--> 205         regex = re.compile(pat, flags=flags)
    206 
    207         if regex.groups > 0:

C:\Users\Kirill.Minenko\Anaconda2\lib\re.pyc in compile(pattern, flags)
    192 def compile(pattern, flags=0):
    193     "Compile a regular expression pattern, returning a pattern object."
--> 194     return _compile(pattern, flags)
    195 
    196 def purge():

C:\Users\Kirill.Minenko\Anaconda2\lib\re.pyc in _compile(*key)
    249         p = sre_compile.compile(pattern, flags)
    250     except error, v:
--> 251         raise error, v # invalid expression
    252     if not bypass_cache:
    253         if len(_cache) >= _MAXCACHE:

error: unbalanced parenthesis

In [ ]: