Import necessary packages


In [1]:
import exploringShipLogbooks
import zipfile

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
import pandas as pd
import exploringShipLogbooks.wordcount as wc

from exploringShipLogbooks.basic_utils import clean_data
from exploringShipLogbooks.basic_utils import remove_undesired_columns

In [2]:
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()

Import data using R


In [3]:
data_path = op.join(exploringShipLogbooks.__path__[0], 'data')
filename = data_path + '/tastdb-exp-2010.sav'
data = rpy2.robjects.r('foreign::read.spss("%s", to.data.frame=TRUE)' % filename)


/Applications/miniconda3/lib/python3.5/site-packages/rpy2/robjects/functions.py:106: UserWarning: re-encoding from latin1

  res = super(Function, self).__call__(*new_args, **new_kwargs)
  • save the data loaded from R for use on windows computer

In [6]:
data.to_pickle(data_path + 'tastdb-exp-2010')

In [7]:
store = pd.HDFStore(data_path + '/tastdb-exp-2010.h5')
store['df'] = data

In [8]:
store = pd.HDFStore(data_path + '/tastdb-exp-2010.h5')
data = store['df']

In [9]:
data.head()


Out[9]:
voyageid evgreen shipname placcons constreg yrcons placreg regisreg yrreg national ... adlt2imp chil2imp male2imp feml2imp filter_. ncartot pctemb slastot pctdis datepl
1 1 1 Pastora de Lima ... Alicante Spain NaN Alicante Spain NaN Portugal ... NaN NaN NaN NaN Selected NaN NaN NaN NaN Invoice date at port of departure
2 2 1 Tibério ... Alicante Spain NaN Alicante Spain NaN Spain ... NaN NaN NaN NaN Not Selected NaN NaN NaN NaN Invoice date at port of departure
3 3 1 Paquete Real ... Alicante Spain NaN Alicante Spain NaN Spain ... NaN NaN NaN NaN Selected NaN NaN NaN NaN Guia de despacho (added)
4 4 1 Bom Caminho ... Alicante Spain NaN Alicante Spain NaN Spain ... NaN NaN NaN NaN Selected NaN NaN NaN NaN Invoice date at port of departure
5 5 1 Benigretta ... Alicante Spain NaN Alicante Spain NaN Spain ... NaN NaN NaN NaN Not Selected NaN NaN NaN NaN Invoice date at port of departure

5 rows × 279 columns


In [10]:
data.columns.values


Out[10]:
array(['voyageid', 'evgreen', 'shipname', 'placcons', 'constreg', 'yrcons',
       'placreg', 'regisreg', 'yrreg', 'national', 'rig', 'tonnage',
       'tontype', 'guns', 'ownera', 'ownerb', 'ownerc', 'ownerd', 'ownere',
       'ownerf', 'ownerg', 'ownerh', 'owneri', 'ownerj', 'ownerk',
       'ownerl', 'ownerm', 'ownern', 'ownero', 'ownerp', 'natinimp',
       'tonmod', 'fate', 'resistance', 'fate2', 'fate3', 'fate4',
       'embport', 'embreg', 'embport2', 'embreg2', 'arrport', 'regarr',
       'arrport2', 'regarr2', 'portdep', 'nppretra', 'plac1tra', 'regem1',
       'plac2tra', 'regem2', 'plac3tra', 'regem3', 'majbuypt', 'npafttra',
       'npprior', 'sla1port', 'regdis1', 'adpsale1', 'regdis2', 'adpsale2',
       'regdis3', 'majselpt', 'portret', 'retrnreg', 'retrnreg1',
       'ptdepimp', 'deptregimp', 'deptregimp1', 'mjbyptimp', 'majbyimp',
       'majbyimp1', 'mjslptimp', 'mjselimp', 'mjselimp1', 'datedepa',
       'datedepb', 'datedepc', 'd1slatra', 'd1slatrb', 'd1slatrc',
       'dlslatra', 'dlslatrb', 'dlslatrc', 'datarr32', 'datarr33',
       'datarr34', 'datarr36', 'datarr37', 'datarr38', 'datarr39',
       'datarr40', 'datarr41', 'ddepam', 'ddepamb', 'ddepamc', 'datarr43',
       'datarr44', 'datarr45', 'Date_dep', 'Date_buy', 'Date_leftAfr',
       'Date_land1', 'Date_land2', 'Date_land3', 'Date_depam', 'Date_end',
       'voyage', 'yeardep', 'yearaf', 'yearam', 'year5', 'year10',
       'year25', 'year100', 'voy1imp', 'voy2imp', 'captaina', 'captainb',
       'captainc', 'crew1', 'crew2', 'crew3', 'crew4', 'crew5', 'crew',
       'saild1', 'saild2', 'saild3', 'saild4', 'saild5', 'crewdied',
       'ndesert', 'slintend', 'slinten2', 'ncar13', 'ncar15', 'ncar17',
       'tslavesp', 'tslavesd', 'sladafri', 'sladvoy', 'slaarriv', 'slas32',
       'slas36', 'slas39', 'sladamer', 'xmimpflag', 'slaximp', 'slamimp',
       'men1', 'women1', 'boy1', 'girl1', 'male1', 'female1', 'adult1',
       'child1', 'infant1', 'men4', 'women4', 'boy4', 'girl4', 'male4',
       'female4', 'adult4', 'child4', 'infant4', 'men5', 'women5', 'boy5',
       'girl5', 'male5', 'female5', 'adult5', 'child5', 'infant5', 'men2',
       'women2', 'boy2', 'girl2', 'male2', 'female2', 'adult2', 'child2',
       'infant2', 'men3', 'women3', 'boy3', 'girl3', 'male3', 'female3',
       'adult3', 'child3', 'infant3', 'men6', 'women6', 'boy6', 'girl6',
       'male6', 'female6', 'adult6', 'child6', 'infant6', 'tslmtimp',
       'vymrtimp', 'vymrtrat', 'jamcaspr', 'sourcea', 'sourceb', 'sourcec',
       'sourced', 'sourcee', 'sourcef', 'sourceg', 'sourceh', 'sourcei',
       'sourcej', 'sourcek', 'sourcel', 'sourcem', 'sourcen', 'sourceo',
       'sourcep', 'sourceq', 'sourcer', 'adlt1imp', 'chil1imp', 'male1imp',
       'feml1imp', 'slavema1', 'slavemx1', 'slavmax1', 'chilrat1',
       'malrat1', 'menrat1', 'womrat1', 'boyrat1', 'girlrat1', 'adlt3imp',
       'chil3imp', 'male3imp', 'feml3imp', 'slavema3', 'slavemx3',
       'slavmax3', 'chilrat3', 'malrat3', 'menrat3', 'womrat3', 'boyrat3',
       'girlrat3', 'slavema7', 'slavemx7', 'slavmax7', 'men7', 'women7',
       'boy7', 'girl7', 'adult7', 'child7', 'male7', 'female7', 'menrat7',
       'womrat7', 'boyrat7', 'girlrat7', 'malrat7', 'chilrat7', 'adlt2imp',
       'chil2imp', 'male2imp', 'feml2imp', 'filter_.', 'ncartot', 'pctemb',
       'slastot', 'pctdis', 'datepl'], dtype=object)

In [11]:
desired_columns=['portdep', 'portret', 'shipname', 'rig', 'national', 'yeardep']
undesired_columns = remove_undesired_columns(data, desired_columns)
data = data.drop(undesired_columns, axis=1)

data.columns = ['ShipName', 'Nationality', 'ShipType', 'VoyageFrom', 'VoyageTo', 'Year']
     
logbook_data = clean_data(data)

In [12]:
logbook_data.head()


Out[12]:
ShipName Nationality ShipType VoyageFrom VoyageTo Year
1 pastora de lima portugal bergantim rio de janeiro alicante 1816
2 tibério spain bergantim bahia, port unspecified alicante 1816
3 paquete real spain bergantim bahia, port unspecified alicante 1816
4 bom caminho spain bergantim bahia, port unspecified alicante 1816
5 benigretta spain galera alicante alicante 1817

One hot encoding


In [13]:
from exploringShipLogbooks.basic_utils import encode_data_df

In [16]:
encoded_data_df = encode_data_df(logbook_data, 'Naive Bayes')

In [18]:
classification_array = np.array(encoded_data_df)

In [19]:
encoded_data_df.head()


Out[19]:
's-graveland . aaron frigate aartshertogin maria christina abbott devereux abby abecerraje abeille abencerrage ... sète tenerife texel vannes vlissingen wales whitehaven williamsburg zeeland Year
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1816
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1816
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1816
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1816
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1817

5 rows × 9545 columns


In [ ]: