This notebook was used to develop the one hot encoder function

Import necessary packages


In [1]:
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import exploringShipLogbooks.wordcount as wc

from exploringShipLogbooks.basic_utils import clean_data
from exploringShipLogbooks.basic_utils import extract_logbook_data
from exploringShipLogbooks.basic_utils import remove_undesired_columns
from IPython.display import display
from sklearn import preprocessing

Load logbook data


In [2]:
logbook_data = extract_logbook_data('CLIWOC15.csv')


/Applications/miniconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2825: DtypeWarning: Columns (5,6,7,8,11,13,18,19,23,24,25,26,28,29,30,34,35,38,43,44,46,73,77,81,82,84,85,87,88,94,96,97,98,99,111,114,116,119,120,122,124,125,127,129,131,133,135,137,140) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):

Search for logbook entries that mention slaves


In [3]:
columns = ['CargoMemo', 'LifeOnBoardMemo', 'OtherRem', 'EncRem']
key_words = ['slave',  'slaves', 'slaaf', 'slaven', 'meisjesslaaf', 'manslaaf', 'manslaven', 
            'slavenjong','jongensslaaf', 'meidslaaf', 'servant',
            'slavenmeid', 'vrouwslaaf', 'vrouwslaven', 'slavenhandel', 'slaaf',
            'esclavo', 'esclavos', 'esclave', 'esclaves']

mentions_key_words = wc.count_key_words(logbook_data, columns, key_words)
slave_index = mentions_key_words

In [4]:
logbook_data = logbook_data.loc[slave_index]

Drop undesired columns and clean the data


In [5]:
desired_columns=['VoyageFrom', 'VoyageTo', 'ShipName', 'ShipType', 
                 'Company', 'Nationality', 'WarsAndFights', 'Year']
undesired_columns = remove_undesired_columns(logbook_data, desired_columns)
logbook_data = logbook_data.drop(undesired_columns, axis=1)
logbook_data = clean_data(logbook_data)

In [6]:
logbook_data.head()


Out[6]:
VoyageFrom VoyageTo ShipName ShipType Company Nationality Year WarsAndFights
43039 madeira accraw badger snow rn british 1752 0
72306 texel st. eustatius maarsen oorlogsschip adm dutch 1760 0
78339 axim suriname pollux fregat mer dutch 1785 0
78346 rotterdam west afrika pollux fregat mer dutch 1785 0
78350 axim suriname pollux fregat mer dutch 1785 0

One hot encoding


In [7]:
from exploringShipLogbooks.basic_utils import encode_data
from exploringShipLogbooks.basic_utils import encode_data_df

In [8]:
encoded_data, encoder = encode_data(logbook_data, 'Naive Bayes')

In [10]:
encoded_data_df = encode_data_df(logbook_data, 'Naive Bayes')

In [11]:
classification_array = np.array(encoded_data_df)

unknown, desconocido

Example of preprocessing data using a fake data set


In [12]:
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
print(list(le.classes_))
le.transform(["tokyo", "tokyo", "paris"]) 
list(le.inverse_transform([2, 2, 1]))


['amsterdam', 'paris', 'tokyo']
Out[12]:
['tokyo', 'tokyo', 'paris']

In [13]:
def preprocessing_data(df_input):
    le = preprocessing.LabelEncoder()
    return le.fit_transform(df_input)

In [ ]: