In [1]:
import zipfile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import exploringShipLogbooks.wordcount as wc
from exploringShipLogbooks.basic_utils import clean_data
from exploringShipLogbooks.basic_utils import extract_logbook_data
from exploringShipLogbooks.basic_utils import remove_undesired_columns
from IPython.display import display
from sklearn import preprocessing
In [2]:
logbook_data = extract_logbook_data('CLIWOC15.csv')
In [3]:
columns = ['CargoMemo', 'LifeOnBoardMemo', 'OtherRem', 'EncRem']
key_words = ['slave', 'slaves', 'slaaf', 'slaven', 'meisjesslaaf', 'manslaaf', 'manslaven',
'slavenjong','jongensslaaf', 'meidslaaf', 'servant',
'slavenmeid', 'vrouwslaaf', 'vrouwslaven', 'slavenhandel', 'slaaf',
'esclavo', 'esclavos', 'esclave', 'esclaves']
mentions_key_words = wc.count_key_words(logbook_data, columns, key_words)
slave_index = mentions_key_words
In [4]:
logbook_data = logbook_data.loc[slave_index]
In [5]:
desired_columns=['VoyageFrom', 'VoyageTo', 'ShipName', 'ShipType',
'Company', 'Nationality', 'WarsAndFights', 'Year']
undesired_columns = remove_undesired_columns(logbook_data, desired_columns)
logbook_data = logbook_data.drop(undesired_columns, axis=1)
logbook_data = clean_data(logbook_data)
In [6]:
logbook_data.head()
Out[6]:
In [7]:
from exploringShipLogbooks.basic_utils import encode_data
from exploringShipLogbooks.basic_utils import encode_data_df
In [8]:
encoded_data, encoder = encode_data(logbook_data, 'Naive Bayes')
In [10]:
encoded_data_df = encode_data_df(logbook_data, 'Naive Bayes')
In [11]:
classification_array = np.array(encoded_data_df)
unknown, desconocido
In [12]:
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
print(list(le.classes_))
le.transform(["tokyo", "tokyo", "paris"])
list(le.inverse_transform([2, 2, 1]))
Out[12]:
In [13]:
def preprocessing_data(df_input):
le = preprocessing.LabelEncoder()
return le.fit_transform(df_input)
In [ ]: