Notebook used to develop code

  • output from classification is data frame with slave_logs (maybe rename that column?) indicating:
    • cliwoc_data (unclassified) = 0
    • cliwoc_data (no slaves) = 1
    • cliwoc_data (slaves) = 2
    • slave_data = 3
    • classified as slave log = 4
    • classified as non slave log = 5

In [1]:
classifier_algorithm = "Decision Tree"

In [14]:
import collections
import exploringShipLogbooks

import numpy as np
import os.path as op
import pandas as pd
import exploringShipLogbooks.wordcount as wc

from fuzzywuzzy import fuzz
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree

from exploringShipLogbooks.basic_utils import clean_data
from exploringShipLogbooks.basic_utils import encode_data_df
from exploringShipLogbooks.basic_utils import extract_logbook_data
from exploringShipLogbooks.fuzz_replacement import fuzzy_wuzzy_classification
from exploringShipLogbooks.basic_utils import isolate_columns
from exploringShipLogbooks.basic_utils import isolate_training_data

from exploringShipLogbooks.config import *

Load and clean data

Load CLIWOC ship logs


In [4]:
# extract data from zip file
cliwoc_data = extract_logbook_data('CLIWOC15.csv')


/Applications/miniconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2825: DtypeWarning: Columns (5,6,7,8,11,13,18,19,23,24,25,26,28,29,30,34,35,38,43,44,46,73,77,81,82,84,85,87,88,94,96,97,98,99,111,114,116,119,120,122,124,125,127,129,131,133,135,137,140) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):

In [5]:
label_encoding = preprocessing.LabelEncoder().fit(cliwoc_data['LogbookIdent']).classes_
cliwoc_data['LogbookIdent'] = preprocessing.LabelEncoder().fit_transform(cliwoc_data['LogbookIdent'])

Find definite slave data in CLIWOC data set

  • These logs will be used to test the classifier

In [6]:
# extract logs that mention slaves
slave_mask = wc.count_key_words(cliwoc_data, text_columns, slave_words)
print('Found ', len(slave_mask[slave_mask]), ' logs that mention slaves')


Found  464  logs that mention slaves

Clean CLIWOC data


In [7]:
# find indices of ship names that are "non-slave" ships before dropping ship name column
non_slave_log_locations = isolate_training_data(cliwoc_data, {'ShipName': non_slave_ships})
print('Found ', len(non_slave_log_locations[non_slave_log_locations==True]), ' logs that are non-slave ships')


Found  5417  logs that are non-slave ships

In [8]:
cliwoc_data['slave_logs'] = np.zeros(len(cliwoc_data))
slave_log_locations = cliwoc_data['LogbookIdent'].isin(list(cliwoc_data['LogbookIdent']
                                                            [slave_mask].unique()))
  • cliwoc_data (unclassified) = 0
  • cliwoc_data (no slaves) = 1
  • cliwoc_data (slaves) = 2
  • slave_data = 3

In [9]:
cliwoc_data.loc[non_slave_log_locations,'slave_logs'] = 1
cliwoc_data.loc[slave_log_locations,'slave_logs'] = 2

In [10]:
cliwoc_data = cliwoc_data.sort_values('LogbookIdent', ascending=True)
cliwoc_data_all = cliwoc_data.set_index('LogbookIdent', drop= False).copy()
cliwoc_data = cliwoc_data.set_index('LogbookIdent', drop = False)
cliwoc_data = cliwoc_data.drop_duplicates('LogbookIdent')

In [11]:
# uncomment this if looking at ship names for manual review
#desired_columns.append('ShipName')

In [12]:
# remove undesired columns
cliwoc_data = isolate_columns(cliwoc_data, desired_columns)

Load Slave Voyages data


In [16]:
data_path = op.join(exploringShipLogbooks.__path__[0], 'data')
file_name = data_path + '/tastdb-exp-2010'
slave_voyage_logs = pd.read_pickle(file_name)

In [17]:
year_ind = ~(slave_voyage_logs['yeardep'].isnull())
slave_voyage_logs = slave_voyage_logs[year_ind]

In [18]:
cliwoc_ind = (slave_voyage_logs['yeardep']>cliwoc_data['Year'].min()) & (slave_voyage_logs['yeardep']<cliwoc_data['Year'].max())
slave_voyage_logs = slave_voyage_logs[cliwoc_ind]

Clean Slave voyages data


In [19]:
slave_voyage_desired_cols = list(slave_voyage_conversions.keys())
slave_voyage_logs = isolate_columns(slave_voyage_logs, slave_voyage_desired_cols)

slave_voyage_logs.rename(columns=slave_voyage_conversions, inplace=True)
#slave_voyage_logs.columns = ['Nationality', 'ShipType', 'VoyageFrom', 'VoyageTo', 'Year']

In [20]:
slave_voyage_logs['slave_logs'] = 3
slave_voyage_indices = range(len(slave_voyage_logs)) + (cliwoc_data.tail(1).index[0]+1)
slave_voyage_logs = slave_voyage_logs.set_index(slave_voyage_indices)

Join data sets


In [21]:
all_data = pd.concat([cliwoc_data, slave_voyage_logs])
#all_data = cliwoc_data.append(slave_voyage_logs)
all_data = clean_data(all_data)

# cleanup
#del cliwoc_data, slave_voyage_logs

In [22]:
all_data.head()


Out[22]:
Nationality ShipType VoyageFrom VoyageTo Year slave_logs
LogbookIdent
0 dutch nan fishing grounds nan 1785 0
1 dutch nan fishing grounds nan 1789 0
2 spanish falucho barcelona cartagena 1847 0
3 spanish mistico cadiz melilla 1849 0
4 spanish bergantín barcelona marañón 1848 1

Test of fuzzywuzzy method


In [23]:
all_data_test = all_data.copy()

In [24]:
fuzz_columns = ['Nationality', 'ShipType', 'VoyageFrom', 'VoyageTo']

In [25]:
for col in fuzz_columns:
    all_data = fuzzy_wuzzy_classification(all_data, col)

Encode data

  • Must encode data before separating, otherwise values that do not occur in a subset will be encoded differently

In [26]:
from sklearn.preprocessing import LabelEncoder
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                if is_instance(X[col][0], str):
                    output[col] = LabelEncoder().fit_transform(output[col])
                else:
                    output[col] = X[col]
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [27]:
if classifier_algorithm == "Decision Tree":
    all_data = MultiColumnLabelEncoder().fit_transform(all_data)
elif classifier_algorithm == "Naive Bayes":
    all_data = encode_data_df(all_data)
    all_data['no_data'] = all_data['nan'].apply(lambda x: x.any(), axis=1).astype(int)
    all_data = all_data.drop('nan', axis=1)
else:
    raise KeyError("Please enter a valid classification type (Decision Trees or Naive Bayes)")

Extract training data, and create list of classes


In [28]:
unclassified_logs = all_data[all_data['slave_logs']==0]
#unclassified_logs = unclassified_logs.drop('slave_logs', axis=1)

validation_set_1 = all_data[all_data['slave_logs']==2]
#validation_set_1 = validation_set_1.drop('slave_logs', axis=1)

# reserve first 20% of slave_voyage_logs as validation set
validation_set_2_indices = range(slave_voyage_indices.min(),
                                 slave_voyage_indices.min() + round(len(slave_voyage_indices)*.2))
validation_set_2 = all_data.iloc[validation_set_2_indices]
#validation_set_2 = validation_set_2.drop('slave_logs', axis=1)

training_logs_pos = all_data.drop(validation_set_2_indices)
training_logs_pos = training_logs_pos[training_logs_pos['slave_logs']==3]
#training_logs_pos = training_logs_pos.drop('slave_logs', axis=1)

# note! This relies on cliwoc data being first in all_data
# could make more robust later
training_logs_neg = all_data[all_data['slave_logs']==1]
#training_logs_neg = training_logs_neg.drop('slave_logs', axis=1)

# cleanup
#del all_data
  • left this code so we can check if there are any null values in each dataframe

In [29]:
def finding_null_values(df):
    return df.isnull().sum()[df.isnull().sum()>0]

In [30]:
repeat_multiplier = round(len(training_logs_pos)/len(training_logs_neg))

# create list of classes for training data (0 is for non-slave, 1 is for slave)
# index matches training_data
classes = np.zeros(len(training_logs_neg)).repeat(repeat_multiplier)
#classes = np.append(classes, np.ones(len(training_logs_pos)))
classes = np.append(classes, np.ones(len(training_logs_pos)))

# join training data
neg_rep = pd.concat([training_logs_neg]*repeat_multiplier)
training_data = pd.concat([neg_rep, training_logs_pos], ignore_index = True)

# convert to numpy array
columns = list(training_data.columns)
columns.remove('slave_logs')
training_data = training_data.as_matrix(columns)

Fit training data to classifier

  • note! first column of numpy array is index! do not include in classification!

In [31]:
if classifier_algorithm == "Decision Tree":
    classifier = MultinomialNB(alpha = 1.0, class_prior = None, fit_prior = True)
    classifier.fit(training_data[::,1::], classes)
elif classifier_algorithm == "Naive Bayes":
    classifier = tree.DecisionTreeClassifier()
    classifier.fit(training_data[::,1::], classes)
else: 
    raise KeyError("Please enter a valid classification type (Decision Trees or Naive Bayes)")

Test classifier

  • check if slave logs from cliwoc data were classified correctly (want mostly classified as 1)
  • compare first column with slave_index

In [32]:
def validation_test(classifier, validation_set, expected_class):
    """
    input classifer object, validation set (data frame), and expected class 
    of validation set (i.e. 1 or 0). Prints successful classification rate.
    """
    columns = list(validation_set.columns)
    columns.remove('slave_logs')
    validation_set = validation_set.as_matrix(columns)
    predictions = classifier.predict(validation_set[::,1::])
    
    counts = collections.Counter(predictions)
    percent_correct = (counts[expected_class]/(len(predictions))* 100)
                       
    print('Validation set was classified as', expected_class,
          round(percent_correct,2), '% of the time')

In [33]:
def predict_class(classifier, data_subset):
    """
    Predict class of data, and append predictions to data frame
    """
    try:
        # drop old predictions before reclassifying (if they exist)
        data_subset = data_subset.drop('predictions', axis = 1)
        data_to_classify = data_subset.copy()
    except:
        data_to_classify = data_subset.copy()
        pass
    
    # convert to numpy and classify
    columns = list(data_to_classify.columns)
    columns.remove('slave_logs')
    data_matrix = data_to_classify.as_matrix(columns)
    predictions = classifier.predict(data_matrix[::,1::])
    
    # revalue slave_log ID column to indicate classification
    data_to_classify['slave_logs'] = predictions + 4
    
    # print statstics
    counts = collections.Counter(predictions)
    
    for key in counts:
        percent = (counts[key]/(len(predictions))* 100)
        print(round(percent, 2), 'of data was classified as ', key)
        
    # update slave_log columns

    return data_to_classify

In [34]:
print('Testing validation data from slave logs data set')
validation_test(classifier, validation_set_2, 1)

print('Testing validation data from cliwoc data set:')
validation_test(classifier, validation_set_1, 1)


Testing validation data from slave logs data set
Validation set was classified as 1 94.55 % of the time
Testing validation data from cliwoc data set:
Validation set was classified as 1 23.21 % of the time

In [35]:
unclassified_logs = predict_class(classifier, unclassified_logs)


64.72 of data was classified as  0.0
35.28 of data was classified as  1.0

In [36]:
unclassified_logs.head()


Out[36]:
Nationality ShipType VoyageFrom VoyageTo Year slave_logs
LogbookIdent
0 4 78 160 288 123 4
1 4 78 160 288 127 4
2 15 42 35 84 185 5
3 15 77 63 262 187 4
9 4 78 374 34 0 5

try decision trees plotting

  • Following lines of code do not currently work, we need to install graphviz

In [37]:
# export PDF with decision tree
from sklearn.externals.six import StringIO  
import os
import pydot 

dot_data = StringIO()
tree.export_graphviz(new_classifier, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("test.pdf")


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-37-a4068616a441> in <module>()
      2 from sklearn.externals.six import StringIO
      3 import os
----> 4 import pydot
      5 
      6 dot_data = StringIO()

ImportError: No module named 'pydot'

In [ ]: