Notebook used to develop code

output from classification is data frame with slave_logs (maybe rename that column?) indicating:
- cliwoc_data (unclassified) = 0
- cliwoc_data (no slaves) = 1
- cliwoc_data (slaves) = 2
- slave_data = 3
- classified as slave log = 4
- classified as non slave log = 5



In [1]:

    
classifier_algorithm = "Decision Tree"



In [14]:

    
import collections
import exploringShipLogbooks

import numpy as np
import os.path as op
import pandas as pd
import exploringShipLogbooks.wordcount as wc

from fuzzywuzzy import fuzz
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree

from exploringShipLogbooks.basic_utils import clean_data
from exploringShipLogbooks.basic_utils import encode_data_df
from exploringShipLogbooks.basic_utils import extract_logbook_data
from exploringShipLogbooks.fuzz_replacement import fuzzy_wuzzy_classification
from exploringShipLogbooks.basic_utils import isolate_columns
from exploringShipLogbooks.basic_utils import isolate_training_data

from exploringShipLogbooks.config import *

Load and clean data

Load CLIWOC ship logs



In [4]:

    
# extract data from zip file
cliwoc_data = extract_logbook_data('CLIWOC15.csv')









    



/Applications/miniconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2825: DtypeWarning: Columns (5,6,7,8,11,13,18,19,23,24,25,26,28,29,30,34,35,38,43,44,46,73,77,81,82,84,85,87,88,94,96,97,98,99,111,114,116,119,120,122,124,125,127,129,131,133,135,137,140) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):



In [5]:

    
label_encoding = preprocessing.LabelEncoder().fit(cliwoc_data['LogbookIdent']).classes_
cliwoc_data['LogbookIdent'] = preprocessing.LabelEncoder().fit_transform(cliwoc_data['LogbookIdent'])

Find definite slave data in CLIWOC data set

These logs will be used to test the classifier



In [6]:

    
# extract logs that mention slaves
slave_mask = wc.count_key_words(cliwoc_data, text_columns, slave_words)
print('Found ', len(slave_mask[slave_mask]), ' logs that mention slaves')









    



Found  464  logs that mention slaves

Clean CLIWOC data



In [7]:

    
# find indices of ship names that are "non-slave" ships before dropping ship name column
non_slave_log_locations = isolate_training_data(cliwoc_data, {'ShipName': non_slave_ships})
print('Found ', len(non_slave_log_locations[non_slave_log_locations==True]), ' logs that are non-slave ships')









    



Found  5417  logs that are non-slave ships



In [8]:

    
cliwoc_data['slave_logs'] = np.zeros(len(cliwoc_data))
slave_log_locations = cliwoc_data['LogbookIdent'].isin(list(cliwoc_data['LogbookIdent']
                                                            [slave_mask].unique()))

cliwoc_data (unclassified) = 0
cliwoc_data (no slaves) = 1
cliwoc_data (slaves) = 2
slave_data = 3



In [9]:

    
cliwoc_data.loc[non_slave_log_locations,'slave_logs'] = 1
cliwoc_data.loc[slave_log_locations,'slave_logs'] = 2



In [10]:

    
cliwoc_data = cliwoc_data.sort_values('LogbookIdent', ascending=True)
cliwoc_data_all = cliwoc_data.set_index('LogbookIdent', drop= False).copy()
cliwoc_data = cliwoc_data.set_index('LogbookIdent', drop = False)
cliwoc_data = cliwoc_data.drop_duplicates('LogbookIdent')



In [11]:

    
# uncomment this if looking at ship names for manual review
#desired_columns.append('ShipName')



In [12]:

    
# remove undesired columns
cliwoc_data = isolate_columns(cliwoc_data, desired_columns)

Load Slave Voyages data



In [16]:

    
data_path = op.join(exploringShipLogbooks.__path__[0], 'data')
file_name = data_path + '/tastdb-exp-2010'
slave_voyage_logs = pd.read_pickle(file_name)



In [17]:

    
year_ind = ~(slave_voyage_logs['yeardep'].isnull())
slave_voyage_logs = slave_voyage_logs[year_ind]



In [18]:

    
cliwoc_ind = (slave_voyage_logs['yeardep']>cliwoc_data['Year'].min()) & (slave_voyage_logs['yeardep']<cliwoc_data['Year'].max())
slave_voyage_logs = slave_voyage_logs[cliwoc_ind]

Clean Slave voyages data



In [19]:

    
slave_voyage_desired_cols = list(slave_voyage_conversions.keys())
slave_voyage_logs = isolate_columns(slave_voyage_logs, slave_voyage_desired_cols)

slave_voyage_logs.rename(columns=slave_voyage_conversions, inplace=True)
#slave_voyage_logs.columns = ['Nationality', 'ShipType', 'VoyageFrom', 'VoyageTo', 'Year']



In [20]:

    
slave_voyage_logs['slave_logs'] = 3
slave_voyage_indices = range(len(slave_voyage_logs)) + (cliwoc_data.tail(1).index[0]+1)
slave_voyage_logs = slave_voyage_logs.set_index(slave_voyage_indices)

Join data sets



In [21]:

    
all_data = pd.concat([cliwoc_data, slave_voyage_logs])
#all_data = cliwoc_data.append(slave_voyage_logs)
all_data = clean_data(all_data)

# cleanup
#del cliwoc_data, slave_voyage_logs



In [22]:

    
all_data.head()









    Out[22]:






  
    
      
      Nationality
      ShipType
      VoyageFrom
      VoyageTo
      Year
      slave_logs
    
    
      LogbookIdent
      
      
      
      
      
      
    
  
  
    
      0
      dutch
      nan
      fishing grounds
      nan
      1785
      0
    
    
      1
      dutch
      nan
      fishing grounds
      nan
      1789
      0
    
    
      2
      spanish
      falucho
      barcelona
      cartagena
      1847
      0
    
    
      3
      spanish
      mistico
      cadiz
      melilla
      1849
      0
    
    
      4
      spanish
      bergantín
      barcelona
      marañón
      1848
      1

Test of fuzzywuzzy method



In [23]:

    
all_data_test = all_data.copy()



In [24]:

    
fuzz_columns = ['Nationality', 'ShipType', 'VoyageFrom', 'VoyageTo']



In [25]:

    
for col in fuzz_columns:
    all_data = fuzzy_wuzzy_classification(all_data, col)

Encode data

Must encode data before separating, otherwise values that do not occur in a subset will be encoded differently



In [26]:

    
from sklearn.preprocessing import LabelEncoder
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                if is_instance(X[col][0], str):
                    output[col] = LabelEncoder().fit_transform(output[col])
                else:
                    output[col] = X[col]
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)



In [27]:

    
if classifier_algorithm == "Decision Tree":
    all_data = MultiColumnLabelEncoder().fit_transform(all_data)
elif classifier_algorithm == "Naive Bayes":
    all_data = encode_data_df(all_data)
    all_data['no_data'] = all_data['nan'].apply(lambda x: x.any(), axis=1).astype(int)
    all_data = all_data.drop('nan', axis=1)
else:
    raise KeyError("Please enter a valid classification type (Decision Trees or Naive Bayes)")

Extract training data, and create list of classes



In [28]:

    
unclassified_logs = all_data[all_data['slave_logs']==0]
#unclassified_logs = unclassified_logs.drop('slave_logs', axis=1)

validation_set_1 = all_data[all_data['slave_logs']==2]
#validation_set_1 = validation_set_1.drop('slave_logs', axis=1)

# reserve first 20% of slave_voyage_logs as validation set
validation_set_2_indices = range(slave_voyage_indices.min(),
                                 slave_voyage_indices.min() + round(len(slave_voyage_indices)*.2))
validation_set_2 = all_data.iloc[validation_set_2_indices]
#validation_set_2 = validation_set_2.drop('slave_logs', axis=1)

training_logs_pos = all_data.drop(validation_set_2_indices)
training_logs_pos = training_logs_pos[training_logs_pos['slave_logs']==3]
#training_logs_pos = training_logs_pos.drop('slave_logs', axis=1)

# note! This relies on cliwoc data being first in all_data
# could make more robust later
training_logs_neg = all_data[all_data['slave_logs']==1]
#training_logs_neg = training_logs_neg.drop('slave_logs', axis=1)

# cleanup
#del all_data

left this code so we can check if there are any null values in each dataframe



In [29]:

    
def finding_null_values(df):
    return df.isnull().sum()[df.isnull().sum()>0]



In [30]:

    
repeat_multiplier = round(len(training_logs_pos)/len(training_logs_neg))

# create list of classes for training data (0 is for non-slave, 1 is for slave)
# index matches training_data
classes = np.zeros(len(training_logs_neg)).repeat(repeat_multiplier)
#classes = np.append(classes, np.ones(len(training_logs_pos)))
classes = np.append(classes, np.ones(len(training_logs_pos)))

# join training data
neg_rep = pd.concat([training_logs_neg]*repeat_multiplier)
training_data = pd.concat([neg_rep, training_logs_pos], ignore_index = True)

# convert to numpy array
columns = list(training_data.columns)
columns.remove('slave_logs')
training_data = training_data.as_matrix(columns)

Fit training data to classifier

note! first column of numpy array is index! do not include in classification!



In [31]:

    
if classifier_algorithm == "Decision Tree":
    classifier = MultinomialNB(alpha = 1.0, class_prior = None, fit_prior = True)
    classifier.fit(training_data[::,1::], classes)
elif classifier_algorithm == "Naive Bayes":
    classifier = tree.DecisionTreeClassifier()
    classifier.fit(training_data[::,1::], classes)
else: 
    raise KeyError("Please enter a valid classification type (Decision Trees or Naive Bayes)")

Test classifier

check if slave logs from cliwoc data were classified correctly (want mostly classified as 1)
compare first column with slave_index



In [32]:

    
def validation_test(classifier, validation_set, expected_class):
    """
    input classifer object, validation set (data frame), and expected class 
    of validation set (i.e. 1 or 0). Prints successful classification rate.
    """
    columns = list(validation_set.columns)
    columns.remove('slave_logs')
    validation_set = validation_set.as_matrix(columns)
    predictions = classifier.predict(validation_set[::,1::])
    
    counts = collections.Counter(predictions)
    percent_correct = (counts[expected_class]/(len(predictions))* 100)
                       
    print('Validation set was classified as', expected_class,
          round(percent_correct,2), '% of the time')



In [33]:

    
def predict_class(classifier, data_subset):
    """
    Predict class of data, and append predictions to data frame
    """
    try:
        # drop old predictions before reclassifying (if they exist)
        data_subset = data_subset.drop('predictions', axis = 1)
        data_to_classify = data_subset.copy()
    except:
        data_to_classify = data_subset.copy()
        pass
    
    # convert to numpy and classify
    columns = list(data_to_classify.columns)
    columns.remove('slave_logs')
    data_matrix = data_to_classify.as_matrix(columns)
    predictions = classifier.predict(data_matrix[::,1::])
    
    # revalue slave_log ID column to indicate classification
    data_to_classify['slave_logs'] = predictions + 4
    
    # print statstics
    counts = collections.Counter(predictions)
    
    for key in counts:
        percent = (counts[key]/(len(predictions))* 100)
        print(round(percent, 2), 'of data was classified as ', key)
        
    # update slave_log columns

    return data_to_classify



In [34]:

    
print('Testing validation data from slave logs data set')
validation_test(classifier, validation_set_2, 1)

print('Testing validation data from cliwoc data set:')
validation_test(classifier, validation_set_1, 1)









    



Testing validation data from slave logs data set
Validation set was classified as 1 94.55 % of the time
Testing validation data from cliwoc data set:
Validation set was classified as 1 23.21 % of the time



In [35]:

    
unclassified_logs = predict_class(classifier, unclassified_logs)









    



64.72 of data was classified as  0.0
35.28 of data was classified as  1.0



In [36]:

    
unclassified_logs.head()









    Out[36]:






  
    
      
      Nationality
      ShipType
      VoyageFrom
      VoyageTo
      Year
      slave_logs
    
    
      LogbookIdent
      
      
      
      
      
      
    
  
  
    
      0
      4
      78
      160
      288
      123
      4
    
    
      1
      4
      78
      160
      288
      127
      4
    
    
      2
      15
      42
      35
      84
      185
      5
    
    
      3
      15
      77
      63
      262
      187
      4
    
    
      9
      4
      78
      374
      34
      0
      5

try decision trees plotting

Following lines of code do not currently work, we need to install graphviz



In [37]:

    
# export PDF with decision tree
from sklearn.externals.six import StringIO  
import os
import pydot 

dot_data = StringIO()
tree.export_graphviz(new_classifier, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("test.pdf")









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-37-a4068616a441> in <module>()
      2 from sklearn.externals.six import StringIO
      3 import os
----> 4 import pydot
      5 
      6 dot_data = StringIO()

ImportError: No module named 'pydot'



In [ ]:

	Nationality	ShipType	VoyageFrom	VoyageTo	Year	slave_logs
LogbookIdent
0	dutch	nan	fishing grounds	nan	1785	0
1	dutch	nan	fishing grounds	nan	1789	0
2	spanish	falucho	barcelona	cartagena	1847	0
3	spanish	mistico	cadiz	melilla	1849	0
4	spanish	bergantín	barcelona	marañón	1848	1