In [1]:
classifier_algorithm = "Decision Tree"
In [14]:
import collections
import exploringShipLogbooks
import numpy as np
import os.path as op
import pandas as pd
import exploringShipLogbooks.wordcount as wc
from fuzzywuzzy import fuzz
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from exploringShipLogbooks.basic_utils import clean_data
from exploringShipLogbooks.basic_utils import encode_data_df
from exploringShipLogbooks.basic_utils import extract_logbook_data
from exploringShipLogbooks.fuzz_replacement import fuzzy_wuzzy_classification
from exploringShipLogbooks.basic_utils import isolate_columns
from exploringShipLogbooks.basic_utils import isolate_training_data
from exploringShipLogbooks.config import *
In [4]:
# extract data from zip file
cliwoc_data = extract_logbook_data('CLIWOC15.csv')
In [5]:
label_encoding = preprocessing.LabelEncoder().fit(cliwoc_data['LogbookIdent']).classes_
cliwoc_data['LogbookIdent'] = preprocessing.LabelEncoder().fit_transform(cliwoc_data['LogbookIdent'])
In [6]:
# extract logs that mention slaves
slave_mask = wc.count_key_words(cliwoc_data, text_columns, slave_words)
print('Found ', len(slave_mask[slave_mask]), ' logs that mention slaves')
In [7]:
# find indices of ship names that are "non-slave" ships before dropping ship name column
non_slave_log_locations = isolate_training_data(cliwoc_data, {'ShipName': non_slave_ships})
print('Found ', len(non_slave_log_locations[non_slave_log_locations==True]), ' logs that are non-slave ships')
In [8]:
cliwoc_data['slave_logs'] = np.zeros(len(cliwoc_data))
slave_log_locations = cliwoc_data['LogbookIdent'].isin(list(cliwoc_data['LogbookIdent']
[slave_mask].unique()))
In [9]:
cliwoc_data.loc[non_slave_log_locations,'slave_logs'] = 1
cliwoc_data.loc[slave_log_locations,'slave_logs'] = 2
In [10]:
cliwoc_data = cliwoc_data.sort_values('LogbookIdent', ascending=True)
cliwoc_data_all = cliwoc_data.set_index('LogbookIdent', drop= False).copy()
cliwoc_data = cliwoc_data.set_index('LogbookIdent', drop = False)
cliwoc_data = cliwoc_data.drop_duplicates('LogbookIdent')
In [11]:
# uncomment this if looking at ship names for manual review
#desired_columns.append('ShipName')
In [12]:
# remove undesired columns
cliwoc_data = isolate_columns(cliwoc_data, desired_columns)
In [16]:
data_path = op.join(exploringShipLogbooks.__path__[0], 'data')
file_name = data_path + '/tastdb-exp-2010'
slave_voyage_logs = pd.read_pickle(file_name)
In [17]:
year_ind = ~(slave_voyage_logs['yeardep'].isnull())
slave_voyage_logs = slave_voyage_logs[year_ind]
In [18]:
cliwoc_ind = (slave_voyage_logs['yeardep']>cliwoc_data['Year'].min()) & (slave_voyage_logs['yeardep']<cliwoc_data['Year'].max())
slave_voyage_logs = slave_voyage_logs[cliwoc_ind]
In [19]:
slave_voyage_desired_cols = list(slave_voyage_conversions.keys())
slave_voyage_logs = isolate_columns(slave_voyage_logs, slave_voyage_desired_cols)
slave_voyage_logs.rename(columns=slave_voyage_conversions, inplace=True)
#slave_voyage_logs.columns = ['Nationality', 'ShipType', 'VoyageFrom', 'VoyageTo', 'Year']
In [20]:
slave_voyage_logs['slave_logs'] = 3
slave_voyage_indices = range(len(slave_voyage_logs)) + (cliwoc_data.tail(1).index[0]+1)
slave_voyage_logs = slave_voyage_logs.set_index(slave_voyage_indices)
In [21]:
all_data = pd.concat([cliwoc_data, slave_voyage_logs])
#all_data = cliwoc_data.append(slave_voyage_logs)
all_data = clean_data(all_data)
# cleanup
#del cliwoc_data, slave_voyage_logs
In [22]:
all_data.head()
Out[22]:
In [23]:
all_data_test = all_data.copy()
In [24]:
fuzz_columns = ['Nationality', 'ShipType', 'VoyageFrom', 'VoyageTo']
In [25]:
for col in fuzz_columns:
all_data = fuzzy_wuzzy_classification(all_data, col)
In [26]:
from sklearn.preprocessing import LabelEncoder
class MultiColumnLabelEncoder:
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self # not relevant here
def transform(self,X):
'''
Transforms columns of X specified in self.columns using
LabelEncoder(). If no columns specified, transforms all
columns in X.
'''
output = X.copy()
if self.columns is not None:
for col in self.columns:
if is_instance(X[col][0], str):
output[col] = LabelEncoder().fit_transform(output[col])
else:
output[col] = X[col]
else:
for colname,col in output.iteritems():
output[colname] = LabelEncoder().fit_transform(col)
return output
def fit_transform(self,X,y=None):
return self.fit(X,y).transform(X)
In [27]:
if classifier_algorithm == "Decision Tree":
all_data = MultiColumnLabelEncoder().fit_transform(all_data)
elif classifier_algorithm == "Naive Bayes":
all_data = encode_data_df(all_data)
all_data['no_data'] = all_data['nan'].apply(lambda x: x.any(), axis=1).astype(int)
all_data = all_data.drop('nan', axis=1)
else:
raise KeyError("Please enter a valid classification type (Decision Trees or Naive Bayes)")
In [28]:
unclassified_logs = all_data[all_data['slave_logs']==0]
#unclassified_logs = unclassified_logs.drop('slave_logs', axis=1)
validation_set_1 = all_data[all_data['slave_logs']==2]
#validation_set_1 = validation_set_1.drop('slave_logs', axis=1)
# reserve first 20% of slave_voyage_logs as validation set
validation_set_2_indices = range(slave_voyage_indices.min(),
slave_voyage_indices.min() + round(len(slave_voyage_indices)*.2))
validation_set_2 = all_data.iloc[validation_set_2_indices]
#validation_set_2 = validation_set_2.drop('slave_logs', axis=1)
training_logs_pos = all_data.drop(validation_set_2_indices)
training_logs_pos = training_logs_pos[training_logs_pos['slave_logs']==3]
#training_logs_pos = training_logs_pos.drop('slave_logs', axis=1)
# note! This relies on cliwoc data being first in all_data
# could make more robust later
training_logs_neg = all_data[all_data['slave_logs']==1]
#training_logs_neg = training_logs_neg.drop('slave_logs', axis=1)
# cleanup
#del all_data
In [29]:
def finding_null_values(df):
return df.isnull().sum()[df.isnull().sum()>0]
In [30]:
repeat_multiplier = round(len(training_logs_pos)/len(training_logs_neg))
# create list of classes for training data (0 is for non-slave, 1 is for slave)
# index matches training_data
classes = np.zeros(len(training_logs_neg)).repeat(repeat_multiplier)
#classes = np.append(classes, np.ones(len(training_logs_pos)))
classes = np.append(classes, np.ones(len(training_logs_pos)))
# join training data
neg_rep = pd.concat([training_logs_neg]*repeat_multiplier)
training_data = pd.concat([neg_rep, training_logs_pos], ignore_index = True)
# convert to numpy array
columns = list(training_data.columns)
columns.remove('slave_logs')
training_data = training_data.as_matrix(columns)
In [31]:
if classifier_algorithm == "Decision Tree":
classifier = MultinomialNB(alpha = 1.0, class_prior = None, fit_prior = True)
classifier.fit(training_data[::,1::], classes)
elif classifier_algorithm == "Naive Bayes":
classifier = tree.DecisionTreeClassifier()
classifier.fit(training_data[::,1::], classes)
else:
raise KeyError("Please enter a valid classification type (Decision Trees or Naive Bayes)")
In [32]:
def validation_test(classifier, validation_set, expected_class):
"""
input classifer object, validation set (data frame), and expected class
of validation set (i.e. 1 or 0). Prints successful classification rate.
"""
columns = list(validation_set.columns)
columns.remove('slave_logs')
validation_set = validation_set.as_matrix(columns)
predictions = classifier.predict(validation_set[::,1::])
counts = collections.Counter(predictions)
percent_correct = (counts[expected_class]/(len(predictions))* 100)
print('Validation set was classified as', expected_class,
round(percent_correct,2), '% of the time')
In [33]:
def predict_class(classifier, data_subset):
"""
Predict class of data, and append predictions to data frame
"""
try:
# drop old predictions before reclassifying (if they exist)
data_subset = data_subset.drop('predictions', axis = 1)
data_to_classify = data_subset.copy()
except:
data_to_classify = data_subset.copy()
pass
# convert to numpy and classify
columns = list(data_to_classify.columns)
columns.remove('slave_logs')
data_matrix = data_to_classify.as_matrix(columns)
predictions = classifier.predict(data_matrix[::,1::])
# revalue slave_log ID column to indicate classification
data_to_classify['slave_logs'] = predictions + 4
# print statstics
counts = collections.Counter(predictions)
for key in counts:
percent = (counts[key]/(len(predictions))* 100)
print(round(percent, 2), 'of data was classified as ', key)
# update slave_log columns
return data_to_classify
In [34]:
print('Testing validation data from slave logs data set')
validation_test(classifier, validation_set_2, 1)
print('Testing validation data from cliwoc data set:')
validation_test(classifier, validation_set_1, 1)
In [35]:
unclassified_logs = predict_class(classifier, unclassified_logs)
In [36]:
unclassified_logs.head()
Out[36]:
In [37]:
# export PDF with decision tree
from sklearn.externals.six import StringIO
import os
import pydot
dot_data = StringIO()
tree.export_graphviz(new_classifier, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("test.pdf")
In [ ]: