In [1]:
#Ignore scikit learn deprication warnings
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
In [2]:
import os
print(os.getcwd())
#os.chdir('../blocking/')
import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
from numpy import genfromtxt
In [3]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import *
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import *
In [4]:
t = pd.read_csv("../learning/test.csv", encoding="ISO-8859-1", index_col='_id')
#Automatically label the testing data
matches = 0;
nonmatches = 0;
labels = []
for index, row in t.iterrows():
if row['rtable_scientific_name'].strip().lower() == row['ltable_scientific_name'].strip().lower():
labels.append(1)
matches = matches + 1
else:
labels.append(0)
nonmatches = nonmatches + 1
t['label'] = labels
#cols_to_keep version 2
cols_to_keep = ['name_name_jac_qgm_3_qgm_3',
'name_name_jac_dlm_dc0_dlm_dc0',
'countries_countries_jac_qgm_3_qgm_3',
'countries_countries_cos_dlm_dc0_dlm_dc0',
'countries_countries_jac_dlm_dc0_dlm_dc0',
'countries_countries_mel',
'countries_countries_lev_dist',
'countries_countries_lev_sim',
'countries_countries_nmw',
'countries_countries_sw',
'country_count_country_count_exm',
'country_count_country_count_anm',
'country_count_country_count_lev_dist',
'country_count_country_count_lev_sim',
'status_match',
'country_overlap',
'country_count_sim']
tr = t[cols_to_keep + ['label']]
tr.to_csv('../learning/test_reduced.csv')
In [5]:
# Awesome sauce. Now let's fill in missing values rather than dropping them
# reload train
train = genfromtxt('../learning/train_reduced.csv',delimiter=',')
test = genfromtxt('../learning/test_reduced.csv',delimiter=',')
unlabeled_r = genfromtxt('../learning/unlabeled_reduced.csv',delimiter=',')
# get rid of first row (I think this is just column labels)
train = train[1:,:]
print('train', train.shape)
test = test[1:,:]
print('test', test.shape)
unlabeled_r = unlabeled_r[1:,:]
print('unlabeled', unlabeled_r.shape)
##Drop all nans:
train = train[~np.isnan(train).any(axis=1)]
test = test[~np.isnan(test).any(axis=1)]
unlabeled_r = unlabeled_r[~np.isnan(unlabeled_r).any(axis=1)]
print('test', test.shape)
print(test[6,:])
yTrue = test[:,test.shape[1]-1] # label
print(yTrue.shape)
print(yTrue[6])
Xtest = test[:,1:test.shape[1]-1] # data
print(Xtest.shape)
print(Xtest[6])
y = train[:,train.shape[1]-1] # label
X = train[:,1:train.shape[1]-1] # data
unlabeled_X = unlabeled_r[:,1:unlabeled_r.shape[1]]
In [6]:
# Double check that model is good
from sklearn.grid_search import GridSearchCV
# SVC tuning
#tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
# 'C': [1, 10, 100, 1000]},
# {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
#clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=10,
# scoring='f1')
clf = SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
clf.fit(X,y)
# Get parameters from model
#params = clf.get_params()
#sv = clf.support_vectors
#nv = clf.n_support_
#a = clf.dual_coef_
#b = clf._intercept_
#cs = clf.classes_
yPred = clf.predict(Xtest)
prec, rec, f1, whatever = precision_recall_fscore_support(yTrue, yPred,average='weighted')
print('prec', prec)
print('rec', rec)
print('f1', f1)
In [7]:
#import gold reduced and train the classifier on the full thing
gold = genfromtxt('../learning/gold_reduced.csv',delimiter=',')
# get rid of first row (I think this is just column labels)
gold = gold[1:,:]
##Drop all nans:
gold = gold[~np.isnan(gold).any(axis=1)]
gold_y = gold[:,gold.shape[1]-1] # label
gold_X = gold[:,1:gold.shape[1]-1] # data
In [8]:
unlabeled = em.read_csv_metadata("../learning/unlabeled.csv", encoding="ISO-8859-1", key='_id')
df = unlabeled.dropna(subset = ['_id', 'name_name_jac_qgm_3_qgm_3', 'name_name_jac_dlm_dc0_dlm_dc0',
'countries_countries_jac_qgm_3_qgm_3',
'countries_countries_cos_dlm_dc0_dlm_dc0',
'countries_countries_jac_dlm_dc0_dlm_dc0', 'countries_countries_mel',
'countries_countries_lev_dist', 'countries_countries_lev_sim',
'countries_countries_nmw', 'countries_countries_sw',
'country_count_country_count_exm', 'country_count_country_count_anm',
'country_count_country_count_lev_dist',
'country_count_country_count_lev_sim', 'status_match',
'country_overlap', 'country_count_sim'])
#labeled.head()
#print(test[2])
#labeled_r.columns
In [9]:
#train model on gold
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
clf.fit(gold_X,gold_y)
prediction = clf.predict(unlabeled_X)
In [10]:
#unlabeled['label'] = prediction
#unlabeled
#len(dropped)
In [11]:
add = pd.DataFrame(prediction, columns=['label'])
df['label'] = add
df = df[df.label != 0]
In [12]:
len(df)
Out[12]:
In [13]:
# Read in csv as dataframe
rl = em.read_csv_metadata("../finalRedlist.csv", encoding="ISO-8859-1", key='id')
# glance at first few rows
ar = em.read_csv_metadata("../finalArkives.csv", encoding="ISO-8859-1", key='id')
In [14]:
C = pd.read_csv('../learning/gold.csv', encoding="ISO-8859-1")
In [15]:
df = df.append(C, ignore_index=True)
In [16]:
df = df[df.label != 0]
In [17]:
len(df)
Out[17]:
In [18]:
cols_to_drop = ['id.1',
'Unnamed',
'name_name_jac_qgm_3_qgm_3',
'name_name_jac_dlm_dc0_dlm_dc0',
'countries_countries_jac_qgm_3_qgm_3',
'countries_countries_cos_dlm_dc0_dlm_dc0',
'countries_countries_jac_dlm_dc0_dlm_dc0',
'countries_countries_mel',
'countries_countries_lev_dist',
'countries_countries_lev_sim',
'countries_countries_nmw',
'countries_countries_sw',
'country_count_country_count_exm',
'country_count_country_count_anm',
'country_count_country_count_lev_dist',
'country_count_country_count_lev_sim',
'status_match',
'country_overlap',
'country_count_sim'
]
for n in df.columns:
for c in cols_to_drop:
if c in n:
df = df.drop(n,1)
df.columns
Out[18]:
In [19]:
df.columns
Out[19]:
In [20]:
count = 0
for index, row in df.iterrows():
ltable = str(row['ltable_scientific_name']).lower().strip()
rtable = str(row['rtable_scientific_name']).lower().strip()
if ltable == rtable:
count = count + 1
else:
df = df.drop(index)
len(df)
Out[20]:
In [21]:
df.drop('ltable_scientific_name', axis=1, inplace=True)
#df.rename(columns={'ltable_scientific_name':'scientific_name'}, inplace=True)
In [22]:
df.head()
Out[22]:
In [23]:
df.rename(columns={'rtable_scientific_name':'scientific_name'}, inplace=True)
In [ ]:
In [24]:
#sanity check to make sure genus matches in all instances. Looking for 148
count = 0
for index, row in df.iterrows():
ltable = str(row['ltable_genus']).lower().strip()
rtable = str(row['rtable_genus']).lower().strip()
if ltable == rtable:
count = count + 1
#else:
#df = df.drop(index)
print(count)
In [25]:
#yay. lets drop stuff
df.drop('ltable_genus', axis=1, inplace=True)
df.rename(columns={'rtable_genus':'genus'}, inplace=True)
df.drop('ltable_id', axis=1, inplace=True)
df.drop('rtable_id', axis=1, inplace=True)
In [26]:
df.columns
Out[26]:
In [27]:
df.to_csv('labeled.csv')
In [ ]: