In [1]:
#Ignore scikit learn deprication warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
import os
print(os.getcwd())
#os.chdir('../blocking/')
import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
from numpy import genfromtxt


C:\Users\aparn\Desktop\cs638project\analysis

In [3]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import *
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import *

In [4]:
t = pd.read_csv("../learning/test.csv", encoding="ISO-8859-1", index_col='_id')
#Automatically label the testing data
matches = 0;
nonmatches = 0;
labels = []
for index, row in t.iterrows():
    if row['rtable_scientific_name'].strip().lower() == row['ltable_scientific_name'].strip().lower():
        labels.append(1)
        matches = matches + 1
    else:
        labels.append(0)
        nonmatches = nonmatches + 1
        
t['label'] = labels

#cols_to_keep version 2
cols_to_keep = ['name_name_jac_qgm_3_qgm_3',
                'name_name_jac_dlm_dc0_dlm_dc0', 
                'countries_countries_jac_qgm_3_qgm_3',
                'countries_countries_cos_dlm_dc0_dlm_dc0',
                'countries_countries_jac_dlm_dc0_dlm_dc0', 
                'countries_countries_mel',
                'countries_countries_lev_dist', 
                'countries_countries_lev_sim',
                'countries_countries_nmw', 
                'countries_countries_sw',
                'country_count_country_count_exm', 
                'country_count_country_count_anm',
                'country_count_country_count_lev_dist',
                'country_count_country_count_lev_sim', 
                'status_match',
                'country_overlap', 
                'country_count_sim']

tr = t[cols_to_keep + ['label']]
tr.to_csv('../learning/test_reduced.csv')

In [5]:
# Awesome sauce.  Now let's fill in missing values rather than dropping them
# reload train
train = genfromtxt('../learning/train_reduced.csv',delimiter=',')
test = genfromtxt('../learning/test_reduced.csv',delimiter=',')
unlabeled_r = genfromtxt('../learning/unlabeled_reduced.csv',delimiter=',')


# get rid of first row (I think this is just column labels)
train = train[1:,:]
print('train', train.shape)

test = test[1:,:]
print('test', test.shape)


unlabeled_r = unlabeled_r[1:,:]
print('unlabeled', unlabeled_r.shape)



##Drop all nans:
train = train[~np.isnan(train).any(axis=1)]
test = test[~np.isnan(test).any(axis=1)]
unlabeled_r = unlabeled_r[~np.isnan(unlabeled_r).any(axis=1)]

print('test', test.shape)
print(test[6,:])


yTrue = test[:,test.shape[1]-1] # label
print(yTrue.shape)
print(yTrue[6])

Xtest = test[:,1:test.shape[1]-1]  # data
print(Xtest.shape)
print(Xtest[6])

y = train[:,train.shape[1]-1] # label
X = train[:,1:train.shape[1]-1]  # data

unlabeled_X = unlabeled_r[:,1:unlabeled_r.shape[1]]


train (279, 19)
test (120, 19)
unlabeled (2179, 18)
test (59, 19)
[  9.06326700e+06   1.79487179e-01   3.33333333e-01   0.00000000e+00
   0.00000000e+00   0.00000000e+00   3.53131313e-01   2.30000000e+01
   8.00000000e-02  -1.20000000e+01   2.00000000e+00   0.00000000e+00
   5.00000000e-01   1.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   1.00000000e+00   0.00000000e+00]
(59,)
0.0
(59, 17)
[  0.17948718   0.33333333   0.           0.           0.           0.35313131
  23.           0.08       -12.           2.           0.           0.5
   1.           0.           0.           0.           1.        ]

In [6]:
# Double check that model is good
from sklearn.grid_search import GridSearchCV


# SVC tuning
#tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
#                      'C': [1, 10, 100, 1000]},
#                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
#clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=10,
#                       scoring='f1')
clf = SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
clf.fit(X,y)

# Get parameters from model
#params = clf.get_params()
#sv = clf.support_vectors
#nv = clf.n_support_
#a  = clf.dual_coef_
#b  = clf._intercept_
#cs = clf.classes_



yPred = clf.predict(Xtest)
prec, rec, f1, whatever = precision_recall_fscore_support(yTrue, yPred,average='weighted')
print('prec', prec)
print('rec', rec)
print('f1', f1)


prec 0.894292632307
rec 0.898305084746
f1 0.894741910632

In [7]:
#import gold reduced and train the classifier on the full thing
gold = genfromtxt('../learning/gold_reduced.csv',delimiter=',')


# get rid of first row (I think this is just column labels)
gold = gold[1:,:]

##Drop all nans:
gold = gold[~np.isnan(gold).any(axis=1)]

gold_y = gold[:,gold.shape[1]-1] # label
gold_X = gold[:,1:gold.shape[1]-1]  # data

In [8]:
unlabeled = em.read_csv_metadata("../learning/unlabeled.csv", encoding="ISO-8859-1", key='_id')
df = unlabeled.dropna(subset = ['_id', 'name_name_jac_qgm_3_qgm_3', 'name_name_jac_dlm_dc0_dlm_dc0',
       'countries_countries_jac_qgm_3_qgm_3',
       'countries_countries_cos_dlm_dc0_dlm_dc0',
       'countries_countries_jac_dlm_dc0_dlm_dc0', 'countries_countries_mel',
       'countries_countries_lev_dist', 'countries_countries_lev_sim',
       'countries_countries_nmw', 'countries_countries_sw',
       'country_count_country_count_exm', 'country_count_country_count_anm',
       'country_count_country_count_lev_dist',
       'country_count_country_count_lev_sim', 'status_match',
       'country_overlap', 'country_count_sim'])

#labeled.head()
#print(test[2])
#labeled_r.columns


Metadata file is not present in the given path; proceeding to read the csv file.

In [9]:
#train model on gold
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
clf.fit(gold_X,gold_y)

prediction = clf.predict(unlabeled_X)

In [10]:
#unlabeled['label'] = prediction
#unlabeled
#len(dropped)

In [11]:
add = pd.DataFrame(prediction, columns=['label'])
df['label'] = add
df = df[df.label != 0]

In [12]:
len(df)


Out[12]:
603

In [13]:
# Read in csv as dataframe
rl = em.read_csv_metadata("../finalRedlist.csv", encoding="ISO-8859-1", key='id')
# glance at first few rows

ar = em.read_csv_metadata("../finalArkives.csv", encoding="ISO-8859-1", key='id')


Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.

In [14]:
C = pd.read_csv('../learning/gold.csv', encoding="ISO-8859-1")

In [15]:
df = df.append(C, ignore_index=True)

In [16]:
df = df[df.label != 0]

In [17]:
len(df)


Out[17]:
664

In [18]:
cols_to_drop = ['id.1', 
                'Unnamed',
                'name_name_jac_qgm_3_qgm_3',
                'name_name_jac_dlm_dc0_dlm_dc0', 
                'countries_countries_jac_qgm_3_qgm_3',
                'countries_countries_cos_dlm_dc0_dlm_dc0',
                'countries_countries_jac_dlm_dc0_dlm_dc0', 
                'countries_countries_mel',
                'countries_countries_lev_dist', 
                'countries_countries_lev_sim',
                'countries_countries_nmw', 
                'countries_countries_sw',
                'country_count_country_count_exm', 
                'country_count_country_count_anm',
                'country_count_country_count_lev_dist',
                'country_count_country_count_lev_sim', 
                'status_match',
                'country_overlap', 
                'country_count_sim'
               ]

for n in df.columns:
    for c in cols_to_drop:
        if c in n:
            df = df.drop(n,1)
df.columns


Out[18]:
Index(['_id', 'ltable_id', 'rtable_id', 'ltable_name', 'ltable_genus',
       'ltable_family', 'ltable_ecology', 'ltable_countries',
       'ltable_threat_paragraph', 'ltable_conservation_paragraph',
       'ltable_pop_trend', 'ltable_status', 'ltable_country_count',
       'ltable_scientific_name', 'rtable_scientific_name', 'rtable_name',
       'rtable_kingdom', 'rtable_phylum', 'rtable_class', 'rtable_order',
       'rtable_family', 'rtable_genus', 'rtable_size', 'rtable_threats',
       'rtable_conservation', 'rtable_threat_keywords',
       'rtable_conservation_keywords', 'rtable_status', 'rtable_countries',
       'rtable_country_count', 'rtable_tCount', 'label'],
      dtype='object')

In [19]:
df.columns


Out[19]:
Index(['_id', 'ltable_id', 'rtable_id', 'ltable_name', 'ltable_genus',
       'ltable_family', 'ltable_ecology', 'ltable_countries',
       'ltable_threat_paragraph', 'ltable_conservation_paragraph',
       'ltable_pop_trend', 'ltable_status', 'ltable_country_count',
       'ltable_scientific_name', 'rtable_scientific_name', 'rtable_name',
       'rtable_kingdom', 'rtable_phylum', 'rtable_class', 'rtable_order',
       'rtable_family', 'rtable_genus', 'rtable_size', 'rtable_threats',
       'rtable_conservation', 'rtable_threat_keywords',
       'rtable_conservation_keywords', 'rtable_status', 'rtable_countries',
       'rtable_country_count', 'rtable_tCount', 'label'],
      dtype='object')

In [20]:
count = 0
for index, row in df.iterrows():
    ltable = str(row['ltable_scientific_name']).lower().strip()
    rtable = str(row['rtable_scientific_name']).lower().strip()

    if ltable == rtable:
        count = count + 1
    
    else:
        df = df.drop(index)
len(df)


Out[20]:
148

In [21]:
df.drop('ltable_scientific_name', axis=1, inplace=True)
#df.rename(columns={'ltable_scientific_name':'scientific_name'}, inplace=True)

In [22]:
df.head()


Out[22]:
_id ltable_id rtable_id ltable_name ltable_genus ltable_family ltable_ecology ltable_countries ltable_threat_paragraph ltable_conservation_paragraph ... rtable_size rtable_threats rtable_conservation rtable_threat_keywords rtable_conservation_keywords rtable_status rtable_countries rtable_country_count rtable_tCount label
7 1522478 2798 192 caterpillar slug Laevicaulis Veronicellidae NaN South Africa (KwaZulu-Natal); NaN NaN ... Extended length: up to 90 mm (2) The caterpillar slug is threatened by habitat loss and degradation as a result of ongoing urbani... Although there are currently no conservation measures directly targeting the caterpillar slug in... loss;environment; Endangered ['India', 'Russia', 'Malaysia', 'China', 'Indonesia'] 5 2 1.0
17 8689373 6618 1097 catalina mahogany Cercocarpus Rosaceae NaN United States (California); NaN NaN ... Height: 3 - 7 m (2)Trunk diameter: c. 20 cm (2) Historically a major threat to the Catalina mahogany was the introduction of <strong>herbivores<... Conservation efforts began in the 1970s with a detailed inventory of the remaining Catalina maho... loss;invasive; Critically Endangered ['Ukraine', 'Morocco', 'Russia', 'Hungary'] 4 2 1.0
21 7512846 1511 949 lorenz von liburnaus woolly lemur, western avahi, western woolly lemur Avahi Indriidae Terrestrial Madagascar; \r\r\r\r\n The major threat is forest destruction due to annual burning that creates new catt... ['\n This species is listed on Appendix I of CITES. ', <span lang="EN-CA">This species is kno... ... 700 â?? 900 g (2) 15 species of lemur have become extinct since sea-faring humans arrived on Madagascarâ??s shore... The western woolly lemur is confirmed in only two protected areas Ankarafantsika Nature Reserve ... hunting; protected; Critically Endangered ['Australia'] 1 1 1.0
23 1191527 4277 150 bluelegged mantella, tular golden frog, tular mantella, tulear golden frog Mantella Mantellidae Terrestrial; Freshwater Madagascar; \r\r\r\r\n The main threat to this species is habitat loss due to grazing and fire, and in so... ['\n It occurs in Parque Nacional de Isalo. Trade in this species needs to be very carefully ... ... 1 â?? 3 g (3) Several thousand blue-legged mantellas are thought to be collected every year from some regions ... Listing on Appendix II of the Convention on International Trade in Endangered Species provides t... loss; Endangered ['Taiwan', 'China', 'Vietnam'] 3 1 1.0
25 4646125 20 587 malagasy giant jumping rat, malagasy giant rat Hypogeomys Nesomyidae Terrestrial Madagascar; \r\r\r\r\n The historical decline of this species has been partly through climatic change lea... ['\n The new Menabe-Antimena protected area has temporary protection order and covers the ent... ... 1 â?? 1.5 kg (2) Like many of Madagascarâ??s unique species the Malagasy giant rat is thought to have become hig... This large rodent is in urgent need of conservation and its future remains highly uncertain. The... loss;pet; captive breeding;protected; Endangered ['Canada'] 1 2 1.0

5 rows × 31 columns


In [23]:
df.rename(columns={'rtable_scientific_name':'scientific_name'}, inplace=True)

In [ ]:


In [24]:
#sanity check to make sure genus matches in all instances. Looking for 148
count = 0
for index, row in df.iterrows():
    ltable = str(row['ltable_genus']).lower().strip()
    rtable = str(row['rtable_genus']).lower().strip()
    
    if ltable == rtable:
        count = count + 1
    
    #else:
        #df = df.drop(index)
print(count)


148

In [25]:
#yay. lets drop stuff
df.drop('ltable_genus', axis=1, inplace=True)
df.rename(columns={'rtable_genus':'genus'}, inplace=True)
df.drop('ltable_id', axis=1, inplace=True)
df.drop('rtable_id', axis=1, inplace=True)

In [26]:
df.columns


Out[26]:
Index(['_id', 'ltable_name', 'ltable_family', 'ltable_ecology',
       'ltable_countries', 'ltable_threat_paragraph',
       'ltable_conservation_paragraph', 'ltable_pop_trend', 'ltable_status',
       'ltable_country_count', 'scientific_name', 'rtable_name',
       'rtable_kingdom', 'rtable_phylum', 'rtable_class', 'rtable_order',
       'rtable_family', 'genus', 'rtable_size', 'rtable_threats',
       'rtable_conservation', 'rtable_threat_keywords',
       'rtable_conservation_keywords', 'rtable_status', 'rtable_countries',
       'rtable_country_count', 'rtable_tCount', 'label'],
      dtype='object')

In [27]:
df.to_csv('labeled.csv')

In [ ]: