notebook.community

Edit and run



In [1]:

    
#Ignore scikit learn deprication warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn



In [2]:

    
import os
print(os.getcwd())
#os.chdir('../blocking/')
import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
from numpy import genfromtxt









    



C:\Users\aparn\Desktop\cs638project\analysis



In [3]:

    
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import *
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import *



In [4]:

    
t = pd.read_csv("../learning/test.csv", encoding="ISO-8859-1", index_col='_id')
#Automatically label the testing data
matches = 0;
nonmatches = 0;
labels = []
for index, row in t.iterrows():
    if row['rtable_scientific_name'].strip().lower() == row['ltable_scientific_name'].strip().lower():
        labels.append(1)
        matches = matches + 1
    else:
        labels.append(0)
        nonmatches = nonmatches + 1
        
t['label'] = labels

#cols_to_keep version 2
cols_to_keep = ['name_name_jac_qgm_3_qgm_3',
                'name_name_jac_dlm_dc0_dlm_dc0', 
                'countries_countries_jac_qgm_3_qgm_3',
                'countries_countries_cos_dlm_dc0_dlm_dc0',
                'countries_countries_jac_dlm_dc0_dlm_dc0', 
                'countries_countries_mel',
                'countries_countries_lev_dist', 
                'countries_countries_lev_sim',
                'countries_countries_nmw', 
                'countries_countries_sw',
                'country_count_country_count_exm', 
                'country_count_country_count_anm',
                'country_count_country_count_lev_dist',
                'country_count_country_count_lev_sim', 
                'status_match',
                'country_overlap', 
                'country_count_sim']

tr = t[cols_to_keep + ['label']]
tr.to_csv('../learning/test_reduced.csv')



In [5]:

    
# Awesome sauce.  Now let's fill in missing values rather than dropping them
# reload train
train = genfromtxt('../learning/train_reduced.csv',delimiter=',')
test = genfromtxt('../learning/test_reduced.csv',delimiter=',')
unlabeled_r = genfromtxt('../learning/unlabeled_reduced.csv',delimiter=',')


# get rid of first row (I think this is just column labels)
train = train[1:,:]
print('train', train.shape)

test = test[1:,:]
print('test', test.shape)


unlabeled_r = unlabeled_r[1:,:]
print('unlabeled', unlabeled_r.shape)



##Drop all nans:
train = train[~np.isnan(train).any(axis=1)]
test = test[~np.isnan(test).any(axis=1)]
unlabeled_r = unlabeled_r[~np.isnan(unlabeled_r).any(axis=1)]

print('test', test.shape)
print(test[6,:])


yTrue = test[:,test.shape[1]-1] # label
print(yTrue.shape)
print(yTrue[6])

Xtest = test[:,1:test.shape[1]-1]  # data
print(Xtest.shape)
print(Xtest[6])

y = train[:,train.shape[1]-1] # label
X = train[:,1:train.shape[1]-1]  # data

unlabeled_X = unlabeled_r[:,1:unlabeled_r.shape[1]]









    



train (279, 19)
test (120, 19)
unlabeled (2179, 18)
test (59, 19)
[  9.06326700e+06   1.79487179e-01   3.33333333e-01   0.00000000e+00
   0.00000000e+00   0.00000000e+00   3.53131313e-01   2.30000000e+01
   8.00000000e-02  -1.20000000e+01   2.00000000e+00   0.00000000e+00
   5.00000000e-01   1.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   1.00000000e+00   0.00000000e+00]
(59,)
0.0
(59, 17)
[  0.17948718   0.33333333   0.           0.           0.           0.35313131
  23.           0.08       -12.           2.           0.           0.5
   1.           0.           0.           0.           1.        ]



In [6]:

    
# Double check that model is good
from sklearn.grid_search import GridSearchCV


# SVC tuning
#tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
#                      'C': [1, 10, 100, 1000]},
#                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
#clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=10,
#                       scoring='f1')
clf = SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
clf.fit(X,y)

# Get parameters from model
#params = clf.get_params()
#sv = clf.support_vectors
#nv = clf.n_support_
#a  = clf.dual_coef_
#b  = clf._intercept_
#cs = clf.classes_



yPred = clf.predict(Xtest)
prec, rec, f1, whatever = precision_recall_fscore_support(yTrue, yPred,average='weighted')
print('prec', prec)
print('rec', rec)
print('f1', f1)









    



prec 0.894292632307
rec 0.898305084746
f1 0.894741910632



In [7]:

    
#import gold reduced and train the classifier on the full thing
gold = genfromtxt('../learning/gold_reduced.csv',delimiter=',')


# get rid of first row (I think this is just column labels)
gold = gold[1:,:]

##Drop all nans:
gold = gold[~np.isnan(gold).any(axis=1)]

gold_y = gold[:,gold.shape[1]-1] # label
gold_X = gold[:,1:gold.shape[1]-1]  # data



In [8]:

    
unlabeled = em.read_csv_metadata("../learning/unlabeled.csv", encoding="ISO-8859-1", key='_id')
df = unlabeled.dropna(subset = ['_id', 'name_name_jac_qgm_3_qgm_3', 'name_name_jac_dlm_dc0_dlm_dc0',
       'countries_countries_jac_qgm_3_qgm_3',
       'countries_countries_cos_dlm_dc0_dlm_dc0',
       'countries_countries_jac_dlm_dc0_dlm_dc0', 'countries_countries_mel',
       'countries_countries_lev_dist', 'countries_countries_lev_sim',
       'countries_countries_nmw', 'countries_countries_sw',
       'country_count_country_count_exm', 'country_count_country_count_anm',
       'country_count_country_count_lev_dist',
       'country_count_country_count_lev_sim', 'status_match',
       'country_overlap', 'country_count_sim'])

#labeled.head()
#print(test[2])
#labeled_r.columns









    



Metadata file is not present in the given path; proceeding to read the csv file.



In [9]:

    
#train model on gold
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
clf.fit(gold_X,gold_y)

prediction = clf.predict(unlabeled_X)



In [10]:

    
#unlabeled['label'] = prediction
#unlabeled
#len(dropped)



In [11]:

    
add = pd.DataFrame(prediction, columns=['label'])
df['label'] = add
df = df[df.label != 0]



In [12]:

    
len(df)









    Out[12]:





603



In [13]:

    
# Read in csv as dataframe
rl = em.read_csv_metadata("../finalRedlist.csv", encoding="ISO-8859-1", key='id')
# glance at first few rows

ar = em.read_csv_metadata("../finalArkives.csv", encoding="ISO-8859-1", key='id')









    



Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.



In [14]:

    
C = pd.read_csv('../learning/gold.csv', encoding="ISO-8859-1")



In [15]:

    
df = df.append(C, ignore_index=True)



In [16]:

    
df = df[df.label != 0]



In [17]:

    
len(df)









    Out[17]:





664



In [18]:

    
cols_to_drop = ['id.1', 
                'Unnamed',
                'name_name_jac_qgm_3_qgm_3',
                'name_name_jac_dlm_dc0_dlm_dc0', 
                'countries_countries_jac_qgm_3_qgm_3',
                'countries_countries_cos_dlm_dc0_dlm_dc0',
                'countries_countries_jac_dlm_dc0_dlm_dc0', 
                'countries_countries_mel',
                'countries_countries_lev_dist', 
                'countries_countries_lev_sim',
                'countries_countries_nmw', 
                'countries_countries_sw',
                'country_count_country_count_exm', 
                'country_count_country_count_anm',
                'country_count_country_count_lev_dist',
                'country_count_country_count_lev_sim', 
                'status_match',
                'country_overlap', 
                'country_count_sim'
               ]

for n in df.columns:
    for c in cols_to_drop:
        if c in n:
            df = df.drop(n,1)
df.columns









    Out[18]:





Index(['_id', 'ltable_id', 'rtable_id', 'ltable_name', 'ltable_genus',
       'ltable_family', 'ltable_ecology', 'ltable_countries',
       'ltable_threat_paragraph', 'ltable_conservation_paragraph',
       'ltable_pop_trend', 'ltable_status', 'ltable_country_count',
       'ltable_scientific_name', 'rtable_scientific_name', 'rtable_name',
       'rtable_kingdom', 'rtable_phylum', 'rtable_class', 'rtable_order',
       'rtable_family', 'rtable_genus', 'rtable_size', 'rtable_threats',
       'rtable_conservation', 'rtable_threat_keywords',
       'rtable_conservation_keywords', 'rtable_status', 'rtable_countries',
       'rtable_country_count', 'rtable_tCount', 'label'],
      dtype='object')



In [19]:

    
df.columns









    Out[19]:





Index(['_id', 'ltable_id', 'rtable_id', 'ltable_name', 'ltable_genus',
       'ltable_family', 'ltable_ecology', 'ltable_countries',
       'ltable_threat_paragraph', 'ltable_conservation_paragraph',
       'ltable_pop_trend', 'ltable_status', 'ltable_country_count',
       'ltable_scientific_name', 'rtable_scientific_name', 'rtable_name',
       'rtable_kingdom', 'rtable_phylum', 'rtable_class', 'rtable_order',
       'rtable_family', 'rtable_genus', 'rtable_size', 'rtable_threats',
       'rtable_conservation', 'rtable_threat_keywords',
       'rtable_conservation_keywords', 'rtable_status', 'rtable_countries',
       'rtable_country_count', 'rtable_tCount', 'label'],
      dtype='object')



In [20]:

    
count = 0
for index, row in df.iterrows():
    ltable = str(row['ltable_scientific_name']).lower().strip()
    rtable = str(row['rtable_scientific_name']).lower().strip()

    if ltable == rtable:
        count = count + 1
    
    else:
        df = df.drop(index)
len(df)









    Out[20]:





148



In [21]:

    
df.drop('ltable_scientific_name', axis=1, inplace=True)
#df.rename(columns={'ltable_scientific_name':'scientific_name'}, inplace=True)



In [22]:

    
df.head()









    Out[22]:






  
    
      
      _id
      ltable_id
      rtable_id
      ltable_name
      ltable_genus
      ltable_family
      ltable_ecology
      ltable_countries
      ltable_threat_paragraph
      ltable_conservation_paragraph
      ...
      rtable_size
      rtable_threats
      rtable_conservation
      rtable_threat_keywords
      rtable_conservation_keywords
      rtable_status
      rtable_countries
      rtable_country_count
      rtable_tCount
      label
    
  
  
    
      7
      1522478
      2798
      192
      caterpillar slug
      Laevicaulis
      Veronicellidae
      NaN
      South Africa (KwaZulu-Natal);
      NaN
      NaN
      ...
      Extended length: up to 90 mm (2)
      The caterpillar slug is threatened by habitat loss and degradation as a result of ongoing urbani...
      Although there are currently no conservation measures directly targeting the caterpillar slug in...
      loss;environment;
      
      Endangered
      ['India', 'Russia', 'Malaysia', 'China', 'Indonesia']
      5
      2
      1.0
    
    
      17
      8689373
      6618
      1097
      catalina mahogany
      Cercocarpus
      Rosaceae
      NaN
      United States (California);
      NaN
      NaN
      ...
      Height: 3 - 7 m (2)Trunk diameter: c. 20 cm (2)
      Historically a major threat to the Catalina mahogany was the introduction of <strong>herbivores<...
      Conservation efforts began in the 1970s with a detailed inventory of the remaining Catalina maho...
      loss;invasive;
      
      Critically Endangered
      ['Ukraine', 'Morocco', 'Russia', 'Hungary']
      4
      2
      1.0
    
    
      21
      7512846
      1511
      949
      lorenz von liburnaus woolly lemur, western avahi, western woolly lemur
      Avahi
      Indriidae
      Terrestrial
      Madagascar;
      \r\r\r\r\n    The major threat is forest destruction due to annual burning that creates new catt...
      ['\n    This species is listed on Appendix I of CITES. ', <span lang="EN-CA">This species is kno...
      ...
      700 Ã¢?? 900 g (2)
      15 species of lemur have become extinct since sea-faring humans arrived on MadagascarÃ¢??s shore...
      The western woolly lemur is confirmed in only two protected areas Ankarafantsika Nature Reserve ...
      hunting;
      protected;
      Critically Endangered
      ['Australia']
      1
      1
      1.0
    
    
      23
      1191527
      4277
      150
      bluelegged mantella, tular golden frog, tular mantella, tulear golden frog
      Mantella
      Mantellidae
      Terrestrial; Freshwater
      Madagascar;
      \r\r\r\r\n    The main threat to this species is habitat loss due to grazing and fire, and in so...
      ['\n    It occurs in Parque Nacional de Isalo. Trade in this species needs to be very carefully ...
      ...
      1 Ã¢?? 3 g (3)
      Several thousand blue-legged mantellas are thought to be collected every year from some regions ...
      Listing on Appendix II of the Convention on International Trade in Endangered Species provides t...
      loss;
      
      Endangered
      ['Taiwan', 'China', 'Vietnam']
      3
      1
      1.0
    
    
      25
      4646125
      20
      587
      malagasy giant jumping rat, malagasy giant rat
      Hypogeomys
      Nesomyidae
      Terrestrial
      Madagascar;
      \r\r\r\r\n    The historical decline of this species has been partly through climatic change lea...
      ['\n    The new Menabe-Antimena protected area has temporary protection order and covers the ent...
      ...
      1 Ã¢?? 1.5 kg (2)
      Like many of MadagascarÃ¢??s unique species the Malagasy giant rat is thought to have become hig...
      This large rodent is in urgent need of conservation and its future remains highly uncertain. The...
      loss;pet;
      captive breeding;protected;
      Endangered
      ['Canada']
      1
      2
      1.0
    
  

5 rows × 31 columns



In [23]:

    
df.rename(columns={'rtable_scientific_name':'scientific_name'}, inplace=True)



In [ ]:



In [24]:

    
#sanity check to make sure genus matches in all instances. Looking for 148
count = 0
for index, row in df.iterrows():
    ltable = str(row['ltable_genus']).lower().strip()
    rtable = str(row['rtable_genus']).lower().strip()
    
    if ltable == rtable:
        count = count + 1
    
    #else:
        #df = df.drop(index)
print(count)



In [25]:

    
#yay. lets drop stuff
df.drop('ltable_genus', axis=1, inplace=True)
df.rename(columns={'rtable_genus':'genus'}, inplace=True)
df.drop('ltable_id', axis=1, inplace=True)
df.drop('rtable_id', axis=1, inplace=True)



In [26]:

    
df.columns









    Out[26]:





Index(['_id', 'ltable_name', 'ltable_family', 'ltable_ecology',
       'ltable_countries', 'ltable_threat_paragraph',
       'ltable_conservation_paragraph', 'ltable_pop_trend', 'ltable_status',
       'ltable_country_count', 'scientific_name', 'rtable_name',
       'rtable_kingdom', 'rtable_phylum', 'rtable_class', 'rtable_order',
       'rtable_family', 'genus', 'rtable_size', 'rtable_threats',
       'rtable_conservation', 'rtable_threat_keywords',
       'rtable_conservation_keywords', 'rtable_status', 'rtable_countries',
       'rtable_country_count', 'rtable_tCount', 'label'],
      dtype='object')



In [27]:

    
df.to_csv('labeled.csv')



In [ ]:

	_id	ltable_id	rtable_id	ltable_name	ltable_genus	ltable_family	ltable_ecology	ltable_countries	ltable_threat_paragraph	ltable_conservation_paragraph	...	rtable_size	rtable_threats	rtable_conservation	rtable_threat_keywords	rtable_conservation_keywords	rtable_status	rtable_countries	rtable_country_count	rtable_tCount	label
7	1522478	2798	192	caterpillar slug	Laevicaulis	Veronicellidae	NaN	South Africa (KwaZulu-Natal);	NaN	NaN	...	Extended length: up to 90 mm (2)	The caterpillar slug is threatened by habitat loss and degradation as a result of ongoing urbani...	Although there are currently no conservation measures directly targeting the caterpillar slug in...	loss;environment;		Endangered	['India', 'Russia', 'Malaysia', 'China', 'Indonesia']	5	2	1.0
17	8689373	6618	1097	catalina mahogany	Cercocarpus	Rosaceae	NaN	United States (California);	NaN	NaN	...	Height: 3 - 7 m (2)Trunk diameter: c. 20 cm (2)	Historically a major threat to the Catalina mahogany was the introduction of <strong>herbivores<...	Conservation efforts began in the 1970s with a detailed inventory of the remaining Catalina maho...	loss;invasive;		Critically Endangered	['Ukraine', 'Morocco', 'Russia', 'Hungary']	4	2	1.0
21	7512846	1511	949	lorenz von liburnaus woolly lemur, western avahi, western woolly lemur	Avahi	Indriidae	Terrestrial	Madagascar;	\r\r\r\r\n The major threat is forest destruction due to annual burning that creates new catt...	['\n This species is listed on Appendix I of CITES. ', <span lang="EN-CA">This species is kno...	...	700 Ã¢?? 900 g (2)	15 species of lemur have become extinct since sea-faring humans arrived on MadagascarÃ¢??s shore...	The western woolly lemur is confirmed in only two protected areas Ankarafantsika Nature Reserve ...	hunting;	protected;	Critically Endangered	['Australia']	1	1	1.0
23	1191527	4277	150	bluelegged mantella, tular golden frog, tular mantella, tulear golden frog	Mantella	Mantellidae	Terrestrial; Freshwater	Madagascar;	\r\r\r\r\n The main threat to this species is habitat loss due to grazing and fire, and in so...	['\n It occurs in Parque Nacional de Isalo. Trade in this species needs to be very carefully ...	...	1 Ã¢?? 3 g (3)	Several thousand blue-legged mantellas are thought to be collected every year from some regions ...	Listing on Appendix II of the Convention on International Trade in Endangered Species provides t...	loss;		Endangered	['Taiwan', 'China', 'Vietnam']	3	1	1.0
25	4646125	20	587	malagasy giant jumping rat, malagasy giant rat	Hypogeomys	Nesomyidae	Terrestrial	Madagascar;	\r\r\r\r\n The historical decline of this species has been partly through climatic change lea...	['\n The new Menabe-Antimena protected area has temporary protection order and covers the ent...	...	1 Ã¢?? 1.5 kg (2)	Like many of MadagascarÃ¢??s unique species the Malagasy giant rat is thought to have become hig...	This large rodent is in urgent need of conservation and its future remains highly uncertain. The...	loss;pet;	captive breeding;protected;	Endangered	['Canada']	1	2	1.0