notebook.community

Edit and run



In [16]:

    
import os
print(os.getcwd())
os.chdir('../blocking/')









    



/Users/tammi/Desktop/CS 638 Stuffs/cs638project/blocking



In [17]:

    
import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re



In [3]:

    
# video series tutorial: https://www.youtube.com/watch?v=RlQuVL6-qe8&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=4

from sklearn.datasets import load_iris
iris = load_iris()
type(iris)
#print(iris.data) # each row is a flower, each column a feature

#print(iris.feature_names)
print(iris.target) # lists the label

# features should be numeric, as well as label
# should be in NumPy array

# features and response should have specific shapes

# observations by features
print(iris.data.shape)

# same magnitude as first dim of above (one response for each observation)
print(iris.target.shape)









    



[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
(150, 4)
(150,)



In [4]:

    
X = iris.data
y = iris.target



In [5]:

    
# Begin machine learning

# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)



In [6]:

    
#print(knn)
knn.fit(X,y) # fit the training data
knn.predict([3,5,4,2]) # predict label of new tuples









    



/Users/tammi/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)






    Out[6]:





array([2])



In [7]:

    
X_new = [[3,5,2,4],[3,6,2,3]] # new test data
knn.predict(X_new)









    Out[7]:





array([0, 0])



In [8]:

    
# LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

# fit the model with the data
logreg.fit(X,y) #
logreg.predict(X_new)









    Out[8]:





array([2, 0])



In [9]:

    
# COMPARING SHIT
# evaluation procedure #1: train and test on the entire dataset (hint: don't do this)
yPredLog = logreg.predict(X)
len(yPredLog)

# compare predicted labels with true labels
from sklearn import metrics
print(metrics.accuracy_score(y,yPredLog)) # 96% correct with train accuracy

yPredKNN = knn.predict(X)
print(metrics.accuracy_score(y,yPredKNN)) # 100% correct with train accuracy

kn3n = KNeighborsClassifier(n_neighbors=3)
kn3n.fit(X,y) # fit the training data
yPredKN3N = kn3n.predict(X)
print(metrics.accuracy_score(y,yPredKN3N)) # 96% correct with train accuracy



In [10]:

    
# evaluation procedure #2: train/test split 
### like minute 13 of https://www.youtube.com/watch?v=0pP4EwWJgIU&index=5&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A

from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=.4)
x_train # 60% in train set (no general rule to choose these values)

# data is split in random way, so this is essentially cross-validation, but... not quite... because I think CV uses one tuple
# at a time as the test set?  need to check that out
print(x_train.shape)
print(x_test.shape)









    



(90, 4)
(60, 4)



In [11]:

    
# fit train data
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
yPred_logCVish = logreg.predict(x_test)
print(metrics.accuracy_score(y_test,yPred_logCVish)) #

0.9



In [12]:

    
###### evaluation procedure #3: CV (repeated train/test split); https://www.youtube.com/watch?v=6dbrR-WymjI&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=7
# This is the method we'll be using for stage 4.

### Drawback of train/test procedure
# Accuracy changes a lot depending on which observations fall into test/train sets

### How CV overcomes
# tests all possible permutations of data split

### How to use
# k can be any number, but k=10 is generally recommended
# use stratified sampling
# - each response class should be represented with equal proportions in each of the K folds
# - scikit-learn implements this automatically

from sklearn.cross_validation import cross_val_score

# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)

# use average accuracy as an estimate of out-of-sample accuracy
print(scores.mean())

# search for an optimal value of K for KNN
k_range = list(range(1, 31))
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
print(k_scores)









    



[ 1.          0.93333333  1.          1.          0.86666667  0.93333333
  0.93333333  1.          1.          1.        ]
0.966666666667
[0.95999999999999996, 0.95333333333333337, 0.96666666666666656, 0.96666666666666656, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.98000000000000009, 0.96666666666666656, 0.96666666666666656, 0.97333333333333338, 0.95999999999999996, 0.96666666666666656, 0.95999999999999996, 0.96666666666666656, 0.95333333333333337, 0.95333333333333337, 0.95333333333333337]



In [13]:

    
import matplotlib.pyplot as plt
%matplotlib inline

# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')









    Out[13]:





<matplotlib.text.Text at 0x10badb898>



In [14]:

    
# 10-fold cross-validation with the best KNN model
knn = KNeighborsClassifier(n_neighbors=20)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())









    



0.98
0.953333333333



In [ ]:

    
## USE THE THING BEFORE THIS FOR CLASSIFYING OUR DATA



In [26]:

    
# logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())

# decision tree


# random forest
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, max_depth=10)
print(cross_val_score(forest, X, y, cv=10, scoring='accuracy').mean())

# support vector machine
from sklearn.svm import LinearSVC
svm = LinearSVC()
print(cross_val_score(svm, X, y, cv=10, scoring='accuracy').mean())

# naive bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
print(cross_val_score(svm, X, y, cv=10, scoring='accuracy').mean())

# KNN
knn = KNeighborsClassifier(n_neighbors=20)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())









    



0.953333333333
0.953333333333
0.966666666667
0.966666666667
0.98



In [48]:

    
from numpy import genfromtxt
#myData = genfromtxt('train.csv',delimiter=',')
#myData
df = pd.read_csv('train_reduced.csv', sep=',')
df2 = df.dropna()
df2.head()









    Out[48]:






  
    
      
      _id
      ltable_id
      rtable_id
      ltable_Unnamed: 0
      ltable_name
      ltable_genus
      ltable_family
      ltable_ecology
      ltable_countries
      ltable_threat_paragraph
      ...
      rtable_genus
      rtable_size
      rtable_threats
      rtable_conservation
      rtable_threat_keywords
      rtable_conservation_keywords
      rtable_tCount
      rtable_cCount
      rtable_block
      label
    
  
  
    
      12
      8954717
      2852
      1131
      2852
      umbeluzi cycad
      Encephalartos
      Zamiaceae
      Terrestrial
      Mozambique; Swaziland;
      \r\r\n    Illegal removal of plants from the wild has been recorded, even within reserves.\r\r\n...
      ...
      Encephalartos
      Stem height: up to 4.5 m (2)
      The few remaining wild specimens of <em>E. latifrons</em> are so widely scattered that seed-prod...
      Although E. latifrons is scarce in the wild numerous specimens are known to occur in public and ...
      loss;environment;
      cites;
      2
      1
      yes
      0
    
    
      13
      9100153
      5818
      1149
      5818
      delacours langur
      Trachypithecus
      Cercopithecidae
      Terrestrial
      Viet Nam;
      \r\r\n    Hunting for the purposes of traditional "medicine" is the primary threat facing this s...
      ...
      Trachypithecus
      Male head-to-body length: 57 â?? 62 cm (2)Female head-to-body length: 57 â?? 59 cm (2)Mail tail ...
      With as few as 270 to 300 estimated individuals remaining in 19 isolated populations and 14 of t...
      Four areas where Delacourâ??s langurs are protected include: Cuc Phuong National Park Pu Luong N...
      loss;hunting;environment;
      protected;
      3
      1
      yes
      1
    
    
      16
      10724422
      7512
      1354
      7512
      giant ditch frog, mountain chicken
      Leptodactylus
      Leptodactylidae
      Terrestrial
      Guadeloupe; Martinique; Saint Kitts and Nevis; Dominica; Montserrat;
      \r\r\n    The species is consumed by humans and is prized for its meat (both subsistence and com...
      ...
      Leptodactylus
      c. 700 g (3)
      An unfortunate victim of hunting disease natural disasters and habitat loss the mountain chicken...
      Hunting of the mountain chicken was banned on Dominica in the late 1990s although a three month ...
      loss;hunting;disease;
      captive breeding;
      3
      1
      yes
      1
    
    
      22
      8714228
      7728
      1100
      7728
      cave catfish
      Clarias
      Clariidae
      Freshwater
      Namibia;
      \r\r\n    Depletion of ground water; the cave lake has been used as a water supply in an otherwi...
      ...
      Clarias
      Length: 16.1 cm (2)
      The cave catfish is found only within a single cave in Namibia and is therefore at inherent risk...
      The protection of the Aigamas Cave is vital in the conservation of this species and the present ...
      
      
      0
      0
      yes
      1
    
    
      23
      9647999
      7529
      1218
      7529
      blackeared golden mantella, blackeared mantella
      Mantella
      Mantellidae
      Terrestrial; Freshwater
      Madagascar;
      \r\r\n    The area where this species occurs is severely threatened, with its forest habitat rec...
      ...
      Mantella
      Length: 20-26 mm (2)
      The golden frog is very popular in the pet-trade and over-collection of individuals is still car...
      All mantella frogs are listed on Appendix II of the Convention on International Trade in Endange...
      pet;
      cites;captive breeding;
      1
      2
      yes
      0
    
  

5 rows × 34 columns



In [ ]:



In [1]:









    



/Users/tammi/Desktop/CS 638 Stuffs/cs638project/learning



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
#BELOW THIS IS SHIT



In [ ]:

    
# Create a set of ML-matchers
dt = mg.DTMatcher(name='DecisionTree', random_state=0)
svm = mg.SVMMatcher(name='SVM')
rf = mg.RFMatcher(name='RF', random_state=0)
nb = mg.NBMatcher(name='NB')
lg = mg.LogRegMatcher(name='LogReg')
ln = mg.LinRegMatcher(name='LinReg')

# select the best ML matcher using CV
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold', metric='precision', random_state=0)



In [ ]:



In [ ]:

    
### BEGIN PREVIOUS BLOCKING CODE (just have this here for grabbing code segments)
#Read in Redlist

import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
style.use('ggplot')

# Read in csv as dataframe
rl = em.read_csv_metadata("finalRedlist.csv", encoding="ISO-8859-1", key='id')
# glance at first few rows
rl.head()



In [ ]:

    
# CME
em.get_key(rl) # get key

# Note: The id column of the  redlist csv doesn't actually look like the object created here (rl).  In the csv,
# id numbers jump around since they represent original row number of the original scraped data csv.  Apparently
# this library can detect what we want and enumerate things in order if you give it a key feature like this.



In [ ]:

    
#Read in ARKIVE
# Read in csv as dataframe
ar = em.read_csv_metadata("finalArkive.csv", encoding="ISO-8859-1", key="id")

# glance at first few rows
ar.head()



In [ ]:

    
# Remove parentheticals from arkive common names
# author: CME
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in ar.iterrows():
    genus = str(row['common_name'])

    pstart = genus.find("(")
    if pstart !=-1:        
        # extract full common/nick name
        noparens = genus[0:pstart]
        ar.loc[index, 'common_name'] = noparens
    else:
        error = error + 1

print(error)



In [ ]:

    
# Concatenate nickname and common_name fields in arkive
# author: @andrewedstrom

# remove parenthesis contents
nameColumn = []
for index, row in ar.iterrows():
    nickname = str(row['scientific_name']) #look here if errors start happening
    nickname = re.sub('[^A-Za-z0-9;,\s]+', '', nickname).lower()
    common = str(row['common_name'])
    common = re.sub('[^A-Za-z0-9;,\s]+', '', common).lower()
    if common in nickname: 
        nameColumn.append(nickname)
    else:
        nameColumn.append((nickname + "; " + common))
ar['name'] = nameColumn
ar.head()



In [ ]:

    
# Lowercase redlist name field
# author: @andrewedstrom
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in rl.iterrows():
    n = str(row['name'])
    n = re.sub('[^A-Za-z0-9;,\s]+', '', n).lower()

    rl.loc[index, 'name'] = n.lower()

rl.head()



In [ ]:



In [ ]:

    
# Remove parentheticals from arkive genus
# author: @andrewedstrom
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in ar.iterrows():
    genus = str(row['genus'])

    pstart = genus.find("(")
    if pstart !=-1:        
        # extract full common/nick name
        noparens = genus[0:pstart]
        ar.loc[index, 'genus'] = noparens
    else:
        error = error + 1

print(error)



In [ ]:

    
#rl.to_csv('finalRedlist.csv')
#ar.to_csv('finalArkive.csv')



In [ ]:

    
# Rule-based blocking over input tables
# first get features that can be used
feature_table = em.get_features_for_blocking(rl, ar)



In [ ]:

    
#len(feature_table)
feature_table
#em.get_attr_corres(rl, ar)['ltable']



In [ ]:



In [ ]:

    
# CME -- I'm realizing now that we may want to start with the attribute equivalence
# blocker for names and see where that leaves us.  On my TO-DO.
equivB = em.AttrEquivalenceBlocker()
C0 = equivB.block_tables(rl, ar, 'name', 'name',
                    l_output_attrs=['name', 'genus'], 
                    r_output_attrs=['name', 'genus'])
C0

# lolwat this reveals zero matches?



In [ ]:

    
#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
ob = em.OverlapBlocker()
C0 = ob.block_tables(rl, ar,'name', 'name', 
                     l_output_attrs=['name', 'genus', 'family'], 
                     r_output_attrs=['name', 'genus', 'family'], overlap_size=2)
C0



In [ ]:

    
#Only keep candidate pairs which share genus
ab = em.AttrEquivalenceBlocker()
C1 = ob.block_candset(C0, 'genus', 'genus')
C1

#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
#C1 = ab.block_tables(rl, ar,'genus', 'genus', 
#                     l_output_attrs=['name', 'genus'], 
#                     r_output_attrs=['name', 'genus'])



In [ ]:

    
C2 = equivB.block_tables(rl, ar, 'family', 'family',
                    l_output_attrs=['name', 'genus' ,'family'], 
                    r_output_attrs=['name', 'genus', 'family'])

C3 = ob.block_candset(C2, 'genus', 'genus')
C3



In [ ]:

    
C = em.combine_blocker_outputs_via_union([C1, C3])
C



In [ ]:

    
em.to_csv_metadata(C, './candidate_set.csv')



In [ ]:

    
C = em.read_csv_metadata('candidate_set.csv', ltable=rl, rtable=ar)



In [ ]:

    
C



In [ ]:

	_id	ltable_id	rtable_id	ltable_Unnamed: 0	ltable_name	ltable_genus	ltable_family	ltable_ecology	ltable_countries	ltable_threat_paragraph	...	rtable_genus	rtable_size	rtable_threats	rtable_conservation	rtable_threat_keywords	rtable_conservation_keywords	rtable_tCount	rtable_cCount	rtable_block	label
12	8954717	2852	1131	2852	umbeluzi cycad	Encephalartos	Zamiaceae	Terrestrial	Mozambique; Swaziland;	\r\r\n Illegal removal of plants from the wild has been recorded, even within reserves.\r\r\n...	...	Encephalartos	Stem height: up to 4.5 m (2)	The few remaining wild specimens of <em>E. latifrons</em> are so widely scattered that seed-prod...	Although E. latifrons is scarce in the wild numerous specimens are known to occur in public and ...	loss;environment;	cites;	2	1	yes	0
13	9100153	5818	1149	5818	delacours langur	Trachypithecus	Cercopithecidae	Terrestrial	Viet Nam;	\r\r\n Hunting for the purposes of traditional "medicine" is the primary threat facing this s...	...	Trachypithecus	Male head-to-body length: 57 â?? 62 cm (2)Female head-to-body length: 57 â?? 59 cm (2)Mail tail ...	With as few as 270 to 300 estimated individuals remaining in 19 isolated populations and 14 of t...	Four areas where Delacourâ??s langurs are protected include: Cuc Phuong National Park Pu Luong N...	loss;hunting;environment;	protected;	3	1	yes	1
16	10724422	7512	1354	7512	giant ditch frog, mountain chicken	Leptodactylus	Leptodactylidae	Terrestrial	Guadeloupe; Martinique; Saint Kitts and Nevis; Dominica; Montserrat;	\r\r\n The species is consumed by humans and is prized for its meat (both subsistence and com...	...	Leptodactylus	c. 700 g (3)	An unfortunate victim of hunting disease natural disasters and habitat loss the mountain chicken...	Hunting of the mountain chicken was banned on Dominica in the late 1990s although a three month ...	loss;hunting;disease;	captive breeding;	3	1	yes	1
22	8714228	7728	1100	7728	cave catfish	Clarias	Clariidae	Freshwater	Namibia;	\r\r\n Depletion of ground water; the cave lake has been used as a water supply in an otherwi...	...	Clarias	Length: 16.1 cm (2)	The cave catfish is found only within a single cave in Namibia and is therefore at inherent risk...	The protection of the Aigamas Cave is vital in the conservation of this species and the present ...			0	0	yes	1
23	9647999	7529	1218	7529	blackeared golden mantella, blackeared mantella	Mantella	Mantellidae	Terrestrial; Freshwater	Madagascar;	\r\r\n The area where this species occurs is severely threatened, with its forest habitat rec...	...	Mantella	Length: 20-26 mm (2)	The golden frog is very popular in the pet-trade and over-collection of individuals is still car...	All mantella frogs are listed on Appendix II of the Convention on International Trade in Endange...	pet;	cites;captive breeding;	1	2	yes	0