In [16]:
import os
print(os.getcwd())
os.chdir('../blocking/')


/Users/tammi/Desktop/CS 638 Stuffs/cs638project/blocking

In [17]:
import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re

In [3]:
# video series tutorial: https://www.youtube.com/watch?v=RlQuVL6-qe8&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=4

from sklearn.datasets import load_iris
iris = load_iris()
type(iris)
#print(iris.data) # each row is a flower, each column a feature

#print(iris.feature_names)
print(iris.target) # lists the label

# features should be numeric, as well as label
# should be in NumPy array

# features and response should have specific shapes

# observations by features
print(iris.data.shape)

# same magnitude as first dim of above (one response for each observation)
print(iris.target.shape)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
(150, 4)
(150,)

In [4]:
X = iris.data
y = iris.target

In [5]:
# Begin machine learning

# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)

In [6]:
#print(knn)
knn.fit(X,y) # fit the training data
knn.predict([3,5,4,2]) # predict label of new tuples


/Users/tammi/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[6]:
array([2])

In [7]:
X_new = [[3,5,2,4],[3,6,2,3]] # new test data
knn.predict(X_new)


Out[7]:
array([0, 0])

In [8]:
# LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

# fit the model with the data
logreg.fit(X,y) #
logreg.predict(X_new)


Out[8]:
array([2, 0])

In [9]:
# COMPARING SHIT
# evaluation procedure #1: train and test on the entire dataset (hint: don't do this)
yPredLog = logreg.predict(X)
len(yPredLog)

# compare predicted labels with true labels
from sklearn import metrics
print(metrics.accuracy_score(y,yPredLog)) # 96% correct with train accuracy

yPredKNN = knn.predict(X)
print(metrics.accuracy_score(y,yPredKNN)) # 100% correct with train accuracy

kn3n = KNeighborsClassifier(n_neighbors=3)
kn3n.fit(X,y) # fit the training data
yPredKN3N = kn3n.predict(X)
print(metrics.accuracy_score(y,yPredKN3N)) # 96% correct with train accuracy


0.96
1.0
0.96

In [10]:
# evaluation procedure #2: train/test split 
### like minute 13 of https://www.youtube.com/watch?v=0pP4EwWJgIU&index=5&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A

from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=.4)
x_train # 60% in train set (no general rule to choose these values)

# data is split in random way, so this is essentially cross-validation, but... not quite... because I think CV uses one tuple
# at a time as the test set?  need to check that out
print(x_train.shape)
print(x_test.shape)


(90, 4)
(60, 4)

In [11]:
# fit train data
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
yPred_logCVish = logreg.predict(x_test)
print(metrics.accuracy_score(y_test,yPred_logCVish)) #


0.9

In [12]:
###### evaluation procedure #3: CV (repeated train/test split); https://www.youtube.com/watch?v=6dbrR-WymjI&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=7
# This is the method we'll be using for stage 4.

### Drawback of train/test procedure
# Accuracy changes a lot depending on which observations fall into test/train sets

### How CV overcomes
# tests all possible permutations of data split

### How to use
# k can be any number, but k=10 is generally recommended
# use stratified sampling
# - each response class should be represented with equal proportions in each of the K folds
# - scikit-learn implements this automatically

from sklearn.cross_validation import cross_val_score

# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)

# use average accuracy as an estimate of out-of-sample accuracy
print(scores.mean())

# search for an optimal value of K for KNN
k_range = list(range(1, 31))
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
print(k_scores)


[ 1.          0.93333333  1.          1.          0.86666667  0.93333333
  0.93333333  1.          1.          1.        ]
0.966666666667
[0.95999999999999996, 0.95333333333333337, 0.96666666666666656, 0.96666666666666656, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.96666666666666679, 0.96666666666666679, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.97333333333333338, 0.98000000000000009, 0.97333333333333338, 0.98000000000000009, 0.96666666666666656, 0.96666666666666656, 0.97333333333333338, 0.95999999999999996, 0.96666666666666656, 0.95999999999999996, 0.96666666666666656, 0.95333333333333337, 0.95333333333333337, 0.95333333333333337]

In [13]:
import matplotlib.pyplot as plt
%matplotlib inline

# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')


Out[13]:
<matplotlib.text.Text at 0x10badb898>

In [14]:
# 10-fold cross-validation with the best KNN model
knn = KNeighborsClassifier(n_neighbors=20)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())


0.98
0.953333333333

In [ ]:
## USE THE THING BEFORE THIS FOR CLASSIFYING OUR DATA

In [26]:
# logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())

# decision tree


# random forest
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, max_depth=10)
print(cross_val_score(forest, X, y, cv=10, scoring='accuracy').mean())

# support vector machine
from sklearn.svm import LinearSVC
svm = LinearSVC()
print(cross_val_score(svm, X, y, cv=10, scoring='accuracy').mean())

# naive bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
print(cross_val_score(svm, X, y, cv=10, scoring='accuracy').mean())

# KNN
knn = KNeighborsClassifier(n_neighbors=20)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())


0.953333333333
0.953333333333
0.966666666667
0.966666666667
0.98

In [48]:
from numpy import genfromtxt
#myData = genfromtxt('train.csv',delimiter=',')
#myData
df = pd.read_csv('train_reduced.csv', sep=',')
df2 = df.dropna()
df2.head()


Out[48]:
_id ltable_id rtable_id ltable_Unnamed: 0 ltable_name ltable_genus ltable_family ltable_ecology ltable_countries ltable_threat_paragraph ... rtable_genus rtable_size rtable_threats rtable_conservation rtable_threat_keywords rtable_conservation_keywords rtable_tCount rtable_cCount rtable_block label
12 8954717 2852 1131 2852 umbeluzi cycad Encephalartos Zamiaceae Terrestrial Mozambique; Swaziland; \r\r\n Illegal removal of plants from the wild has been recorded, even within reserves.\r\r\n... ... Encephalartos Stem height: up to 4.5 m (2) The few remaining wild specimens of <em>E. latifrons</em> are so widely scattered that seed-prod... Although E. latifrons is scarce in the wild numerous specimens are known to occur in public and ... loss;environment; cites; 2 1 yes 0
13 9100153 5818 1149 5818 delacours langur Trachypithecus Cercopithecidae Terrestrial Viet Nam; \r\r\n Hunting for the purposes of traditional "medicine" is the primary threat facing this s... ... Trachypithecus Male head-to-body length: 57 â?? 62 cm (2)Female head-to-body length: 57 â?? 59 cm (2)Mail tail ... With as few as 270 to 300 estimated individuals remaining in 19 isolated populations and 14 of t... Four areas where Delacourâ??s langurs are protected include: Cuc Phuong National Park Pu Luong N... loss;hunting;environment; protected; 3 1 yes 1
16 10724422 7512 1354 7512 giant ditch frog, mountain chicken Leptodactylus Leptodactylidae Terrestrial Guadeloupe; Martinique; Saint Kitts and Nevis; Dominica; Montserrat; \r\r\n The species is consumed by humans and is prized for its meat (both subsistence and com... ... Leptodactylus c. 700 g (3) An unfortunate victim of hunting disease natural disasters and habitat loss the mountain chicken... Hunting of the mountain chicken was banned on Dominica in the late 1990s although a three month ... loss;hunting;disease; captive breeding; 3 1 yes 1
22 8714228 7728 1100 7728 cave catfish Clarias Clariidae Freshwater Namibia; \r\r\n Depletion of ground water; the cave lake has been used as a water supply in an otherwi... ... Clarias Length: 16.1 cm (2) The cave catfish is found only within a single cave in Namibia and is therefore at inherent risk... The protection of the Aigamas Cave is vital in the conservation of this species and the present ... 0 0 yes 1
23 9647999 7529 1218 7529 blackeared golden mantella, blackeared mantella Mantella Mantellidae Terrestrial; Freshwater Madagascar; \r\r\n The area where this species occurs is severely threatened, with its forest habitat rec... ... Mantella Length: 20-26 mm (2) The golden frog is very popular in the pet-trade and over-collection of individuals is still car... All mantella frogs are listed on Appendix II of the Convention on International Trade in Endange... pet; cites;captive breeding; 1 2 yes 0

5 rows × 34 columns


In [ ]:


In [1]:



/Users/tammi/Desktop/CS 638 Stuffs/cs638project/learning

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
#BELOW THIS IS SHIT

In [ ]:
# Create a set of ML-matchers
dt = mg.DTMatcher(name='DecisionTree', random_state=0)
svm = mg.SVMMatcher(name='SVM')
rf = mg.RFMatcher(name='RF', random_state=0)
nb = mg.NBMatcher(name='NB')
lg = mg.LogRegMatcher(name='LogReg')
ln = mg.LinRegMatcher(name='LinReg')

# select the best ML matcher using CV
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold', metric='precision', random_state=0)

In [ ]:


In [ ]:
### BEGIN PREVIOUS BLOCKING CODE (just have this here for grabbing code segments)
#Read in Redlist

import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
style.use('ggplot')

# Read in csv as dataframe
rl = em.read_csv_metadata("finalRedlist.csv", encoding="ISO-8859-1", key='id')
# glance at first few rows
rl.head()

In [ ]:
# CME
em.get_key(rl) # get key

# Note: The id column of the  redlist csv doesn't actually look like the object created here (rl).  In the csv,
# id numbers jump around since they represent original row number of the original scraped data csv.  Apparently
# this library can detect what we want and enumerate things in order if you give it a key feature like this.

In [ ]:
#Read in ARKIVE
# Read in csv as dataframe
ar = em.read_csv_metadata("finalArkive.csv", encoding="ISO-8859-1", key="id")

# glance at first few rows
ar.head()

In [ ]:
# Remove parentheticals from arkive common names
# author: CME
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in ar.iterrows():
    genus = str(row['common_name'])

    pstart = genus.find("(")
    if pstart !=-1:        
        # extract full common/nick name
        noparens = genus[0:pstart]
        ar.loc[index, 'common_name'] = noparens
    else:
        error = error + 1

print(error)

In [ ]:
# Concatenate nickname and common_name fields in arkive
# author: @andrewedstrom

# remove parenthesis contents
nameColumn = []
for index, row in ar.iterrows():
    nickname = str(row['scientific_name']) #look here if errors start happening
    nickname = re.sub('[^A-Za-z0-9;,\s]+', '', nickname).lower()
    common = str(row['common_name'])
    common = re.sub('[^A-Za-z0-9;,\s]+', '', common).lower()
    if common in nickname: 
        nameColumn.append(nickname)
    else:
        nameColumn.append((nickname + "; " + common))
ar['name'] = nameColumn
ar.head()

In [ ]:
# Lowercase redlist name field
# author: @andrewedstrom
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in rl.iterrows():
    n = str(row['name'])
    n = re.sub('[^A-Za-z0-9;,\s]+', '', n).lower()

    rl.loc[index, 'name'] = n.lower()

rl.head()

In [ ]:


In [ ]:
# Remove parentheticals from arkive genus
# author: @andrewedstrom
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in ar.iterrows():
    genus = str(row['genus'])

    pstart = genus.find("(")
    if pstart !=-1:        
        # extract full common/nick name
        noparens = genus[0:pstart]
        ar.loc[index, 'genus'] = noparens
    else:
        error = error + 1

print(error)

In [ ]:
#rl.to_csv('finalRedlist.csv')
#ar.to_csv('finalArkive.csv')

In [ ]:
# Rule-based blocking over input tables
# first get features that can be used
feature_table = em.get_features_for_blocking(rl, ar)

In [ ]:
#len(feature_table)
feature_table
#em.get_attr_corres(rl, ar)['ltable']

In [ ]:


In [ ]:
# CME -- I'm realizing now that we may want to start with the attribute equivalence
# blocker for names and see where that leaves us.  On my TO-DO.
equivB = em.AttrEquivalenceBlocker()
C0 = equivB.block_tables(rl, ar, 'name', 'name',
                    l_output_attrs=['name', 'genus'], 
                    r_output_attrs=['name', 'genus'])
C0

# lolwat this reveals zero matches?

In [ ]:
#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
ob = em.OverlapBlocker()
C0 = ob.block_tables(rl, ar,'name', 'name', 
                     l_output_attrs=['name', 'genus', 'family'], 
                     r_output_attrs=['name', 'genus', 'family'], overlap_size=2)
C0

In [ ]:
#Only keep candidate pairs which share genus
ab = em.AttrEquivalenceBlocker()
C1 = ob.block_candset(C0, 'genus', 'genus')
C1

#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
#C1 = ab.block_tables(rl, ar,'genus', 'genus', 
#                     l_output_attrs=['name', 'genus'], 
#                     r_output_attrs=['name', 'genus'])

In [ ]:
C2 = equivB.block_tables(rl, ar, 'family', 'family',
                    l_output_attrs=['name', 'genus' ,'family'], 
                    r_output_attrs=['name', 'genus', 'family'])

C3 = ob.block_candset(C2, 'genus', 'genus')
C3

In [ ]:
C = em.combine_blocker_outputs_via_union([C1, C3])
C

In [ ]:
em.to_csv_metadata(C, './candidate_set.csv')

In [ ]:
C = em.read_csv_metadata('candidate_set.csv', ltable=rl, rtable=ar)

In [ ]:
C

In [ ]: