In [16]:
import os
print(os.getcwd())
os.chdir('../blocking/')
In [17]:
import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
In [3]:
# video series tutorial: https://www.youtube.com/watch?v=RlQuVL6-qe8&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=4
from sklearn.datasets import load_iris
iris = load_iris()
type(iris)
#print(iris.data) # each row is a flower, each column a feature
#print(iris.feature_names)
print(iris.target) # lists the label
# features should be numeric, as well as label
# should be in NumPy array
# features and response should have specific shapes
# observations by features
print(iris.data.shape)
# same magnitude as first dim of above (one response for each observation)
print(iris.target.shape)
In [4]:
X = iris.data
y = iris.target
In [5]:
# Begin machine learning
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
In [6]:
#print(knn)
knn.fit(X,y) # fit the training data
knn.predict([3,5,4,2]) # predict label of new tuples
Out[6]:
In [7]:
X_new = [[3,5,2,4],[3,6,2,3]] # new test data
knn.predict(X_new)
Out[7]:
In [8]:
# LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
# fit the model with the data
logreg.fit(X,y) #
logreg.predict(X_new)
Out[8]:
In [9]:
# COMPARING SHIT
# evaluation procedure #1: train and test on the entire dataset (hint: don't do this)
yPredLog = logreg.predict(X)
len(yPredLog)
# compare predicted labels with true labels
from sklearn import metrics
print(metrics.accuracy_score(y,yPredLog)) # 96% correct with train accuracy
yPredKNN = knn.predict(X)
print(metrics.accuracy_score(y,yPredKNN)) # 100% correct with train accuracy
kn3n = KNeighborsClassifier(n_neighbors=3)
kn3n.fit(X,y) # fit the training data
yPredKN3N = kn3n.predict(X)
print(metrics.accuracy_score(y,yPredKN3N)) # 96% correct with train accuracy
In [10]:
# evaluation procedure #2: train/test split
### like minute 13 of https://www.youtube.com/watch?v=0pP4EwWJgIU&index=5&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=.4)
x_train # 60% in train set (no general rule to choose these values)
# data is split in random way, so this is essentially cross-validation, but... not quite... because I think CV uses one tuple
# at a time as the test set? need to check that out
print(x_train.shape)
print(x_test.shape)
In [11]:
# fit train data
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
yPred_logCVish = logreg.predict(x_test)
print(metrics.accuracy_score(y_test,yPred_logCVish)) #
In [12]:
###### evaluation procedure #3: CV (repeated train/test split); https://www.youtube.com/watch?v=6dbrR-WymjI&list=PL5-da3qGB5ICeMbQuqbbCOQWcS6OYBr5A&index=7
# This is the method we'll be using for stage 4.
### Drawback of train/test procedure
# Accuracy changes a lot depending on which observations fall into test/train sets
### How CV overcomes
# tests all possible permutations of data split
### How to use
# k can be any number, but k=10 is generally recommended
# use stratified sampling
# - each response class should be represented with equal proportions in each of the K folds
# - scikit-learn implements this automatically
from sklearn.cross_validation import cross_val_score
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)
# use average accuracy as an estimate of out-of-sample accuracy
print(scores.mean())
# search for an optimal value of K for KNN
k_range = list(range(1, 31))
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
k_scores.append(scores.mean())
print(k_scores)
In [13]:
import matplotlib.pyplot as plt
%matplotlib inline
# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
Out[13]:
In [14]:
# 10-fold cross-validation with the best KNN model
knn = KNeighborsClassifier(n_neighbors=20)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())
# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())
In [ ]:
## USE THE THING BEFORE THIS FOR CLASSIFYING OUR DATA
In [26]:
# logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())
# decision tree
# random forest
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, max_depth=10)
print(cross_val_score(forest, X, y, cv=10, scoring='accuracy').mean())
# support vector machine
from sklearn.svm import LinearSVC
svm = LinearSVC()
print(cross_val_score(svm, X, y, cv=10, scoring='accuracy').mean())
# naive bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
print(cross_val_score(svm, X, y, cv=10, scoring='accuracy').mean())
# KNN
knn = KNeighborsClassifier(n_neighbors=20)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())
In [48]:
from numpy import genfromtxt
#myData = genfromtxt('train.csv',delimiter=',')
#myData
df = pd.read_csv('train_reduced.csv', sep=',')
df2 = df.dropna()
df2.head()
Out[48]:
In [ ]:
In [1]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
#BELOW THIS IS SHIT
In [ ]:
# Create a set of ML-matchers
dt = mg.DTMatcher(name='DecisionTree', random_state=0)
svm = mg.SVMMatcher(name='SVM')
rf = mg.RFMatcher(name='RF', random_state=0)
nb = mg.NBMatcher(name='NB')
lg = mg.LogRegMatcher(name='LogReg')
ln = mg.LinRegMatcher(name='LinReg')
# select the best ML matcher using CV
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold', metric='precision', random_state=0)
In [ ]:
In [ ]:
### BEGIN PREVIOUS BLOCKING CODE (just have this here for grabbing code segments)
#Read in Redlist
import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
style.use('ggplot')
# Read in csv as dataframe
rl = em.read_csv_metadata("finalRedlist.csv", encoding="ISO-8859-1", key='id')
# glance at first few rows
rl.head()
In [ ]:
# CME
em.get_key(rl) # get key
# Note: The id column of the redlist csv doesn't actually look like the object created here (rl). In the csv,
# id numbers jump around since they represent original row number of the original scraped data csv. Apparently
# this library can detect what we want and enumerate things in order if you give it a key feature like this.
In [ ]:
#Read in ARKIVE
# Read in csv as dataframe
ar = em.read_csv_metadata("finalArkive.csv", encoding="ISO-8859-1", key="id")
# glance at first few rows
ar.head()
In [ ]:
# Remove parentheticals from arkive common names
# author: CME
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases
# remove parenthesis contents
for index, row in ar.iterrows():
genus = str(row['common_name'])
pstart = genus.find("(")
if pstart !=-1:
# extract full common/nick name
noparens = genus[0:pstart]
ar.loc[index, 'common_name'] = noparens
else:
error = error + 1
print(error)
In [ ]:
# Concatenate nickname and common_name fields in arkive
# author: @andrewedstrom
# remove parenthesis contents
nameColumn = []
for index, row in ar.iterrows():
nickname = str(row['scientific_name']) #look here if errors start happening
nickname = re.sub('[^A-Za-z0-9;,\s]+', '', nickname).lower()
common = str(row['common_name'])
common = re.sub('[^A-Za-z0-9;,\s]+', '', common).lower()
if common in nickname:
nameColumn.append(nickname)
else:
nameColumn.append((nickname + "; " + common))
ar['name'] = nameColumn
ar.head()
In [ ]:
# Lowercase redlist name field
# author: @andrewedstrom
error = 0 # make sure there's no wonky cases
# remove parenthesis contents
for index, row in rl.iterrows():
n = str(row['name'])
n = re.sub('[^A-Za-z0-9;,\s]+', '', n).lower()
rl.loc[index, 'name'] = n.lower()
rl.head()
In [ ]:
In [ ]:
# Remove parentheticals from arkive genus
# author: @andrewedstrom
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases
# remove parenthesis contents
for index, row in ar.iterrows():
genus = str(row['genus'])
pstart = genus.find("(")
if pstart !=-1:
# extract full common/nick name
noparens = genus[0:pstart]
ar.loc[index, 'genus'] = noparens
else:
error = error + 1
print(error)
In [ ]:
#rl.to_csv('finalRedlist.csv')
#ar.to_csv('finalArkive.csv')
In [ ]:
# Rule-based blocking over input tables
# first get features that can be used
feature_table = em.get_features_for_blocking(rl, ar)
In [ ]:
#len(feature_table)
feature_table
#em.get_attr_corres(rl, ar)['ltable']
In [ ]:
In [ ]:
# CME -- I'm realizing now that we may want to start with the attribute equivalence
# blocker for names and see where that leaves us. On my TO-DO.
equivB = em.AttrEquivalenceBlocker()
C0 = equivB.block_tables(rl, ar, 'name', 'name',
l_output_attrs=['name', 'genus'],
r_output_attrs=['name', 'genus'])
C0
# lolwat this reveals zero matches?
In [ ]:
#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
ob = em.OverlapBlocker()
C0 = ob.block_tables(rl, ar,'name', 'name',
l_output_attrs=['name', 'genus', 'family'],
r_output_attrs=['name', 'genus', 'family'], overlap_size=2)
C0
In [ ]:
#Only keep candidate pairs which share genus
ab = em.AttrEquivalenceBlocker()
C1 = ob.block_candset(C0, 'genus', 'genus')
C1
#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
#C1 = ab.block_tables(rl, ar,'genus', 'genus',
# l_output_attrs=['name', 'genus'],
# r_output_attrs=['name', 'genus'])
In [ ]:
C2 = equivB.block_tables(rl, ar, 'family', 'family',
l_output_attrs=['name', 'genus' ,'family'],
r_output_attrs=['name', 'genus', 'family'])
C3 = ob.block_candset(C2, 'genus', 'genus')
C3
In [ ]:
C = em.combine_blocker_outputs_via_union([C1, C3])
C
In [ ]:
em.to_csv_metadata(C, './candidate_set.csv')
In [ ]:
C = em.read_csv_metadata('candidate_set.csv', ltable=rl, rtable=ar)
In [ ]:
C
In [ ]: