In [24]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [25]:
import nltk
import sklearn
import sklearn_crfsuite
In [31]:
def parse_file(filename):
f = open(filename, 'r')
raw = f.readlines()
sentences = []
s = []
for line in raw:
if line.strip():
tag, token = line.strip().split('\t')
s.append((token, tag))
else:
sentences.append(s)
s = []
return sentences
In [92]:
train_sents = parse_file("pos_train.conll")
test_sents = parse_file("pos_test.conll")
In [120]:
%%time
import re
def is_url(s):
# https://gist.github.com/gruber/249502#gistcomment-6465
if re.match(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', s):
return True
else:
return False
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def word2features(sent, i):
word = sent[i][0]
postag = sent[i][1]
features = {
'word[-4:]': word.lower()[-4:],
'mention': word.startswith('@') and len(word) > 1,
'hashtag': word.startswith('#') and len(word) > 1,
'url': is_url(word),
'word.lower()': word.lower(),
'number': is_number(word),
'word[-3:]': word.lower()[-3:],
'word[-2:]': word.lower()[-2:],
'word[-1:]': word.lower()[-1:],
'word.istitle()': word.istitle(),
'word.isupper()': word.isupper(),
}
return features
def get_features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def get_tags(sent):
return [tag for token, tag in sent]
def get_tokens(sent):
return [token for token, tag in sent]
X_train = [get_features(s) for s in train_sents]
y_train = [get_tags(s) for s in train_sents]
X_test = [get_features(s) for s in test_sents]
y_test = [get_tags(s) for s in test_sents]
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
)
crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)
same = 0
sum = 0
for st, sp in zip(y_test, y_pred):
for tt, tp in zip(st, sp):
if tt == tp:
same += 1
sum += 1
print("perc: ", same/sum)
In [ ]: