In [24]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [25]:
import nltk
import sklearn
import sklearn_crfsuite

In [31]:
def parse_file(filename):
    f = open(filename, 'r')
    raw = f.readlines()
    sentences = []
    s = []
    for line in raw:
        if line.strip():
            tag, token = line.strip().split('\t')
            s.append((token, tag))
        else:
            sentences.append(s)
            s = []
    return sentences

In [92]:
train_sents = parse_file("pos_train.conll")
test_sents = parse_file("pos_test.conll")

In [120]:
%%time

import re
def is_url(s):
    # https://gist.github.com/gruber/249502#gistcomment-6465
    if re.match(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', s):
        return True
    else: 
        return False

    
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

    
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'word[-4:]': word.lower()[-4:],
        'mention': word.startswith('@') and len(word) > 1,
        'hashtag': word.startswith('#') and len(word) > 1,
        'url': is_url(word),
        'word.lower()': word.lower(),
        'number': is_number(word),
        'word[-3:]': word.lower()[-3:],
        'word[-2:]': word.lower()[-2:],
        'word[-1:]': word.lower()[-1:],
        'word.istitle()': word.istitle(),
        'word.isupper()': word.isupper(),      
    }
                
    return features


def get_features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def get_tags(sent):
    return [tag for token, tag in sent]

def get_tokens(sent):
    return [token for token, tag in sent]


X_train = [get_features(s) for s in train_sents]
y_train = [get_tags(s) for s in train_sents]

X_test = [get_features(s) for s in test_sents]
y_test = [get_tags(s) for s in test_sents]


crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1,
    c2=0.1,
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)
same = 0
sum = 0
for st, sp in zip(y_test, y_pred):
    for tt, tp in zip(st, sp):
        if tt == tp:
            same += 1
        sum += 1
print("perc: ", same/sum)


perc:  0.8537020517395183
CPU times: user 9.13 s, sys: 4 ms, total: 9.13 s
Wall time: 9.15 s

In [ ]: