In [1]:
import pandas as pd
import nltk
from pattern.nl import parse, split

In [2]:
data_vraag = pd.read_excel('Overzicht-vragen-en-clusters-150721.xlsx', sheetname='vragen met toelichting')

In [3]:
# Clean genders
data_vraag['Gender'] = data_vraag['Man of vrouw']
data_vraag['Gender'] = data_vraag['Gender'].fillna('other')
data_vraag['Gender'] = data_vraag['Gender'].replace('man, vrouw', 'other')

In [4]:
# Append title and text
data_vraag['FullText'] = data_vraag['Titel'] + ' ' + data_vraag['Inhoud']

In [5]:
stopwords = nltk.corpus.stopwords.words('dutch')
stopwords += nltk.corpus.stopwords.words('english')
stopwords += ['','.',',','?','(',')',',',':',"'",u'``',u"''",';','-','!','%','&','...','=','we','wij']
stopwords += [u'\u2019',u'\u2018',u'\u2013',u'\u2022',u'\u2014',u'\uf02d',u'\u20ac',u'\u2026']

In [6]:
keepTags = [
    # None, u'(', u')', u',', u'.', u'<notranslation>damesbladen', # extras
    # u'CC', # conjunction
    # u'CD', # cardinal (numbers)
    # u'DT', # determiner (de, het)
    u'FW', # foreign word
    # u'IN', #conjunction
    u'JJ', # adjectives -- # u'JJR', u'JJS',
    # u'MD', # Modal verb
    u'NN', u'NNP', u'NNPS', u'NNS', # Nouns
    # u'PRP', # Pronouns -- # u'PRP$',
    u'RB', # adverb
    u'RP', # adverb
    # u'SYM', # Symbol
    # u'TO', # infinitival to
    # u'UH', # interjection
    u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ' # Verb forms
]

In [7]:
def cleanup(sentText):
    words = []
    for s in split(parse(sentText)):
        for word, tag in s.tagged:
            if tag in keepTags:
                words.append(word.lower())
    words = [ w for w in words if w not in stopwords ]
    return words

In [ ]:


In [8]:
data_vraag['SentToks'] = data_vraag['FullText'].apply(cleanup)

In [10]:
data_vraag.to_pickle('preprocessedData.pkl')

In [ ]:


In [10]:
origSent = data_vraag['FullText'][0]
cleanSent = cleanup(origSent)

In [14]:
print 'Cleanup process transforms a sentence from this format:'
print '======================================================='
print origSent
print '======================================================='
print 'to a bag-of-words like this:'
print '======================================================='
print cleanSent


Cleanup process transforms a sentence from this format:
=======================================================
Hoe kan je, binnen de context van mondiale veranderingsprocessen, inclusieve ontwikkeling bevorderen? Is het opportuun om te blijven focussen op Nederland, in deze tijden van massale migratie, wereldwijde instabiliteit en economische onzekerheid? 2015 is uitgeroepen tot 'het Europees Jaar voor Ontwikkeling'. Hoe kan Nederland bijdragen aan inclusieve mondiale ontwikkeling? 
=======================================================
to a bag-of-words like this:
=======================================================
[u'context', u'mondiale', u'veranderingsprocessen', u'inclusieve', u'ontwikkeling', u'bevorderen', u'opportuun', u'focussen', u'nederland', u'tijden', u'massale', u'migratie', u'wereldwijde', u'instabiliteit', u'economische', u'onzekerheid', u'uitgeroepen', u'europees', u'jaar', u'ontwikkeling', u'nederland', u'bijdragen', u'inclusieve', u'mondiale', u'ontwikkeling']