In [1]:
import pandas as pd
import nltk
from import parse, split
In [2]:
data_vraag = pd.read_excel('Overzicht-vragen-en-clusters-150721.xlsx', sheetname='vragen met toelichting')
In [3]:
# Clean genders
data_vraag['Gender'] = data_vraag['Man of vrouw']
data_vraag['Gender'] = data_vraag['Gender'].fillna('other')
data_vraag['Gender'] = data_vraag['Gender'].replace('man, vrouw', 'other')
In [4]:
# Append title and text
data_vraag['FullText'] = data_vraag['Titel'] + ' ' + data_vraag['Inhoud']
In [5]:
stopwords = nltk.corpus.stopwords.words('dutch')
stopwords += nltk.corpus.stopwords.words('english')
stopwords += ['','.',',','?','(',')',',',':',"'",u'``',u"''",';','-','!','%','&','...','=','we','wij']
stopwords += [u'\u2019',u'\u2018',u'\u2013',u'\u2022',u'\u2014',u'\uf02d',u'\u20ac',u'\u2026']
In [6]:
keepTags = [
# None, u'(', u')', u',', u'.', u'<notranslation>damesbladen', # extras
# u'CC', # conjunction
# u'CD', # cardinal (numbers)
# u'DT', # determiner (de, het)
u'FW', # foreign word
# u'IN', #conjunction
u'JJ', # adjectives -- # u'JJR', u'JJS',
# u'MD', # Modal verb
u'NN', u'NNP', u'NNPS', u'NNS', # Nouns
# u'PRP', # Pronouns -- # u'PRP$',
u'RB', # adverb
u'RP', # adverb
# u'SYM', # Symbol
# u'TO', # infinitival to
# u'UH', # interjection
u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ' # Verb forms
In [7]:
def cleanup(sentText):
words = []
for s in split(parse(sentText)):
for word, tag in s.tagged:
if tag in keepTags:
words = [ w for w in words if w not in stopwords ]
return words
In [8]:
data_vraag['SentToks'] = data_vraag['FullText'].apply(cleanup)
In [10]:
In [10]:
origSent = data_vraag['FullText'][0]
cleanSent = cleanup(origSent)
In [14]:
print 'Cleanup process transforms a sentence from this format:'
print '======================================================='
print origSent
print '======================================================='
print 'to a bag-of-words like this:'
print '======================================================='
print cleanSent