In [1]:
# retrieve dictionary
import os

os.system('mkdir senti_ws')

if not os.path.isfile('SentiWS_v1.8c.zip'):
    os.system('wget -O senti_ws/SentiWS_v1.8c.zip http://wortschatz.informatik.uni-leipzig.de/download/SentiWS_v1.8c.zip')

archive = 'senti_ws/SentiWS_v1.8c.zip'
if os.path.isfile(archive):
    print( 'Dictionary successfully retrieved.' )
else:
    print( 'Dictionary not retrieved!' )


Dictionary successfully retrieved.

In [2]:
# unzip
os.system( 'unzip ' + archive + ' -d senti_ws' )
positive_file = 'senti_ws/SentiWS_v1.8c_Positive.txt'
negative_file = 'senti_ws/SentiWS_v1.8c_Negative.txt'

In [3]:
# dictionary files to DataFrame (positive words)
import pandas as pd

pos = pd.read_csv(positive_file, sep='\t', names=['base','score','examples'])
pos = pos.reset_index()
pos.head()


Out[3]:
index base score examples
0 0 Abmachung|NN 0.004 Abmachungen
1 1 Abschluß|NN 0.004 Abschlüße,Abschlußs,Abschlußes,Abschlüßen
2 2 Abstimmung|NN 0.004 Abstimmungen
3 3 Agilität|NN 0.004 NaN
4 4 Aktivität|NN 0.004 Aktivitäten

In [4]:
words = pos.examples.str.split(',', expand=True)#.fillna('')
words = words.reset_index()

In [5]:
# positive word
m = pos.merge(words)
m = m.set_index(['index', 'score'])
m = m.drop(['examples'], 1)
m = m.stack().to_frame().reset_index()
m = m.drop(['index', 'level_2'], 1)
m = m.rename(columns={0:'word'})
m.word = m.word.apply(lambda x: pd.Series(x.split('|')))[0]
positives = m
#positives.head()

In [6]:
# negative words
pos = pd.read_csv(negative_file, sep='\t', names=['base','score','examples'])
pos = pos.reset_index()
words = pos.examples.str.split(',', expand=True)#.fillna('')
words = words.reset_index()
m = pos.merge(words)
m = m.set_index(['index', 'score'])
m = m.drop(['examples'], 1)
m = m.stack().to_frame().reset_index()
m = m.drop(['index', 'level_2'], 1)
m = m.rename(columns={0:'word'})
m.word = m.word.apply(lambda x: pd.Series(x.split('|')))[0]
negatives = m
#negatives.head()

In [78]:
# combine positives and negatives
dictionary = positives.append(negatives)

In [79]:
# average redundant words ('doubles')
df = dictionary.word.value_counts().to_frame()
doubles = df[df['word'] == 2].reset_index()['index']

for double in doubles:
    new_score  = dictionary[dictionary.word == double]['score'].mean()
    dictionary = dictionary[dictionary.word != double]
    dictionary.loc[len(dictionary)] = [new_score, double]

In [81]:
# DataFrame to SQLite
import sqlite3 as lite

db_filename = 'dictionary_german.sqlite'
con = lite.connect(db_filename)
dictionary.to_sql('dict', con, flavor='sqlite', if_exists='replace')
con.close()