In [1]:
# retrieve dictionary
import os
os.system('mkdir senti_ws')
if not os.path.isfile('SentiWS_v1.8c.zip'):
os.system('wget -O senti_ws/SentiWS_v1.8c.zip http://wortschatz.informatik.uni-leipzig.de/download/SentiWS_v1.8c.zip')
archive = 'senti_ws/SentiWS_v1.8c.zip'
if os.path.isfile(archive):
print( 'Dictionary successfully retrieved.' )
else:
print( 'Dictionary not retrieved!' )
In [2]:
# unzip
os.system( 'unzip ' + archive + ' -d senti_ws' )
positive_file = 'senti_ws/SentiWS_v1.8c_Positive.txt'
negative_file = 'senti_ws/SentiWS_v1.8c_Negative.txt'
In [3]:
# dictionary files to DataFrame (positive words)
import pandas as pd
pos = pd.read_csv(positive_file, sep='\t', names=['base','score','examples'])
pos = pos.reset_index()
pos.head()
Out[3]:
In [4]:
words = pos.examples.str.split(',', expand=True)#.fillna('')
words = words.reset_index()
In [5]:
# positive word
m = pos.merge(words)
m = m.set_index(['index', 'score'])
m = m.drop(['examples'], 1)
m = m.stack().to_frame().reset_index()
m = m.drop(['index', 'level_2'], 1)
m = m.rename(columns={0:'word'})
m.word = m.word.apply(lambda x: pd.Series(x.split('|')))[0]
positives = m
#positives.head()
In [6]:
# negative words
pos = pd.read_csv(negative_file, sep='\t', names=['base','score','examples'])
pos = pos.reset_index()
words = pos.examples.str.split(',', expand=True)#.fillna('')
words = words.reset_index()
m = pos.merge(words)
m = m.set_index(['index', 'score'])
m = m.drop(['examples'], 1)
m = m.stack().to_frame().reset_index()
m = m.drop(['index', 'level_2'], 1)
m = m.rename(columns={0:'word'})
m.word = m.word.apply(lambda x: pd.Series(x.split('|')))[0]
negatives = m
#negatives.head()
In [78]:
# combine positives and negatives
dictionary = positives.append(negatives)
In [79]:
# average redundant words ('doubles')
df = dictionary.word.value_counts().to_frame()
doubles = df[df['word'] == 2].reset_index()['index']
for double in doubles:
new_score = dictionary[dictionary.word == double]['score'].mean()
dictionary = dictionary[dictionary.word != double]
dictionary.loc[len(dictionary)] = [new_score, double]
In [81]:
# DataFrame to SQLite
import sqlite3 as lite
db_filename = 'dictionary_german.sqlite'
con = lite.connect(db_filename)
dictionary.to_sql('dict', con, flavor='sqlite', if_exists='replace')
con.close()