notebook.community

Edit and run



In [1]:

    
# retrieve dictionary
import os

os.system('mkdir senti_ws')

if not os.path.isfile('SentiWS_v1.8c.zip'):
    os.system('wget -O senti_ws/SentiWS_v1.8c.zip http://wortschatz.informatik.uni-leipzig.de/download/SentiWS_v1.8c.zip')

archive = 'senti_ws/SentiWS_v1.8c.zip'
if os.path.isfile(archive):
    print( 'Dictionary successfully retrieved.' )
else:
    print( 'Dictionary not retrieved!' )









    



Dictionary successfully retrieved.



In [2]:

    
# unzip
os.system( 'unzip ' + archive + ' -d senti_ws' )
positive_file = 'senti_ws/SentiWS_v1.8c_Positive.txt'
negative_file = 'senti_ws/SentiWS_v1.8c_Negative.txt'



In [3]:

    
# dictionary files to DataFrame (positive words)
import pandas as pd

pos = pd.read_csv(positive_file, sep='\t', names=['base','score','examples'])
pos = pos.reset_index()
pos.head()









    Out[3]:






  
    
      
      index
      base
      score
      examples
    
  
  
    
      0
      0
      Abmachung|NN
      0.004
      Abmachungen
    
    
      1
      1
      Abschluß|NN
      0.004
      Abschlüße,Abschlußs,Abschlußes,Abschlüßen
    
    
      2
      2
      Abstimmung|NN
      0.004
      Abstimmungen
    
    
      3
      3
      Agilität|NN
      0.004
      NaN
    
    
      4
      4
      Aktivität|NN
      0.004
      Aktivitäten



In [4]:

    
words = pos.examples.str.split(',', expand=True)#.fillna('')
words = words.reset_index()



In [5]:

    
# positive word
m = pos.merge(words)
m = m.set_index(['index', 'score'])
m = m.drop(['examples'], 1)
m = m.stack().to_frame().reset_index()
m = m.drop(['index', 'level_2'], 1)
m = m.rename(columns={0:'word'})
m.word = m.word.apply(lambda x: pd.Series(x.split('|')))[0]
positives = m
#positives.head()



In [6]:

    
# negative words
pos = pd.read_csv(negative_file, sep='\t', names=['base','score','examples'])
pos = pos.reset_index()
words = pos.examples.str.split(',', expand=True)#.fillna('')
words = words.reset_index()
m = pos.merge(words)
m = m.set_index(['index', 'score'])
m = m.drop(['examples'], 1)
m = m.stack().to_frame().reset_index()
m = m.drop(['index', 'level_2'], 1)
m = m.rename(columns={0:'word'})
m.word = m.word.apply(lambda x: pd.Series(x.split('|')))[0]
negatives = m
#negatives.head()



In [78]:

    
# combine positives and negatives
dictionary = positives.append(negatives)



In [79]:

    
# average redundant words ('doubles')
df = dictionary.word.value_counts().to_frame()
doubles = df[df['word'] == 2].reset_index()['index']

for double in doubles:
    new_score  = dictionary[dictionary.word == double]['score'].mean()
    dictionary = dictionary[dictionary.word != double]
    dictionary.loc[len(dictionary)] = [new_score, double]



In [81]:

    
# DataFrame to SQLite
import sqlite3 as lite

db_filename = 'dictionary_german.sqlite'
con = lite.connect(db_filename)
dictionary.to_sql('dict', con, flavor='sqlite', if_exists='replace')
con.close()

	index	base	score	examples
0	0	Abmachung\|NN	0.004	Abmachungen
1	1	Abschluß\|NN	0.004	Abschlüße,Abschlußs,Abschlußes,Abschlüßen
2	2	Abstimmung\|NN	0.004	Abstimmungen
3	3	Agilität\|NN	0.004	NaN
4	4	Aktivität\|NN	0.004	Aktivitäten