In [5]:
import sqlite3
import pandas as pd
from scipy.stats import chi2_contingency

cnx_lda = sqlite3.connect("1_31_LDA.db")
cnx_sentiment = sqlite3.connect("2016-01_sentiments_annotated.db")

In [6]:
# get topic distribution over stories
_ = pd.read_sql("SELECT * FROM [1_31_LDA]", cnx_lda)
topics = [str(i) for i in range(100)]
df_lda = _[topics]
topics_lemmas = _.loc[_.index[-1]][topics]
df_lda.index = _['story_id']
df_lda = df_lda[:-1]

In [7]:
# get emotion vectors
_ = pd.read_sql("SELECT * FROM [2016-01_sentiments_annotated.db]", cnx_sentiment)
df_emotions = _[['negative', 'ambiguous', 'positive']]
df_emotions.index = _['story_id']

In [8]:
def controversy(topic, cutoff_topic=.1, df_emotions=df_emotions, df_lda=df_lda, mode="custom"):
    # retrieve all relevant story ids for given topic
    story_ids = list()
    for row in df_lda.iterrows():
        if row[1][topic] is not None:
            if float(row[1][topic]) > cutoff_topic:
                story_ids.append(row[0])
    story_ids = set(story_ids)

    # retrieve all emotions vectors for relevant stories
    emotion_vectors = list()
    for row in df_emotions.iterrows():
        if str(row[0]) in story_ids:
            if row[1].values.sum() > 0:
                emotion_vectors.append(list(row[1].values))

    # calculate divergence
    if len(emotion_vectors) > 2:
        if mode == "chi2":
            _, p, _, _ = chi2_contingency(emotion_vectors)
            score = 1 - p
        elif mode == "custom":
            neg = 0
            pos = 0
            for e in emotion_vectors:
                neg += e[0]
                pos += e[2]
            if neg >= 5 and pos >= 5:
                if neg >= pos:
                    score = pos / neg
                else:
                    score = neg / pos
            else:
                score = 0
        print("topic " + topic + ": controversy score: " + str(score))
        return score, story_ids

    else:
        print("topic " + topic + ": not enough stories with emotions vectors in that topic")
        return 0, story_ids


# evaluate for each topic
stories = list()
controversy_scores = list()
for topic in topics:
    score, ids = controversy(topic)
    controversy_scores.append(score)
    stories.append(ids)


topic 0: controversy score: 0.919527896996
topic 1: controversy score: 0.606060606061
topic 2: controversy score: 0.797833935018
topic 3: controversy score: 0.70243902439
topic 4: controversy score: 0.950310559006
topic 5: controversy score: 0.527777777778
topic 6: controversy score: 0.647777777778
topic 7: controversy score: 0.55
topic 8: controversy score: 0.75
topic 9: controversy score: 0.653250773994
topic 10: controversy score: 0.560468140442
topic 11: controversy score: 0.511879049676
topic 12: controversy score: 0.613526570048
topic 13: controversy score: 0.841121495327
topic 14: controversy score: 0.628272251309
topic 15: controversy score: 0.513196480938
topic 16: controversy score: 0.627214170692
topic 17: controversy score: 0.692434324397
topic 18: controversy score: 0.811926605505
topic 19: controversy score: 0.382394366197
topic 20: controversy score: 0.501992031873
topic 21: controversy score: 0.86974789916
topic 22: controversy score: 0.907185628743
topic 23: controversy score: 0.451428571429
topic 24: controversy score: 0.792307692308
topic 25: controversy score: 0.83081570997
topic 26: controversy score: 0.689295039164
topic 27: controversy score: 0.785087719298
topic 28: controversy score: 0.494047619048
topic 29: controversy score: 0.709677419355
topic 30: controversy score: 0.624145785877
topic 31: controversy score: 0.881656804734
topic 32: controversy score: 0.56015037594
topic 33: controversy score: 0.659574468085
topic 34: controversy score: 0.711678832117
topic 35: controversy score: 0.621323529412
topic 36: controversy score: 0.629473684211
topic 37: controversy score: 0.633027522936
topic 38: controversy score: 0.639484978541
topic 39: controversy score: 0.773399014778
topic 40: controversy score: 0.708812260536
topic 41: controversy score: 0.677734375
topic 42: controversy score: 0.78853046595
topic 43: controversy score: 0.876160990712
topic 44: controversy score: 0.501945525292
topic 45: controversy score: 0.639871382637
topic 46: controversy score: 0.49043062201
topic 47: controversy score: 0.747616456703
topic 48: controversy score: 0.705442902882
topic 49: controversy score: 0.53642384106
topic 50: controversy score: 0.825503355705
topic 51: controversy score: 0.825581395349
topic 52: controversy score: 0.6
topic 53: controversy score: 0.538834951456
topic 54: controversy score: 0.805653710247
topic 55: controversy score: 0.674931129477
topic 56: controversy score: 0.497737556561
topic 57: controversy score: 0.681818181818
topic 58: controversy score: 0.904458598726
topic 59: controversy score: 0.694610778443
topic 60: controversy score: 0.759188846641
topic 61: controversy score: 0.671256958687
topic 62: controversy score: 0.418903150525
topic 63: controversy score: 0.534653465347
topic 64: controversy score: 0.77656945131
topic 65: controversy score: 0.905714285714
topic 66: controversy score: 0.642313546423
topic 67: controversy score: 0.401639344262
topic 68: controversy score: 0.768656716418
topic 69: controversy score: 0.548314606742
topic 70: controversy score: 0.702127659574
topic 71: controversy score: 0.674518201285
topic 72: controversy score: 0.778280542986
topic 73: controversy score: 0.718562874251
topic 74: controversy score: 0.571428571429
topic 75: controversy score: 0.885929357267
topic 76: controversy score: 0.793939393939
topic 77: controversy score: 0.809523809524
topic 78: controversy score: 0.652173913043
topic 79: controversy score: 0.731225296443
topic 80: controversy score: 0.84858044164
topic 81: controversy score: 0.995949594959
topic 82: controversy score: 0.886363636364
topic 83: controversy score: 0.683544303797
topic 84: controversy score: 0.883491599708
topic 85: controversy score: 0.662236905602
topic 86: controversy score: 0.675
topic 87: controversy score: 0.698689956332
topic 88: controversy score: 0.628712871287
topic 89: controversy score: 0.639423076923
topic 90: controversy score: 0.576
topic 91: controversy score: 0.381111111111
topic 92: controversy score: 0.75204359673
topic 93: controversy score: 0.693602693603
topic 94: controversy score: 0.393530997305
topic 95: controversy score: 0.9383640553
topic 96: controversy score: 0.72197309417
topic 97: controversy score: 0.840579710145
topic 98: controversy score: 0.708333333333
topic 99: controversy score: 0.549905838041

In [ ]:
df_topic_controversy = pd.DataFrame(index=topics)
df_topic_controversy['controversy'] = controversy_scores
df_topic_controversy['lemmas'] = topics_lemmas
df_topic_controversy['story_ids'] = stories
df_topic_controversy.to_csv("January_controversy_scores.csv")

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: