notebook.community

Edit and run



In [1]:

    
from pyspark import SparkContext, SparkConf
import sys
from utility_code.util import *
import glob
from operator import add
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities
from collections import Counter
import utility_code.dependency_parse_object as dpo
from gensim.matutils import corpus2csc
import msgpack

%matplotlib inline
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import cPickle as pickle
from gensim import corpora, models, similarities

from urllib2 import Request, urlopen, URLError

import msgpack
from vaderSentiment.vaderSentiment import sentiment

sns.set()

all_files =glob.glob("../analysis/fin_conll_analysis/*")



In [48]:

    
def get_identities(line):
    to_ret = []
    identities = [dpo.DependencyParseObject(o, do_lemmatize=False) 
                      for o in line if o.split("\t")[-1] == "Identity"]
    i = 0
    while i < len(identities):
        text = get_cleaned_text(identities[i].singular_form.lower()).replace("#","")
        while i < len(identities)-1 and identities[i+1].id == identities[i].id+1:
            text += "_" + get_cleaned_text(identities[i+1].singular_form.lower()).replace("#","")
            i += 1
        to_ret.append(text)
        i +=1
    return [(x,1) for x in to_ret]

def get_loc_data():

    full_uid_to_county = pickle.load(open("../analysis/country_state_to_uid.p","rb"))
    uids_to_county = {}
    c =0
    for a in all_files:
        uid = int(a.replace("../analysis/fin_conll_analysis/","").replace(".txt.gz",""))
        if uid in full_uid_to_county:
            c+=1
            uids_to_county[uid] = full_uid_to_county[uid]
    print 'n geotagged users: ', c
    fields_of_interest = ['B01003_001E','B01001_002E','B01001_026E','B02001_002E','B02001_003E','B01002_001E',
                          'B19001_001E','B19013_001E','B03003_002E','B03003_003E','B19083_001E',
                          'B19301_001E','B19301H_001E','B19301B_001E','B19301I_001E','B23025_002E','B23025_007E']

    q = "http://api.census.gov/data/2013/acs5?get=NAME,{params}&for=county:*&key={key}".format(
                params=",".join(fields_of_interest),key='a6e317918af5d4097d792cabd992f41f2691b75e')
    response = urlopen(q)
    kittens = eval(response.read().replace("null","None"))
    headers = kittens.pop(0)
    #for i in range(0,len(descs)):
        #headers[i] = headers[i]+' - '+descs[i]
    #headers = zip(headers,descs)
    df = pd.DataFrame(kittens,columns=headers)
    df[fields_of_interest]=df[fields_of_interest].astype('float')

    df.columns =  ['county_name','total_pop','male_pop','female_pop','white_pop','black_pop','med_age',
                           'n_home','med_income','not_hisp_pop','hisp_pop','gini',
                           'per_cap_inc','per_cap_white','per_cap_black','per_cap_hisp','tot_in_lab','tot_not_in_lab','state','county']

    state_to_abbrev = {x.strip().split("\t")[1]:x.split("\t")[0] for x in open("../analysis/state_abbrev.tsv","r")}
    state_to_abbrev['DC'] = 'District of Columbia'
    state_to_abbrev['PR'] = 'Puerto Rico'
    
    x = pd.DataFrame([(x[0],x[1][0]+ ", " + state_to_abbrev[x[1][1]]) for x in uids_to_county.items()])
    x.columns = ['uid','vname']
    full_loc_dat = pd.merge(x,df,left_on='vname',right_on='county_name')
    full_loc_dat['b_percent'] = full_loc_dat.black_pop/full_loc_dat.total_pop
    full_loc_dat.sort(columns="b_percent",inplace=True,ascending=False)
    return full_loc_dat



In [40]:

    
conf = (SparkConf()
     .setMaster("local[*]")
     .setAppName("My app")
     .set("spark.driver.maxResultSize", "70g"))
sc = SparkContext(conf=conf)



In [44]:

    
only_identity_tweets = sc.parallelize(all_files, 64).flatMap(read_grouped_by_newline_file)



In [4]:

    
## get some stats on all tweets

n_tweets = only_identity_tweets.count()
total_terms = only_identity_tweets.map(lambda x: len(x)).reduce(add)
print 'N total tweets: ', n_tweets, ' Total terms: ', total_terms









    



N total tweets:  45448604  Total terms:  855462036



In [106]:

    
# collect identity counts
identities = only_identity_tweets.flatMap(get_identities).reduceByKey(add).collect()



In [108]:

    
# plot identity distribution, get most frequent identities
identities_counter = Counter()
total_identities = 0
for x in identities:
    identities_counter[x[0]] += x[1]
    total_identities += x[1]
total_identites = float(total_identities)

print 'total identities: ', total_identites#, ' proportion: ', total_identites/total_terms
for x in identities_counter.most_common(25):
    print x[0],  float(x[1])/total_identities
print len(identities_counter)

pdf_data = Counter()
for x in identities_counter.items():
    pdf_data[x[1]] += 1
    
pdf_data = np.array(pdf_data.items())

plt.yscale("log")
plt.xscale('log')
plt.scatter(pdf_data[:,0],pdf_data[:,1])









    



 total identities:  54242490.0
girl 0.058047224602
guy 0.0578001489238
friend 0.0515115917429
kid 0.0322996234133
woman 0.0310493120799
nigga 0.0274180075435
mom 0.0248451352436
fan 0.0232287824545
dude 0.0161952742214
child 0.0156884390816
dad 0.0146144102161
lady 0.01366884153
brother 0.0135526595479
bitch 0.0131788566491
player 0.0124496312761
boy 0.0116356937154
sister 0.00972665524757
student 0.00957389677354
police 0.00903929742163
parent 0.00860784598937
cop 0.00843097357809
president 0.00804362041639
teacher 0.00756982210809
black 0.00728961741985
wife 0.00717651420501
143428






    Out[108]:





<matplotlib.collections.PathCollection at 0x7f6afee58950>



In [10]:

    
pdf_dat_df = pd.DataFrame(pdf_data)
pdf_dat_df.to_csv("results/pdf_identities.csv")



In [13]:

    
# get per-user identities
def get_identities_with_uid_key(line):
    identities = [x[0] for x in get_identities(line)]
    if len(identities):
        uid = dpo.DependencyParseObject(line[0], do_lemmatize=False).dataset
        return  (uid, identities)
    else:
        return [None, []]

per_uid_data = only_identity_tweets.map(get_identities_with_uid_key).reduceByKey(add).collect()
# dump results out to file
msgpack.dump(per_uid_data, open("../analysis/identity_per_user.mpack","wb"))



In [15]:

    
# get some basic stats here for the per user stuff
usage_counts = []
uid_dict = {}
for k, v in per_uid_data:
    uid_dict[k] = v
    usage_counts.append([len(v),len(set(v))])
    
df = pd.DataFrame(usage_counts)
df.columns = ['tot','uniq']
df.sort(columns='uniq',inplace=True,ascending=False)
print df.uniq.median()
df.head(10)

f, ax = plt.subplots(figsize=(7, 7))
#ax.set(xscale="log")
#plt.xlim([1,1000])
sns.distplot(df.uniq,ax=ax,kde=False)









    



68.0






    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f4d483a2550>



In [18]:

    
df.to_csv("../python/results/per_user_identity_counts.csv")



In [119]:

    
# set up the dictionary for the lda 

texts_for_lda = [x[1] for x in per_uid_data if len(set(x[1]))> 50]

print 'n texts: ', len(texts_for_lda)
# run bipartite clustering?
dictionary = corpora.Dictionary(texts_for_lda)
print dictionary
dictionary.filter_extremes(no_below=100)
print dictionary
corpus = [dictionary.doc2bow(text) for text in texts_for_lda]









    



n texts:  161436
Dictionary(137327 unique tokens: [u'', u'fawn', u'aunt_pol', u'female_child', u'liberal_a-hole']...)
Dictionary(4293 unique tokens: [u'gatekeeper', u'pedophile', u'islander_fan', u'saver', u'vermin']...)



In [ ]:

    
# run the LDA
lda = models.ldamodel.LdaModel(corpus, num_topics=30, alpha='auto', id2word=dictionary,iterations=1000)
msgpack.dump(top,open("../analysis/lda_mod.mpack","wb"))
lda.save("../analysis/lda_model.lda")



In [ ]:

    
# save this out for semantic coherence
top = lda.top_topics(corpus,10)
for x in top:
    print x[1],x[0][0][1]



In [14]:

    
# load cached LDA, write out results

lda = models.ldamodel.LdaModel.load("../analysis/lda_model.lda")

lda_data = []
for i,t in enumerate(lda.show_topics(30, num_words=20, formatted=False)):
    for x in t:
        lda_data.append([i, x[0],x[1]])
        
pd.DataFrame(lda_data).to_csv("../python/results/lda_results.csv")

# Load cached user -> identity matrix
per_uid_data = msgpack.load( open("../analysis/identity_per_user.mpack","rb"))



In [ ]:

    
# get geotagged data
full_loc_dat = get_loc_data()
geo_tagged_uids = set(full_loc_dat.uid.tolist())



In [21]:

    
# loc stuff -> transition to nigga part
def get_doc_topic(corpus, model): 
    doc_topic = list() 
    for i,doc in enumerate(corpus): 
        doc_topic.append(model.__getitem__(doc, eps=0)) 
    return doc_topic 

def get_vector_for_topic(top_data, top_num):
    return [x[top_num][1] for x in top_data]

test_for_space  = [dictionary.doc2bow(x[1]) for x in per_uid_data if int(x[0]) in geo_tagged_uids]
test_for_space_uids = [x[0] for x in per_uid_data if int(x[0]) in geo_tagged_uids]
top_data = get_doc_topic(test_for_space,lda)



In [61]:

    
# do racial comparisons
for topic_num in range(30):
    v_for_n = get_vector_for_topic(top_data,topic_num)
    vn_space = pd.DataFrame(zip(test_for_space_uids,v_for_n))
    vn_space.columns = ['uid','top_'+str(topic_num)+'_val']
    vn_space['uid'] = vn_space.uid.astype(int)
    loc_with_top = pd.merge(loc_with_top,vn_space, left_on='uid',right_on='uid')



In [ ]:

    
#sns.distplot(loc_with_top.b_percent)
#sns.distplot(loc_with_top.nigga_use)



In [62]:

    
loc_with_top.to_csv("../python/results/spatial_lda_results.csv")



In [46]:

    
## now move to affect stuff

def get_tweet_info(line):
    o = dpo.DependencyParseObject(line[0])
    text = " ".join([l.split("\t")[1] for l in line])
    sent = sentiment(text)['compound']
    date = o.is_reply
    identities = get_identities(line)
    return [o.tweet_id, sent,date, [i[0] for i in identities],o.dataset]

#for line in read_grouped_by_newline_file("../analysis/fin_conll_analysis/310325693.txt.gz"):
#    print get_tweet_info(line)
    
tweet_identity_data = only_identity_tweets.map(get_tweet_info)

#tweet_info = tweet_identity_data.collect()
#msgpack.dump(tweet_info,open("../analysis/tweet_info.mpack","wb"))



In [ ]:

    
# identity usage per 1/4 month
# get the things that modified police, rae identities
# predict user location with per-user prediction



In [2]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
# Affective



In [49]:

    
#tweet_info = msgpack.load(open("../analysis/tweet_info.mpack","rb"))
def get_identity_to_person_sent_mat(tweet_info_obj):
    tweet_id, sentiment,date,identities,userid = tweet_info_obj
    if sentiment == 0:
        return []
    else:
        return [ ((i, userid), sentiment) for i in set(identities)]

  
sentiment_mat = tweet_identity_data.flatMap(get_identity_to_person_sent_mat).collect()
msgpack.dump(sentiment_mat,open("../analysis/sentiment_mat.mpack","wb"))



In [ ]:

    
# get the sentiment matrix, reset 

## NOTE THIS SHOULD REALLY BE SENTIMENT_MAT but bug before when writing out
#sentiment_mat = msgpack.load(open("../analysis/tweet_info.mpack","rb"))
c = Counter()
for i, s in enumerate(sentiment_mat):
    c["_".join(s[0])] += s[1]
    
dat = [x[0].rsplit("_",1)+ [x[1]] for x in c.iteritems()]



In [53]:

    
# only include the stuff from the lda
lda = models.ldamodel.LdaModel.load("../analysis/lda_model.lda")
per_uid_data = msgpack.load( open("../analysis/identity_per_user.mpack","rb"))
users_for_lda = set([int(x[0]) for x in per_uid_data if len(set(x[1]))> 50])

texts_for_lda = [x[1] for x in per_uid_data if len(set(x[1]))> 50]
print 'n texts: ', len(texts_for_lda)
# run bipartite clustering?
dictionary = corpora.Dictionary(texts_for_lda)
print dictionary
dictionary.filter_extremes(no_below=100)
print dictionary









    



n texts:  161436
Dictionary(137327 unique tokens: [u'', u'fawn', u'aunt_pol', u'female_child', u'liberal_a-hole']...)
Dictionary(4293 unique tokens: [u'gatekeeper', u'pedophile', u'islander_fan', u'saver', u'vermin']...)



In [54]:

    
df = pd.DataFrame(dat)
df.columns = ['term','uid','sent']

terms_of_interest = set(dictionary.token2id.keys())
df = df[df.term.apply(lambda x: x in terms_of_interest)]
df['uid'] = df.uid.astype(int)
df = df[df.uid.apply(lambda x: x in users_for_lda)]









    



---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-54-168f40b3c3de> in <module>()
      6 df['uid'] = df.uid.astype(int)
      7 df = df[df.uid.apply(lambda x: x in users_for_lda)]
----> 8 df.to_csv("../analysis/raw_df.csv")

/home/kjoseph/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in to_csv(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, quoting, quotechar, line_terminator, chunksize, tupleize_cols, date_format, doublequote, escapechar, decimal, **kwds)
   1187                                      escapechar=escapechar,
   1188                                      decimal=decimal)
-> 1189         formatter.save()
   1190 
   1191         if path_or_buf is None:

/home/kjoseph/anaconda/lib/python2.7/site-packages/pandas/core/format.pyc in save(self)
   1465 
   1466             else:
-> 1467                 self._save()
   1468 
   1469         finally:

/home/kjoseph/anaconda/lib/python2.7/site-packages/pandas/core/format.pyc in _save(self)
   1565                 break
   1566 
-> 1567             self._save_chunk(start_i, end_i)
   1568 
   1569     def _save_chunk(self, start_i, end_i):

/home/kjoseph/anaconda/lib/python2.7/site-packages/pandas/core/format.pyc in _save_chunk(self, start_i, end_i)
   1592                                         quoting=self.quoting)
   1593 
-> 1594         lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
   1595 
   1596 # from collections import namedtuple

pandas/lib.pyx in pandas.lib.write_csv_rows (pandas/lib.c:17612)()

UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 5: ordinal not in range(128)



In [56]:

    
df.to_csv("../analysis/raw_df.csv",encoding='utf8')



In [114]:

    
# check out highest and lowest sentiment words
f = df[['term','sent']].groupby("term").sum()
f.sort(columns='sent',inplace=True)
f = f.reset_index()

doc_freq = []
for k in dictionary.token2id.keys():
    doc_freq.append([k, identities_counter[k]])
doc_freq = pd.DataFrame(doc_freq)
doc_freq.columns = ['term','n_doc']
doc_freq.sort(columns='n_doc',ascending=False, inplace=True)
f = pd.merge(f, doc_freq, left_on='term',right_on='term')
f['sent_per_doc'] = f.sent/f.n_doc
f.sort(columns='sent_per_doc',inplace=True)



In [116]:

    
f.head(10)









    Out[116]:






  
    
      
      term
      sent
      n_doc
      sent_per_doc
    
  
  
    
      386
      bitch_bitch
      -100.9976
      149
      -0.677836
    
    
      434
      hater_nigga
      -81.2768
      125
      -0.650214
    
    
      332
      rape
      -132.3976
      213
      -0.621585
    
    
      388
      white_criminal
      -100.6194
      176
      -0.571701
    
    
      394
      black_criminal
      -98.5218
      173
      -0.569490
    
    
      185
      nigga_bitch
      -422.5157
      744
      -0.567897
    
    
      199
      cop_killer
      -375.0204
      663
      -0.565642
    
    
      478
      animal_abuser
      -70.6499
      125
      -0.565199
    
    
      216
      woman_bitch
      -323.1013
      592
      -0.545779
    
    
      217
      child_abuser
      -313.6885
      582
      -0.538984



In [120]:

    
z = df.pivot("term","uid","sent").fillna(0)

from sklearn.preprocessing import scale
term_to_u_mat = z.values
term_to_u_mat = scale(term_to_u_mat,1)
term_to_u_mat = scale(term_to_u_mat)



In [122]:

    
from sklearn.random_projection import SparseRandomProjection
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, FastICA
import sklearn.mixture as mixture

sp = SparseRandomProjection(n_components = 500)
X = sp.fit_transform(term_to_u_mat)
mixed = mixture.VBGMM(n_components=30, covariance_type='diag', n_iter=100)
fitted = mixed.fit(X)



In [123]:

    
fitted.n_components









    Out[123]:





30



In [124]:

    
r = fitted.predict_proba(X)



In [125]:

    
aff_clust_dat = []
for i in range(r.shape[0]):
    aff_clust_dat.append([z.index[i], r[i].argmax(), r[i].max()])

aff_clust = pd.DataFrame(aff_clust_dat)



In [126]:

    
aff_clust.columns = ['term','cluster','prob']



In [127]:

    
aff_clust.sort('prob',inplace=True)



In [130]:

    
aff_clust.tail(20)









    Out[130]:






  
    
      
      term
      cluster
      prob
    
  
  
    
      81
      afrikaner
      28
      0.999997
    
    
      2549
      mistres
      28
      0.999997
    
    
      489
      bonafide
      28
      0.999997
    
    
      3680
      spotter
      21
      0.999997
    
    
      3037
      politico
      28
      0.999997
    
    
      3987
      underclassman
      10
      0.999997
    
    
      2260
      liberal_journalist
      28
      0.999997
    
    
      1986
      immigrant_kid
      16
      0.999997
    
    
      2522
      miner_daughter
      28
      0.999997
    
    
      2519
      minded
      20
      0.999997
    
    
      2119
      junior
      10
      0.999997
    
    
      1406
      fellow_veteran
      28
      0.999997
    
    
      2514
      millennial
      5
      0.999997
    
    
      717
      child_rapist
      28
      0.999997
    
    
      1402
      fellow_texan
      28
      0.999997
    
    
      1793
      haitian
      29
      0.999997
    
    
      2499
      migo
      10
      0.999997
    
    
      1984
      immigrant
      28
      0.999997
    
    
      1430
      female_customer
      28
      0.999997
    
    
      3889
      thottie
      10
      0.999997



In [131]:

    
aff_clust.to_csv("../python/results/affect_clustering.csv",encoding='utf8')



In [117]:

    
f.to_csv("../python/results/affect_scores.csv",encoding='utf8')



In [142]:

    
w2top = {x.strip().split()[0]:x.strip().split()[1] for x in open("results/word2topic.tsv")}



In [224]:

    
from math import floor
identities_of_interest = {'victim','officer','police_officer','protester',
                          'victim','juror','teen', 'prosecutor',
                          'lawyer','attorney','reporter','journalist','murderer','judge',
                         'nigga','mayor','driver','nigger','negro','racist',"cop","pig",
                         "black","white"}
def get_word_to_date_count(line):
    objs = [dpo.DependencyParseObject(l,do_lemmatize=False) for l in line]
    date = objs[0].is_reply
    uid = objs[0].dataset
    
    if int(date.split("-")[2]) < 13:
        return []

    identities = [o for o in objs if o.label == "Identity"]
    i = 0
    
    ids = []
    heads = []
    
    while i < len(identities):
        text = get_cleaned_text(identities[i].singular_form.lower()).replace("#","")
        head = ''
        if identities[i].head > 0:
            head = objs[identities[i].head-1].text + "_" + objs[identities[i].head-1].postag
        if i < len(identities)-1 and identities[i+1].id == identities[i].id+1:
            tmp_txt = text + "_" + get_cleaned_text(identities[i+1].singular_form.lower()).replace("#","")
            if tmp_txt in identities_of_interest:
                text = tmp_txt
                i+=1
                if identities[i].head > 0:
                    head = objs[identities[i].head-1].text + "_" + objs[identities[i].head-1].postag   
        ids.append(text)
        heads.append(head)
        
        i +=1
        
        
    if not len(set(ids) & identities_of_interest):
        return []
    
    head2 = []
    for h in heads:
        try:
            h.encode('ascii')
            head2.append(h)
        except:
            head2.append('')
    
    text = " ".join([l.split("\t")[1] for l in line])
    sent = sentiment(text)['compound']
    d_spl = date.split("-")
    wk = int(floor(float(d_spl[1]) / 15))
    if wk > 1:
        wk = 1
    day = str((wk * 15)+1)
    date_ret = "-".join([d_spl[0],day,d_spl[2]])
    return [ (date_ret, x, head2[i], sent, uid) for i,x in enumerate(ids) if x in identities_of_interest]


# for x in read_grouped_by_newline_file(all_files[40]):
#     z =  get_word_to_date_count(x)
#     if len(z):
#         print '\t', z
date_info = only_identity_tweets.flatMap(get_word_to_date_count).collect()



In [ ]:

    
tweet_i



In [178]:

    
full_loc_dat = get_loc_data()
geo_tagged_uids = set([int(x) for x in full_loc_dat.uid.tolist()])









    



n geotagged users:  73040



In [235]:

    
date_info[1]









    Out[235]:





(u'09-16-14', u'victim', u'cry_V', -0.8176, u'636262914')



In [192]:

    
from collections import Counter, defaultdict
blah = defaultdict(lambda: defaultdict(Counter))
z = defaultdict(lambda:defaultdict(Counter))
for x in date_info:
    blah[x[3]][x[1]][x[0]] += x[2]
    z[x[3]][x[1]][x[0]] += 1
    
time_dat = []
for user,word_date_count in blah.items():
    for word, date_count in word_date_count.items():
        for date, count in date_count.items():
            date_split = date.split("-")
            year = date_split[2]
            month = date_split[0]
            day = int(date_split[1])
            if day > 1:
                day = 1
            day = str(day * 15)
            time_dat.append([user,word,"-".join([month,day,year]),count,z[user][word][date]])

time_df = pd.DataFrame(time_dat)
time_df.to_csv("../python/results/time_sent_user_df.csv")



In [234]:

    
pdf_data[:,1].sum()









    Out[234]:





143428



In [171]:



In [172]:



In [ ]:

    
# ck how affective meanings change

	term	sent	n_doc	sent_per_doc
386	bitch_bitch	-100.9976	149	-0.677836
434	hater_nigga	-81.2768	125	-0.650214
332	rape	-132.3976	213	-0.621585
388	white_criminal	-100.6194	176	-0.571701
394	black_criminal	-98.5218	173	-0.569490
185	nigga_bitch	-422.5157	744	-0.567897
199	cop_killer	-375.0204	663	-0.565642
478	animal_abuser	-70.6499	125	-0.565199
216	woman_bitch	-323.1013	592	-0.545779
217	child_abuser	-313.6885	582	-0.538984

	term	cluster	prob
81	afrikaner	28	0.999997
2549	mistres	28	0.999997
489	bonafide	28	0.999997
3680	spotter	21	0.999997
3037	politico	28	0.999997
3987	underclassman	10	0.999997
2260	liberal_journalist	28	0.999997
1986	immigrant_kid	16	0.999997
2522	miner_daughter	28	0.999997
2519	minded	20	0.999997
2119	junior	10	0.999997
1406	fellow_veteran	28	0.999997
2514	millennial	5	0.999997
717	child_rapist	28	0.999997
1402	fellow_texan	28	0.999997
1793	haitian	29	0.999997
2499	migo	10	0.999997
1984	immigrant	28	0.999997
1430	female_customer	28	0.999997
3889	thottie	10	0.999997