In [19]:
###Make sure to run the relevant cells on the test data too

In [80]:
import math 
#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series
import random
#import prettyplotlib as ppl
#from sklearn import ensemble as ske
%matplotlib inline
#import seaborn as sns
import re
from django.utils.encoding import smart_str, smart_unicode

            
df = pd.io.json.read_json('data/test_1.json')
df.columns.values.tolist()


Out[80]:
[u'giver_username_if_known',
 u'request_id',
 u'request_text_edit_aware',
 u'request_title',
 u'requester_account_age_in_days_at_request',
 u'requester_days_since_first_post_on_raop_at_request',
 u'requester_number_of_comments_at_request',
 u'requester_number_of_comments_in_raop_at_request',
 u'requester_number_of_posts_at_request',
 u'requester_number_of_posts_on_raop_at_request',
 u'requester_number_of_subreddits_at_request',
 u'requester_subreddits_at_request',
 u'requester_upvotes_minus_downvotes_at_request',
 u'requester_upvotes_plus_downvotes_at_request',
 u'requester_username',
 u'unix_timestamp_of_request',
 u'unix_timestamp_of_request_utc']

In [81]:
##cleaning some weird non ascii characters
df['clean_request_text'] = df['request_text_edit_aware'].apply(lambda x: smart_str(x)) 
#df.drop('request_text_edit_aware',1)
df['clean_title_text'] = df['request_title'].apply(lambda x: smart_str(x))
#df.drop('request_title')

In [82]:
#FEATURES ADDED: Length of posts  titles, log transformed
df['request_length'] = df['clean_request_text'] .apply(lambda x: len(x))
df['log_request_length'] = df['request_length'].apply(lambda x: math.log10(x+1))

#Avoiding Collinearity in predictors
df = df.drop('request_length',1)

In [83]:
#FEATURES ADDED: Length of requests, log transformed
df['request_title_length'] = df['request_title'].apply(lambda x:len(x))
df['log_title_length'] = df['request_title_length'].apply(lambda x: math.log10(x+1))

df = df.drop('request_title_length',1)

In [84]:
#FEATURE ADDED: presence of link
def link_test(inputstring):
    x = inputstring
    a = re.search(".com", x)
    b = re.search("http", x)
    c = re.search(".net",x)
    d = a or b or c
    if d:
        return 1
    else:
        return 0
                                                
df['link_presence?'] = df['clean_request_text'].apply(lambda j: link_test(j))

In [85]:
#FEATURE ADDED: presence of semi-colon, as a rough proxy for level of education, and well-written-ness
def colon_test(inputstring):
    x = inputstring
    a = re.search(";", x)
    if a:
        return 1
    else:
        return 0
    
df['semi_colon?'] = df['clean_request_text'].apply(lambda j: colon_test(j))

In [86]:
#FEATURE ADDED: Length of Username (longer ones tend to be more witty), and giver if known

df['request_user_name_length'] = df['requester_username'].apply(lambda j: len(j))

df['giver_user_name_length'] = df['giver_username_if_known'].apply(lambda j:len(j))

In [87]:
#### hacking the topical narratives from the paper:
#could supplement these...

###Open question: the paper only used a binary count. maybe we should add that too? also, maybe we should consider verbosity by also 
#diving by length of text

money1 = """ week ramen paycheck work couple rice check pizza grocery rent anyone favor someone bill money"""
money2 = """  food money house bill rent stamp month today parent help pizza someone anything mom anyone"""
job =  """job month rent year interview bill luck school pizza paycheck unemployment money ramen end check"""
friend = """ friend house night mine pizza birthday thing school site place family story way movie anything"""
student= """ student college final pizza loan summer university money class meal year semester story kid school"""
familytime = """ tonight night today tomorrow someone anyone friday dinner something account family bank anything home work"""
time = """ day couple anything today work pizza help pay anyone home meal food ramen someone favor"""
gratitude = """ thanks advance guy reading anyone pizza  anything story tonight help place everyone craving kind favor"""
pizza = """pizza craving hut story someone anyone domino money cheese thing request picture act title kind"""
general = """time pizza year people part work hour life thing lurker story anything someone month way"""

narratives = [money1,money2,job,friend,student,familytime,time,gratitude,pizza,general]
### to do: create  10 columns for each request, each column with the number of points scored
narrative_names = '''money1 money2 job friend student familytime time gratitude pizza general'''

def narrative_scorer(text):
    narrative_scores = []
    names = narrative_names.split()
    
    for i in range(len(narratives)):
        current_keywords = narratives[i].split()
        narrative_score = 0

        for word in current_keywords: ###This doesnt work yet
            a = re.search(word,text) ####>>>>>?????
           # print a
            if a:
                narrative_score += 1
            else:
                pass
        narrative_scores.append(narrative_score)
        
    return narrative_scores

df['clean_request_text'][0:5].apply(lambda x: narrative_scorer(x))


Out[87]:
0    [1, 2, 3, 1, 2, 0, 2, 1, 1, 1]
1    [5, 3, 6, 4, 3, 3, 3, 3, 3, 3]
2    [4, 3, 2, 3, 0, 4, 6, 4, 1, 3]
3    [0, 1, 0, 0, 0, 1, 3, 0, 0, 0]
4    [2, 1, 0, 2, 0, 4, 3, 2, 1, 3]
Name: clean_request_text, dtype: object

In [88]:
narrativedf = pd.DataFrame(df['clean_request_text'].apply(lambda x: Series(narrative_scorer(x))))


####TODO - Join this dataframe with df

In [89]:
narrativedf.head()


Out[89]:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 1 2 0 2 1 1 1
1 5 3 6 4 3 3 3 3 3 3
2 4 3 2 3 0 4 6 4 1 3
3 0 1 0 0 0 1 3 0 0 0
4 2 1 0 2 0 4 3 2 1 3

In [90]:
narrativedf['request_id'] = df['request_id']
#narrativedf['requester_received_pizza'] = df['requester_received_pizza']

In [91]:
narrativedf.columns = ['T:money1','T:money2','T:Job','T:Friend','T:Student','T:Familytime','T:Time','T:gratitude','T:Pizza','T:General','request_id']

In [92]:
Topical = pd.DataFrame.merge(df,narrativedf)

In [6]:
#####################
#LDA on titles#

#Still need to run model backwords on new 'docbow' instances
#Importing gensim
import gensim #####
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities
import codecs

In [7]:
#need to transfrom text from data frame into a list of strings. (for both request text and request title? or should they merged?
#keeping them seperate for now
request = df['request_text_edit_aware']
Reqtxt = []
for item in request:
     Reqtxt.append(smart_str(item))
title= df['request_title']
Titletxt = []
for item in title:
    Titletxt.append(smart_str(item))

In [144]:
##next version: update stoplist 
stoplist = set("""for a of the and me my to in is i I it am be has [request] [Request] [REQUEST] request (request) pizza pizza! pizza, pizza.
               have would this i'm on when but as just that get with reddit subreddit if at so no are or
               we was out can not some someone""".split())
Rtexts = [[word for word in document.lower().split() if word not in stoplist]
        for document in Reqtxt]
Ttexts = [[word for word in document.lower().split() if word not in stoplist]
        for document in Titletxt]

###NNNEED TO REMOVE ALL WORDS THAT AREN"T NOUNS

###option: remove all words that only appear once in the entire dataset. [potentially train LDA on Test and Train???]

# remove words that appear only once
#Rall_tokens = sum(Rtexts, [])
#Rtokens_once = set(word for word in set(Rall_tokens) if Rall_tokens.count(word) == 1)
#Rtexts = [[word for word in text if word not in Rtokens_once]
         #for text in Rtexts]

#Tall_tokens = sum(Ttexts, [])
#Ttokens_once = set(word for word in set(Tall_tokens) if Tall_tokens.count(word) == 1)
#Ttexts = [[word for word in text if word not in Ttokens_once]
      #   for text in Ttexts]

In [145]:
Rdict = corpora.Dictionary(Rtexts)
Tdict = corpora.Dictionary(Ttexts)

In [146]:
Rdict.save('request_dictionary.dict')
Tdict.save('title_dictionary.dict')

In [147]:
##creating a corpus
Rcorpus = [Rdict.doc2bow(text) for text in Rtexts]
Tcorpus = [Tdict.doc2bow(text) for text in Ttexts]
corpora.MmCorpus.serialize('Rcorpus.mm', Rcorpus)
corpora.MmCorpus.serialize('Tcorpus.mm', Tcorpus)

In [148]:
Rlda_23 = gensim.models.ldamodel.LdaModel(corpus=Rcorpus, id2word=Rdict, num_topics=23, update_every=20, chunksize=100, passes=1)

In [99]:
Rlda_23.save('Rlda_23.model')

In [149]:
Rlda_23.print_topics(23)


Out[149]:
[u'0.021*). + 0.010*been + 0.006*could + 0.006*help + 0.006*sense + 0.005*really + 0.004*you + 0.004*week + 0.004*our + 0.004*even',
 u'0.020*looking! + 0.015*been + 0.010*noodle + 0.010*honest, + 0.010*loan + 0.010*past, + 0.010*troubles + 0.010*jimmy + 0.010*eyes, + 0.009*:)',
 u"0.018*all + 0.012*food + 0.009*we're + 0.009*remembered + 0.009*arrived + 0.008*cheers! + 0.008*egift + 0.008*roomate + 0.008*cant + 0.008*florida",
 u"0.012*you + 0.009*until + 0.008*i've + 0.008*friday + 0.008*food + 0.007*don't + 0.007*day + 0.007*today + 0.007*help + 0.007*next",
 u"0.011*you + 0.009*until + 0.009*about + 0.009*been + 0.007*help + 0.007*will + 0.006*food + 0.006*also + 0.006*money + 0.006*i've",
 u'0.015*alive + 0.014*searched + 0.010*emotionally + 0.010*cabin, + 0.010*bed + 0.009*stressed + 0.009*ground. + 0.009*wonders + 0.006*pay + 0.006*tell',
 u'0.014*food + 0.011*our + 0.009*he + 0.009*his + 0.008*hunger + 0.008*tend + 0.008*do + 0.008*all + 0.007*been + 0.006*date',
 u"0.013** + 0.011*addition + 0.011*georgia. + 0.011*that'd + 0.011*15th. + 0.010*money + 0.010*hell. + 0.010*fair + 0.009*help + 0.008*day",
 u'0.020*forever. + 0.019*bugging + 0.019*sticky + 0.018*epic + 0.017*times. + 0.015*friend. + 0.012*<3 + 0.011*law + 0.011*couch + 0.009*will',
 u'0.019*kids + 0.018*her + 0.012*you + 0.011*she + 0.010*tried + 0.008*our + 0.007*thank + 0.007*pepperoni + 0.007*mother, + 0.007*attempt',
 u'0.013*him + 0.012*his + 0.011*he + 0.008*pay + 0.008*got + 0.007*been + 0.007*cake + 0.007*soo + 0.007*quickly + 0.007*cleaning',
 u"0.020*homework, + 0.020*broccoli + 0.015*woo! + 0.013*games + 0.011*:) + 0.008*there's + 0.008*pie + 0.007*edit: + 0.007*pizza](http://bigepizza.com)! + 0.007*hotspot",
 u'0.017*all + 0.014*you + 0.013*food + 0.011*now + 0.010*:) + 0.009*want + 0.009*return + 0.009*love + 0.008*up + 0.008*$15',
 u"0.016*been + 0.010*help + 0.008*had + 0.008*i've + 0.008*you + 0.007*able + 0.007*food + 0.007*there + 0.007*really + 0.007*after",
 u'0.020*this) + 0.014*proud + 0.011*belly + 0.011*<-- + 0.011*swell. + 0.011*mad + 0.011*two. + 0.011*p.s. + 0.011*pancreatic + 0.011*etc.,',
 u'0.040*foster + 0.020*voice + 0.020*(of + 0.019*halfway + 0.015*meat. + 0.009*live + 0.008*food + 0.006*sos + 0.006*creek + 0.006*coconut',
 u'0.019*atlanta, + 0.019*time! + 0.019*promotion + 0.019*deviantart + 0.019*ga. + 0.017*upcoming + 0.013*link + 0.012*commissions + 0.012*ao + 0.012*you',
 u"0.021*er + 0.012*favor, + 0.011*deciding + 0.011*virtually + 0.009*i'll + 0.009*dish + 0.008*sense + 0.008*laptop + 0.008*weird + 0.008*you",
 u"0.025*hook + 0.016*up + 0.014*month's + 0.014*shipping + 0.014*papa + 0.014*johns + 0.011*tomorrow. + 0.011*could + 0.011*gift + 0.010*here",
 u"0.020*little + 0.012*stories + 0.012*today! + 0.012*boyfriend's + 0.012*calls + 0.012*i'd + 0.012*trick. + 0.012*jack + 0.012*else's + 0.010*fulfill",
 u'0.017*morning, + 0.017*me; + 0.015*chest + 0.014*chances + 0.012*alcohol + 0.007*bills, + 0.006*been + 0.006*pay + 0.005*pizza?! + 0.005*she',
 u"0.012*grandmother + 0.011*after + 0.008*it's + 0.008*i've + 0.008*there + 0.008*about + 0.007*home + 0.007*possible. + 0.007*fix + 0.006*you",
 u"0.017*help + 0.008*i've + 0.007*you + 0.007*can't + 0.007*friend + 0.007*any + 0.006*thanks + 0.006*out. + 0.006*screen + 0.006*kind"]

In [78]:
Topical.columns.values.tolist()


Out[78]:
[u'giver_username_if_known',
 u'request_id',
 u'request_text_edit_aware',
 u'request_title',
 u'requester_account_age_in_days_at_request',
 u'requester_days_since_first_post_on_raop_at_request',
 u'requester_number_of_comments_at_request',
 u'requester_number_of_comments_in_raop_at_request',
 u'requester_number_of_posts_at_request',
 u'requester_number_of_posts_on_raop_at_request',
 u'requester_number_of_subreddits_at_request',
 u'requester_subreddits_at_request',
 u'requester_upvotes_minus_downvotes_at_request',
 u'requester_upvotes_plus_downvotes_at_request',
 u'requester_username',
 u'unix_timestamp_of_request',
 u'unix_timestamp_of_request_utc',
 'clean_request_text',
 'clean_title_text',
 'log_request_length',
 'log_title_length',
 'link_presence?',
 'semi_colon?',
 'request_user_name_length',
 'giver_user_name_length',
 'T:money1',
 'T:money2',
 'T:Job',
 'T:Friend',
 'T:Student',
 'T:Familytime',
 'T:Time',
 'T:gratitude',
 'T:Pizza',
 'T:General']

In [14]:
df.to_json('NonTopictest.json')

In [93]:
Topical.to_json('TopicTest1.json')

In [49]:
Topical.head()


Out[49]:
giver_username_if_known number_of_downvotes_of_request_at_retrieval number_of_upvotes_of_request_at_retrieval post_was_edited request_id request_number_of_comments_at_retrieval request_text request_text_edit_aware request_title requester_account_age_in_days_at_request ... 0 1 2 3 4 5 6 7 8 9
0 N/A 0 1 0 t3_l25d7 0 Hi I am in need of food for my 4 children we a... Hi I am in need of food for my 4 children we a... Request Colorado Springs Help Us Please 0.000000 ... 0 2 0 2 0 2 2 1 0 1
1 N/A 2 5 0 t3_rcb83 0 I spent the last money I had on gas today. Im ... I spent the last money I had on gas today. Im ... [Request] California, No cash and I could use ... 501.111100 ... 1 2 1 0 1 1 2 0 1 0
2 N/A 0 3 0 t3_lpu5j 0 My girlfriend decided it would be a good idea ... My girlfriend decided it would be a good idea ... [Request] Hungry couple in Dundee, Scotland wo... 0.000000 ... 2 2 3 2 2 0 1 2 3 1
3 N/A 0 1 1 t3_mxvj3 4 It's cold, I'n hungry, and to be completely ho... It's cold, I'n hungry, and to be completely ho... [Request] In Canada (Ontario), just got home f... 6.518438 ... 1 1 1 1 1 1 1 1 1 1
4 N/A 6 6 0 t3_1i6486 5 hey guys:\n I love this sub. I think it's grea... hey guys:\n I love this sub. I think it's grea... [Request] Old friend coming to visit. Would LO... 162.063252 ... 1 1 3 6 2 1 1 4 2 4

5 rows × 50 columns


In [ ]: