In [19]:
###Make sure to run the relevant cells on the test data too
In [80]:
import math
#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series
import random
#import prettyplotlib as ppl
#from sklearn import ensemble as ske
%matplotlib inline
#import seaborn as sns
import re
from django.utils.encoding import smart_str, smart_unicode
df = pd.io.json.read_json('data/test_1.json')
df.columns.values.tolist()
Out[80]:
In [81]:
##cleaning some weird non ascii characters
df['clean_request_text'] = df['request_text_edit_aware'].apply(lambda x: smart_str(x))
#df.drop('request_text_edit_aware',1)
df['clean_title_text'] = df['request_title'].apply(lambda x: smart_str(x))
#df.drop('request_title')
In [82]:
#FEATURES ADDED: Length of posts titles, log transformed
df['request_length'] = df['clean_request_text'] .apply(lambda x: len(x))
df['log_request_length'] = df['request_length'].apply(lambda x: math.log10(x+1))
#Avoiding Collinearity in predictors
df = df.drop('request_length',1)
In [83]:
#FEATURES ADDED: Length of requests, log transformed
df['request_title_length'] = df['request_title'].apply(lambda x:len(x))
df['log_title_length'] = df['request_title_length'].apply(lambda x: math.log10(x+1))
df = df.drop('request_title_length',1)
In [84]:
#FEATURE ADDED: presence of link
def link_test(inputstring):
x = inputstring
a = re.search(".com", x)
b = re.search("http", x)
c = re.search(".net",x)
d = a or b or c
if d:
return 1
else:
return 0
df['link_presence?'] = df['clean_request_text'].apply(lambda j: link_test(j))
In [85]:
#FEATURE ADDED: presence of semi-colon, as a rough proxy for level of education, and well-written-ness
def colon_test(inputstring):
x = inputstring
a = re.search(";", x)
if a:
return 1
else:
return 0
df['semi_colon?'] = df['clean_request_text'].apply(lambda j: colon_test(j))
In [86]:
#FEATURE ADDED: Length of Username (longer ones tend to be more witty), and giver if known
df['request_user_name_length'] = df['requester_username'].apply(lambda j: len(j))
df['giver_user_name_length'] = df['giver_username_if_known'].apply(lambda j:len(j))
In [87]:
#### hacking the topical narratives from the paper:
#could supplement these...
###Open question: the paper only used a binary count. maybe we should add that too? also, maybe we should consider verbosity by also
#diving by length of text
money1 = """ week ramen paycheck work couple rice check pizza grocery rent anyone favor someone bill money"""
money2 = """ food money house bill rent stamp month today parent help pizza someone anything mom anyone"""
job = """job month rent year interview bill luck school pizza paycheck unemployment money ramen end check"""
friend = """ friend house night mine pizza birthday thing school site place family story way movie anything"""
student= """ student college final pizza loan summer university money class meal year semester story kid school"""
familytime = """ tonight night today tomorrow someone anyone friday dinner something account family bank anything home work"""
time = """ day couple anything today work pizza help pay anyone home meal food ramen someone favor"""
gratitude = """ thanks advance guy reading anyone pizza anything story tonight help place everyone craving kind favor"""
pizza = """pizza craving hut story someone anyone domino money cheese thing request picture act title kind"""
general = """time pizza year people part work hour life thing lurker story anything someone month way"""
narratives = [money1,money2,job,friend,student,familytime,time,gratitude,pizza,general]
### to do: create 10 columns for each request, each column with the number of points scored
narrative_names = '''money1 money2 job friend student familytime time gratitude pizza general'''
def narrative_scorer(text):
narrative_scores = []
names = narrative_names.split()
for i in range(len(narratives)):
current_keywords = narratives[i].split()
narrative_score = 0
for word in current_keywords: ###This doesnt work yet
a = re.search(word,text) ####>>>>>?????
# print a
if a:
narrative_score += 1
else:
pass
narrative_scores.append(narrative_score)
return narrative_scores
df['clean_request_text'][0:5].apply(lambda x: narrative_scorer(x))
Out[87]:
In [88]:
narrativedf = pd.DataFrame(df['clean_request_text'].apply(lambda x: Series(narrative_scorer(x))))
####TODO - Join this dataframe with df
In [89]:
narrativedf.head()
Out[89]:
In [90]:
narrativedf['request_id'] = df['request_id']
#narrativedf['requester_received_pizza'] = df['requester_received_pizza']
In [91]:
narrativedf.columns = ['T:money1','T:money2','T:Job','T:Friend','T:Student','T:Familytime','T:Time','T:gratitude','T:Pizza','T:General','request_id']
In [92]:
Topical = pd.DataFrame.merge(df,narrativedf)
In [6]:
#####################
#LDA on titles#
#Still need to run model backwords on new 'docbow' instances
#Importing gensim
import gensim #####
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities
import codecs
In [7]:
#need to transfrom text from data frame into a list of strings. (for both request text and request title? or should they merged?
#keeping them seperate for now
request = df['request_text_edit_aware']
Reqtxt = []
for item in request:
Reqtxt.append(smart_str(item))
title= df['request_title']
Titletxt = []
for item in title:
Titletxt.append(smart_str(item))
In [144]:
##next version: update stoplist
stoplist = set("""for a of the and me my to in is i I it am be has [request] [Request] [REQUEST] request (request) pizza pizza! pizza, pizza.
have would this i'm on when but as just that get with reddit subreddit if at so no are or
we was out can not some someone""".split())
Rtexts = [[word for word in document.lower().split() if word not in stoplist]
for document in Reqtxt]
Ttexts = [[word for word in document.lower().split() if word not in stoplist]
for document in Titletxt]
###NNNEED TO REMOVE ALL WORDS THAT AREN"T NOUNS
###option: remove all words that only appear once in the entire dataset. [potentially train LDA on Test and Train???]
# remove words that appear only once
#Rall_tokens = sum(Rtexts, [])
#Rtokens_once = set(word for word in set(Rall_tokens) if Rall_tokens.count(word) == 1)
#Rtexts = [[word for word in text if word not in Rtokens_once]
#for text in Rtexts]
#Tall_tokens = sum(Ttexts, [])
#Ttokens_once = set(word for word in set(Tall_tokens) if Tall_tokens.count(word) == 1)
#Ttexts = [[word for word in text if word not in Ttokens_once]
# for text in Ttexts]
In [145]:
Rdict = corpora.Dictionary(Rtexts)
Tdict = corpora.Dictionary(Ttexts)
In [146]:
Rdict.save('request_dictionary.dict')
Tdict.save('title_dictionary.dict')
In [147]:
##creating a corpus
Rcorpus = [Rdict.doc2bow(text) for text in Rtexts]
Tcorpus = [Tdict.doc2bow(text) for text in Ttexts]
corpora.MmCorpus.serialize('Rcorpus.mm', Rcorpus)
corpora.MmCorpus.serialize('Tcorpus.mm', Tcorpus)
In [148]:
Rlda_23 = gensim.models.ldamodel.LdaModel(corpus=Rcorpus, id2word=Rdict, num_topics=23, update_every=20, chunksize=100, passes=1)
In [99]:
Rlda_23.save('Rlda_23.model')
In [149]:
Rlda_23.print_topics(23)
Out[149]:
In [78]:
Topical.columns.values.tolist()
Out[78]:
In [14]:
df.to_json('NonTopictest.json')
In [93]:
Topical.to_json('TopicTest1.json')
In [49]:
Topical.head()
Out[49]:
In [ ]: