In [3]:
import os
import string
from datetime import datetime, date, timedelta
import unicodedata
import pymongo
from instagram.client import InstagramAPI
from instagram.bind import InstagramAPIError
from nltk.corpus import stopwords
from nltk.metrics import edit_distance
from nltk.corpus import wordnet as wn
from gensim import corpora, models, similarities
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from mongo_helper import (
CLIENT,
DB,
TRANS_COLLECTION,
USER_PAIRS_COLLECTION,
VENMO_INSTAGRAM_MATCHES
)
from instagram_helper import (
InstagramAPICycler,
get_all_paginated_data,
instagram_media_to_dict
)
from insta_query import query
from secrets import TOKENS
SLEEP_SECONDS = 60*60
API_CYCLER = InstagramAPICycler(TOKENS)
HEAVY_USER_THRESHOLD = 30
AFTER_CUTOFF_DATE = date(2015, 3, 1)
HOURS_RADIUS = 24
VENMO_DATE_FORMAT_STR = '%Y-%m-%dT%H:%M:%SZ'
STOPLIST = frozenset(stopwords.words('english'))
GRAPHS_PATH = os.path.join(os.getcwd(), 'graphs')
if not os.path.exists(GRAPHS_PATH):
os.mkdir(GRAPHS_PATH)
In [4]:
query(TOKENS, HEAVY_USER_THRESHOLD, repopulate=True) # Run this to repopulate the user matches collection
# query(TOKENS, HEAVY_USER_THRESHOLD)
In [5]:
user_matches = [result for result in VENMO_INSTAGRAM_MATCHES.find()]
print 'Total Venmo-Instagram user matches: %d' % len(user_matches)
In [6]:
def venmo_user_trans(user_id):
pipeline = [
{"$unwind": "$transactions"},
{"$match": {"$or": [
{"actor.id": user_id},
{"transactions.target.id": user_id}
]}},
{"$sort": {"created_time": 1}}
]
return [r for r in TRANS_COLLECTION.aggregate(pipeline)]
In [7]:
def parse_venmo_datetime(datetime_str):
return datetime.strptime(datetime_str, VENMO_DATE_FORMAT_STR)
In [8]:
def get_venmo_trans_datetimes(transactions):
return [parse_venmo_datetime( t.get('created_time') ) for t in transactions]
def get_instagram_datetimes(media):
return [m.created_time for m in media]
In [9]:
def group_by_date(datetimes, min_date=None):
results = {}
for dt in datetimes:
if min_date is None or dt.date() >= min_date:
results[dt.date()] = results.setdefault(dt.date(), 0) + 1
return results
def normalize_date_data(data_dict, all_dates):
normalized = {}
for date in all_dates:
normalized[date] = data_dict.setdefault(date, 0)
return normalized
In [10]:
def get_instagram_api_data(instagram_user):
instagram_id = instagram_user.get('id')
media = get_all_paginated_data(API_CYCLER.api, 'user_recent_media', user_id=instagram_id, count=100)
print '%d Media fetched for Instagram user %s (%s)' % (len(media), instagram_user.get('username'), instagram_user.get('id'))
return media
def get_venmo_api_data(venmo_user):
venmo_id = venmo_user.get('id')
venmo_trans = venmo_user_trans(venmo_id)
print '%d Transactions fetched for Venmo user %s (%s)' % (len(venmo_trans), venmo_user.get('username'), venmo_user.get('id'))
return venmo_trans
def get_api_data(venmo_user, instagram_user):
media = get_instagram_api_data(instagram_user)
venmo_trans = get_venmo_api_data(venmo_user)
return venmo_trans, media
def normalize_for_plot(trans, media):
venmo_trans_datetimes = get_venmo_trans_datetimes(trans)
instagram_datetimes = get_instagram_datetimes(media)
# Group media and transactions activity across individual days
venmo_date_data = group_by_date(venmo_trans_datetimes, AFTER_CUTOFF_DATE)
instagram_date_data = group_by_date(instagram_datetimes, AFTER_CUTOFF_DATE)
full_date_set = set(instagram_date_data.keys()).union(venmo_date_data.keys())
venmo_date_data_norm = normalize_date_data(venmo_date_data, full_date_set)
instagram_date_data_norm = normalize_date_data(instagram_date_data, full_date_set)
x = venmo_date_data_norm.keys()
venmo_y = [venmo_date_data_norm[date] for date in venmo_date_data_norm.keys()]
instagram_y = [instagram_date_data_norm[date] for date in instagram_date_data_norm.keys()]
return x, venmo_y, instagram_y
In [11]:
width = 0.35
days = mdates.DayLocator()
weeks = mdates.WeekdayLocator()
date_fmt = mdates.DateFormatter('%d %b %Y')
def plot_user_data(x, venmo_y, instagram_y, venmo_user, instagram_user, fig_num):
figure, ax = plt.subplots(figsize=(18, 4), num=fig_num)
p1 = ax.bar(mdates.date2num(x), instagram_y, color='#ED913D', width=width, linewidth=0)
p2 = ax.bar(mdates.date2num(x), venmo_y, color='#78b653', width=width, linewidth=0, bottom=instagram_y)
# Formatting
ax.xaxis.set_major_locator(weeks)
ax.xaxis.set_minor_locator(days)
ax.xaxis.set_major_formatter(date_fmt)
ax.legend( (p1[0], p2[0]), ('Instagram', 'Venmo') )
ax.set_ylabel('Activity')
ax.set_xlabel('Dates')
title_str = 'Venmo user %s (%s) | Instagram user %s (%s)' % (
venmo_user.get('username'), venmo_user.get('id'),
instagram_user.get('username'), instagram_user.get('id'),
)
ax.set_title(title_str)
filename = '%s.png' % venmo_user.get('username')
plt.savefig(os.path.join(GRAPHS_PATH, filename))
plt.show()
In [12]:
def instagram_caption_words(instagram_media):
raw_captions = [getattr(post.caption, 'text') for post in instagram_media if hasattr(post.caption, 'text')]
return get_word_dictionary(raw_captions)
def venmo_message_words(venmo_messages):
venmo_messages_raw = [tran.get('message') for tran in sample_trans]
return get_word_dictionary(venmo_messages_raw)
def filter_ascii_punctuation(word):
return ''.join([c for c in word if c not in string.punctuation])
def get_words(document):
return [
filter_ascii_punctuation(word) for
word in document.lower().replace('#', '').split() if word not in STOPLIST
]
def get_word_dictionary(documents):
return [get_words(document) for document in documents]
In [13]:
tfidf_threshold = 0.75
sim_match_index = 0
sim_match_words = 1
def text_matches(venmo_trans, instagram_media):
instagram_words = instagram_caption_words(instagram_media)
# venmo_words = venmo_message_words(trans)
# v_word_dict = corpora.Dictionary(venmo_words)
try:
i_word_dict, tfidf_model, tfidf_index = build_tfidf_model(instagram_words)
except ValueError as e:
yield None, None
# print i_word_dict
# doc = 'test sushi with pals'
for i, msg in enumerate([tran.get('message') for tran in venmo_trans]):
# vec_bow = i_word_dict.doc2bow(msg.lower().split())
vec_bow = i_word_dict.doc2bow(get_words(msg))
vec_tfidf = tfidf_model[vec_bow]
sims = tfidf_index[vec_tfidf]
tfidf_sims_list = [sim for sim in list(enumerate(sims)) if sim[sim_match_words] > tfidf_threshold]
# report_results(tfidf_sims_list, instagram_words)
if tfidf_sims_list:
# print 'FOUND MATCHES FOR VENMO MSG %s' % msg
yield venmo_trans[i], [instagram_media[match[sim_match_index]] for match in tfidf_sims_list]
else:
yield None, None
def build_tfidf_model(instagram_words):
i_word_dict = corpora.Dictionary(instagram_words)
i_corpus = [i_word_dict.doc2bow(word) for word in instagram_words]
tfidf_model = models.TfidfModel(i_corpus)
corpus_tfidf = tfidf_model[i_corpus]
tfidf_index = similarities.MatrixSimilarity(tfidf_model[corpus_tfidf])
return i_word_dict, tfidf_model, tfidf_index
def report_results(sims, media_captions):
for i, sim in enumerate(sims):
if sim[1] > 0:
print '%s -- %s' % (sim, media_captions[i])
In [14]:
diff_after = timedelta(hours=-HOURS_RADIUS)
diff_before = timedelta(hours=HOURS_RADIUS)
def media_near_transaction(tran, media):
tran_datetime = parse_venmo_datetime( tran.get('created_time') )
after_datetime = tran_datetime + diff_after
before_datetime = tran_datetime + diff_before
return [
m for m in media if
m.created_time > after_datetime and
m.created_time < before_datetime
]
In [15]:
# # print media_captions_raw[0]
# # print venmo_messages_raw[0]
# def report_levenshtein_dist(i_captions, v_messages):
# for ic in i_captions:
# ld = sorted([(vm, edit_distance(vm, ic)) for vm in v_messages], key=lambda r: r[1], reverse=True)
# print ic
# for i, r in enumerate(ld[0:5]):
# print '\t%d: %s' % (i+1, r)
# # report_levenshtein_dist(media_captions_raw, venmo_messages_raw)
# caption_words = [[w for w in c.split()] for c in media_captions_raw]
# venmo_words = [[w for w in c.split()] for c in venmo_messages_raw]
# def wn_similarity(word1, word2):
# return [(s1, s2, wn.path_similarity(s1, s2)) for s1 in wn.synsets(word1) for s2 in wn.synsets(word2)]
# # for doc in caption_words:
# # for word in doc:
# # for tran in venmo_words:
# # for
In [16]:
# # test = u'\U0001f60d\U0001f61c\U0001f632'
# # print unicodedata.name(u'\U0001f632')
# edit_distance("dinner", "dinners")
# # print wn.synsets('fish')[0]
# # [s.hyponyms() for s in wn.synsets('fish')]
# [(s1.hyponyms(), s2.hyponyms(), wn.path_similarity(s1, s2)) for s1 in wn.synsets('lunch') for s2 in wn.synsets('dinner')]
# # wn.path_similarity(wn.synsets('fish'), wn.synsets('sushi'))
# # [exp1 for x in xSet for y in ySet]
# # is equal to
# # result=[]
# # for x in xSet:
# # for y in ySet:
# # result.append(exp1)
In [17]:
# target_username = 'fwedeorange'
# sample = [m for m in user_matches if m.get('venmo').get('username') == target_username]
# sample_instagram = sample[-1].get('instagram')
# sample_venmo = sample[-1].get('venmo')
# sample_trans, sample_media = get_api_data(sample_venmo, sample_instagram)
In [18]:
# media1, media2, media3, media4 = sample_media[0], sample_media[1], sample_media[3], sample_media[0]
# m1 = [media1, media2]
# m3 = [media3, media4, media1]
# print set(m1).intersection(m3)
# print getattr(media1, 'id')
# # print sample_media[1] == sample_media[2]
In [19]:
def venmo_instagram_matches(venmo_trans, instagram_media):
for venmo_tran, instagram_caption_matches in text_matches(venmo_trans, instagram_media):
if venmo_tran:
# print venmo_tran.get('message'), [getattr(post.caption, 'text') for post in instagram_caption_matches]
instagram_nearby_date = media_near_transaction(venmo_tran, instagram_media)
vi_match = set(instagram_nearby_date).intersection(instagram_caption_matches)
if vi_match:
yield venmo_tran, list(vi_match)
else:
continue
# yield venmo_tran, list(slam_dunkins) if slam_dunkins else None
In [20]:
update_matches = []
errors = []
for i, user_pair in enumerate(user_matches):
instagram_user = user_pair.get('instagram')
venmo_user = user_pair.get('venmo')
try:
venmo_trans, instagram_media = get_api_data(venmo_user, instagram_user)
print
print 'Checking for matching Venmo and Instragram updates for user %s/%s' % (venmo_user.get('username'), instagram_user.get('username'))
for va, ia in venmo_instagram_matches(venmo_trans, instagram_media):
print update_matches.append((venmo, instagram))
if update_matches:
print 'FOUND MATCHING UPDATES %s' % update_matches
else:
pass
except InstagramAPIError as e:
if e.status_code == 400:
error_str = "ERROR: Instagram user %s -- %s is set to private." % (instagram_user.get('username'), instagram_user.get('id'))
errors.append(error_str)
continue
# print venmo_trans
# print instagram_media
# x, venmo_y, instagram_y = normalize_for_plot(venmo_trans, instagram_media)
# plot_user_data(x, venmo_y, instagram_y, venmo_user, instagram_user, i)
# match_pair_updates()
In [ ]: