In [55]:
import tweepy
import pickle
import os
from geopy.geocoders import Nominatim
import countries
import datetime
import time
import re
import nltk
from nltk.corpus import stopwords
from scipy.sparse import lil_matrix
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.feature_selection import chi2
import numpy as np
import operator
In [2]:
#nltk.download()
cachedStopWords = stopwords.words("english")
In [2]:
TWEET_PATH = 'Tweets/'
ACCESS_TOKEN = '3007366663-QU3WM6hrAXEAfelPzdCpv713LOB8D7LgtsuvZWL'
ACCESS_SECRET = 'RzDTCkg3xoZfEmc3bGNXypponiq06ak9rZxiziXzx7nkO'
CONSUMER_KEY = 'nlbCSYMdqtyKpANbLQOl6ITKZ'
CONSUMER_SECRET = 'wY1CXDFLcN03H94BQo96KzKW47J2nfvoQBr4x5XL96POTV9Bbj'
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth)
api.wait_on_rate_limit = True
api.wait_on_rate_limit_notify = True
In [4]:
def get_all_tweets(user_id, api):
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(user_id = user_id,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#print "getting tweets before %s" % (oldest)
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(user_id = user_id, count=200, max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#print "...%s tweets downloaded so far" % (len(alltweets))
return alltweets
In [14]:
#all_tweets = get_all_tweets(40428817,api)
#for tweet in all_tweets:
#print tweet.coordinates
In [15]:
#f = open('all_tweets.txt', 'w')
#pickle.dump(all_tweets, f)
#f.close()
#f = open('all_tweets.txt', 'r')
#all_tweets2 = pickle.load(f)
#f.close()
In [5]:
def get_followers(filename, screen_name, api):
uscis_followers = []
for page in tweepy.Cursor(api.followers_ids, screen_name = screen_name).pages():
uscis_followers.extend(page)
time.sleep(60)
#write into a file
f = open(filename , 'w')
for id in uscis_followers:
f.write(str(id) + '\n')
f.close()
return uscis_followers
In [17]:
#uscis_followers = get_followers('uscis_followers','USCIS' ,api)
In [6]:
f = open('uscis_followers', 'r')
uscis_followers = []
for line in f:
uscis_followers.append(line)
f.close()
print len(uscis_followers)
In [7]:
def prune_followers(uscis_followers, min_status_count, api):
pruned = []
i = 0
for id in uscis_followers:
try:
user = api.get_user(id)
if not user.protected:
if user.geo_enabled:
if user.statuses_count > min_status_count:
pruned.append(id)
print i , ' --- ', id
except:
print 'error caught!'
pass
i += 1
return pruned
In [16]:
pruned_followers1 = prune_followers(uscis_followers[0:6000], 100, api)
In [19]:
f = open('pruned_followers1','w')
for follower in pruned_followers1:
f.write(str(follower)+'\n')
f.close()
In [8]:
pruned_followers2 = prune_followers(uscis_followers[6000:12000], 100, api)
In [9]:
print len(pruned_followers2)
In [10]:
f = open('pruned_followers2','w')
for follower in pruned_followers2:
f.write(str(follower)+'\n')
f.close()
In [7]:
pruned_followers3 = prune_followers(uscis_followers[12000:18000], 100, api)
In [8]:
f = open('pruned_followers3','w')
for follower in pruned_followers3:
f.write(str(follower)+'\n')
f.close()
In [9]:
len(pruned_followers3)
Out[9]:
In [ ]:
#pruned_followers4 = prune_followers(uscis_followers[18000:24000], 100, api)
In [ ]:
#pruned_followers5 = prune_followers(uscis_followers[24000:30000], 100, api)
In [ ]:
#pruned_followers6 = prune_followers(uscis_followers[30000:36000], 100, api)
In [ ]:
#pruned_followers7 = prune_followers(uscis_followers[36000:42000], 100, api)
In [ ]:
#pruned_followers8 = prune_followers(uscis_followers[42000:48000], 100, api)
In [ ]:
#pruned_followers9 = prune_followers(uscis_followers[48000:54000], 100, api)
In [ ]:
#pruned_followers10 = prune_followers(uscis_followers[54000:60000], 100, api)
In [ ]:
#pruned_followers11 = prune_followers(uscis_followers[60000:], 100, api)
In [34]:
pruned_followers1 = []
f = open('pruned_followers1','r')
for line in f:
if not line.isspace():
pruned_followers1.append(line)
f.close()
In [35]:
len(pruned_followers1)
Out[35]:
In [8]:
def get_and_save_tweets_one_user(user_id, api):
all_tweets = get_all_tweets(user_id, api)
f = open (TWEET_PATH + str(user_id) , 'w')
pickle.dump(all_tweets, f)
f.close
In [9]:
def get_and_save_tweets_all_users(followers_pruned_id, api):
for id in followers_pruned_id:
try:
get_and_save_tweets_one_user(id, api)
print 'tweets of ' , id , 'were saved!'
except:
print '*******************************'
print '*******************************'
print ' EXCEPTION! '
print 'in id: ', id
print '*******************************'
print '*******************************'
In [10]:
def get_user_ids_Twetter_folder():
all_followers = []
for dirs, root, files in os.walk(TWEET_PATH):
for file in files:
if not file.startswith('.'):
all_followers.append(file)
return all_followers
In [ ]:
#get_and_save_tweets_all_users(pruned_followers1, api)
In [11]:
all_followers = get_user_ids_Twetter_folder()
In [12]:
len(all_followers)
Out[12]:
In [14]:
class User_Tweet(object):
def __init__(self,date,text,country):
self.date = date
self.text = text
self.country = country
def __cmp__(self,other):
if self.date < other.date:
return -1
elif self.date == other.date:
return 0
else:
return 1
In [19]:
def create_user_timelines_dic(followers_ids):
users_timelines = {}
for user_id in followers_ids:
user_id.replace('\n','')
timeline = []
try:
file_name = TWEET_PATH + str(user_id)
f = open(file_name,'r')
tweets = pickle.load(f)
f.close()
for tweet in tweets:
if tweet.place is not None:
user_tweet = User_Tweet(tweet.created_at,tweet.text,tweet.place.country)
timeline.append(user_tweet)
users_timelines[user_id] = timeline
except:
print 'Error caught and prevented interruption'
continue
return users_timelines
In [15]:
users_timelines = create_user_timelines_dic(all_followers)
In [27]:
class Immigrant(object):
def __init__(self,user_id, immigration_date,tweets,source_country,destination_country, duration):
self.user_id = user_id
self.immigration_date = immigration_date
self.tweets = tweets
self.source_country = source_country
self.destination_country = destination_country
self.duration = duration
def display(self):
print '**********************************'
print 'user id:', self.user_id
print '# of tweets', len(self.tweets)
print 'immigrated from: ', self.source_country
print 'to: ', self.destination_country
print 'at:', self.immigration_date
print 'for', self.duration, 'days'
print '**********************************'
In [28]:
def find_immigrants(users_timelines):
immigrants = []
for user_id in users_timelines:
timeline = users_timelines[user_id]
sorted_timeline = sorted(timeline)
countries = []
for user_tweet in sorted_timeline:
if user_tweet.country not in countries:
countries.append(user_tweet.country)
if (len(countries) > 1):
if sorted_timeline[-1].country == countries[-1]:
#find the time of last tweet in the previous country
i = len(sorted_timeline) - 1
last_tweet_previous_country = sorted_timeline[i]
while(last_tweet_previous_country.country != countries[-2]):
i = i - 1
last_tweet_previous_country = sorted_timeline[i]
duration = sorted_timeline[-1].date - last_tweet_previous_country.date
if duration.days > 90:
#We found an immigrant!
#lets reload ALL his/her tweet:
file_name = TWEET_PATH + str(user_id)
f = open(file_name,'r')
tweets = pickle.load(f)
f.close()
#now lets build him/her:
immigrant = Immigrant(user_id,
last_tweet_previous_country.date,
tweets,
last_tweet_previous_country.country,
sorted_timeline[-1].country,
duration.days)
#immigrant.display()
immigrants.append(immigrant)
return immigrants
In [18]:
immigrants = find_immigrants(users_timelines)
In [19]:
print len(immigrants)
In [20]:
#write immigrant ids into a file
f = open('immigrant_ids11','w')
for immigrant in immigrants:
f.write(immigrant.user_id)
f.close()
In [21]:
#dump entire immigrants objects into a file
f = open('immigrants_objects11', 'w')
pickle.dump(immigrants, f)
f.close()
In [22]:
#read all immigrants (full objects) from a file
f = open('immigrants_objects11', 'r')
all_immigrants = pickle.load(f)
f.close()
print len(all_immigrants)
In [23]:
def get_tweets_after_immigration(immigrant):
tweets = []
for tweet in immigrant.tweets:
if(tweet.created_at > immigrant.immigration_date):
tweets.append(tweet)
return tweets
In [24]:
def get_tweets_before_immigration(immigrant):
tweets = []
for tweet in immigrant.tweets:
if(tweet.created_at <= immigrant.immigration_date):
tweets.append(tweet)
return tweets
In [25]:
def tokenize(string, lowercase, keep_punctuation, prefix, collapse_urls, collapse_mentions):
if not string:
return []
if lowercase:
string = string.lower()
tokens = []
if collapse_urls:
string = re.sub('http\S+', 'THIS_IS_A_URL', string)
if collapse_mentions:
string = re.sub('@\S+', 'THIS_IS_A_MENTION', string)
if keep_punctuation:
tokens = string.split()
else:
tokens = re.sub('\W+', ' ', string).split()
if prefix:
tokens = ['%s%s' % (prefix, t) for t in tokens]
return tokens
In [26]:
#Separate all tweets of all immigrants based on being tweeted after or before the immigration
#Label tweets and return labels as second values
def get_all_tweet_texts_of_all_immigrants(all_immigrants):
all_tweet_texts = []
labels = []
for immigrant in all_immigrants:
tweets_after_immigration = get_tweets_after_immigration(immigrant)
tweets_before_immigration = get_tweets_before_immigration(immigrant)
for tweet in tweets_before_immigration:
#Remove stop words:
text = ' '.join([word for word in tweet.text.split() if word not in cachedStopWords])
all_tweet_texts.append(text)
labels.append(0)
for tweet in tweets_after_immigration:
#Remove stop words:
text = ' '.join([word for word in tweet.text.split() if word not in cachedStopWords])
all_tweet_texts.append(text)
labels.append(1)
return all_tweet_texts, np.array(labels)
In [27]:
all_tweet_texts, labels = get_all_tweet_texts_of_all_immigrants(all_immigrants)
In [28]:
def do_vectorize(data, tokenizer_fn=tokenize, min_df=1, max_df=1., binary=True, ngram_range=(1,1)):
vectorizer = CountVectorizer(input='content', min_df = 2, binary = False, ngram_range = (1,2))
X = vectorizer.fit_transform(data)
return X.tocsr(), vectorizer
In [29]:
X, vec = do_vectorize(all_tweet_texts)
In [30]:
def select_features(X, y, vec, number_of_features = 100, threshold_rate = 40):
chi, pvals = chi2(X,y)
feats = vec.get_feature_names()
features = {}
rate = threshold_rate
for i in np.argsort(chi)[::-1]:
#Ignore independet features:
if chi[i] == 0.00:
break
#select features that have frequency larger than threshhold rate (set in rate value)
my_count = Counter(y[np.where(X[:, i].T.toarray()[0]>=1)])
if my_count[1] > my_count[0] and my_count[0] != 0 and (my_count[1] * 1.0)/my_count[0] > rate:
feat_rate = (my_count[1] * 1.0)/my_count[0]
features[feats[i]] = feat_rate
keywords = [i for i in dict(sorted(features.items(), key=lambda x: x[1]))]
return keywords[-number_of_features : ]
In [31]:
features = select_features(X,labels,vec, 50)
In [32]:
print features
In [29]:
#input: immigrants_filename: a file with immigrant Twitter IDs in each line
def get_immigrants_objects(immigrants_filename):
immigrant_ids = []
f = open(immigrants_filename, 'r')
for line in f:
immigrant_ids.append(line)
f.close()
print 'found ', len(immigrant_ids), ' immigrant twitter accounts'
print 'building array immigrants objects'
users_timelines = create_user_timelines_dic(immigrant_ids)
immigrants = find_immigrants(users_timelines)
print 'done!'
return immigrants
In [30]:
immigrants = get_immigrants_objects('immigrant_ids11')
In [82]:
def get_country_freq(immigrants):
source_countries = {}
destination_countries = {}
src_dests = {}
for immigrant in immigrants:
src = immigrant.source_country
dest = immigrant.destination_country
src_dest = src + ' -> ' + dest
if src in source_countries:
source_countries[src] += 1
else:
source_countries[src] = 1
if dest in destination_countries:
destination_countries[dest] += 1
else:
destination_countries[dest] = 1
if src_dest in src_dests:
src_dests[src_dest] += 1
else:
src_dests[src_dest] = 1
source_countries = sorted(source_countries.items(), key=operator.itemgetter(1), reverse=True)
destination_countries = sorted(destination_countries.items(), key=operator.itemgetter(1), reverse=True)
src_dests = sorted(src_dests.items(), key=operator.itemgetter(1), reverse=True)
print 'Top 5 Source Countries: '
for (country,freq) in source_countries[:5]:
print ' ', country, freq
print
print 'Top 5 Destination Countries: '
for (country,freq) in destination_countries[:5]:
print ' ', country, freq
print
print 'Top 5 source->destination'
for (path, freq) in src_dests[:5]:
print ' ', path, freq
In [83]:
get_country_freq(immigrants)
In [ ]: