In [1]:
# -*- coding: utf-8 -*-
from pymongo import MongoClient
import numpy as np
import datetime
from collections import Counter
# Variables
hostname = "localhost"
port = 27017
database_name = "twitter-data"
collection_name = "keyword-filtered-users"
In [85]:
client = MongoClient(hostname, port)
db = client[database_name]
collection = db[collection_name]
In [86]:
X = []
Y = []
for tweet in collection.find():
user = tweet['user']
followers_count = user['followers_count']
statuses_count = user['statuses_count']
friends_count = user['friends_count']
favourites_count = user['favourites_count']
listed_count = user['listed_count']
verified = int(user['verified'])
retweet_count = sum([tweet['retweet_count'][tweet_id] for tweet_id in tweet['retweet_count']])
tweet_count = len(tweet['tweets'])
# retweeters_in_db = tweet['retweeters_sum']
date = tweet['user']['created_at'].encode('ascii', 'ignore').split()
date = date[1] + ' ' + date[2] + ' ' + date[5] + ' ' + date[3]
epoch = int(datetime.datetime.strptime(date, '%b %d %Y %H:%M:%S').strftime('%s'))
X.append(np.array([followers_count, statuses_count, friends_count, favourites_count, listed_count, verified]))
Y.append(retweet_count/tweet_count)
In [100]:
# Regressors
def random_forest_regression(X,Y,params):
clf = RandomForestRegressor(n_estimators=params)
clf.fit(X, Y)
return clf
def k_neighbors_regressor(X,Y,params):
neigh = KNeighborsRegressor(n_neighbors=params)
neigh.fit(X, Y)
return neigh
def svr(X,Y,params):
clf = SVR(C=params[0], epsilon=params[1])
clf.fit(X, Y)
return clf
#Classifiers
def random_forest_classifier(X,Y,params):
clf = RandomForestClassifier(n_estimators=params)
clf.fit(X, Y)
return clf
def k_neighbors_classifier(X,Y,params):
neigh = KNeighborsClassifier(n_neighbors=params)
neigh.fit(X, Y)
return neigh
def svc(X,Y,params=None):
clf = SVC(params)
clf.fit(X, Y)
return clf
def print_stat(X):
print 'mean:', np.mean(X), '\tmedian:', np.median(X), 'std:', np.std(X), 'var:', np.var(X)
print 'ptp:', np.ptp(X), '\tmin elem:', min(X), '\tmax elem:', max(X)
In [105]:
def get_stat(method, X, Y, u, v, params):
train_X = X[u[0]:u[1]]
train_Y = Y[u[0]:u[1]]
test_X = X[v[0]:v[1]]
test_Y = Y[v[0]:v[1]]
clf = method(train_X, train_Y,params)
cnt = Counter()
real_result = test_Y
prediction = clf.predict(test_X)
diff = abs(prediction - real_result)
real_counter = Counter()
for retweet_count in real_result:
real_counter[retweet_count] += 1
print sum(real_counter.values())/len(real_counter)
print np.median(real_counter.values())
print_stat(prediction)
print_stat(real_result)
print_stat(diff)
real_data = zip(real_result, prediction).sort()
x = np.linspace(-2, 3)
real = Scatter(
y=real_result,
x=real_result,
mode='markers',
name='real'
)
diff = Scatter(
y=diff,
x=real_result,
mode='markers',
name='diff'
)
predicted = Scatter(
y=prediction,
x=real_result,
mode='markers',
name='predicted'
)
diversity = Pie(
labels=real_counter.keys(),
values=real_counter.values(),
hoverinfo='label+percent+value', textinfo='label'
)
#diff = Bar(
# y=diff,
# name='difference'
#)
data = Data([real,predicted, {'x': x, 'y': x**2}])
data2 = Data([diversity])
iplot(data)
iplot(data2)
print '------------'
In [106]:
get_stat(random_forest_regression, X, Y, (0,100000), (100000,107000), 3)
In [16]:
get_stat(k_neighbors_regressor, X, Y, (0,100000), (100000,101000), 2)
In [14]:
get_stat(svr, X, Y, (0,10000), (1000,1500), (1,0.1))
In [95]:
a = np.array([[10, 4, 7], [3, 2, 1]])
np.median(a, axis=1)
Out[95]:
In [ ]: