In [1]:
# -*- coding: utf-8 -*-
from pymongo import MongoClient
import numpy as np
import datetime
from collections import Counter
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVR, SVC
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
from plotly.graph_objs import *
init_notebook_mode(connected=True)
# Variables
hostname = "localhost"
port = 27017
database_name = "twitter-data"
collection_name = "keyword-filtered-users"
In [85]:
client = MongoClient(hostname, port)
db = client[database_name]
collection = db[collection_name]
In [86]:
X = []
Y = []
for tweet in collection.find():
user = tweet['user']
followers_count = user['followers_count']
statuses_count = user['statuses_count']
friends_count = user['friends_count']
favourites_count = user['favourites_count']
listed_count = user['listed_count']
verified = int(user['verified'])
retweet_count = sum([tweet['retweet_count'][tweet_id] for tweet_id in tweet['retweet_count']])
tweet_count = len(tweet['tweets'])
# retweeters_in_db = tweet['retweeters_sum']
date = tweet['user']['created_at'].encode('ascii', 'ignore').split()
date = date[1] + ' ' + date[2] + ' ' + date[5] + ' ' + date[3]
epoch = int(datetime.datetime.strptime(date, '%b %d %Y %H:%M:%S').strftime('%s'))
X.append(np.array([followers_count, statuses_count, friends_count, favourites_count, listed_count, verified]))
Y.append(retweet_count/tweet_count)
In [100]:
# Regressors
def random_forest_regression(X,Y,params):
clf = RandomForestRegressor(n_estimators=params)
clf.fit(X, Y)
return clf
def k_neighbors_regressor(X,Y,params):
neigh = KNeighborsRegressor(n_neighbors=params)
neigh.fit(X, Y)
return neigh
def svr(X,Y,params):
clf = SVR(C=params[0], epsilon=params[1])
clf.fit(X, Y)
return clf
#Classifiers
def random_forest_classifier(X,Y,params):
clf = RandomForestClassifier(n_estimators=params)
clf.fit(X, Y)
return clf
def k_neighbors_classifier(X,Y,params):
neigh = KNeighborsClassifier(n_neighbors=params)
neigh.fit(X, Y)
return neigh
def svc(X,Y,params=None):
clf = SVC(params)
clf.fit(X, Y)
return clf
def print_stat(X):
print 'mean:', np.mean(X), '\tmedian:', np.median(X), 'std:', np.std(X), 'var:', np.var(X)
print 'ptp:', np.ptp(X), '\tmin elem:', min(X), '\tmax elem:', max(X)
In [105]:
def get_stat(method, X, Y, u, v, params):
train_X = X[u[0]:u[1]]
train_Y = Y[u[0]:u[1]]
test_X = X[v[0]:v[1]]
test_Y = Y[v[0]:v[1]]
clf = method(train_X, train_Y,params)
cnt = Counter()
real_result = test_Y
prediction = clf.predict(test_X)
diff = abs(prediction - real_result)
real_counter = Counter()
for retweet_count in real_result:
real_counter[retweet_count] += 1
print sum(real_counter.values())/len(real_counter)
print np.median(real_counter.values())
print_stat(prediction)
print_stat(real_result)
print_stat(diff)
real_data = zip(real_result, prediction).sort()
x = np.linspace(-2, 3)
real = Scatter(
y=real_result,
x=real_result,
mode='markers',
name='real'
)
diff = Scatter(
y=diff,
x=real_result,
mode='markers',
name='diff'
)
predicted = Scatter(
y=prediction,
x=real_result,
mode='markers',
name='predicted'
)
diversity = Pie(
labels=real_counter.keys(),
values=real_counter.values(),
hoverinfo='label+percent+value', textinfo='label'
)
#diff = Bar(
# y=diff,
# name='difference'
#)
data = Data([real,predicted, {'x': x, 'y': x**2}])
data2 = Data([diversity])
iplot(data)
iplot(data2)
print '------------'
In [106]:
get_stat(random_forest_regression, X, Y, (0,100000), (100000,107000), 3)
In [16]:
get_stat(k_neighbors_regressor, X, Y, (0,100000), (100000,101000), 2)
In [14]:
get_stat(svr, X, Y, (0,10000), (1000,1500), (1,0.1))
In [95]:
a = np.array([[10, 4, 7], [3, 2, 1]])
np.median(a, axis=1)
Out[95]:
In [ ]: