In [1]:
# -*- coding: utf-8 -*-
from pymongo import MongoClient
import numpy as np
import datetime
from collections import Counter

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVR, SVC

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
from plotly.graph_objs import *
init_notebook_mode(connected=True)

# Variables
hostname = "localhost"
port = 27017
database_name = "twitter-data"
collection_name = "keyword-filtered-users"



In [85]:
client = MongoClient(hostname, port)
db = client[database_name]
collection = db[collection_name]

In [86]:
X = []
Y = []
for tweet in collection.find():
    user = tweet['user']
    
    followers_count = user['followers_count']
    statuses_count = user['statuses_count']
    friends_count = user['friends_count']
    favourites_count = user['favourites_count']
    listed_count = user['listed_count']
    verified = int(user['verified'])
    
    retweet_count = sum([tweet['retweet_count'][tweet_id] for tweet_id in tweet['retweet_count']])
    tweet_count = len(tweet['tweets'])
    
    # retweeters_in_db = tweet['retweeters_sum']
    
    date = tweet['user']['created_at'].encode('ascii', 'ignore').split()
    date = date[1] + ' ' + date[2] + ' ' + date[5] + ' ' + date[3]
    epoch = int(datetime.datetime.strptime(date, '%b %d %Y %H:%M:%S').strftime('%s'))
    
    X.append(np.array([followers_count, statuses_count, friends_count, favourites_count, listed_count, verified]))
    Y.append(retweet_count/tweet_count)

In [100]:
# Regressors
def random_forest_regression(X,Y,params):
    clf = RandomForestRegressor(n_estimators=params)
    clf.fit(X, Y)
    return clf

def k_neighbors_regressor(X,Y,params):
    neigh = KNeighborsRegressor(n_neighbors=params)
    neigh.fit(X, Y)
    return neigh

def svr(X,Y,params):
    clf = SVR(C=params[0], epsilon=params[1])
    clf.fit(X, Y)
    return clf

#Classifiers
def random_forest_classifier(X,Y,params):
    clf = RandomForestClassifier(n_estimators=params)
    clf.fit(X, Y)
    return clf

def k_neighbors_classifier(X,Y,params):
    neigh = KNeighborsClassifier(n_neighbors=params)
    neigh.fit(X, Y)
    return neigh

def svc(X,Y,params=None):
    clf = SVC(params)
    clf.fit(X, Y)
    return clf

def print_stat(X):
    print 'mean:', np.mean(X), '\tmedian:', np.median(X), 'std:', np.std(X), 'var:', np.var(X)
    print 'ptp:', np.ptp(X), '\tmin elem:', min(X), '\tmax elem:', max(X)

In [105]:
def get_stat(method, X, Y, u, v, params):
    train_X = X[u[0]:u[1]]
    train_Y = Y[u[0]:u[1]]
    test_X  = X[v[0]:v[1]]
    test_Y  = Y[v[0]:v[1]]
          
    clf = method(train_X, train_Y,params)
    cnt = Counter()
    
    real_result = test_Y
    prediction = clf.predict(test_X)
    diff = abs(prediction - real_result)
    
    real_counter = Counter()
    for retweet_count in real_result:
        real_counter[retweet_count] += 1
    
    print sum(real_counter.values())/len(real_counter)
    print np.median(real_counter.values())

    
    print_stat(prediction)
    print_stat(real_result)
    print_stat(diff)
    
    real_data = zip(real_result, prediction).sort()
    
    
    x = np.linspace(-2, 3)

    real = Scatter(
        y=real_result,
        x=real_result,
        mode='markers',
        name='real'
    )
    diff = Scatter(
        y=diff,
        x=real_result,
        mode='markers',
        name='diff'
    )
    predicted = Scatter(
        y=prediction,
        x=real_result,
        mode='markers',
        name='predicted'
    )
    diversity = Pie(
        labels=real_counter.keys(),
        values=real_counter.values(),
        hoverinfo='label+percent+value', textinfo='label'
    )
    #diff = Bar(
    #    y=diff,
    #    name='difference'
    #)
    
    data = Data([real,predicted, {'x': x, 'y': x**2}])
    data2 = Data([diversity]) 
    iplot(data)
    iplot(data2)
    print '------------'

In [106]:
get_stat(random_forest_regression, X, Y, (0,100000), (100000,107000), 3)


29
1.0
mean: 105.037550794 	median: 2.33333333333 std: 1590.80640459 var: 2530665.0169
ptp: 68815.6666667 	min elem: 1.0 	max elem: 68816.6666667
mean: 40.845 	median: 1.0 std: 1008.48754797 var: 1017047.1344
ptp: 79717 	min elem: 1 	max elem: 79718
mean: 131.413788889 	median: 1.66666666667 std: 1871.73414282 var: 3503388.70139
ptp: 79716.6666667 	min elem: 0.0 	max elem: 79716.6666667
------------

In [16]:
get_stat(k_neighbors_regressor, X, Y, (0,100000), (100000,101000), 2)


result mean: 103.218 	min elem: 1.0 	max elem: 50988.0
original mean: 105.171 	min elem: 1 	max elem: 79718
abs diff mean: 202.035 	min elem: 0.0 	max elem: 79717.0
------------

In [14]:
get_stat(svr, X, Y, (0,10000), (1000,1500), (1,0.1))


result mean: 2.39034852986 	min elem: 1.09990009236 	max elem: 3.03974384236
original mean: 602.258 	min elem: 1 	max elem: 128298
abs diff mean: 599.930638368 	min elem: 0.0397438423646 	max elem: 128294.960256
------------

In [95]:
a = np.array([[10, 4, 7], [3, 2, 1]])
np.median(a, axis=1)


Out[95]:
array([ 7.,  2.])

In [ ]: