In [1]:
# -*- coding: utf-8 -*-
from pymongo import MongoClient
import numpy as np
import datetime
from collections import Counter

# Variables
hostname = "localhost"
port = 27017
database_name = "twitter-data"
collection_name = "keyword-filtered-users"



In [85]:
client = MongoClient(hostname, port)
db = client[database_name]
collection = db[collection_name]

In [86]:
X = []
Y = []
for tweet in collection.find():
    user = tweet['user']
    
    followers_count = user['followers_count']
    statuses_count = user['statuses_count']
    friends_count = user['friends_count']
    favourites_count = user['favourites_count']
    listed_count = user['listed_count']
    verified = int(user['verified'])
    
    retweet_count = sum([tweet['retweet_count'][tweet_id] for tweet_id in tweet['retweet_count']])
    tweet_count = len(tweet['tweets'])
    
    # retweeters_in_db = tweet['retweeters_sum']
    
    date = tweet['user']['created_at'].encode('ascii', 'ignore').split()
    date = date[1] + ' ' + date[2] + ' ' + date[5] + ' ' + date[3]
    epoch = int(datetime.datetime.strptime(date, '%b %d %Y %H:%M:%S').strftime('%s'))
    
    X.append(np.array([followers_count, statuses_count, friends_count, favourites_count, listed_count, verified]))
    Y.append(retweet_count/tweet_count)

In [100]:
# Regressors
def random_forest_regression(X,Y,params):
    clf = RandomForestRegressor(n_estimators=params)
    clf.fit(X, Y)
    return clf

def k_neighbors_regressor(X,Y,params):
    neigh = KNeighborsRegressor(n_neighbors=params)
    neigh.fit(X, Y)
    return neigh

def svr(X,Y,params):
    clf = SVR(C=params[0], epsilon=params[1])
    clf.fit(X, Y)
    return clf

#Classifiers
def random_forest_classifier(X,Y,params):
    clf = RandomForestClassifier(n_estimators=params)
    clf.fit(X, Y)
    return clf

def k_neighbors_classifier(X,Y,params):
    neigh = KNeighborsClassifier(n_neighbors=params)
    neigh.fit(X, Y)
    return neigh

def svc(X,Y,params=None):
    clf = SVC(params)
    clf.fit(X, Y)
    return clf

def print_stat(X):
    print 'mean:', np.mean(X), '\tmedian:', np.median(X), 'std:', np.std(X), 'var:', np.var(X)
    print 'ptp:', np.ptp(X), '\tmin elem:', min(X), '\tmax elem:', max(X)

In [105]:
def get_stat(method, X, Y, u, v, params):
    train_X = X[u[0]:u[1]]
    train_Y = Y[u[0]:u[1]]
    test_X  = X[v[0]:v[1]]
    test_Y  = Y[v[0]:v[1]]
          
    clf = method(train_X, train_Y,params)
    cnt = Counter()
    
    real_result = test_Y
    prediction = clf.predict(test_X)
    diff = abs(prediction - real_result)
    
    real_counter = Counter()
    for retweet_count in real_result:
        real_counter[retweet_count] += 1
    
    print sum(real_counter.values())/len(real_counter)
    print np.median(real_counter.values())

    
    print_stat(prediction)
    print_stat(real_result)
    print_stat(diff)
    
    real_data = zip(real_result, prediction).sort()
    
    
    x = np.linspace(-2, 3)

    real = Scatter(
        y=real_result,
        x=real_result,
        mode='markers',
        name='real'
    )
    diff = Scatter(
        y=diff,
        x=real_result,
        mode='markers',
        name='diff'
    )
    predicted = Scatter(
        y=prediction,
        x=real_result,
        mode='markers',
        name='predicted'
    )
    diversity = Pie(
        labels=real_counter.keys(),
        values=real_counter.values(),
        hoverinfo='label+percent+value', textinfo='label'
    )
    #diff = Bar(
    #    y=diff,
    #    name='difference'
    #)
    
    data = Data([real,predicted, {'x': x, 'y': x**2}])
    data2 = Data([diversity]) 
    iplot(data)
    iplot(data2)
    print '------------'

In [106]:
get_stat(random_forest_regression, X, Y, (0,100000), (100000,107000), 3)


29
1.0
mean: 105.037550794 	median: 2.33333333333 std: 1590.80640459 var: 2530665.0169
ptp: 68815.6666667 	min elem: 1.0 	max elem: 68816.6666667
mean: 40.845 	median: 1.0 std: 1008.48754797 var: 1017047.1344
ptp: 79717 	min elem: 1 	max elem: 79718
mean: 131.413788889 	median: 1.66666666667 std: 1871.73414282 var: 3503388.70139
ptp: 79716.6666667 	min elem: 0.0 	max elem: 79716.6666667
------------

In [16]:
get_stat(k_neighbors_regressor, X, Y, (0,100000), (100000,101000), 2)


result mean: 103.218 	min elem: 1.0 	max elem: 50988.0
original mean: 105.171 	min elem: 1 	max elem: 79718
abs diff mean: 202.035 	min elem: 0.0 	max elem: 79717.0
------------

In [14]:
get_stat(svr, X, Y, (0,10000), (1000,1500), (1,0.1))


result mean: 2.39034852986 	min elem: 1.09990009236 	max elem: 3.03974384236
original mean: 602.258 	min elem: 1 	max elem: 128298
abs diff mean: 599.930638368 	min elem: 0.0397438423646 	max elem: 128294.960256
------------

In [95]:
a = np.array([[10, 4, 7], [3, 2, 1]])
np.median(a, axis=1)


Out[95]:
array([ 7.,  2.])

In [ ]: