notebook.community

Edit and run



In [1]:

    
import numpy as np
import os
import pandas as pd
pd.options.display.float_format = '{:20,.4f}'.format
from scipy import stats


import json
import re
import sqlite3

import igraph as ig

import itertools 

from datetime import datetime 
import pytz
import time

import gc

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
cons = {}
cons['evolBio'] = sqlite3.connect("data/BMCevolBioSample.db")
cons['bio'] = sqlite3.connect("data/BMCbioSample.db")
cons['bmc'] = sqlite3.connect("data/bmcTwitter.db")
cons['comm'] = sqlite3.connect("data/communications.db")

dataset = 'bmc'



In [5]:

    
# def load_user_details(con=None):
#     # Now lets make a full users_df
#     users_df = pd.read_sql("SELECT user_id, user_object FROM users", con, index_col = 'user_id')
#     users_df.index = users_df.index.astype(int)
#     users_df['user'] = users_df.user_object.map(json.loads)
#     for field in ['screen_name', 'name', 'followers_count', 'friends_count','statuses_count', 'description']:
#         users_df[field] = users_df.user.map(lambda x: x[field])

#     del users_df['user_object']
#     del users_df['user']   
#     return users_df
    
def load_tweet_details(con = None):
    df = pd.read_sql("SELECT doi, tweet_id, old_screen_name, tweet FROM sample WHERE tweet IS NOT NULL ", con, index_col='tweet_id')
    df = df[~df.tweet.isnull()]
    df['tweet'] = df.tweet.apply(lambda x: json.loads(x) if x is not None else None)
    
    df['created_at'] = df.tweet.apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(x['created_at'],'%a %b %d %H:%M:%S +0000 %Y')))
    df['created_at'] = pd.to_datetime(df.created_at)
    df['created_at_dayofweek'] = df.tweet.apply(lambda x: x['created_at'][0:3])
    df['user'] = df.tweet.apply(lambda x: x['user'])
    df['screen_name'] = df.tweet.apply(lambda x: x['user']['screen_name'])
#     df['user_id'] = df.tweet.apply(lambda x: int(x['user']['id_str']))
#     df['user_utc_offset'] = df.tweet.apply(lambda x: x['user']['utc_offset'])
#     df['user_name'] = df.tweet.apply(lambda x: x['user']['name'])    
#     df['user_followers_count'] = df.tweet.apply(lambda x: x['user']['followers_count'])
#     df['user_friends_count'] = df.tweet.apply(lambda x: x['user']['friends_count'])
#     df['user_description'] = df.tweet.apply(lambda x: re.sub( '\s+', ' ', x['user']['description']).strip())
#     df['user_statuses_count'] = df.tweet.apply(lambda x: x['user']['statuses_count'])
    df['is_retweet'] = df.tweet.apply(lambda x: 'retweeted_status' in x)
    df['is_retweet'] = df['is_retweet'].fillna(False)
    df['retweet_of_status_id_str'] = df.tweet.apply(lambda x: x['retweeted_status']['id_str'] if 'retweeted_status' in x else None)
    df['retweet_of_screen_name'] = df.tweet.apply(lambda x: x['retweeted_status']['user']['screen_name'] if 'retweeted_status' in x else None)
    df['is_reply'] = df.tweet.apply(lambda x: x['in_reply_to_status_id'] != None)
    df['in_reply_to_status_id_str'] = df.tweet.apply(lambda x: x['in_reply_to_status_id_str'])
    df['in_reply_to_screen_name'] = df.tweet.apply(lambda x: x['in_reply_to_screen_name'])
    df['text'] = df.tweet.apply(lambda x: re.sub( '\s+', ' ', x['text']).strip()) # remove commas for CSV simplicity
    del df['tweet']
    tweetdetails = df.sort_index()
    del df

    df = pd.read_sql("SELECT doi, tweet_id, old_screen_name FROM sample WHERE error LIKE '%screen_name%'", con, index_col='old_screen_name')
    users_df = pd.read_sql("SELECT screen_name, user_object FROM users", con, index_col='screen_name')
    users_df['user'] = users_df.user_object.map(json.loads)
    del users_df['user_object']

    df = df.join(users_df, how="inner")
    df.index.name = 'screen_name'  
    df = df.reset_index().set_index('tweet_id')

    tweetdetails = tweetdetails.append(df).sort_index()
    del df
    
    for field in ['id', 'name', 'followers_count', 'friends_count','statuses_count', 'description']:
        tweetdetails['user_%s' % field] = tweetdetails.user.map(lambda x: x[field])
    del tweetdetails['user']
    
    
    try: 
        tweet_times = pd.read_csv('data/%s/tweet_times.csv' % dataset, index_col = 'tweet_id')
        tweet_times.columns = ['created_at', 'is_retweet']
        tweet_times['created_at'] = pd.to_datetime(tweet_times.created_at)
        tweet_times.index = tweet_times.index.astype(str)
        del tweetdetails['is_retweet']
        tweetdetails = tweetdetails.combine_first(tweet_times)
        del tweet_times
    except:
#         raise
        pass
    
    return tweetdetails



In [6]:

    
def load_graphs(con, tweetdetails = None):
    if not tweetdetails: 
        tweetdetails = load_tweet_details(con)
    
    dois = list(tweetdetails.doi.unique())
        
    friends = pd.read_sql_query("SELECT * FROM friends", con, index_col="user_id")
    friends.index = friends.index.astype(int)
    friends.friend_id = friends.friend_id.astype(int) 
    
    followers = pd.read_sql_query("SELECT * FROM followers", con, index_col="user_id")
    followers.index = followers.index.astype(int)
    followers.follower_id = followers.follower_id.astype(int)
    
    # join the list of users with the friends to construct a one-way edge list
    df = tweetdetails[['doi', 'user_id']].drop_duplicates().set_index('user_id').join(friends)[['friend_id', 'doi']]
    df = df[df.friend_id.notnull()]
    df.friend_id = df.friend_id.astype(int)
    df = df.reset_index()
    df.columns = ['in', 'out', 'doi']
    
    # do the same thing for the followers 
    df2 = tweetdetails[['doi', 'user_id']].drop_duplicates().set_index('user_id').join(followers)[['follower_id', 'doi']]
    df2 = df2[df2.follower_id.notnull()]
    df2.follower_id = df2.follower_id.astype(int)
    df2 = df2.reset_index()
    df2.columns = ['out', 'in', 'doi']
    
    edgelist = df.append(df2).set_index('in').reset_index()
    edgelist = edgelist.drop_duplicates()

    graphs = {}
    for doi in dois: 
        e = edgelist[edgelist.doi == doi]
        if len(e) == 0: 
            continue
        del e['doi']
        
        filename = 'data/%s/%s-edgelist.csv' % (dataset, doi.replace('/','_'))
        e.columns = ['Source', 'Target']

        try: 
            all_graph_edgelist = all_graph_edgelist.append(e).drop_duplicates()
            
        except:
            all_graph_edgelist = e
        
        e.to_csv(filename, index=False, sep="\t", header=None) # this is just for reading again
        
        graphs[doi] = ig.Graph.Read_Ncol(filename, names=True, directed=True)
        e.to_csv(filename, index=False)
    
    all_graph_edgelist.to_csv('data/%s/all_dois-edgelist.csv' % dataset, index=False)
    del all_graph_edgelist
    del edgelist
    del friends
    del followers
    del df
    gc.collect()
    
    return graphs, tweetdetails

print(dataset)
graphs, tweetdetails = load_graphs(cons[dataset])

tweetdetails.to_csv('data/%s/tweetDetailsAll.csv' % dataset, encoding='utf8')
print (len(graphs), len(tweetdetails), len(tweetdetails.user_id.unique()))



In [7]:

    
dois = tweetdetails.doi.unique()

def timedelta_to_days(td):
    return td.days + td.seconds/3600.0/24

def median_timestamp(x):
    ts = list(map(lambda t: t.value/1000000000, x))
    return datetime.fromtimestamp(int(np.median(ts)), tz=pytz.utc).replace(tzinfo=None)

def lifespan(x):
    return timedelta_to_days(x.max()-x.min())

def halflife(x):
    return timedelta_to_days(median_timestamp(x)-x.min())



tweet_stats = tweetdetails.groupby('doi').agg({'created_at': [np.min, np.max, lifespan, median_timestamp, halflife], 
                               'is_retweet': [np.size, np.sum, lambda x: 100.0*x.sum()/len(x)]})
tweet_stats.columns = ['first_tweet', 'last_tweet', 'tweet_lifespan', 'median_tweettime', 'tweet_halflife', 'tweets', 'retweets', 'retweets_p']



In [8]:

    
if dataset == 'bmc':
    names = pd.DataFrame.from_dict({'10.1186/s12915-014-0069-1': 'Biol5', '10.1186/s12915-014-0087-z': 'Biol7', '10.1186/1741-7007-12-36': 'Biol3', '10.1186/1741-7007-12-38': 'Biol4', '10.1186/s12862-014-0193-0': 'Evol3', '10.1186/1741-7007-12-29': 'Biol2', '10.1186/1471-2148-14-136': 'Evol2', '10.1186/s12915-014-0076-2': 'Biol6', '10.1186/1471-2148-14-70': 'Evol1', '10.1186/preaccept-2055025475136453': 'Evol4', '10.1186/1741-7007-12-8': 'Biol1'}, orient='index')
else:
    names = pd.DataFrame.from_dict({doi: "paper_%s" % i for (i, doi) in enumerate(dois)}, orient='index')

names.columns = ['name']    
names.join(tweet_stats).sort_values('retweets_p')









    Out[8]:







  
    
      
      name
      first_tweet
      last_tweet
      tweet_lifespan
      median_tweettime
      tweet_halflife
      tweets
      retweets
      retweets_p
    
  
  
    
      10.1186/s12915-014-0076-2
      Biol6
      2014-10-28 01:53:46
      2015-05-22 16:03:39
      206.5902
      2014-11-05 14:21:05
      8.5190
      196.0000
      105.0000
      53.5714
    
    
      10.1186/1741-7007-12-36
      Biol3
      2014-05-19 12:45:54
      2014-09-09 08:09:42
      112.8082
      2014-05-28 14:41:21
      9.0802
      54.0000
      29.0000
      53.7037
    
    
      10.1186/1471-2148-14-70
      Evol1
      2014-04-02 06:28:52
      2014-06-13 00:06:03
      71.7342
      2014-04-02 16:30:33
      0.4178
      61.0000
      33.0000
      54.0984
    
    
      10.1186/1471-2148-14-136
      Evol2
      2014-07-16 04:17:29
      2015-01-26 15:53:22
      194.4833
      2014-07-16 15:02:22
      0.4478
      53.0000
      31.0000
      58.4906
    
    
      10.1186/1741-7007-12-29
      Biol2
      2014-04-30 17:07:53
      2015-05-26 12:48:04
      390.8196
      2014-05-06 17:56:09
      6.0335
      195.0000
      115.0000
      58.9744
    
    
      10.1186/s12915-014-0087-z
      Biol7
      2014-11-12 03:00:53
      2015-06-11 14:02:54
      211.4597
      2014-11-13 15:56:52
      1.5389
      234.0000
      140.0000
      59.8291
    
    
      10.1186/1741-7007-12-38
      Biol4
      2014-06-26 01:33:19
      2014-12-20 17:49:59
      177.6782
      2014-06-27 10:31:30
      1.3737
      55.0000
      35.0000
      63.6364
    
    
      10.1186/s12915-014-0069-1
      Biol5
      2014-08-22 11:24:10
      2015-03-04 17:12:04
      194.2416
      2014-08-24 16:26:23
      2.2099
      88.0000
      57.0000
      64.7727
    
    
      10.1186/1741-7007-12-8
      Biol1
      2014-01-31 14:10:44
      2015-02-21 04:06:13
      385.5802
      2014-03-28 21:54:00
      56.3217
      216.0000
      141.0000
      65.2778
    
    
      10.1186/s12862-014-0193-0
      Evol3
      2014-09-23 04:41:05
      2015-04-22 15:10:04
      211.4368
      2014-09-23 23:11:14
      0.7709
      207.0000
      156.0000
      75.3623
    
    
      10.1186/preaccept-2055025475136453
      Evol4
      2014-11-26 08:59:59
      2015-04-30 06:57:03
      154.9146
      2014-11-28 12:53:45
      2.1623
      185.0000
      175.0000
      94.5946



In [9]:

    
dois = graphs.keys()

infomaps = {}
subgraphs = {}
short_subgraphs = {}

calculate_shortest = True

graph_stats = {}
shortest_paths = {}

for i, doi in enumerate(dois):
    tweets = tweetdetails[tweetdetails.doi == doi]
    tweets['event_number'] = tweets.index.map(lambda x: tweets.index.get_loc(x))

    # write out the nodelist
    filename = 'data/%s/%s-nodelist.csv' % (dataset, doi.replace('/','_'))
    users_df = tweets[['screen_name', 'user_followers_count', 'user_friends_count', 'user_description', 'event_number', 'text', 'user_id']].drop_duplicates(subset='user_id', keep='first').set_index('user_id')
    users_df.to_csv(filename)
    
    tweets['user_id_str'] = tweets.user_id.astype(str)
    tweeters = tweets[tweets.doi == doi].user_id_str.unique()
    
    assert(len(tweeters) == len(users_df))
#     del users_df
    del tweets['user_id'] # delete to avoid confusion: probably should just use numeric throughout
    
    # temporary for testing, make sure all tweeters are in the graph
    G = graphs[doi]
    for t in tweeters:
        if t not in [v['name'] for v in G.vs]:
            G.add_vertex(t)
    # end temporary
        
    G = graphs[doi].subgraph(tweeters)
    subgraphs[doi] = G
    print("%s\t%s\t%s" % (doi, G.vcount(), G.ecount()))
    
    graph_stats[doi] = {}
    graph_stats[doi]['density'] = G.density()
    graph_stats[doi]['num_nodes'] = G.vcount()
    graph_stats[doi]['num_edges'] = G.ecount()
    graph_stats[doi]['diameter'] = G.diameter(directed=True)
    
    graph_stats[doi]['in_degree_mean'] = np.mean(G.indegree())
    graph_stats[doi]['out_degree_mean'] = np.mean(G.outdegree())
    graph_stats[doi]['degree_mean'] = np.mean(G.degree())
    
    wccs = sorted(G.components(mode=ig.WEAK).subgraphs(), key=lambda g: g.vcount(), reverse=True)
    graph_stats[doi]['biggest_wcc_num_nodes'] = wccs[0].vcount()
    graph_stats[doi]['biggest_wcc_num_nodes_p'] = wccs[0].vcount()*100.0/G.vcount()
    graph_stats[doi]['biggest_wcc_density'] = wccs[0].density()
    graph_stats[doi]['biggest_wcc_infomap_modularity'] = wccs[0].community_infomap().modularity
  
    if G.ecount() == 0:
        continue

    paths = G.shortest_paths(mode=ig.ALL)
    graph_stats[doi]['shortest_paths_mean'] = np.mean([item if item != np.inf else 0 for sublist in paths for item in sublist ])
    graph_stats[doi]['shortest_paths_median'] = np.median([item if item != np.inf else 0 for sublist in paths for item in sublist ])
    graph_stats[doi]['infomap_modularity'] = G.community_infomap().modularity
    
    filename = 'data/%s/%s-subgraph-edgelist.csv' % (dataset, doi.replace('/','_'))
    G.write_ncol(filename)
    df = pd.read_csv(filename, sep=" ", header=None)
    df.columns = ['Source', 'Target']
    df.to_csv(filename, index=False)

    if calculate_shortest: 
        path_lengths = []   
    
        # double check that order is preserved with .unique
        diffusion_paths = []
        for t, f in itertools.combinations(tweets.user_id_str.unique(), 2):
            paths = G.get_shortest_paths(t, f, mode=ig.OUT)
            
            # handle case where more than one path is returned
            if len(paths) > 0 and len(paths[0]) > 0:
                diffusion_paths.append(paths[0])
                path_lengths.append(len(paths[0]))
    #         paths = G.get_shortest_paths(f, t, mode=ig.IN)
    #         path_lengths.append(len(paths[0]))

        shortest_paths[doi] = diffusion_paths
    
        graph_stats[doi]['shortest_diffusion_path_length_mean'] = np.mean(path_lengths)
        graph_stats[doi]['shortest_diffusion_path_length_median'] = np.median(path_lengths)
        
        subG = G
        tweeters = {}
        for v in subG.vs():
            tweeters[v.index] = {}
            tweeters[v.index]['name'] = v['name']
            tweeters[v.index]['event_number'] = tweets[tweets.user_id_str == v['name']].event_number.min()
        
        edges = set()
        for p in diffusion_paths:
            for v_index in range(len(p)-1):
                edges.add((p[v_index], p[v_index+1]))
        
        G = ig.Graph(directed=True)
        G.add_vertices([tweeters[v_index]['name'] for v_index in range(subG.vcount())])
        for v_index in range(subG.vcount()):
            G.vs[v_index]['event_number'] = tweeters[v_index]['event_number']
            
        for e in edges: 
            G.add_edge(e[0], e[1])

        short_subgraphs[doi] = G

graph_stats = pd.DataFrame.from_dict(graph_stats, orient='index')
graph_stats.index.name = 'doi'

# graph_stats.to_csv('data/%s/graph_stats.csv' % dataset)

all_stats = graph_stats.join(tweet_stats)
all_stats.to_csv('data/%s/all_stats.csv' % dataset)

names.join(all_stats)









    



/usr/local/lib/python3.6/site-packages/ipykernel/__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python3.6/site-packages/ipykernel/__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    



10.1186/1741-7007-12-8	196	436






    



/usr/local/lib/python3.6/site-packages/ipykernel/__main__.py:64: RuntimeWarning: weights attribute does not exists at foreign.c:1530
/usr/local/lib/python3.6/site-packages/ipykernel/__main__.py:75: RuntimeWarning: Couldn't reach some vertices at structural_properties.c:740






    



10.1186/1471-2148-14-70	57	250
10.1186/1741-7007-12-29	153	947
10.1186/1741-7007-12-36	44	129
10.1186/1741-7007-12-38	50	238
10.1186/1471-2148-14-136	50	168
10.1186/s12915-014-0069-1	70	673
10.1186/s12862-014-0193-0	188	969
10.1186/s12915-014-0076-2	168	621
10.1186/s12915-014-0087-z	190	2520
10.1186/preaccept-2055025475136453	177	908






    Out[9]:







  
    
      
      name
      density
      num_nodes
      num_edges
      diameter
      in_degree_mean
      out_degree_mean
      degree_mean
      biggest_wcc_num_nodes
      biggest_wcc_num_nodes_p
      ...
      shortest_diffusion_path_length_mean
      shortest_diffusion_path_length_median
      first_tweet
      last_tweet
      tweet_lifespan
      median_tweettime
      tweet_halflife
      tweets
      retweets
      retweets_p
    
  
  
    
      10.1186/s12915-014-0069-1
      Biol5
      0.1393
      70
      673
      5
      9.6143
      9.6143
      19.2286
      68
      97.1429
      ...
      3.1900
      3.0000
      2014-08-22 11:24:10
      2015-03-04 17:12:04
      194.2416
      2014-08-24 16:26:23
      2.2099
      88.0000
      57.0000
      64.7727
    
    
      10.1186/s12915-014-0087-z
      Biol7
      0.0702
      190
      2520
      6
      13.2632
      13.2632
      26.5263
      170
      89.4737
      ...
      3.3314
      3.0000
      2014-11-12 03:00:53
      2015-06-11 14:02:54
      211.4597
      2014-11-13 15:56:52
      1.5389
      234.0000
      140.0000
      59.8291
    
    
      10.1186/1741-7007-12-36
      Biol3
      0.0682
      44
      129
      6
      2.9318
      2.9318
      5.8636
      40
      90.9091
      ...
      3.7733
      4.0000
      2014-05-19 12:45:54
      2014-09-09 08:09:42
      112.8082
      2014-05-28 14:41:21
      9.0802
      54.0000
      29.0000
      53.7037
    
    
      10.1186/1741-7007-12-38
      Biol4
      0.0971
      50
      238
      6
      4.7600
      4.7600
      9.5200
      48
      96.0000
      ...
      3.4331
      3.0000
      2014-06-26 01:33:19
      2014-12-20 17:49:59
      177.6782
      2014-06-27 10:31:30
      1.3737
      55.0000
      35.0000
      63.6364
    
    
      10.1186/s12862-014-0193-0
      Evol3
      0.0276
      188
      969
      9
      5.1543
      5.1543
      10.3085
      159
      84.5745
      ...
      4.0708
      4.0000
      2014-09-23 04:41:05
      2015-04-22 15:10:04
      211.4368
      2014-09-23 23:11:14
      0.7709
      207.0000
      156.0000
      75.3623
    
    
      10.1186/1741-7007-12-29
      Biol2
      0.0407
      153
      947
      8
      6.1895
      6.1895
      12.3791
      139
      90.8497
      ...
      3.9363
      4.0000
      2014-04-30 17:07:53
      2015-05-26 12:48:04
      390.8196
      2014-05-06 17:56:09
      6.0335
      195.0000
      115.0000
      58.9744
    
    
      10.1186/1471-2148-14-136
      Evol2
      0.0686
      50
      168
      6
      3.3600
      3.3600
      6.7200
      42
      84.0000
      ...
      3.5997
      4.0000
      2014-07-16 04:17:29
      2015-01-26 15:53:22
      194.4833
      2014-07-16 15:02:22
      0.4478
      53.0000
      31.0000
      58.4906
    
    
      10.1186/s12915-014-0076-2
      Biol6
      0.0221
      168
      621
      7
      3.6964
      3.6964
      7.3929
      115
      68.4524
      ...
      3.9361
      4.0000
      2014-10-28 01:53:46
      2015-05-22 16:03:39
      206.5902
      2014-11-05 14:21:05
      8.5190
      196.0000
      105.0000
      53.5714
    
    
      10.1186/1471-2148-14-70
      Evol1
      0.0783
      57
      250
      7
      4.3860
      4.3860
      8.7719
      46
      80.7018
      ...
      3.7535
      4.0000
      2014-04-02 06:28:52
      2014-06-13 00:06:03
      71.7342
      2014-04-02 16:30:33
      0.4178
      61.0000
      33.0000
      54.0984
    
    
      10.1186/preaccept-2055025475136453
      Evol4
      0.0291
      177
      908
      10
      5.1299
      5.1299
      10.2599
      164
      92.6554
      ...
      5.0085
      5.0000
      2014-11-26 08:59:59
      2015-04-30 06:57:03
      154.9146
      2014-11-28 12:53:45
      2.1623
      185.0000
      175.0000
      94.5946
    
    
      10.1186/1741-7007-12-8
      Biol1
      0.0114
      196
      436
      11
      2.2245
      2.2245
      4.4490
      160
      81.6327
      ...
      4.9299
      4.0000
      2014-01-31 14:10:44
      2015-02-21 04:06:13
      385.5802
      2014-03-28 21:54:00
      56.3217
      216.0000
      141.0000
      65.2778
    
  

11 rows × 25 columns



In [21]:

    
names.join(all_stats)[['name', 'shortest_diffusion_path_length_mean']].sort_values('shortest_diffusion_path_length_mean')









    Out[21]:







  
    
      
      name
      shortest_diffusion_path_length_mean
    
  
  
    
      10.1186/s12915-014-0069-1
      Biol5
      3.1900
    
    
      10.1186/s12915-014-0087-z
      Biol7
      3.3314
    
    
      10.1186/1741-7007-12-38
      Biol4
      3.4331
    
    
      10.1186/1471-2148-14-136
      Evol2
      3.5997
    
    
      10.1186/1471-2148-14-70
      Evol1
      3.7535
    
    
      10.1186/1741-7007-12-36
      Biol3
      3.7733
    
    
      10.1186/s12915-014-0076-2
      Biol6
      3.9361
    
    
      10.1186/1741-7007-12-29
      Biol2
      3.9363
    
    
      10.1186/s12862-014-0193-0
      Evol3
      4.0708
    
    
      10.1186/1741-7007-12-8
      Biol1
      4.9299
    
    
      10.1186/preaccept-2055025475136453
      Evol4
      5.0085



In [162]:

    
plt.rcParams['figure.figsize'] = (20.0, 12.0)
nrows = len(dois)/2 if len(dois)/2 % 2 == 0 else len(dois)//2+1
ncols = 2
fig, axes = plt.subplots(nrows=nrows, ncols=ncols)
plot_map = [(i,j) for j in range(ncols) for i in range(nrows)]

for i, doi in enumerate(dois): 
    path_lengths = pd.Series(map(len, shortest_paths[doi]))
    ax = axes[plot_map[i][0],plot_map[i][1]]
    ax.set_title("%s: %s" % (names.loc[doi]['name'], doi))
    sns.distplot(path_lengths, kde=False, norm_hist=True, bins=range(0,10), ax=ax)
    ax.set_xlim([0,10])
    ax.axvline(x=path_lengths.mean())

if len(dois) % 2 == 1:
    fig.delaxes(axes[-1,-1])

plt.tight_layout()
plt.savefig('data/%s/shortest_exp_paths_dist.png' % dataset)



In [ ]:

    
def grouped_first(grouped):
    if len(grouped) > 0:
        return grouped.iloc[0]
    return np.nan

def group_concat(grouped):
    try: 
        return "%s" % '||'.join(grouped)
    except TypeError:
        return ''

plt.rcParams['figure.figsize'] = (20.0, 12.0)

isolate_threshold = 4
nrows = len(dois)/2 if len(dois)/2 % 2 == 0 else len(dois)/2+1
ncols = 2
fig, axes = plt.subplots(nrows=nrows, ncols=ncols)
plot_map = [(i,j) for j in range(ncols) for i in range(nrows)]

for i, doi in enumerate(dois): 
    G = subgraphs[doi]
    infomap = infomaps[doi]
    
    df = pd.DataFrame(infomap.membership, columns=['membership'])
    df['user_id_str'] = df.index.map(lambda x: G.vs[x]['name'])
    
    tweets = tweetdetails[tweetdetails.doi == doi]
    tweets['event_number'] = tweets.index.map(lambda x: tweets.index.get_loc(x))
    tweets['user_id_str'] = tweets.user_id.astype(str)
    del tweets['user_id']
    tweets = tweets.merge(df, left_on='user_id_str', right_on='user_id_str')
    tweets['num_tweets'] = 1
    
    filename = 'data/%s/%s-tweets.csv' % (dataset, doi.replace('/','_'))
    df = tweets.groupby('user_id_str').agg({'doi': grouped_first, 
                                       'created_at': grouped_first, 
                                        'screen_name': grouped_first, 
                                        'old_screen_name': grouped_first,
                                        'user_utc_offset': grouped_first,
                                        'user_followers_count': grouped_first,
                                        'user_friends_count': grouped_first,
                                        'user_description': grouped_first,
                                        'is_retweet': grouped_first,
                                        'retweet_of_status_id_str': grouped_first,
                                        'retweet_of_screen_name': grouped_first,
                                        'is_reply': grouped_first,
                                        'in_reply_to_status_id_str': grouped_first,
                                        'in_reply_to_screen_name': grouped_first,
                                        'text': group_concat, 
                                        'num_tweets': lambda x: x.sum(),
                                        'event_number': grouped_first,
                                        'membership': grouped_first
                                       }).sort('created_at').reset_index().rename(columns={'user_id_str': 'ID', 'screen_name': 'Label'}).to_csv(filename, encoding='utf8', quotechar='"', index=False)
    
    for c, s in enumerate([g.vcount() for g in infomap.subgraphs()]):
        if s < 4:
            max_community_meets_threshold = c 
            break
            
    tweets['membership_i'] = tweets.membership.map(lambda x: x if x < max_community_meets_threshold else max_community_meets_threshold)
    
    activated_communities = set()
    num_activated_communities = []
    for e, c in tweets.membership_i.iteritems():
        activated_communities.add(c)
        num_activated_communities.append(len(activated_communities))
    tweets['community_activations'] = num_activated_communities

    if ncols == 1:
        ax=axes[i]
    else:
        ax=axes[plot_map[i][0],plot_map[i][1]]    
    tweets.community_activations.plot(style="or", ylim=[0, tweets.community_activations.max()+5], ms=5, alpha=0, ax=ax)
    for x, y in zip(tweets.index, tweets.community_activations):
        ax.text(x, y, tweets.ix[x]['membership_i'], color="blue", fontsize=12)
    ax.set_title(doi)
    plt.margins(0.1)



In [62]:

    
sns.set(style="white")
sns.set(style="ticks", color_codes=True)
plt.rcParams["axes.labelsize"] = 15

labels = ['Num nodes', '% users\nin largest\ncomponent', 'Mean shortest\ndiffusion path', 'Density', '% Retweets', 'Tweet\nlifespan', 'Tweet\nhalf-life', 'Infomap\nmodularity']
vars_to_plot = ['num_nodes', 'biggest_wcc_num_nodes_p', 'shortest_diffusion_path_length_mean', 'density', 'retweets_p', 'tweet_lifespan', 'tweet_halflife', 'biggest_wcc_infomap_modularity']

# vars_to_plot = ['num_nodes', 'biggest_wcc_num_nodes_p', 'shortest_diffusion_path_length_mean']

g = sns.PairGrid(all_stats, diag_sharey=False, vars=vars_to_plot)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3)

def sig_marker(p_value):
    if p_value < .01:
        sig = '***'
    elif p_value < .05:
        sig = '**'
    elif p_value < .1:
        sig = '*'
    else: 
        sig = ''     
    
    return sig

def corrfunc(x, y, **kws):
#     _, _, r_value, p_value, _ = stats.linregress(x, y)
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    ax = plt.gca()
    ax.plot(x, intercept + slope*x, 'r')
   
    ax.annotate("R-sq = {:.2f}".format(r_value**2) + sig_marker(p_value),
                xy=(.68, .1), xycoords=ax.transAxes)
    rho, p_value = stats.spearmanr(x, y)
    ax.annotate("rho = {:.2f}".format(rho**2) + sig_marker(p_value),
            xy=(.68, .2), xycoords=ax.transAxes)


# g.map_upper(corrfunc)

for i in range(len(vars_to_plot)):
    ax = g.axes[i][0]
    ax.set_ylabel(labels[i], fontsize=18)
    ax.tick_params(axis='y', labelsize=12)
    
    ax = g.axes[len(vars_to_plot)-1][i]
    ax.set_xlabel(labels[i], fontsize=18)
    ax.tick_params(axis='x', labelsize=12)

plt.tight_layout()
plt.savefig('data/%s/scatterplot_kde.png' % dataset)









    



/usr/local/lib/python3.6/site-packages/matplotlib/axes/_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "



In [ ]:

    
x = df[df['tweet_halflife']<50]['tweet_halflife']
y = df[df['tweet_halflife']<50]['shortest_diffusion_path_length_mean']
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print(r_value**2, p_value)
print()
rho, p_value = stats.spearmanr(x, y)
print(rho, p_value)



In [ ]:

    
x = df['tweet_halflife']
y = df['shortest_diffusion_path_length_mean']
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print(r_value**2, p_value)
print()
rho, p_value = stats.spearmanr(x, y)
print(rho, p_value)



In [ ]:

    
pd.options.display.float_format = '{:,.2f}'.format
names.join(df)



In [ ]:

    
stats_summary = df.describe().transpose()
stats_summary[['mean', 'std', 'min', 'max', '50%']].style.format("{:.2f}")



In [ ]:

    
names.join(graph_stats).sort_values('shortest_diffusion_path_length_mean')



In [ ]:



In [ ]:

    
# Weakly connected components
network_stats = {}
for doi in dois: 

    G = short_subgraphs[doi]
    print(doi)
    l = sorted([g.vcount() for g in G.components(mode=ig.WEAK).subgraphs()], reverse=True)
    print(l)
    print("%s, %.2f%%" % (l[0], l[0]*100.0/sum(l)))
    
    print
# ig.plot(G.community_infomap(), vertex_label=[v['event_number'] for v in G.vs])



In [ ]:

    
for doi in dois: 
    G = subgraphs[doi]
    G.community_infomap().modularity

    wccs = sorted(G.components(mode=ig.WEAK).subgraphs(), key=lambda g: g.vcount(), reverse=True)
    print(wccs[0].community_infomap().modularity)



In [ ]:

    
sns.set(style="white")
sns.set(style="ticks", color_codes=True)

df = all_stats[['num_nodes', 'biggest_wcc_num_nodes_p', 'shortest_paths_mean', 'density', 'retweets_p', 'tweet_lifespan', 'tweet_halflife', 'infomap_modularity']]

g = sns.PairGrid(df, diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3)

from scipy import stats
def corrfunc(x, y, **kws):
#     _, _, r_value, p_value, _ = stats.linregress(x, y)
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    ax = plt.gca()
    ax.plot(x, intercept + slope*x, 'r')
    ax.annotate("R-sq = {:.2f}".format(r_value**2),
                xy=(.68, .1), xycoords=ax.transAxes)

g.map_upper(corrfunc)

plt.savefig('data/%s/scatterplot_kde.png' % dataset)



In [ ]:

    
doi = list(dois)[7]

G = short_subgraphs[doi]
ig.plot(G.community_infomap())



In [ ]:

    
doi = dois[5]
G = short_subgraphs[doi]
print doi
print len([g.vcount() for g in G.components(mode=ig.WEAK).subgraphs()])



In [ ]:

    
im = G.community_infomap()
[g.vcount() for g in im.subgraphs()]



In [ ]:

    
ig.plot(im.subgraphs()[2])



In [ ]:

    
ig.plot(infomap, vertex_label=[v['event_number'] for v in G.vs])



In [ ]:

    
G = graphs[doi]
tweets = tweetdetails[tweetdetails.doi == doi]
tweets['event_number'] = tweets.index.map(lambda x: tweets.index.get_loc(x))
tweets['user_id_str'] = tweets.user_id_str.astype(str)

tweeters = tweetdetails[tweetdetails.doi == doi].user_id_str.unique().astype(str)
tweeters = {v['name']: v.index for v in G.vs if v['name'] in tweeters}

subG = G.subgraph(tweeters.keys())
tweeters = {v['name']: v.index for v in subG.vs}
tweeters



In [ ]:

    
doi = dois[0]
G = subgraphs[doi]
tweeters = tweetdetails[tweetdetails.doi == doi].user_id_str.unique().astype(str)
tweeters = {v.index: v['name'] for v in G.vs if v['name'] in tweeters}
# tweeters = {v['name']: v.index for v in G.vs if v['name'] in tweeters}



In [ ]:

    
activated_communities = set()
num_activated_communities = []
for e, c in tweets.membership_i.iteritems():
    activated_communities.add(c)
    num_activated_communities.append(len(activated_communities))
tweets['community_activations'] = num_activated_communities

pylab.rcParams['figure.figsize'] = (12.0, 8.0)
tweets.community_activations.plot(style="or", ylim=[0, tweets.community_activations.max()+.5], ms=5, alpha=0)
for x, y in zip(tweets.index, tweets.community_activations):
#     print x,y
    plt.text(x, y, tweets.ix[x]['membership_i'], color="red", fontsize=12)
# plt.margins(0.1)



In [ ]:

    
tweets.membership.plot.hist(bins=range(tweets.membership.max()))



In [ ]:

    
for doi in dois:
    print doi, [g.vcount() for g in wccs[doi].subgraphs()]



In [ ]:

    
doi = dois[1]
for doi in dois: 
    G = graphs[doi]
    tweeters = tweetdetails[tweetdetails.doi == doi].user_id_str.unique().astype(str)
    tweeters = {v['name']: v.index for v in G.vs if v['name'] in tweeters}
    
    wcc = G.subgraph(tweeters.keys()).components(mode=ig.WEAK)
    print [g.vcount() for g in wcc.subgraphs()]



In [ ]:

    
# infomap = infomaps.values()[0]
for doi, infomap in infomaps.items(): 
    print(doi, infomap.modularity, [g.vcount() for g in infomap.subgraphs()])



In [ ]:

    
['Scientist', 'Publisher', 'Public', 'Scientist (non-research)',
       'Aggregator', 'Blogger', 'Institution']

def make_label(s): 
    if s.find('(') > 0:
        return 'Sn'
    elif s == 'Publisher':
        return 'Ps'
    else:
        return s[0]



In [ ]:

    
doi = dois[0]
G = subgraphs[doi]
infomap = infomaps[doi]



In [ ]:

    
user_classifications = pd.read_csv('data/user_classifications.tsv', sep='\t', dtype={'user_id': str})
user_classifications = user_classifications.drop_duplicates()

df = pd.DataFrame(infomap.membership, columns=['membership'], )
df['user_id'] = df.index.map(lambda x: G.vs[x]['name'])

df = df.merge(user_classifications, left_on="user_id", right_on="user_id", how='inner')
print len(df)



In [ ]:

    
print G.shortest_paths(35, 34)
print G.shortest_paths(34, 35)



In [ ]:

    
ig.plot(infomap, vertex_label=[df.ix[v.index]['user_id'] for v in G.vs])



In [ ]:

    
tweets = pd.read_csv('data/%s/1741-7007-12-36-tweets.csv' % dataset)
edgelist = pd.read_csv('data/%s/1741-7007-12-36-edgelist.tsv' % dataset, sep='\t', header=None, names=['in', 'out'])
edgelist[(edgelist['in'].isin(tweets.user_id_str.unique())) | (edgelist['out'].isin(tweets.user_id_str.unique()))].to_csv('data/tmp.tsv', index=False, sep='\t', header=False)
G = ig.Graph.Read_Ncol('data/tmp.tsv', names=True, directed=True)



In [ ]:

    
s.



In [ ]:

    
set([v['name'] for v in G.vs]).difference(tweets.user_id_str)



In [ ]:

    
ig.plot(G.community_infomap())



In [ ]:



In [ ]:

    
tweets.head()



In [ ]:

    
infomap.modularity



In [ ]:

    
df.groupby('membership').size().head()



In [ ]:

    
df.groupby(['membership', 'usertype']).size().multiply(100.0).divide(df.groupby('membership').size())



In [ ]:

    
len(df) #.groupby('usertype').size()



In [ ]:

    
gb['p'] = gb



In [ ]:

    
gb



In [ ]:

    
# {v: k for k, v in tweeters.iteritems() if v in [11607, 4916, 6622, 4868]}
for v in [11607, 4916, 6622, 4868]:
    for k, v2 in tweeters.iteritems():
        if v == v2:
            print k, v



In [ ]:

    
edgelist = pd.read_csv('data/evolBioAllFollowers.csv', dtype={'in': str, 'out': str, 'doi': str}).drop_duplicates()
e = pd.read_csv('data/evolBioAllFriends.csv', dtype={'in': str, 'out': str, 'doi': str}).drop_duplicates()
edgelist = edgelist.append(e)
del e

edgelist = edgelist[edgelist.doi == doi]



In [ ]:

    
len(edgelist.drop_duplicates())



In [ ]:

    
for v in G.vs:
    if v.degree() > 40000:
        print v['name'], v.degree(), v
        biomed = v
        break



In [ ]:

    
df.degree.plot(style='o', logx=True, logy=True, alpha=.5)



In [ ]:

    
counts, bins = np.histogram(df.degree, density=True)

bar(bins[:-1],counts/float(sum(counts)),width=bins[1]-bins[0])
ylabel("fraction of nodes")

bar(bins[:-1],counts/float(sum(counts)),width=bins[1]-bins[0],log=True)
#hist(ks,bins=arange(min(ks),max(ks)),normed=True,log=True)
xlabel("degree")
ylabel("fraction of nodes")



In [ ]:

    
maxdegfound=int(ceil(max(bins)))
counts,bins=np.histogram(df.degree,bins=maxdegfound)


countsnozero=counts*1.
countsnozero[counts==0]=-Inf

figure()
scatter(bins[:-1],countsnozero/float(sum(counts)),s=60)
yscale('log')
xscale('log')
ylim(0.00008,1.1)
xlim(0.8,1100)
xlabel('degree')
ylabel("fraction of nodes")
subplots_adjust(bottom=0.15)



In [ ]:

    
df.degree.value_counts().head()



In [ ]:

    
df.degree.value_counts(normalize=False)



In [ ]:

    
plt.loglog(df.degree.value_counts(normalize=True).sort_index(), marker='o')



In [ ]:

    
try:
    del e
except: 
    pass

total_edges = 0

for doi in dois:
    filename = 'data/%s/%s-edgelist.csv' % (dataset, doi.replace('/','_'))
    df = pd.read_csv(filename)
    total_edges += len(df)
    try: 
        e = e.append(df)
    except:
        e = df



In [ ]:

    
e.to_csv('data/%s/all_dois-edgelist.csv' % dataset, index=False)



In [ ]:

    
e = e.drop_duplicates()
e.to_csv(filename, index=False)



In [ ]:

    
filename = 'tmp.edgelist'
e.to_csv('tmp.edgelist', index=False, sep="\t", header=None) # keep as TSV for iGraph
print('wrote')
allG = ig.Graph.Read_Ncol('tmp.edgelist', names=True, directed=True)



In [ ]:

    
tweetdetails.columns



In [ ]:

    
tweets = tweetdetails
tweets['event_number'] = tweets.index.map(lambda x: tweets.index.get_loc(x))

tweeters = tweetdetails.user_id.unique().astype(str)



In [ ]:

    
G = allG.subgraph(tweeters)



In [ ]:

    
filename = 'data/%s/all_dois-subgraph-edgelist.csv' % dataset
G.write_ncol(filename)
# df = pd.read_csv(filename, sep=" ", header=None)
# df.columns = ['Source', 'Target']
# df.to_csv(filename, index=False)
# del df



In [ ]:

    
infomap = G.community_infomap()



In [ ]:

    
from collections import Counter
cnt = Counter(infomap.membership)



In [ ]:

    
infomap.modularity



In [ ]:

    
user_id_screen_name_map = {str(k):v for (k,v) in zip(list(users_df.index), list(users_df.screen_name))}



In [ ]:



In [ ]:

    
memberships = {k:v for (k,v) in zip([user_id_screen_name_map[v['name']] for v in G.vs], infomap.membership)}



In [ ]:

    
s = pd.Series(infomap.membership, index=[user_id_screen_name_map[v['name']] for v in G.vs])
s.name='community_number'
s.to_excel('data/comm/whole_network_infomap_community_memberships.xlsx')



In [ ]:

    
cnt_reverse = {v:k for (k,v) in cnt.items()}



In [ ]:

    
sum([cnt_reverse[x+1] for x in range(10)])



In [ ]:

    
cnt_reverse



In [ ]:

    
sum([1520, 300, 273, 261, 211, 199, 193, 134, 134, 120])



In [ ]:

    
sorted(cnt.values(), reverse=True)[0:30]



In [ ]:

    
sns.distplot(list(cnt.values()))



In [ ]:

    
print("%s\t%s\t%s" % (doi, G.vcount(), G.ecount()))

graph_stats = {}
graph_stats['density'] = G.density()
graph_stats['num_nodes'] = G.vcount()
graph_stats['num_edges'] = G.ecount()
graph_stats['diameter'] = G.diameter()

graph_stats['in_degree_mean'] = np.mean(G.indegree())
graph_stats['out_degree_mean'] = np.mean(G.outdegree())
graph_stats['degree_mean'] = np.mean(G.degree())

wccs = sorted(G.components(mode=ig.WEAK).subgraphs(), key=lambda g: g.vcount(), reverse=True)
graph_stats['biggest_wcc_num_nodes'] = wccs[0].vcount()
graph_stats['biggest_wcc_num_nodes_p'] = wccs[0].vcount()*100.0/G.vcount()
graph_stats['biggest_wcc_density'] = wccs[0].density()
graph_stats['biggest_wcc_infomap_modularity'] = wccs[0].community_infomap().modularity



In [ ]:

    
graph_stats



In [ ]:

    
subG =



In [ ]:

    
tweetdetails = pd.read_csv('data/BMCevolBioTweetDetails.txt', encoding='utf8', sep="\t", index_col='tweet_id')
tweetdetails.sort_index(inplace=True)

# dois = dois + list(tweetdetails.doi.unique())
dois = list(tweetdetails.doi.unique())
print dois



In [ ]:

    
tweetdetails = pd.read_csv('data/BMCBioTweetDetails.txt', encoding='utf8', sep="\t", index_col='tweet_id')
tweetdetails.sort_index(inplace=True)

dois = list(tweetdetails.doi.unique())
print dois



In [ ]:

    
tweetdetails = pd.read_csv('data/BMCBioTweetDetails.csv', encoding='utf8', index_col='tweet_id')
tweetdetails.sort_index(inplace=True)

dois = list(tweetdetails.doi.unique())

tweetdetails['created_at'] = pd.to_datetime(tweetdetails.created_at)
tweetdetails['user_id_str'] = tweetdetails.user_id_str.astype(int)

con = sqlite3.connect("data/BMCbioSample.db")
friends = pd.read_sql_query("SELECT * FROM friends", con, index_col="user_id")
friends.index = friends.index.astype(int)
followers = pd.read_sql_query("SELECT * FROM followers", con, index_col="user_id")
followers.index = followers.index.astype(int)

df = tweetdetails[['doi', 'user_id_str']].drop_duplicates().set_index('user_id_str').join(friends)[['friend_id', 'doi']]
# df.index.rename('user_id', inplace=True)

edgelist = df.reset_index()
edgelist.columns = ['in', 'out', 'doi']
edgelist = edgelist.drop_duplicates()

tweetdetails_BMC = tweetdetails
edgelist_BMC = edgelist

graphs = {}
for doi in dois: 
    e = edgelist[edgelist.doi == doi]
    del e['doi']
    
    filename = 'data/%s/%s-edgelist.csv' % (dataset, doi.replace('/','_'))
    e.columns = ['Source', 'Target']
    e.to_csv(filename, index=False, sep="\t", header=None) # keep as TSV for iGraph
    graphs[doi] = ig.Graph.Read_Ncol(filename, names=True, directed=True)
    e.to_csv(filename, index=False) 

tweetdetails = pd.read_csv('data/BMCevolBioTweetDetails.csv', encoding='utf8', index_col='tweet_id')
tweetdetails.sort_index(inplace=True)

# dois = dois + list(tweetdetails.doi.unique())
dois = list(tweetdetails.doi.unique())

tweetdetails['created_at'] = pd.to_datetime(tweetdetails.created_at)
tweetdetails['user_id_str'] = tweetdetails.user_id_str.astype(int)

edgelist = pd.read_csv('data/evolBioAllFollowers.csv', dtype={'in': str, 'out': str, 'doi': str}).drop_duplicates()
e = pd.read_csv('data/evolBioAllFriends.csv', dtype={'in': str, 'out': str, 'doi': str}).drop_duplicates()
edgelist = edgelist.append(e).drop_duplicates()
del e

tweetdetails = tweetdetails.append(tweetdetails_BMC)
del tweetdetails_BMC
edgelist = edgelist.append(edgelist_BMC)
del edgelist_BMC

for doi in dois:  
    e = edgelist[edgelist.doi == doi]
    del e['doi']

    filename = 'data/%s/%s-edgelist.csv' % (dataset, doi.replace('/','_'))
    e.columns = ['Source', 'Target']
    e.to_csv(filename, index=False, sep="\t", header=None)
    graphs[doi] = ig.Graph.Read_Ncol(filename, names=True, directed=True)
    e.to_csv(filename, index=False)
#     os.remove('data/edgelist.tsv')

dois = list(tweetdetails.doi.unique())

	name	first_tweet	last_tweet	tweet_lifespan	median_tweettime	tweet_halflife	tweets	retweets	retweets_p
10.1186/s12915-014-0076-2	Biol6	2014-10-28 01:53:46	2015-05-22 16:03:39	206.5902	2014-11-05 14:21:05	8.5190	196.0000	105.0000	53.5714
10.1186/1741-7007-12-36	Biol3	2014-05-19 12:45:54	2014-09-09 08:09:42	112.8082	2014-05-28 14:41:21	9.0802	54.0000	29.0000	53.7037
10.1186/1471-2148-14-70	Evol1	2014-04-02 06:28:52	2014-06-13 00:06:03	71.7342	2014-04-02 16:30:33	0.4178	61.0000	33.0000	54.0984
10.1186/1471-2148-14-136	Evol2	2014-07-16 04:17:29	2015-01-26 15:53:22	194.4833	2014-07-16 15:02:22	0.4478	53.0000	31.0000	58.4906
10.1186/1741-7007-12-29	Biol2	2014-04-30 17:07:53	2015-05-26 12:48:04	390.8196	2014-05-06 17:56:09	6.0335	195.0000	115.0000	58.9744
10.1186/s12915-014-0087-z	Biol7	2014-11-12 03:00:53	2015-06-11 14:02:54	211.4597	2014-11-13 15:56:52	1.5389	234.0000	140.0000	59.8291
10.1186/1741-7007-12-38	Biol4	2014-06-26 01:33:19	2014-12-20 17:49:59	177.6782	2014-06-27 10:31:30	1.3737	55.0000	35.0000	63.6364
10.1186/s12915-014-0069-1	Biol5	2014-08-22 11:24:10	2015-03-04 17:12:04	194.2416	2014-08-24 16:26:23	2.2099	88.0000	57.0000	64.7727
10.1186/1741-7007-12-8	Biol1	2014-01-31 14:10:44	2015-02-21 04:06:13	385.5802	2014-03-28 21:54:00	56.3217	216.0000	141.0000	65.2778
10.1186/s12862-014-0193-0	Evol3	2014-09-23 04:41:05	2015-04-22 15:10:04	211.4368	2014-09-23 23:11:14	0.7709	207.0000	156.0000	75.3623
10.1186/preaccept-2055025475136453	Evol4	2014-11-26 08:59:59	2015-04-30 06:57:03	154.9146	2014-11-28 12:53:45	2.1623	185.0000	175.0000	94.5946

	name	density	num_nodes	num_edges	diameter	in_degree_mean	out_degree_mean	degree_mean	biggest_wcc_num_nodes	biggest_wcc_num_nodes_p	...	shortest_diffusion_path_length_mean	shortest_diffusion_path_length_median	first_tweet	last_tweet	tweet_lifespan	median_tweettime	tweet_halflife	tweets	retweets	retweets_p
10.1186/s12915-014-0069-1	Biol5	0.1393	70	673	5	9.6143	9.6143	19.2286	68	97.1429	...	3.1900	3.0000	2014-08-22 11:24:10	2015-03-04 17:12:04	194.2416	2014-08-24 16:26:23	2.2099	88.0000	57.0000	64.7727
10.1186/s12915-014-0087-z	Biol7	0.0702	190	2520	6	13.2632	13.2632	26.5263	170	89.4737	...	3.3314	3.0000	2014-11-12 03:00:53	2015-06-11 14:02:54	211.4597	2014-11-13 15:56:52	1.5389	234.0000	140.0000	59.8291
10.1186/1741-7007-12-36	Biol3	0.0682	44	129	6	2.9318	2.9318	5.8636	40	90.9091	...	3.7733	4.0000	2014-05-19 12:45:54	2014-09-09 08:09:42	112.8082	2014-05-28 14:41:21	9.0802	54.0000	29.0000	53.7037
10.1186/1741-7007-12-38	Biol4	0.0971	50	238	6	4.7600	4.7600	9.5200	48	96.0000	...	3.4331	3.0000	2014-06-26 01:33:19	2014-12-20 17:49:59	177.6782	2014-06-27 10:31:30	1.3737	55.0000	35.0000	63.6364
10.1186/s12862-014-0193-0	Evol3	0.0276	188	969	9	5.1543	5.1543	10.3085	159	84.5745	...	4.0708	4.0000	2014-09-23 04:41:05	2015-04-22 15:10:04	211.4368	2014-09-23 23:11:14	0.7709	207.0000	156.0000	75.3623
10.1186/1741-7007-12-29	Biol2	0.0407	153	947	8	6.1895	6.1895	12.3791	139	90.8497	...	3.9363	4.0000	2014-04-30 17:07:53	2015-05-26 12:48:04	390.8196	2014-05-06 17:56:09	6.0335	195.0000	115.0000	58.9744
10.1186/1471-2148-14-136	Evol2	0.0686	50	168	6	3.3600	3.3600	6.7200	42	84.0000	...	3.5997	4.0000	2014-07-16 04:17:29	2015-01-26 15:53:22	194.4833	2014-07-16 15:02:22	0.4478	53.0000	31.0000	58.4906
10.1186/s12915-014-0076-2	Biol6	0.0221	168	621	7	3.6964	3.6964	7.3929	115	68.4524	...	3.9361	4.0000	2014-10-28 01:53:46	2015-05-22 16:03:39	206.5902	2014-11-05 14:21:05	8.5190	196.0000	105.0000	53.5714
10.1186/1471-2148-14-70	Evol1	0.0783	57	250	7	4.3860	4.3860	8.7719	46	80.7018	...	3.7535	4.0000	2014-04-02 06:28:52	2014-06-13 00:06:03	71.7342	2014-04-02 16:30:33	0.4178	61.0000	33.0000	54.0984
10.1186/preaccept-2055025475136453	Evol4	0.0291	177	908	10	5.1299	5.1299	10.2599	164	92.6554	...	5.0085	5.0000	2014-11-26 08:59:59	2015-04-30 06:57:03	154.9146	2014-11-28 12:53:45	2.1623	185.0000	175.0000	94.5946
10.1186/1741-7007-12-8	Biol1	0.0114	196	436	11	2.2245	2.2245	4.4490	160	81.6327	...	4.9299	4.0000	2014-01-31 14:10:44	2015-02-21 04:06:13	385.5802	2014-03-28 21:54:00	56.3217	216.0000	141.0000	65.2778