In [7]:
import pandas as pd
import re
import json
import sqlite3
from datetime import datetime 
import time

import goslate
gs = goslate.Goslate()



litecon = lite.connect('data/BMCbioSample.db')

In [8]:
evolBioCon = sqlite3.connect("data/BMCevolBioSample.db")
bioCon = sqlite3.connect("data/BMCbioSample.db")
BMCcon = sqlite3.connect("data/bmcTwitter.db")

In [9]:
with BMCcon:
    with evolBioCon:
        litecur = evolBioCon.cursor()
        litecur.execute("SELECT * FROM sample")
	# this will be big-ish in memory but shouldn't be a major issue, the whole SQLite db is about 60mb after one full run.
	journals = litecur.fetchall()

In [3]:
df = pd.read_sql("SELECT doi, tweet_id, old_screen_name, tweet FROM sample WHERE tweet IS NOT NULL ", BMCcon, index_col='tweet_id')
df['tweet'] = df.tweet.apply(lambda x: json.loads(x) if x is not None else None)

df['created_at'] = df.tweet.apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(x['created_at'],'%a %b %d %H:%M:%S +0000 %Y')))
df['created_at_dayofweek'] = df.tweet.apply(lambda x: x['created_at'][0:3])
df['screen_name'] = df.tweet.apply(lambda x: x['user']['screen_name'])
df['user_id_str'] = df.tweet.apply(lambda x: x['user']['id_str'])
df['user_utc_offset'] = df.tweet.apply(lambda x: x['user']['utc_offset'])
df['user_followers_count'] = df.tweet.apply(lambda x: x['user']['followers_count'])
df['user_friends_count'] = df.tweet.apply(lambda x: x['user']['friends_count'])
df['user_description'] = df.tweet.apply(lambda x: re.sub( '\s+', ' ', x['user']['description']).strip())
df['is_retweet'] = df.tweet.apply(lambda x: 'retweeted_status' in x)
df['retweet_of_status_id_str'] = df.tweet.apply(lambda x: x['retweeted_status']['id_str'] if 'retweeted_status' in x else None)
df['retweet_of_screen_name'] = df.tweet.apply(lambda x: x['retweeted_status']['user']['screen_name'] if 'retweeted_status' in x else None)
df['is_reply'] = df.tweet.apply(lambda x: x['in_reply_to_status_id'] != None)
df['in_reply_to_status_id_str'] = df.tweet.apply(lambda x: x['in_reply_to_status_id_str'])
df['in_reply_to_screen_name'] = df.tweet.apply(lambda x: x['in_reply_to_screen_name'])
df['text'] = df.tweet.apply(lambda x: re.sub( '\s+', ' ', x['text']).strip()) # remove commas for CSV simplicity

In [4]:
df = pd.read_sql("SELECT doi, tweet_id, old_screen_name, tweet FROM sample WHERE tweet IS NOT NULL ", litecon, index_col='tweet_id')
df['tweet'] = df.tweet.apply(lambda x: json.loads(x) if x is not None else None)


Out[4]:
tweet_id
461555014155399169    {u'contributors': None, u'truncated': False, u...
461693999657406465    {u'contributors': None, u'truncated': False, u...
461694813633400833    {u'contributors': None, u'truncated': False, u...
461717719851995136    {u'contributors': None, u'truncated': False, u...
461725753449529344    {u'contributors': None, u'truncated': False, u...
461959994586316800    {u'contributors': None, u'truncated': False, u...
461968973714960384    {u'contributors': None, u'truncated': False, u...
462128800068014080    {u'contributors': None, u'truncated': False, u...
462186377652994048    {u'contributors': None, u'truncated': False, u...
462240236706082817    {u'contributors': None, u'truncated': False, u...
462221070502596608    {u'contributors': None, u'truncated': False, u...
462222345873403904    {u'contributors': None, u'truncated': False, u...
462223447041794049    {u'contributors': None, u'truncated': False, u...
462264568140423168    {u'contributors': None, u'truncated': False, u...
462264079315644417    {u'contributors': None, u'truncated': False, u...
462293980290772992    {u'contributors': None, u'truncated': False, u...
462638169402646529    {u'contributors': None, u'truncated': False, u...
462673621799161858    {u'contributors': None, u'truncated': False, u...
462682526054907904    {u'contributors': None, u'truncated': False, u...
462853270558752768    {u'contributors': None, u'truncated': False, u...
462537114668630016    {u'contributors': None, u'truncated': False, u...
462886105856344064    {u'contributors': None, u'truncated': False, u...
462928780936228864    {u'contributors': None, u'truncated': False, u...
462952327293313024    {u'contributors': None, u'truncated': False, u...
463741815473332224    {u'contributors': None, u'truncated': False, u...
463702424935333888    {u'contributors': None, u'truncated': False, u...
463709417943207937    {u'contributors': None, u'truncated': False, u...
464321534749069312    {u'contributors': None, u'truncated': False, u...
464321955718774785    {u'contributors': None, u'truncated': False, u...
464320896225009664    {u'contributors': None, u'truncated': False, u...
                                            ...                        
564544318850215939    {u'contributors': None, u'truncated': False, u...
567317244599074816    {u'contributors': None, u'truncated': False, u...
573172838187855872    {u'contributors': None, u'truncated': False, u...
573157736214618112    {u'contributors': None, u'truncated': False, u...
532469123704299520    {u'contributors': None, u'truncated': False, u...
532628101466165248    {u'contributors': None, u'truncated': False, u...
532469884085886976    {u'contributors': None, u'truncated': False, u...
532865470270304258    {u'contributors': None, u'truncated': False, u...
552489093087903746    {u'contributors': None, u'truncated': False, u...
587996010074378240    {u'contributors': None, u'truncated': False, u...
587998690750681088    {u'contributors': None, u'truncated': False, u...
588019589600141312    {u'contributors': None, u'truncated': False, u...
588056493796286465    {u'contributors': None, u'truncated': False, u...
594084038282993664    {u'contributors': None, u'truncated': False, u...
599214196685983745    {u'contributors': None, u'truncated': False, u...
599222980489703425    {u'contributors': None, u'truncated': False, u...
599226621476888577    {u'contributors': None, u'truncated': False, u...
599214433626365952    {u'contributors': None, u'truncated': False, u...
599218617956311042    {u'contributors': None, u'truncated': False, u...
608614649290977280    {u'contributors': None, u'truncated': False, u...
608615253274963968    {u'contributors': None, u'truncated': False, u...
608616831331401728    {u'contributors': None, u'truncated': False, u...
608621871035232257    {u'contributors': None, u'truncated': False, u...
608623089060958208    {u'contributors': None, u'truncated': False, u...
608624409318969344    {u'contributors': None, u'truncated': False, u...
608631454206205952    {u'contributors': None, u'truncated': False, u...
608633081629675521    {u'contributors': None, u'truncated': False, u...
608686104196202496    {u'contributors': None, u'truncated': False, u...
608646298938441728    {u'contributors': None, u'truncated': False, u...
608997839306252288    {u'contributors': None, u'truncated': False, u...
Name: tweet, dtype: object

In [19]:
try:
    del df['tweet']
except:
    pass
df.to_csv('data/BMCBioTweetDetails.csv', encoding='utf8')

In [10]:
df = pd.read_sql("SELECT tweet_id, tweet FROM sample WHERE tweet IS NOT NULL ", litecon, index_col='tweet_id')
df['tweet'] = df.tweet.apply(lambda x: json.loads(x) if x is not None else None)
df['screen_name'] = df.tweet.apply(lambda x: x['user']['screen_name'])
df['user_description'] = df.tweet.apply(lambda x: re.sub( '\s+', ' ', x['user']['description']).strip())
df['user_followers_count'] = df.tweet.apply(lambda x: x['user']['followers_count'])
df['user_friends_count'] = df.tweet.apply(lambda x: x['user']['friends_count'])
df.set_index('screen_name', inplace=True)
# df['user_description_english'] = df.user_description.map(lambda x: x if all(ord(c) < 128 for c in x) else gs.translate(x, 'en'))
try:
    del df['tweet']
    del df['tweet_id']
except:
    pass
df.drop_duplicates()
df.to_csv('data/BMCevolBioUserDetails.txt', encoding='utf8', sep='\t')

In [ ]:
df.ix['Mycorampage']['user_description']

In [3]:
df[df['screen_name'] == 'HomareZuki']


Out[3]:
doi tweet created_at created_at_dayofweek screen_name user_id_str user_utc_offset user_followers_count user_friends_count user_description is_retweet retweet_of_status_id_str retweet_of_screen_name is_reply in_reply_to_status_id_str in_reply_to_screen_name text
tweet_id
537918381341417473 10.1186/preaccept-2055025475136453 {u'contributors': None, u'truncated': False, u... 2014-11-27 10:38:30 Thu HomareZuki 267046963 -36000.0 170 139 False None None False None None 『葉っぱにそっくりな(擬態した)蝶の模様はどのように進化してきたのか?』を明らかにした論文が...

In [48]:
biodf = pd.read_csv('data/evolBioAllFollowers.csv', header=None )
biodf.columns = ['source', 'target', 'doi']
biodf['source'] = biodf.source.astype('str')
biodf['target'] = biodf.target.astype('str')

In [57]:
m = 0
print 'original: ', len(set(df[df.retweet_of_status_id_str == '537918381341417473'].user_id_str).intersection(biodf[biodf.source == '267046963'].target))
for orig_follower in biodf[biodf.source == '267046963'].target:
    n = len(set(df[df.retweet_of_status_id_str == '537918381341417473'].user_id_str).intersection(biodf[biodf.source == orig_follower].target))
    if n > 0:
        print orig_follower, n
    m += n
    
print m


original:  15
104751612 12
118986590 14
124202460 23
128126656 2
1307483972 11
142668234 16
1941370328 5
22768088 4
231054900 32
264231916 12
2798665298 1
280053568 17
290380223 13
384771903 16
42029826 8
45511718 14
55086704 6
578134773 3
584316680 13
90175777 3
225

In [ ]:


In [39]:
len(set(df[df.retweet_of_status_id_str == '537918381341417473'].user_id_str).intersection()


Out[39]:
15

In [42]:
biodf[['source', 'target']


Out[42]:
source target
822791 2866211152 1000088682
822792 2866211152 102389906
822793 2866211152 10342612
822794 2866211152 104266865
822795 2866211152 110890208
822796 2866211152 111247159
822797 2866211152 112407655
822798 2866211152 1134478200
822799 2866211152 1134777486
822800 2866211152 1140602832
822801 2866211152 1151200609
822802 2866211152 1154354713
822803 2866211152 11644532
822804 2866211152 116500726
822805 2866211152 1177423436
822806 2866211152 122774232
822807 2866211152 1248055861
822808 2866211152 124808725
822809 2866211152 126648937
822810 2866211152 1281425954
822811 2866211152 1307643499
822812 2866211152 1311693414
822813 2866211152 1315345560
822814 2866211152 1327865594
822815 2866211152 13487232
822816 2866211152 1352628000
822817 2866211152 1380279390
822818 2866211152 1382442535
822819 2866211152 1388592445
822820 2866211152 139704061
... ... ...
1009672 3179559956 1353901686
1009673 3179559956 14619266
1009674 3179559956 1521131330
1009675 3179559956 1553560477
1009676 3179559956 1930068564
1009677 3179559956 26250653
1009678 3179559956 2783018151
1009679 3179559956 2992411640
1009680 3179559956 2993349182
1009681 3179559956 2993372706
1009682 3179559956 2994543308
1009683 3179559956 3004892850
1009684 3179559956 3041785896
1009685 3179559956 3044895046
1009686 3179559956 3094993882
1009687 3179559956 3140773679
1009688 3179559956 3141958667
1009689 3179559956 3160352090
1009690 3179559956 3163077408
1009691 3179559956 3165609344
1009692 3179559956 3171625525
1009693 3179559956 3180849740
1009694 3179559956 3182367656
1009695 3179559956 3217859356
1009696 3179559956 3240486046
1009697 3179559956 3296752641
1009698 3179559956 39397234
1009699 3179559956 49245189
1009700 3179559956 552318037
1009701 3179559956 710225819331915776

186911 rows × 2 columns