notebook.community

Edit and run



In [7]:

    
import pandas as pd
import re
import json
import sqlite3
from datetime import datetime 
import time

import goslate
gs = goslate.Goslate()



litecon = lite.connect('data/BMCbioSample.db')



In [8]:

    
evolBioCon = sqlite3.connect("data/BMCevolBioSample.db")
bioCon = sqlite3.connect("data/BMCbioSample.db")
BMCcon = sqlite3.connect("data/bmcTwitter.db")



In [9]:

    
with BMCcon:
    with evolBioCon:
        litecur = evolBioCon.cursor()
        litecur.execute("SELECT * FROM sample")
	# this will be big-ish in memory but shouldn't be a major issue, the whole SQLite db is about 60mb after one full run.
	journals = litecur.fetchall()



In [3]:

    
df = pd.read_sql("SELECT doi, tweet_id, old_screen_name, tweet FROM sample WHERE tweet IS NOT NULL ", BMCcon, index_col='tweet_id')
df['tweet'] = df.tweet.apply(lambda x: json.loads(x) if x is not None else None)

df['created_at'] = df.tweet.apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(x['created_at'],'%a %b %d %H:%M:%S +0000 %Y')))
df['created_at_dayofweek'] = df.tweet.apply(lambda x: x['created_at'][0:3])
df['screen_name'] = df.tweet.apply(lambda x: x['user']['screen_name'])
df['user_id_str'] = df.tweet.apply(lambda x: x['user']['id_str'])
df['user_utc_offset'] = df.tweet.apply(lambda x: x['user']['utc_offset'])
df['user_followers_count'] = df.tweet.apply(lambda x: x['user']['followers_count'])
df['user_friends_count'] = df.tweet.apply(lambda x: x['user']['friends_count'])
df['user_description'] = df.tweet.apply(lambda x: re.sub( '\s+', ' ', x['user']['description']).strip())
df['is_retweet'] = df.tweet.apply(lambda x: 'retweeted_status' in x)
df['retweet_of_status_id_str'] = df.tweet.apply(lambda x: x['retweeted_status']['id_str'] if 'retweeted_status' in x else None)
df['retweet_of_screen_name'] = df.tweet.apply(lambda x: x['retweeted_status']['user']['screen_name'] if 'retweeted_status' in x else None)
df['is_reply'] = df.tweet.apply(lambda x: x['in_reply_to_status_id'] != None)
df['in_reply_to_status_id_str'] = df.tweet.apply(lambda x: x['in_reply_to_status_id_str'])
df['in_reply_to_screen_name'] = df.tweet.apply(lambda x: x['in_reply_to_screen_name'])
df['text'] = df.tweet.apply(lambda x: re.sub( '\s+', ' ', x['text']).strip()) # remove commas for CSV simplicity



In [4]:

    
df = pd.read_sql("SELECT doi, tweet_id, old_screen_name, tweet FROM sample WHERE tweet IS NOT NULL ", litecon, index_col='tweet_id')
df['tweet'] = df.tweet.apply(lambda x: json.loads(x) if x is not None else None)









    Out[4]:





tweet_id
461555014155399169    {u'contributors': None, u'truncated': False, u...
461693999657406465    {u'contributors': None, u'truncated': False, u...
461694813633400833    {u'contributors': None, u'truncated': False, u...
461717719851995136    {u'contributors': None, u'truncated': False, u...
461725753449529344    {u'contributors': None, u'truncated': False, u...
461959994586316800    {u'contributors': None, u'truncated': False, u...
461968973714960384    {u'contributors': None, u'truncated': False, u...
462128800068014080    {u'contributors': None, u'truncated': False, u...
462186377652994048    {u'contributors': None, u'truncated': False, u...
462240236706082817    {u'contributors': None, u'truncated': False, u...
462221070502596608    {u'contributors': None, u'truncated': False, u...
462222345873403904    {u'contributors': None, u'truncated': False, u...
462223447041794049    {u'contributors': None, u'truncated': False, u...
462264568140423168    {u'contributors': None, u'truncated': False, u...
462264079315644417    {u'contributors': None, u'truncated': False, u...
462293980290772992    {u'contributors': None, u'truncated': False, u...
462638169402646529    {u'contributors': None, u'truncated': False, u...
462673621799161858    {u'contributors': None, u'truncated': False, u...
462682526054907904    {u'contributors': None, u'truncated': False, u...
462853270558752768    {u'contributors': None, u'truncated': False, u...
462537114668630016    {u'contributors': None, u'truncated': False, u...
462886105856344064    {u'contributors': None, u'truncated': False, u...
462928780936228864    {u'contributors': None, u'truncated': False, u...
462952327293313024    {u'contributors': None, u'truncated': False, u...
463741815473332224    {u'contributors': None, u'truncated': False, u...
463702424935333888    {u'contributors': None, u'truncated': False, u...
463709417943207937    {u'contributors': None, u'truncated': False, u...
464321534749069312    {u'contributors': None, u'truncated': False, u...
464321955718774785    {u'contributors': None, u'truncated': False, u...
464320896225009664    {u'contributors': None, u'truncated': False, u...
                                            ...                        
564544318850215939    {u'contributors': None, u'truncated': False, u...
567317244599074816    {u'contributors': None, u'truncated': False, u...
573172838187855872    {u'contributors': None, u'truncated': False, u...
573157736214618112    {u'contributors': None, u'truncated': False, u...
532469123704299520    {u'contributors': None, u'truncated': False, u...
532628101466165248    {u'contributors': None, u'truncated': False, u...
532469884085886976    {u'contributors': None, u'truncated': False, u...
532865470270304258    {u'contributors': None, u'truncated': False, u...
552489093087903746    {u'contributors': None, u'truncated': False, u...
587996010074378240    {u'contributors': None, u'truncated': False, u...
587998690750681088    {u'contributors': None, u'truncated': False, u...
588019589600141312    {u'contributors': None, u'truncated': False, u...
588056493796286465    {u'contributors': None, u'truncated': False, u...
594084038282993664    {u'contributors': None, u'truncated': False, u...
599214196685983745    {u'contributors': None, u'truncated': False, u...
599222980489703425    {u'contributors': None, u'truncated': False, u...
599226621476888577    {u'contributors': None, u'truncated': False, u...
599214433626365952    {u'contributors': None, u'truncated': False, u...
599218617956311042    {u'contributors': None, u'truncated': False, u...
608614649290977280    {u'contributors': None, u'truncated': False, u...
608615253274963968    {u'contributors': None, u'truncated': False, u...
608616831331401728    {u'contributors': None, u'truncated': False, u...
608621871035232257    {u'contributors': None, u'truncated': False, u...
608623089060958208    {u'contributors': None, u'truncated': False, u...
608624409318969344    {u'contributors': None, u'truncated': False, u...
608631454206205952    {u'contributors': None, u'truncated': False, u...
608633081629675521    {u'contributors': None, u'truncated': False, u...
608686104196202496    {u'contributors': None, u'truncated': False, u...
608646298938441728    {u'contributors': None, u'truncated': False, u...
608997839306252288    {u'contributors': None, u'truncated': False, u...
Name: tweet, dtype: object



In [19]:

    
try:
    del df['tweet']
except:
    pass
df.to_csv('data/BMCBioTweetDetails.csv', encoding='utf8')



In [10]:

    
df = pd.read_sql("SELECT tweet_id, tweet FROM sample WHERE tweet IS NOT NULL ", litecon, index_col='tweet_id')
df['tweet'] = df.tweet.apply(lambda x: json.loads(x) if x is not None else None)
df['screen_name'] = df.tweet.apply(lambda x: x['user']['screen_name'])
df['user_description'] = df.tweet.apply(lambda x: re.sub( '\s+', ' ', x['user']['description']).strip())
df['user_followers_count'] = df.tweet.apply(lambda x: x['user']['followers_count'])
df['user_friends_count'] = df.tweet.apply(lambda x: x['user']['friends_count'])
df.set_index('screen_name', inplace=True)
# df['user_description_english'] = df.user_description.map(lambda x: x if all(ord(c) < 128 for c in x) else gs.translate(x, 'en'))
try:
    del df['tweet']
    del df['tweet_id']
except:
    pass
df.drop_duplicates()
df.to_csv('data/BMCevolBioUserDetails.txt', encoding='utf8', sep='\t')



In [ ]:

    
df.ix['Mycorampage']['user_description']



In [3]:

    
df[df['screen_name'] == 'HomareZuki']









    Out[3]:






  
    
      
      doi
      tweet
      created_at
      created_at_dayofweek
      screen_name
      user_id_str
      user_utc_offset
      user_followers_count
      user_friends_count
      user_description
      is_retweet
      retweet_of_status_id_str
      retweet_of_screen_name
      is_reply
      in_reply_to_status_id_str
      in_reply_to_screen_name
      text
    
    
      tweet_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      537918381341417473
      10.1186/preaccept-2055025475136453
      {u'contributors': None, u'truncated': False, u...
      2014-11-27 10:38:30
      Thu
      HomareZuki
      267046963
      -36000.0
      170
      139
      
      False
      None
      None
      False
      None
      None
      『葉っぱにそっくりな（擬態した）蝶の模様はどのように進化してきたのか？』を明らかにした論文が...



In [48]:

    
biodf = pd.read_csv('data/evolBioAllFollowers.csv', header=None )
biodf.columns = ['source', 'target', 'doi']
biodf['source'] = biodf.source.astype('str')
biodf['target'] = biodf.target.astype('str')



In [57]:

    
m = 0
print 'original: ', len(set(df[df.retweet_of_status_id_str == '537918381341417473'].user_id_str).intersection(biodf[biodf.source == '267046963'].target))
for orig_follower in biodf[biodf.source == '267046963'].target:
    n = len(set(df[df.retweet_of_status_id_str == '537918381341417473'].user_id_str).intersection(biodf[biodf.source == orig_follower].target))
    if n > 0:
        print orig_follower, n
    m += n
    
print m









    



original:  15
104751612 12
118986590 14
124202460 23
128126656 2
1307483972 11
142668234 16
1941370328 5
22768088 4
231054900 32
264231916 12
2798665298 1
280053568 17
290380223 13
384771903 16
42029826 8
45511718 14
55086704 6
578134773 3
584316680 13
90175777 3
225



In [ ]:



In [39]:

    
len(set(df[df.retweet_of_status_id_str == '537918381341417473'].user_id_str).intersection()









    Out[39]:





15



In [42]:

    
biodf[['source', 'target']









    Out[42]:






  
    
      
      source
      target
    
  
  
    
      822791
      2866211152
      1000088682
    
    
      822792
      2866211152
      102389906
    
    
      822793
      2866211152
      10342612
    
    
      822794
      2866211152
      104266865
    
    
      822795
      2866211152
      110890208
    
    
      822796
      2866211152
      111247159
    
    
      822797
      2866211152
      112407655
    
    
      822798
      2866211152
      1134478200
    
    
      822799
      2866211152
      1134777486
    
    
      822800
      2866211152
      1140602832
    
    
      822801
      2866211152
      1151200609
    
    
      822802
      2866211152
      1154354713
    
    
      822803
      2866211152
      11644532
    
    
      822804
      2866211152
      116500726
    
    
      822805
      2866211152
      1177423436
    
    
      822806
      2866211152
      122774232
    
    
      822807
      2866211152
      1248055861
    
    
      822808
      2866211152
      124808725
    
    
      822809
      2866211152
      126648937
    
    
      822810
      2866211152
      1281425954
    
    
      822811
      2866211152
      1307643499
    
    
      822812
      2866211152
      1311693414
    
    
      822813
      2866211152
      1315345560
    
    
      822814
      2866211152
      1327865594
    
    
      822815
      2866211152
      13487232
    
    
      822816
      2866211152
      1352628000
    
    
      822817
      2866211152
      1380279390
    
    
      822818
      2866211152
      1382442535
    
    
      822819
      2866211152
      1388592445
    
    
      822820
      2866211152
      139704061
    
    
      ...
      ...
      ...
    
    
      1009672
      3179559956
      1353901686
    
    
      1009673
      3179559956
      14619266
    
    
      1009674
      3179559956
      1521131330
    
    
      1009675
      3179559956
      1553560477
    
    
      1009676
      3179559956
      1930068564
    
    
      1009677
      3179559956
      26250653
    
    
      1009678
      3179559956
      2783018151
    
    
      1009679
      3179559956
      2992411640
    
    
      1009680
      3179559956
      2993349182
    
    
      1009681
      3179559956
      2993372706
    
    
      1009682
      3179559956
      2994543308
    
    
      1009683
      3179559956
      3004892850
    
    
      1009684
      3179559956
      3041785896
    
    
      1009685
      3179559956
      3044895046
    
    
      1009686
      3179559956
      3094993882
    
    
      1009687
      3179559956
      3140773679
    
    
      1009688
      3179559956
      3141958667
    
    
      1009689
      3179559956
      3160352090
    
    
      1009690
      3179559956
      3163077408
    
    
      1009691
      3179559956
      3165609344
    
    
      1009692
      3179559956
      3171625525
    
    
      1009693
      3179559956
      3180849740
    
    
      1009694
      3179559956
      3182367656
    
    
      1009695
      3179559956
      3217859356
    
    
      1009696
      3179559956
      3240486046
    
    
      1009697
      3179559956
      3296752641
    
    
      1009698
      3179559956
      39397234
    
    
      1009699
      3179559956
      49245189
    
    
      1009700
      3179559956
      552318037
    
    
      1009701
      3179559956
      710225819331915776
    
  

186911 rows × 2 columns

	doi	tweet	created_at	created_at_dayofweek	screen_name	user_id_str	user_utc_offset	user_followers_count	user_friends_count	user_description	is_retweet	retweet_of_status_id_str	retweet_of_screen_name	is_reply	in_reply_to_status_id_str	in_reply_to_screen_name	text
tweet_id
537918381341417473	10.1186/preaccept-2055025475136453	{u'contributors': None, u'truncated': False, u...	2014-11-27 10:38:30	Thu	HomareZuki	267046963	-36000.0	170	139		False	None	None	False	None	None	『葉っぱにそっくりな（擬態した）蝶の模様はどのように進化してきたのか？』を明らかにした論文が...

	source	target
822791	2866211152	1000088682
822792	2866211152	102389906
822793	2866211152	10342612
822794	2866211152	104266865
822795	2866211152	110890208
822796	2866211152	111247159
822797	2866211152	112407655
822798	2866211152	1134478200
822799	2866211152	1134777486
822800	2866211152	1140602832
822801	2866211152	1151200609
822802	2866211152	1154354713
822803	2866211152	11644532
822804	2866211152	116500726
822805	2866211152	1177423436
822806	2866211152	122774232
822807	2866211152	1248055861
822808	2866211152	124808725
822809	2866211152	126648937
822810	2866211152	1281425954
822811	2866211152	1307643499
822812	2866211152	1311693414
822813	2866211152	1315345560
822814	2866211152	1327865594
822815	2866211152	13487232
822816	2866211152	1352628000
822817	2866211152	1380279390
822818	2866211152	1382442535
822819	2866211152	1388592445
822820	2866211152	139704061
...	...	...
1009672	3179559956	1353901686
1009673	3179559956	14619266
1009674	3179559956	1521131330
1009675	3179559956	1553560477
1009676	3179559956	1930068564
1009677	3179559956	26250653
1009678	3179559956	2783018151
1009679	3179559956	2992411640
1009680	3179559956	2993349182
1009681	3179559956	2993372706
1009682	3179559956	2994543308
1009683	3179559956	3004892850
1009684	3179559956	3041785896
1009685	3179559956	3044895046
1009686	3179559956	3094993882
1009687	3179559956	3140773679
1009688	3179559956	3141958667
1009689	3179559956	3160352090
1009690	3179559956	3163077408
1009691	3179559956	3165609344
1009692	3179559956	3171625525
1009693	3179559956	3180849740
1009694	3179559956	3182367656
1009695	3179559956	3217859356
1009696	3179559956	3240486046
1009697	3179559956	3296752641
1009698	3179559956	39397234
1009699	3179559956	49245189
1009700	3179559956	552318037
1009701	3179559956	710225819331915776