In [2]:
from pprint import pprint
import re
import requests
from pymongo import MongoClient
from bson import ObjectId
import re
regex = re.compile('https?://t\.co/[A-z0-9]*')
conn = MongoClient('laisky.com', port=27016)
db=conn.twitter
In [ ]:
url_regex = re.compile(
r"(?xi)"
"\b"
"(" # Capture 1: entire matched URL
"(?:"
"https?://" # http or https protocol
"|" # or
"www\d{0,3}[.]" # "www.", "www1.", "www2." … "www999."
"|" # or
"[a-z0-9.\-]+[.][a-z]{2,4}/" # looks like domain name followed by a slash
")"
"(?:" # One or more:
"[^\s()<>]+" # Run of non-space, non-()<>
"|" # or
"\(([^\s()<>]+|(\([^\s()<>]+\)))*\)" # balanced parens, up to 2 levels
")+"
"(?:" # End with:
"\(([^\s()<>]+|(\([^\s()<>]+\)))*\)" # balanced parens, up to 2 levels
"|" # or
"[^\s`!()\[\]{};:'\".,<>?«»\“\”‘’]" # not a space or one of these punct chars
")"
")"
)
def find_urls(text):
return url_regex.findall(text)
count = 0
for docu in db.tweets.find(timeout=False):
t = docu['text']
if 'http' not in t:
continue
if count % 50 == 0:
print('finished {}'.format(count))
t = docu['text']
for i, surl in enumerate(find_urls(t)):
ul = len(surl)
ui = t.index(surl)
if ul + ui != len(t):
if t[ul + ui] != ' ':
t = t[:ui + ul] + ' ' + t[ui + ul:]
if ui != 0:
if t[ui - 1] != ' ':
t = t[:ui] + ' ' + t[ui:]
# pprint(docu)
# break
docu['text'] = t
_id = docu.pop('_id')
db.tweets.update({'_id': _id}, {'$set': docu})
count += 1
In [11]:
count = 0
for docu in db.tweets.find():
if 'expanded_urls' not in docu['text'] or 't.co' not in docu['text']:
continue
if count % 50 == 0:
print('finished {}'.format(count))
t = docu['text']
if not docu['expanded_urls']:
pprint(docu)
eurls = docu['expanded_urls'].split(',')
surls = regex.findall(t)
if len(eurls) != len(surls):
_u = set()
eurls = [u for u in eurls if u not in _u and not _u.add(u)]
if len(eurls) != len(surls):
continue
print('ERROR! for docu {}'.format(docu))
break
for i, surl in enumerate(surls):
ul = len(surl)
ui = t.index(surl)
if ul + ui != len(t):
if t[ul + ui] != ' ':
t = t[:ui + ul] + ' ' + t[ui + ul:]
if ui != 0:
if t[ui - 1] != ' ':
t = t[:ui] + ' ' + t[ui:]
t = t.replace(surl, eurls[i])
docu['text'] = t
# print(t)
_id = docu.pop('_id')
db.tweets.update({'_id': _id}, {'$set': docu})
count += 1
# if count > 50:
# break
In [8]:
count = 0
for docu in db.tweets.find({'entities': {'$exists': 1}}):
t = docu['text']
if 't.co' not in t:
continue
if count % 50 == 0:
print('finished {}'.format(count))
t = docu['text']
surls = []
eurls = []
# parse entities media
if 'media' in docu['entities']:
for media in docu['entities']['media']:
surl = media['url']
eurl = media['media_url']
t = t.replace(surl, eurl)
# parse entities urls
if 'urls' in docu['entities']:
for d in docu['entities']['urls']:
surl = d['url']
eurl = d['expanded_url']
t = t.replace(surl, eurl)
docu['text'] = t
_id = docu.pop('_id')
db.tweets.update({'_id': _id}, {'$set': docu})
count += 1
finished 0
{'_id': ObjectId('5491176e98dc5b91bc0583a2'),
'contributors': None,
'coordinates': None,
'created_at': datetime.datetime(2014, 12, 17, 5, 40, 56),
'entities': {'hashtags': [],
'media': [{'display_url': 'pic.twitter.com/sbouqsUjcW',
'expanded_url': 'http://twitter.com/examisboring/status/545021141564133376/photo/1',
'id': 545021140242952192,
'id_str': '545021140242952192',
'indices': [60, 82],
'media_url': 'http://pbs.twimg.com/media/B5BN1uNCcAAKRFi.jpg',
'media_url_https': 'https://pbs.twimg.com/media/B5BN1uNCcAAKRFi.jpg',
'sizes': {'large': {'h': 750,
'resize': 'fit',
'w': 422},
'medium': {'h': 750,
'resize': 'fit',
'w': 422},
'small': {'h': 604,
'resize': 'fit',
'w': 340},
'thumb': {'h': 150,
'resize': 'crop',
'w': 150}},
'source_status_id': 545021141564133376,
'source_status_id_str': '545021141564133376',
'source_user_id': 15262465,
'source_user_id_str': '15262465',
'type': 'photo',
'url': 'http://t.co/sbouqsUjcW'}],
'symbols': [],
'urls': [],
'user_mentions': [{'id': 15262465,
'id_str': '15262465',
'indices': [3, 16],
'name': '鱼也可以喵 已瞎',
'screen_name': 'examisboring'}]},
'extended_entities': {'media': [{'display_url': 'pic.twitter.com/sbouqsUjcW',
'expanded_url': 'http://twitter.com/examisboring/status/545021141564133376/photo/1',
'id': 545021140242952192,
'id_str': '545021140242952192',
'indices': [60, 82],
'media_url': 'http://pbs.twimg.com/media/B5BN1uNCcAAKRFi.jpg',
'media_url_https': 'https://pbs.twimg.com/media/B5BN1uNCcAAKRFi.jpg',
'sizes': {'large': {'h': 750,
'resize': 'fit',
'w': 422},
'medium': {'h': 750,
'resize': 'fit',
'w': 422},
'small': {'h': 604,
'resize': 'fit',
'w': 340},
'thumb': {'h': 150,
'resize': 'crop',
'w': 150}},
'source_status_id': 545021141564133376,
'source_status_id_str': '545021141564133376',
'source_user_id': 15262465,
'source_user_id_str': '15262465',
'type': 'photo',
'url': 'http://t.co/sbouqsUjcW'}]},
'favorite_count': 0,
'favorited': False,
'geo': None,
'id': 545091250525700096,
'id_str': '545091250525700096',
'in_reply_to_screen_name': None,
'in_reply_to_status_id': None,
'in_reply_to_status_id_str': None,
'in_reply_to_user_id': None,
'in_reply_to_user_id_str': None,
'lang': 'en',
'place': None,
'possibly_sensitive': False,
'retweet_count': 8,
'retweeted': True,
'retweeted_status': {'contributors': None,
'coordinates': None,
'created_at': 'Wed Dec 17 01:02:20 +0000 2014',
'entities': {'hashtags': [],
'media': [{'display_url': 'pic.twitter.com/sbouqsUjcW',
'expanded_url': 'http://twitter.com/examisboring/status/545021141564133376/photo/1',
'id': 545021140242952192,
'id_str': '545021140242952192',
'indices': [42, 64],
'media_url': 'http://pbs.twimg.com/media/B5BN1uNCcAAKRFi.jpg',
'media_url_https': 'https://pbs.twimg.com/media/B5BN1uNCcAAKRFi.jpg',
'sizes': {'large': {'h': 750,
'resize': 'fit',
'w': 422},
'medium': {'h': 750,
'resize': 'fit',
'w': 422},
'small': {'h': 604,
'resize': 'fit',
'w': 340},
'thumb': {'h': 150,
'resize': 'crop',
'w': 150}},
'type': 'photo',
'url': 'http://t.co/sbouqsUjcW'}],
'symbols': [],
'urls': [],
'user_mentions': []},
'extended_entities': {'media': [{'display_url': 'pic.twitter.com/sbouqsUjcW',
'expanded_url': 'http://twitter.com/examisboring/status/545021141564133376/photo/1',
'id': 545021140242952192,
'id_str': '545021140242952192',
'indices': [42, 64],
'media_url': 'http://pbs.twimg.com/media/B5BN1uNCcAAKRFi.jpg',
'media_url_https': 'https://pbs.twimg.com/media/B5BN1uNCcAAKRFi.jpg',
'sizes': {'large': {'h': 750,
'resize': 'fit',
'w': 422},
'medium': {'h': 750,
'resize': 'fit',
'w': 422},
'small': {'h': 604,
'resize': 'fit',
'w': 340},
'thumb': {'h': 150,
'resize': 'crop',
'w': 150}},
'type': 'photo',
'url': 'http://t.co/sbouqsUjcW'}]},
'favorite_count': 2,
'favorited': False,
'geo': None,
'id': 545021141564133376,
'id_str': '545021141564133376',
'in_reply_to_screen_name': None,
'in_reply_to_status_id': None,
'in_reply_to_status_id_str': None,
'in_reply_to_user_id': None,
'in_reply_to_user_id_str': None,
'lang': 'en',
'place': None,
'possibly_sensitive': False,
'retweet_count': 8,
'retweeted': True,
'source': '<a href="http://tapbots.com/tweetbot" '
'rel="nofollow">Tweetbot for iΟS</a>',
'text': 'A wild exponential function had appeared! '
'http://t.co/sbouqsUjcW',
'truncated': False,
'user': {'contributors_enabled': False,
'created_at': 'Sat Jun 28 11:52:41 +0000 '
'2008',
'default_profile': False,
'default_profile_image': False,
'description': '宅鱼 双瞳异色大爱 Apple\uf8ff '
'可能一直宅下去 本体是鱼 没童年 ʘ‿ʘ No '
'Life \n'
'长着猫耳会走路的鱼 \n'
'头像即本人',
'entities': {'description': {'urls': []},
'url': {'urls': [{'display_url': 'about.me/examisboring',
'expanded_url': 'http://about.me/examisboring',
'indices': [0,
22],
'url': 'http://t.co/JMyHj3Vsew'}]}},
'favourites_count': 189,
'follow_request_sent': False,
'followers_count': 1142,
'following': False,
'friends_count': 535,
'geo_enabled': True,
'id': 15262465,
'id_str': '15262465',
'is_translation_enabled': False,
'is_translator': False,
'lang': 'en',
'listed_count': 29,
'location': 'La Jolla, CA',
'name': '鱼也可以喵 已瞎',
'notifications': False,
'profile_background_color': 'C0DEED',
'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/685969804/7f2e0dd206ddb76f586d90fb37a60ddd.jpeg',
'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/685969804/7f2e0dd206ddb76f586d90fb37a60ddd.jpeg',
'profile_background_tile': True,
'profile_banner_url': 'https://pbs.twimg.com/profile_banners/15262465/1414549428',
'profile_image_url': 'http://pbs.twimg.com/profile_images/378800000272533011/37421e203a7831eb9dae005d98728ac1_normal.jpeg',
'profile_image_url_https': 'https://pbs.twimg.com/profile_images/378800000272533011/37421e203a7831eb9dae005d98728ac1_normal.jpeg',
'profile_link_color': '0084B4',
'profile_location': None,
'profile_sidebar_border_color': 'C0DEED',
'profile_sidebar_fill_color': 'DDEEF6',
'profile_text_color': '333333',
'profile_use_background_image': True,
'protected': False,
'screen_name': 'examisboring',
'statuses_count': 25098,
'time_zone': 'Alaska',
'url': 'http://t.co/JMyHj3Vsew',
'utc_offset': -32400,
'verified': False}},
'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web '
'Client</a>',
'text': 'RT @examisboring: A wild exponential function had appeared! '
'http://pbs.twimg.com/media/B5BN1uNCcAAKRFi.jpg',
'topics': [],
'truncated': False,
'user': {'contributors_enabled': False,
'created_at': 'Sat Jan 16 03:02:39 +0000 2010',
'default_profile': False,
'default_profile_image': False,
'description': '热爱文学的理科生,现在是程序猿( @yanagi_76 © )',
'entities': {'description': {'urls': []},
'url': {'urls': [{'display_url': 'blog.laisky.us',
'expanded_url': 'http://blog.laisky.us/',
'indices': [0, 22],
'url': 'http://t.co/fqaY66B4cw'}]}},
'favourites_count': 49,
'follow_request_sent': False,
'followers_count': 1518,
'following': False,
'friends_count': 224,
'geo_enabled': True,
'id': 105351466,
'id_str': '105351466',
'is_translation_enabled': False,
'is_translator': False,
'lang': 'en',
'listed_count': 67,
'location': 'Chengdu ♥ Shanghai',
'name': 'Laisky',
'notifications': False,
'profile_background_color': '022330',
'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/440394293438472192/wpI6_rYC.jpeg',
'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/440394293438472192/wpI6_rYC.jpeg',
'profile_background_tile': False,
'profile_banner_url': 'https://pbs.twimg.com/profile_banners/105351466/1353200881',
'profile_image_url': 'http://pbs.twimg.com/profile_images/378800000062981150/89794fcaabdd287af05b3b8ff6d02ad2_normal.jpeg',
'profile_image_url_https': 'https://pbs.twimg.com/profile_images/378800000062981150/89794fcaabdd287af05b3b8ff6d02ad2_normal.jpeg',
'profile_link_color': '0084B4',
'profile_location': None,
'profile_sidebar_border_color': 'FFFFFF',
'profile_sidebar_fill_color': 'C0DFEC',
'profile_text_color': '333333',
'profile_use_background_image': True,
'protected': False,
'screen_name': 'ppcelery',
'statuses_count': 17276,
'time_zone': 'Quito',
'url': 'http://t.co/fqaY66B4cw',
'utc_offset': -18000,
'verified': False}}
In [5]:
docu = db.tweets.find_one({'_id': ObjectId('54a74d222374fca232bfa818')})
# expanded_urls = docu['expanded_urls']
text = docu['text']
# pprint(docu['entities'])
pprint(docu)
{'_id': ObjectId('54a74d222374fca232bfa818'),
'contributors': None,
'coordinates': None,
'created_at': datetime.datetime(2015, 1, 3, 1, 23, 40),
'entities': {'hashtags': [],
'media': [{'display_url': 'pic.twitter.com/uTWrfuDEz7',
'expanded_url': 'http://twitter.com/NASA/status/551185158578397184/photo/1',
'id': 551185158108631040,
'id_str': '551185158108631040',
'indices': [139, 140],
'media_url': 'http://pbs.twimg.com/media/B6Yz-yKIUAABy7x.jpg',
'media_url_https': 'https://pbs.twimg.com/media/B6Yz-yKIUAABy7x.jpg',
'sizes': {'large': {'h': 576,
'resize': 'fit',
'w': 1024},
'medium': {'h': 337,
'resize': 'fit',
'w': 600},
'small': {'h': 191,
'resize': 'fit',
'w': 340},
'thumb': {'h': 150,
'resize': 'crop',
'w': 150}},
'source_status_id': 551185158578397184,
'source_status_id_str': '551185158578397184',
'source_user_id': 11348282,
'source_user_id_str': '11348282',
'type': 'photo',
'url': 'http://t.co/uTWrfuDEz7'}],
'symbols': [],
'urls': [{'display_url': 'on.wsj.com/1K805sA',
'expanded_url': 'http://on.wsj.com/1K805sA',
'indices': [104, 126],
'url': 'http://t.co/22ekGvZP3f'}],
'user_mentions': [{'id': 11348282,
'id_str': '11348282',
'indices': [3, 8],
'name': 'NASA',
'screen_name': 'NASA'}]},
'extended_entities': {'media': [{'display_url': 'pic.twitter.com/uTWrfuDEz7',
'expanded_url': 'http://twitter.com/NASA/status/551185158578397184/photo/1',
'id': 551185158108631040,
'id_str': '551185158108631040',
'indices': [139, 140],
'media_url': 'http://pbs.twimg.com/media/B6Yz-yKIUAABy7x.jpg',
'media_url_https': 'https://pbs.twimg.com/media/B6Yz-yKIUAABy7x.jpg',
'sizes': {'large': {'h': 576,
'resize': 'fit',
'w': 1024},
'medium': {'h': 337,
'resize': 'fit',
'w': 600},
'small': {'h': 191,
'resize': 'fit',
'w': 340},
'thumb': {'h': 150,
'resize': 'crop',
'w': 150}},
'source_status_id': 551185158578397184,
'source_status_id_str': '551185158578397184',
'source_user_id': 11348282,
'source_user_id_str': '11348282',
'type': 'photo',
'url': 'http://t.co/uTWrfuDEz7'}]},
'favorite_count': 0,
'favorited': False,
'geo': None,
'id': 551187103820685313,
'id_str': '551187103820685313',
'in_reply_to_screen_name': None,
'in_reply_to_status_id': None,
'in_reply_to_status_id_str': None,
'in_reply_to_user_id': None,
'in_reply_to_user_id_str': None,
'lang': 'en',
'place': None,
'possibly_sensitive': False,
'retweet_count': 1006,
'retweeted': True,
'retweeted_status': {'contributors': None,
'coordinates': None,
'created_at': 'Sat Jan 03 01:15:56 +0000 2015',
'entities': {'hashtags': [],
'media': [{'display_url': 'pic.twitter.com/uTWrfuDEz7',
'expanded_url': 'http://twitter.com/NASA/status/551185158578397184/photo/1',
'id': 551185158108631040,
'id_str': '551185158108631040',
'indices': [117, 139],
'media_url': 'http://pbs.twimg.com/media/B6Yz-yKIUAABy7x.jpg',
'media_url_https': 'https://pbs.twimg.com/media/B6Yz-yKIUAABy7x.jpg',
'sizes': {'large': {'h': 576,
'resize': 'fit',
'w': 1024},
'medium': {'h': 337,
'resize': 'fit',
'w': 600},
'small': {'h': 191,
'resize': 'fit',
'w': 340},
'thumb': {'h': 150,
'resize': 'crop',
'w': 150}},
'type': 'photo',
'url': 'http://t.co/uTWrfuDEz7'}],
'symbols': [],
'urls': [{'display_url': 'on.wsj.com/1K805sA',
'expanded_url': 'http://on.wsj.com/1K805sA',
'indices': [94, 116],
'url': 'http://t.co/22ekGvZP3f'}],
'user_mentions': []},
'extended_entities': {'media': [{'display_url': 'pic.twitter.com/uTWrfuDEz7',
'expanded_url': 'http://twitter.com/NASA/status/551185158578397184/photo/1',
'id': 551185158108631040,
'id_str': '551185158108631040',
'indices': [117,
139],
'media_url': 'http://pbs.twimg.com/media/B6Yz-yKIUAABy7x.jpg',
'media_url_https': 'https://pbs.twimg.com/media/B6Yz-yKIUAABy7x.jpg',
'sizes': {'large': {'h': 576,
'resize': 'fit',
'w': 1024},
'medium': {'h': 337,
'resize': 'fit',
'w': 600},
'small': {'h': 191,
'resize': 'fit',
'w': 340},
'thumb': {'h': 150,
'resize': 'crop',
'w': 150}},
'type': 'photo',
'url': 'http://t.co/uTWrfuDEz7'}]},
'favorite_count': 1173,
'favorited': False,
'geo': None,
'id': 551185158578397184,
'id_str': '551185158578397184',
'in_reply_to_screen_name': None,
'in_reply_to_status_id': None,
'in_reply_to_status_id_str': None,
'in_reply_to_user_id': None,
'in_reply_to_user_id_str': None,
'lang': 'en',
'place': None,
'possibly_sensitive': False,
'retweet_count': 1006,
'retweeted': True,
'source': '<a href="http://twitter.com" '
'rel="nofollow">Twitter Web Client</a>',
'text': '"America has begun an exciting new chapter '
'in human space exploration," Admin. Charles '
'Bolden http://t.co/22ekGvZP3f '
'http://t.co/uTWrfuDEz7',
'truncated': False,
'user': {'contributors_enabled': False,
'created_at': 'Wed Dec 19 20:20:32 +0000 '
'2007',
'default_profile': False,
'default_profile_image': False,
'description': 'Explore the universe and '
'discover our home planet '
'with @NASA. We usually post '
'in EST (UTC-5).',
'entities': {'description': {'urls': []},
'url': {'urls': [{'display_url': 'nasa.gov',
'expanded_url': 'http://www.nasa.gov',
'indices': [0,
22],
'url': 'http://t.co/TcEE6NS8nD'}]}},
'favourites_count': 408,
'follow_request_sent': False,
'followers_count': 8406914,
'following': True,
'friends_count': 230,
'geo_enabled': True,
'id': 11348282,
'id_str': '11348282',
'is_translation_enabled': False,
'is_translator': False,
'lang': 'en',
'listed_count': 71332,
'location': '',
'name': 'NASA',
'notifications': False,
'profile_background_color': 'FFFFFF',
'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/378800000070768280/1e9d3d155ba7cb623d541c764c5ef9c0.jpeg',
'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/378800000070768280/1e9d3d155ba7cb623d541c764c5ef9c0.jpeg',
'profile_background_tile': False,
'profile_banner_url': 'https://pbs.twimg.com/profile_banners/11348282/1419279625',
'profile_image_url': 'http://pbs.twimg.com/profile_images/188302352/nasalogo_twitter_normal.jpg',
'profile_image_url_https': 'https://pbs.twimg.com/profile_images/188302352/nasalogo_twitter_normal.jpg',
'profile_link_color': '205BA7',
'profile_location': None,
'profile_sidebar_border_color': '000000',
'profile_sidebar_fill_color': 'F3F2F2',
'profile_text_color': '000000',
'profile_use_background_image': True,
'protected': False,
'screen_name': 'NASA',
'statuses_count': 34639,
'time_zone': 'Eastern Time (US & Canada)',
'url': 'http://t.co/TcEE6NS8nD',
'utc_offset': -18000,
'verified': True}},
'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web '
'Client</a>',
'text': 'RT @NASA: "America has begun an exciting new chapter in human '
'space exploration," Admin. Charles Bolden http://t.co/22ekGvZP3f '
'http://t.co/…',
'topics': [],
'truncated': False,
'user': {'contributors_enabled': False,
'created_at': 'Sat Jan 16 03:02:39 +0000 2010',
'default_profile': False,
'default_profile_image': False,
'description': '热爱文学的理科生,现在是程序猿( @yanagi_76 © )',
'entities': {'description': {'urls': []},
'url': {'urls': [{'display_url': 'blog.laisky.us',
'expanded_url': 'http://blog.laisky.us/',
'indices': [0, 22],
'url': 'http://t.co/fqaY66B4cw'}]}},
'favourites_count': 49,
'follow_request_sent': False,
'followers_count': 1547,
'following': False,
'friends_count': 225,
'geo_enabled': True,
'id': 105351466,
'id_str': '105351466',
'is_translation_enabled': False,
'is_translator': False,
'lang': 'en',
'listed_count': 66,
'location': 'Chengdu ♥ Shanghai',
'name': 'Laisky',
'notifications': False,
'profile_background_color': '022330',
'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/440394293438472192/wpI6_rYC.jpeg',
'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/440394293438472192/wpI6_rYC.jpeg',
'profile_background_tile': False,
'profile_banner_url': 'https://pbs.twimg.com/profile_banners/105351466/1353200881',
'profile_image_url': 'http://pbs.twimg.com/profile_images/378800000062981150/89794fcaabdd287af05b3b8ff6d02ad2_normal.jpeg',
'profile_image_url_https': 'https://pbs.twimg.com/profile_images/378800000062981150/89794fcaabdd287af05b3b8ff6d02ad2_normal.jpeg',
'profile_link_color': '0084B4',
'profile_location': None,
'profile_sidebar_border_color': 'FFFFFF',
'profile_sidebar_fill_color': 'C0DFEC',
'profile_text_color': '333333',
'profile_use_background_image': True,
'protected': False,
'screen_name': 'ppcelery',
'statuses_count': 17323,
'time_zone': 'Quito',
'url': 'http://t.co/fqaY66B4cw',
'utc_offset': -18000,
'verified': False}}
Content source: Laisky/laishime2
Similar notebooks: