In [1]:
import pandas as pd
import simplejson as json

In [4]:
dados = []
with open('tweets.json') as arquivo:
    for line in arquivo:
        dados.append(json.loads(line))

In [5]:
len(dados)


Out[5]:
205

In [6]:
dados[0]


Out[6]:
{'contributors': None,
 'coordinates': None,
 'created_at': 'Mon Aug 01 23:41:03 +0000 2016',
 'entities': {'hashtags': [],
  'media': [{'display_url': 'pic.twitter.com/LC5FZyOweu',
    'expanded_url': 'http://twitter.com/folha/status/760248632385871873/photo/1',
    'id': 760248617693306880,
    'id_str': '760248617693306880',
    'indices': [125, 140],
    'media_url': 'http://pbs.twimg.com/media/CozyFv6XYAAK6xv.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/CozyFv6XYAAK6xv.jpg',
    'sizes': {'large': {'h': 512, 'resize': 'fit', 'w': 768},
     'medium': {'h': 512, 'resize': 'fit', 'w': 768},
     'small': {'h': 453, 'resize': 'fit', 'w': 680},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'source_status_id': 760248632385871873,
    'source_status_id_str': '760248632385871873',
    'source_user_id': 14594813,
    'source_user_id_str': '14594813',
    'type': 'photo',
    'url': 'https://t.co/LC5FZyOweu'}],
  'symbols': [],
  'urls': [{'display_url': 'uol.com/bpjRNL',
    'expanded_url': 'http://uol.com/bpjRNL',
    'indices': [101, 124],
    'url': 'https://t.co/VHl4DGhwHd'}],
  'user_mentions': [{'id': 14594813,
    'id_str': '14594813',
    'indices': [3, 9],
    'name': 'Folha de S.Paulo',
    'screen_name': 'folha'}]},
 'extended_entities': {'media': [{'display_url': 'pic.twitter.com/LC5FZyOweu',
    'expanded_url': 'http://twitter.com/folha/status/760248632385871873/photo/1',
    'id': 760248617693306880,
    'id_str': '760248617693306880',
    'indices': [125, 140],
    'media_url': 'http://pbs.twimg.com/media/CozyFv6XYAAK6xv.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/CozyFv6XYAAK6xv.jpg',
    'sizes': {'large': {'h': 512, 'resize': 'fit', 'w': 768},
     'medium': {'h': 512, 'resize': 'fit', 'w': 768},
     'small': {'h': 453, 'resize': 'fit', 'w': 680},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'source_status_id': 760248632385871873,
    'source_status_id_str': '760248632385871873',
    'source_user_id': 14594813,
    'source_user_id_str': '14594813',
    'type': 'photo',
    'url': 'https://t.co/LC5FZyOweu'}]},
 'favorite_count': 0,
 'favorited': False,
 'filter_level': 'low',
 'geo': None,
 'id': 760259076790087680,
 'id_str': '760259076790087680',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'pt',
 'place': None,
 'possibly_sensitive': False,
 'retweet_count': 0,
 'retweeted': False,
 'retweeted_status': {'contributors': None,
  'coordinates': None,
  'created_at': 'Mon Aug 01 22:59:33 +0000 2016',
  'entities': {'hashtags': [],
   'media': [{'display_url': 'pic.twitter.com/LC5FZyOweu',
     'expanded_url': 'http://twitter.com/folha/status/760248632385871873/photo/1',
     'id': 760248617693306880,
     'id_str': '760248617693306880',
     'indices': [114, 137],
     'media_url': 'http://pbs.twimg.com/media/CozyFv6XYAAK6xv.jpg',
     'media_url_https': 'https://pbs.twimg.com/media/CozyFv6XYAAK6xv.jpg',
     'sizes': {'large': {'h': 512, 'resize': 'fit', 'w': 768},
      'medium': {'h': 512, 'resize': 'fit', 'w': 768},
      'small': {'h': 453, 'resize': 'fit', 'w': 680},
      'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
     'type': 'photo',
     'url': 'https://t.co/LC5FZyOweu'}],
   'symbols': [],
   'urls': [{'display_url': 'uol.com/bpjRNL',
     'expanded_url': 'http://uol.com/bpjRNL',
     'indices': [90, 113],
     'url': 'https://t.co/VHl4DGhwHd'}],
   'user_mentions': []},
  'extended_entities': {'media': [{'display_url': 'pic.twitter.com/LC5FZyOweu',
     'expanded_url': 'http://twitter.com/folha/status/760248632385871873/photo/1',
     'id': 760248617693306880,
     'id_str': '760248617693306880',
     'indices': [114, 137],
     'media_url': 'http://pbs.twimg.com/media/CozyFv6XYAAK6xv.jpg',
     'media_url_https': 'https://pbs.twimg.com/media/CozyFv6XYAAK6xv.jpg',
     'sizes': {'large': {'h': 512, 'resize': 'fit', 'w': 768},
      'medium': {'h': 512, 'resize': 'fit', 'w': 768},
      'small': {'h': 453, 'resize': 'fit', 'w': 680},
      'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
     'type': 'photo',
     'url': 'https://t.co/LC5FZyOweu'}]},
  'favorite_count': 207,
  'favorited': False,
  'filter_level': 'low',
  'geo': None,
  'id': 760248632385871873,
  'id_str': '760248632385871873',
  'in_reply_to_screen_name': None,
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'is_quote_status': False,
  'lang': 'pt',
  'place': None,
  'possibly_sensitive': False,
  'retweet_count': 125,
  'retweeted': False,
  'source': '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>',
  'text': "'Queria deixá-la ciente do quanto ela prejudicou minha carreira', diz Biel sobre repórter https://t.co/VHl4DGhwHd https://t.co/LC5FZyOweu",
  'truncated': False,
  'user': {'contributors_enabled': False,
   'created_at': 'Wed Apr 30 02:33:39 +0000 2008',
   'default_profile': False,
   'default_profile_image': False,
   'description': 'Perfil oficial do jornal Folha de S.Paulo.\r\nNo Facebook: http://on.fb.me/wFiUk9.\r\nAtendimento: @folha_atende',
   'favourites_count': 43,
   'follow_request_sent': None,
   'followers_count': 4923055,
   'following': None,
   'friends_count': 190365,
   'geo_enabled': True,
   'id': 14594813,
   'id_str': '14594813',
   'is_translator': False,
   'lang': 'en',
   'listed_count': 9072,
   'location': 'São Paulo, Brazil',
   'name': 'Folha de S.Paulo',
   'notifications': None,
   'profile_background_color': 'C0DEED',
   'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/341066973/fundo_folha.gif',
   'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/341066973/fundo_folha.gif',
   'profile_background_tile': False,
   'profile_banner_url': 'https://pbs.twimg.com/profile_banners/14594813/1452190707',
   'profile_image_url': 'http://pbs.twimg.com/profile_images/2611288520/gfiw6lklhvm09f530wp4_normal.jpeg',
   'profile_image_url_https': 'https://pbs.twimg.com/profile_images/2611288520/gfiw6lklhvm09f530wp4_normal.jpeg',
   'profile_link_color': '0084B4',
   'profile_sidebar_border_color': 'C0DEED',
   'profile_sidebar_fill_color': 'DDEEF6',
   'profile_text_color': '333333',
   'profile_use_background_image': True,
   'protected': False,
   'screen_name': 'folha',
   'statuses_count': 243496,
   'time_zone': 'Brasilia',
   'url': 'http://www.folha.com.br',
   'utc_offset': -10800,
   'verified': True}},
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'text': "RT @folha: 'Queria deixá-la ciente do quanto ela prejudicou minha carreira', diz Biel sobre repórter https://t.co/VHl4DGhwHd https://t.co/L…",
 'timestamp_ms': '1470094863494',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Wed Jan 06 23:16:44 +0000 2016',
  'default_profile': True,
  'default_profile_image': False,
  'description': 'the worst things in life come free to us',
  'favourites_count': 357,
  'follow_request_sent': None,
  'followers_count': 1355,
  'following': None,
  'friends_count': 887,
  'geo_enabled': False,
  'id': 4729069361,
  'id_str': '4729069361',
  'is_translator': False,
  'lang': 'en',
  'listed_count': 2,
  'location': 'ob oitnb htgawm mf',
  'name': 'mi',
  'notifications': None,
  'profile_background_color': 'F5F8FA',
  'profile_background_image_url': '',
  'profile_background_image_url_https': '',
  'profile_background_tile': False,
  'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4729069361/1470093412',
  'profile_image_url': 'http://pbs.twimg.com/profile_images/760253074204155904/srqHS1gh_normal.jpg',
  'profile_image_url_https': 'https://pbs.twimg.com/profile_images/760253074204155904/srqHS1gh_normal.jpg',
  'profile_link_color': '2B7BB9',
  'profile_sidebar_border_color': 'C0DEED',
  'profile_sidebar_fill_color': 'DDEEF6',
  'profile_text_color': '333333',
  'profile_use_background_image': True,
  'protected': False,
  'screen_name': 'badvxuse',
  'statuses_count': 7917,
  'time_zone': 'Pacific Time (US & Canada)',
  'url': None,
  'utc_offset': -25200,
  'verified': False}}

In [7]:
type(dados[0])


Out[7]:
dict

In [8]:
df = pd.DataFrame(dados)

In [12]:
df.columns


Out[12]:
Index(['contributors', 'coordinates', 'created_at', 'entities',
       'extended_entities', 'favorite_count', 'favorited', 'filter_level',
       'geo', 'id', 'id_str', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status',
       'lang', 'place', 'possibly_sensitive', 'quoted_status',
       'quoted_status_id', 'quoted_status_id_str', 'retweet_count',
       'retweeted', 'retweeted_status', 'source', 'text', 'timestamp_ms',
       'truncated', 'user'],
      dtype='object')

In [14]:
df.head()


Out[14]:
contributors coordinates created_at entities extended_entities favorite_count favorited filter_level geo id ... quoted_status_id quoted_status_id_str retweet_count retweeted retweeted_status source text timestamp_ms truncated user
0 None None Mon Aug 01 23:41:03 +0000 2016 {'hashtags': [], 'urls': [{'url': 'https://t.c... {'media': [{'id': 760248617693306880, 'indices... 0 False low None 760259076790087680 ... NaN NaN 0 False {'in_reply_to_screen_name': None, 'created_at'... <a href="http://twitter.com/download/iphone" r... RT @folha: 'Queria deixá-la ciente do quanto e... 1470094863494 False {'profile_text_color': '333333', 'friends_coun...
1 None None Mon Aug 01 23:41:04 +0000 2016 {'hashtags': [], 'urls': [{'url': 'https://t.c... NaN 0 False low None 760259080451678208 ... 7.602486e+17 760248632385871873 0 False NaN <a href="http://twitter.com/download/iphone" r... gente??????????????????????????? https://t.co/... 1470094864367 False {'profile_text_color': '000000', 'friends_coun...
2 None None Mon Aug 01 23:41:06 +0000 2016 {'hashtags': [], 'urls': [{'url': 'https://t.c... {'media': [{'id': 760248617693306880, 'indices... 0 False low None 760259088081182720 ... NaN NaN 0 False {'in_reply_to_screen_name': None, 'created_at'... <a href="http://twitter.com/download/iphone" r... RT @folha: 'Queria deixá-la ciente do quanto e... 1470094866186 False {'profile_text_color': '333333', 'friends_coun...
3 None None Mon Aug 01 23:41:07 +0000 2016 {'hashtags': [], 'urls': [], 'symbols': [], 'u... NaN 0 False low None 760259093848289280 ... NaN NaN 0 False {'in_reply_to_screen_name': 'folha', 'created_... <a href="https://about.twitter.com/products/tw... RT @bk_contador: @folha e @Estadao se revezand... 1470094867561 False {'profile_text_color': '333333', 'friends_coun...
4 None None Mon Aug 01 23:41:07 +0000 2016 {'hashtags': [], 'urls': [{'url': 'https://t.c... NaN 0 False low None 760259095517626368 ... 7.602486e+17 760248632385871873 0 False NaN <a href="http://twitter.com" rel="nofollow">Tw... FALOU POUCO MAS FALOU MERDA HEIN https://t.co/... 1470094867959 False {'profile_text_color': '000000', 'friends_coun...

5 rows × 31 columns


In [19]:
df.user[0]


Out[19]:
{'contributors_enabled': False,
 'created_at': 'Wed Jan 06 23:16:44 +0000 2016',
 'default_profile': True,
 'default_profile_image': False,
 'description': 'the worst things in life come free to us',
 'favourites_count': 357,
 'follow_request_sent': None,
 'followers_count': 1355,
 'following': None,
 'friends_count': 887,
 'geo_enabled': False,
 'id': 4729069361,
 'id_str': '4729069361',
 'is_translator': False,
 'lang': 'en',
 'listed_count': 2,
 'location': 'ob oitnb htgawm mf',
 'name': 'mi',
 'notifications': None,
 'profile_background_color': 'F5F8FA',
 'profile_background_image_url': '',
 'profile_background_image_url_https': '',
 'profile_background_tile': False,
 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4729069361/1470093412',
 'profile_image_url': 'http://pbs.twimg.com/profile_images/760253074204155904/srqHS1gh_normal.jpg',
 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/760253074204155904/srqHS1gh_normal.jpg',
 'profile_link_color': '2B7BB9',
 'profile_sidebar_border_color': 'C0DEED',
 'profile_sidebar_fill_color': 'DDEEF6',
 'profile_text_color': '333333',
 'profile_use_background_image': True,
 'protected': False,
 'screen_name': 'badvxuse',
 'statuses_count': 7917,
 'time_zone': 'Pacific Time (US & Canada)',
 'url': None,
 'utc_offset': -25200,
 'verified': False}

In [ ]:


In [26]:
print(df.text[0])
print(df.created_at[0])
print(df.coordinates[0])
print(df.retweet_count[0])

print('\nUser\n')
print(df.user[0]['screen_name'])
print(df.user[0]['location'])
print(df.user[0]['lang'])
print(df.user[0]['followers_count'])


RT @folha: 'Queria deixá-la ciente do quanto ela prejudicou minha carreira', diz Biel sobre repórter https://t.co/VHl4DGhwHd https://t.co/L…
Mon Aug 01 23:41:03 +0000 2016
None
0

User

badvxuse
ob oitnb htgawm mf
en
1355

In [27]:
colunas = ['text','created_at','coordinates','retweet_count','screen_name','location', 'lang','followers_count']

In [28]:
dado2 =[
        df.text[0],
df.created_at[0],
df.coordinates[0],
df.retweet_count[0],
df.user[0]['screen_name'],
df.user[0]['location'],
df.user[0]['lang'],
df.user[0]['followers_count'],
]

In [29]:
dado2


Out[29]:
["RT @folha: 'Queria deixá-la ciente do quanto ela prejudicou minha carreira', diz Biel sobre repórter https://t.co/VHl4DGhwHd https://t.co/L…",
 'Mon Aug 01 23:41:03 +0000 2016',
 None,
 0,
 'badvxuse',
 'ob oitnb htgawm mf',
 'en',
 1355]

In [30]:
df_aux = pd.DataFrame()

In [31]:
series_aux = pd.Series(dado2,index=colunas)

In [32]:
df_aux.append(series_aux,ignore_index=True)


Out[32]:
coordinates created_at followers_count lang location retweet_count screen_name text
0 None Mon Aug 01 23:41:03 +0000 2016 1355.0 en ob oitnb htgawm mf 0.0 badvxuse RT @folha: 'Queria deixá-la ciente do quanto e...

In [33]:
df_aux


Out[33]:

In [34]:
from geopy.geocoders import Nominatim

In [35]:
geolocalizador = Nominatim()

In [38]:
def pegarLatLng (local):
    geolocalizador = Nominatim()
    try:
        localizacao = geolocalizador.geocode(local)
        return (localizacao.latitude,localizacao.longitude)
    except:
        return 0

In [41]:
pegarLatLng('sudao do sul')


Out[41]:
(7.8699431, 29.6667897)

In [42]:
palavras = df.text[0]

In [43]:
palavras


Out[43]:
"RT @folha: 'Queria deixá-la ciente do quanto ela prejudicou minha carreira', diz Biel sobre repórter https://t.co/VHl4DGhwHd https://t.co/L…"

In [44]:
for palavra in palavras.split():
    if palavra.startswith('#'):
        print(palavra)

In [47]:
def salvar_hashtags(texto):
    aux=[]
    for palavra in texto.split():
        if palavra.startswith('#'):
            aux.append(palavra)
    converter = ' '.join(aux)
    return converter

In [48]:
colunas = ['text','created_at','coordinates','retweet_count','screen_name','location', 'lang','followers_count','lat','lng','hashtags']

In [49]:
dado2 =[
    df.text[0],
    df.created_at[0],
    df.coordinates[0],
    df.retweet_count[0],
    df.user[0]['screen_name'],
    df.user[0]['location'],
    df.user[0]['lang'],
    df.user[0]['followers_count'],
        
]

In [50]:
df_final = pd.DataFrame(columns=colunas)

In [51]:
df_final


Out[51]:
text created_at coordinates retweet_count screen_name location lang followers_count lat lng hashtags

In [ ]:
for i in range(0, len(df)):
    if df.user[i]['location'] != None:
        latLong  = pegarLatLng(df.user[i]['location'])
        if latLong!= 0:
            dados = [
                df.text[i],
                df.created_at[i],
                df.coordinates[i],
                df.retweet_count[i],
                df.user[i]['screen_name'],
                df.user[i]['location'],
                df.user[i]['lang'],
                df.user[i]['followers_count'],
                latLong[0],
                latLong[1],
                salvar_hashtags(df.text[i])
            ]
            print(i, end=" ")
            series = pd.Series(dados,index=colunas)
            df_final = df_final.append(series,ignore_index=True)


1 3 4 5 6 7 8 9 10 12 15 

In [ ]:
df_final.to_csv('stream.csv',sep=';',index=False)