In [1]:
%load_ext cypher
%matplotlib inline
import pandas as pd

Number of Tweets


In [2]:
%cypher match (n:tweet) return count(n)


1 rows affected.
Out[2]:
count(n)
337174

Number of users


In [3]:
%cypher match (u:user) return count(u)


1 rows affected.
Out[3]:
count(u)
99482

Top Tweets


In [4]:
%%cypher
match (n:tweet)-[r]-()
with n, count(r) as deg
order by deg desc
limit 10
match (n)<-[:TWEETS]-(u:user)
return u.screen_name as user, n.tid as tid, substring(n.text, 0, 20) as tweet, deg


10 rows affected.
Out[4]:
user tid tweet deg
DELAESPRIELLAE 682222059888365570 Los mejores deseos p 3339
piedadcordoba 676816668736274437 Presidente Chávez: a 1976
chavezcandanga 243071268004438016 Acompañemos a la her 1697
AlvaroUribeVel 677100763924078592 1.Es esto paz? Narco 1609
AndresPastrana_ 704006738735984640 Lo tengo. La entiend 1522
dtavares 678921162513084416 'gente a miss colomb 1252
EPN 677020610057265152 Felicito al Presiden 986
AlvaroUribeVel 705387639155920897 Según ONU Colombia e 958
RosLehtinen 695398033484152832 ¿Deben los contribuy 807
AlvaroUribeVel 677631504340643841 La paz empieza con e 776

Top Users


In [5]:
%%cypher
match (n:user)-[r]-()
return n.screen_name as user, n.uid, count(r) as deg
order by deg desc
limit 10


10 rows affected.
Out[5]:
user n.uid deg
JuanManSantos 64839766 27370
AlvaroUribeVel 61097151 15092
omarbula 192538987 10183
elespectador 14834302 6780
ELTIEMPO 9633802 6020
AndresPastrana_ 1301761278 5192
zairsoli 2411508153 5164
DELAESPRIELLAE 548906668 5019
piedadcordoba 34798360 4909
elnuevoherald 34641036 4540

Top Tags


In [6]:
%%cypher
match (n:hashtag)-[r]-()
return n.hashtag as hashtags, count(r) as deg
order by deg desc
limit 10


10 rows affected.
Out[6]:
hashtags deg
colombia 45821
paz 7753
farc 5718
cuba 5062
venezuela 2464
lahoradelapaz 2450
noticias 2265
siguemeytesigo 2258
yaracuy 1956
farcsantismo 1637

Language data


In [7]:
langs = %cypher match (n:tweet) where n.lang is not null return distinct n.lang, count(*) as num_tweets order by num_tweets desc


27 rows affected.

In [8]:
lang_df = langs.get_dataframe()
lang_df.set_index("n.lang")[:10].plot(kind="bar")


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2cfeb0b128>

% of tweets with geotags


In [9]:
%cypher match (n:tweet) return count(n)


1 rows affected.
Out[9]:
count(n)
337174

In [10]:
%cypher match (n:tweet) where n.coordinates is not null return count(n)


1 rows affected.
Out[10]:
count(n)
5213

In [11]:
5213 / 337174.0  # 1.5%


Out[11]:
0.01546085997140942

Tweets by country


In [12]:
countries = %cypher match (n:tweet) where n.coordinates is not null return distinct n.country, count(*) as num_tweets order by num_tweets desc


37 rows affected.

In [13]:
countries_df = countries.get_dataframe()

In [14]:
countries_df.set_index("n.country")[:20].plot(kind="bar")


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2cfe786320>

Colombian tweets by city (Top 20)


In [15]:
colombia_cities = %cypher match (t:tweet) where t.country = "Colombia" return distinct t.full_name, count(*) as num_tweets order by num_tweets desc


271 rows affected.

In [16]:
colombia_cities.get_dataframe().set_index("t.full_name")[:20].plot(kind="bar")


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2cfe683e48>

In [ ]: