In [1]:
%load_ext cypher
%matplotlib inline
import pandas as pd


/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")

In [5]:
top_tweets = %cypher match (n:tweet)-[r]-(m:tweet) return n.text, count(r) as deg order by deg desc limit 10


10 rows affected.

In [6]:
top_tweets.get_dataframe()


Out[6]:
n deg
0 {'text': 'Thoughts go out to everyone in Paris... 38199
1 {'text': 'thoughts and prayers for those peopl... 27827
2 {'text': 'Thinking of everyone in Paris.', 'ti... 22158
3 {'text': 'Had a great first show but just hear... 17657
4 {'text': 'all of my love and prayers go out to... 16860
5 {'text': 'To people blaming refugees for attac... 14070
6 {'text': 'Peace for Paris https://t.co/ryf6XB2... 13471
7 {'text': 'Paris massacre is an act of terroris... 12738
8 {'text': 'Please, pray for Paris.', 'tid': '66... 12262
9 {'text': 'Praying for Paris. Our hearts are br... 11636

In [35]:
top_tags = %cypher match (n:hashtag)-[r]-(m) return n.hashtag, count(r) as deg order by deg desc limit 10


10 rows affected.

In [36]:
top_tags.get_dataframe()


Out[36]:
n.hashtag deg
0 paris 719571
1 prayforparis 368931
2 parisattacks 206159
3 prayers4paris 52805
4 bataclan 51524
5 rechercheparis 46333
6 prayersforparis 31736
7 france 31605
8 porteouverte 30484
9 fusillade 27833

In [10]:
top_users =  %cypher match (n:user)-[r]-(m) return n.screen_name, count(r) as deg order by deg desc limit 10


10 rows affected.

In [11]:
top_users.get_dataframe()


Out[11]:
n deg
0 {'screen_name': 'Louis_Tomlinson', 'uid': '842... 76472
1 {'screen_name': 'NiallOfficial', 'uid': '10511... 55876
2 {'screen_name': 'RecherchesP', 'uid': '4185722... 45967
3 {'screen_name': 'Harry_Styles', 'uid': '181561... 44814
4 {'screen_name': 'infos140', 'uid': '1356382759'} 41809
5 {'screen_name': 'justinbieber', 'uid': '272600... 38659
6 {'screen_name': 'nytimes', 'uid': '807095'} 34432
7 {'screen_name': 'Michael5SOS', 'uid': '4032468... 33745
8 {'screen_name': 'jean_jullien', 'uid': '185122... 30970
9 {'screen_name': 'AP', 'uid': '51241574'} 29869

In [3]:
tweets = %cypher match (n:tweet) where n.lang is not null return n.tid, n.lang


4263562 rows affected.

In [6]:
tweets = tweets.get_dataframe()
tweets.to_csv("data/tweets_w_lang.csv")

In [7]:
del tweets

In [9]:
tweet_edges = %cypher match (n:tweet)--(m:tweet) where n.lang is not null and m.lang is not null return n.tid, m.tid


6090786 rows affected.

In [10]:
tweet_edges = tweet_edges.get_dataframe()
tweet_edges.head()


Out[10]:
n.tid m.tid
0 665438816551964672 665322019291013120
1 665563618776326144 665563677429403650
2 665496407525208064 665496689357254656
3 665502309351321602 665485168120045568
4 665549978979442688 665549060665909248

In [11]:
tweet_edges.to_csv("data/tweets_w_lang_edge.csv")

In [ ]:


In [ ]:


In [13]:
tweets_df = pd.read_csv("data/clean_tweets.csv", delimiter="\t")

In [15]:
tweets_df.head()


Out[15]:
tid:ID lang name text clean_text polarity:float subjectivity:float created_at full_name country country_code coordinates :LABEL

In [ ]:
lang_groups = tweets_df.groupby("lang")

In [ ]:


In [ ]:
geotweets = tweets_df.dropna(subset=["coordinates"])

In [ ]:
country_groups = geotweets.groupby("country")

In [ ]:
city_groups = geotweets.groupby("name")

In [ ]:
french_tweets_df = tweets_df[tweets_df["lang"] == "fr"]

In [ ]:
english_tweets_df = tweets_df[tweets_df["lang"] == "en"]