In [1]:
%load_ext cypher
import operator
import numpy as np


/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")

In [5]:
tweet_langs = %cypher match (n:tweet)--(m:tweet) where n.lang is not null and m.lang is not null and m.lang <> "und" and n.lang <> "und" return n.lang, m.lang


6113716 rows affected.

In [6]:
tweet_langs = tweet_langs.get_dataframe()
tweet_langs.head()


Out[6]:
n.lang m.lang
0 tr tr
1 en en
2 en en
3 en en
4 en en

In [7]:
lang_groups = tweet_langs.groupby(["n.lang", "m.lang"])

In [8]:
lang_freq = lang_groups.groups
lang_freq = {k: len(v) for k, v in lang_freq.items()}

In [9]:
langs = sorted(lang_freq.items(), key=operator.itemgetter(1), reverse=True)

In [10]:
langs[:25][0], langs[:25][-1]


Out[10]:
((('en', 'en'), 4264830), (('ro', 'ro'), 1112))

In [15]:
[k for k, v in langs[:50] if k[0] != k[1]]


Out[15]:
[('fr', 'en'),
 ('en', 'fr'),
 ('und', 'en'),
 ('en', 'und'),
 ('es', 'en'),
 ('en', 'es'),
 ('fr', 'und'),
 ('und', 'fr')]

In [33]:
lang_freq = {k: v for k, v in langs[:25]}

In [37]:
list(lang_freq.items())[0:5]


Out[37]:
[(('hr', 'hr'), 106),
 (('es', 'en'), 76),
 (('tl', 'tl'), 9650),
 (('pt', 'pt'), 216686),
 (('ja', 'ja'), 10866)]

In [38]:
keys = np.array(list(lang_freq.keys()))
vals = np.array(list(lang_freq.values()))
unq_keys, key_idx = np.unique(keys, return_inverse=True)
key_idx = key_idx.reshape(-1, 2)
n = len(unq_keys)
adj = np.zeros((n, n) ,dtype=vals.dtype)
adj[key_idx[:,0], key_idx[: ,1]] = vals
adj += adj.T

In [3]:
adj


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-00da27f6971b> in <module>()
----> 1 adj

NameError: name 'adj' is not defined

In [24]:
list(lang_freq.keys())


Out[24]:
[('fr', 'pt'),
 ('tr', 'lt'),
 ('ne', 'ne'),
 ('pt', 'fr'),
 ('es', 'es'),
 ('tl', 'es'),
 ('sk', 'en'),
 ('ja', 'ja'),
 ('tl', 'in'),
 ('it', 'pt'),
 ('en', 'hi'),
 ('it', 'fr'),
 ('es', 'en'),
 ('ro', 'fr'),
 ('fr', 'in'),
 ('es', 'et'),
 ('pt', 'und'),
 ('ru', 'ru'),
 ('es', 'ja'),
 ('ru', 'und'),
 ('de', 'und'),
 ('fr', 'es'),
 ('und', 'und'),
 ('pl', 'und'),
 ('et', 'et'),
 ('ja', 'es'),
 ('und', 'in'),
 ('sk', 'nl'),
 ('no', 'no'),
 ('in', 'und'),
 ('und', 'pt'),
 ('fr', 'et'),
 ('pt', 'tr'),
 ('no', 'tr'),
 ('in', 'hr'),
 ('en', 'en'),
 ('hu', 'hu'),
 ('de', 'nl'),
 ('tr', 'fr'),
 ('no', 'en'),
 ('ta', 'ta'),
 ('en', 'lv'),
 ('in', 'ar'),
 ('lv', 'en'),
 ('fr', 'tl'),
 ('es', 'pl'),
 ('tr', 'en'),
 ('es', 'tl'),
 ('ru', 'es'),
 ('en', 'sl'),
 ('de', 'tr'),
 ('ar', 'ar'),
 ('ka', 'ka'),
 ('und', 'de'),
 ('ru', 'fr'),
 ('ko', 'in'),
 ('en', 'ja'),
 ('fr', 'de'),
 ('th', 'en'),
 ('fr', 'ja'),
 ('tr', 'tr'),
 ('tl', 'fr'),
 ('en', 'sv'),
 ('nl', 'fr'),
 ('en', 'da'),
 ('tr', 'pt'),
 ('bs', 'bs'),
 ('et', 'und'),
 ('no', 'sv'),
 ('ur', 'ur'),
 ('el', 'fr'),
 ('de', 'en'),
 ('pt', 'it'),
 ('lt', 'en'),
 ('ru', 'en'),
 ('und', 'fr'),
 ('en', 'ro'),
 ('sv', 'fr'),
 ('de', 'fr'),
 ('en', 'et'),
 ('in', 'in'),
 ('lt', 'tr'),
 ('und', 'it'),
 ('te', 'te'),
 ('en', 'es'),
 ('en', 'ru'),
 ('en', 'in'),
 ('nl', 'de'),
 ('en', 'pl'),
 ('fr', 'ar'),
 ('iw', 'iw'),
 ('tr', 'no'),
 ('hi', 'tl'),
 ('hi', 'en'),
 ('en', 'ko'),
 ('ro', 'ro'),
 ('it', 'und'),
 ('es', 'ru'),
 ('pl', 'pl'),
 ('fa', 'fa'),
 ('et', 'es'),
 ('und', 'en'),
 ('sl', 'sl'),
 ('de', 'es'),
 ('en', 'lt'),
 ('fr', 'it'),
 ('uk', 'uk'),
 ('et', 'in'),
 ('pt', 'es'),
 ('fr', 'sv'),
 ('in', 'hi'),
 ('iw', 'en'),
 ('th', 'th'),
 ('pt', 'pt'),
 ('sv', 'no'),
 ('is', 'is'),
 ('sv', 'en'),
 ('fr', 'nl'),
 ('fr', 'tr'),
 ('it', 'it'),
 ('et', 'en'),
 ('lt', 'lt'),
 ('en', 'hr'),
 ('pt', 'tl'),
 ('nl', 'en'),
 ('hi', 'et'),
 ('zh', 'en'),
 ('en', 'fr'),
 ('da', 'da'),
 ('bn', 'bn'),
 ('und', 'tr'),
 ('nl', 'sk'),
 ('en', 'pt'),
 ('en', 'tl'),
 ('tr', 'und'),
 ('pt', 'en'),
 ('pa', 'pa'),
 ('en', 'it'),
 ('pl', 'es'),
 ('in', 'tl'),
 ('en', 'iw'),
 ('in', 'pl'),
 ('lv', 'lv'),
 ('en', 'sk'),
 ('mr', 'mr'),
 ('in', 'fr'),
 ('tl', 'en'),
 ('tl', 'tl'),
 ('ml', 'ml'),
 ('in', 'et'),
 ('et', 'hi'),
 ('sr', 'sr'),
 ('pl', 'fr'),
 ('ar', 'fr'),
 ('fr', 'ru'),
 ('hr', 'hr'),
 ('el', 'el'),
 ('ja', 'en'),
 ('et', 'fr'),
 ('en', 'de'),
 ('de', 'de'),
 ('bg', 'bg'),
 ('tl', 'pt'),
 ('tl', 'hi'),
 ('sk', 'sk'),
 ('pl', 'en'),
 ('nl', 'nl'),
 ('ko', 'en'),
 ('it', 'en'),
 ('fr', 'ro'),
 ('hr', 'in'),
 ('fr', 'fr'),
 ('es', 'fr'),
 ('ko', 'ko'),
 ('fr', 'pl'),
 ('es', 'und'),
 ('en', 'th'),
 ('fi', 'fi'),
 ('ro', 'en'),
 ('ja', 'fr'),
 ('sl', 'en'),
 ('hi', 'in'),
 ('si', 'si'),
 ('da', 'en'),
 ('tr', 'de'),
 ('und', 'et'),
 ('en', 'und'),
 ('vi', 'vi'),
 ('in', 'ko'),
 ('en', 'tr'),
 ('in', 'en'),
 ('ar', 'in'),
 ('und', 'ru'),
 ('fr', 'el'),
 ('und', 'pl'),
 ('da', 'tr'),
 ('es', 'de'),
 ('zh', 'zh'),
 ('sv', 'sv'),
 ('fr', 'und'),
 ('und', 'es'),
 ('pl', 'in'),
 ('en', 'no'),
 ('hi', 'hi'),
 ('hr', 'en'),
 ('tr', 'da'),
 ('en', 'nl'),
 ('en', 'fi'),
 ('fi', 'en'),
 ('es', 'pt'),
 ('en', 'zh'),
 ('fr', 'en')]

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [2]:
replies = %cypher match (t1:tweet)-[:REPLIES_TO]-(t2:tweet) where t1.lang is not null and t2.lang is not null return t1, t2


24432 rows affected.

In [4]:
reply_df = replies.get_dataframe()
reply_df.head()


Out[4]:
t1 t2
0 {'text': '@MaluVaccaro Praying for Paris... 🙏🏼... {'text': 'Pray for Paris! 🔵🔴⚪', 'subjectivity'...
1 {'text': 'And what steps will you be taking to... {'text': 'Sydney strongly supports the people ...
2 {'clean_text': 'concordo...mas n sei pq fizera... {'text': 'I just can't seem to fathom what hap...
3 {'text': 'Paris shows IS getting stronger: Abb... {'text': '@SBSNews Seriously, why would anyone...
4 {'text': 'Attentats de #Paris : plus de 200 bl... {'text': '@LP_LaPresse Tennez bon Paris !!! Le...

In [8]:
replies = %cypher match (t1:tweet)-[:REPLIES_TO]-(t2:tweet) where t1.lang is not null and t2.lang is not null and t1.lang <> t2.lang return t1, t2


2390 rows affected.

In [9]:
rt_df = replies.get_dataframe()
rt_df.head()


Out[9]:
t1 t2
0 {'clean_text': 'concordo...mas n sei pq fizera... {'text': 'I just can't seem to fathom what hap...
1 {'text': 'Vols de Dallas à #Paris sont à nouve... {'text': 'Flights from DFW to #Paris are openi...
2 {'text': '@welt @thedailybeast column: Welcom... {'clean_text': 'Mehr als 120 Tote: Was Sie übe...
3 {'country': 'United States', 'text': '@katemoe... {'text': 'Sending all my love to Paris. 🇫🇷🇫🇷🇫🇷...
4 {'text': 'Our city hall lit up in solidarity w... {'lang': 'und', 'text': '@TelAvivNonstop @Isra...

In [ ]: