In [2]:
%load_ext cypher
import json
import random


/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")

In [24]:
geotweets = %cypher match (n:tweet) where n.coordinates is not null return n.tid, n.lang, n.country, n.name, n.coordinates, n.created_at


55881 rows affected.

In [25]:
geotweets = geotweets.get_dataframe()
geotweets.head()


Out[25]:
n.tid n.lang n.country n.name n.coordinates n.created_at
0 665470117124440064 es España Barcelona [[[2.052477, 41.319999], [2.052477, 41.468266]... Sat Nov 14 10:03:34 +0000 2015
1 665617082613178368 en United States Manhattan [[[-74.026675, 40.683935], [-74.026675, 40.877... Sat Nov 14 19:47:33 +0000 2015
2 665466187648778240 fr France Riom [[[3.070936, 45.864675], [3.070936, 45.922835]... Sat Nov 14 09:47:57 +0000 2015
3 665398437790269442 en United Kingdom Derby [[[-1.556856, 52.870587], [-1.556856, 52.96047... Sat Nov 14 05:18:44 +0000 2015
4 665647240824954880 en South Africa Durban [[[30.87891, -30.055938], [30.87891, -29.74533... Sat Nov 14 21:47:23 +0000 2015

In [26]:
json.loads(geotweets.ix[1]["n.coordinates"])[0][0]


Out[26]:
[-74.026675, 40.683935]

In [27]:
def get_random_coords(df):
    lats = []
    lons = []
    for row in df.iterrows():
        row = row[1]
        coords = json.loads(row["n.coordinates"])[0]
        lat1 = coords[0][0]
        lat2 = coords[2][0]
        lon1 = coords[0][1]
        lon2 = coords[1][1]
        ran_lat = random.uniform(lat1, lat2)
        ran_lon = random.uniform(lon1, lon2)
        lats.append(ran_lat)
        lons.append(ran_lon)
    df["lat"] = lats
    df["lon"] = lons
    return df

In [28]:
df = get_random_coords(geotweets)

In [29]:
geotweets.columns = ["Id", "Lang", "Country", "City", "Coords", "Time", "Lon", "Lat"]

In [32]:
geotweets["Label"] = "tweet"

In [33]:
geotweets.head()


Out[33]:
Id Lang Country City Coords Time Lon Lat Label
0 665470117124440064 es España Barcelona [[[2.052477, 41.319999], [2.052477, 41.468266]... Sat Nov 14 10:03:34 +0000 2015 2.110062 41.432345 tweet
1 665617082613178368 en United States Manhattan [[[-74.026675, 40.683935], [-74.026675, 40.877... Sat Nov 14 19:47:33 +0000 2015 -73.969743 40.865664 tweet
2 665466187648778240 fr France Riom [[[3.070936, 45.864675], [3.070936, 45.922835]... Sat Nov 14 09:47:57 +0000 2015 3.124939 45.866749 tweet
3 665398437790269442 en United Kingdom Derby [[[-1.556856, 52.870587], [-1.556856, 52.96047... Sat Nov 14 05:18:44 +0000 2015 -1.469450 52.951176 tweet
4 665647240824954880 en South Africa Durban [[[30.87891, -30.055938], [30.87891, -29.74533... Sat Nov 14 21:47:23 +0000 2015 30.883767 -29.936401 tweet

In [34]:
geotweets.to_csv("data/geotweets.csv")

In [2]:
edges_query = """match (t:tweet)-[:USES]->(h:hashtag) where t.coordinates is not null with h.tagid as hashtag, t.tid as tweet return hashtag, tweet
"""

In [7]:
geotweet_edges = %cypher match (t:tweet)-[:USES]->(h:hashtag) where t.coordinates is not null with h.tagid as hashtag, t.tid as tweet return tweet, hashtag


41490 rows affected.

In [8]:
geotweet_edges = geotweet_edges.get_dataframe()

In [9]:
geotweet_edges.head()


Out[9]:
tweet hashtag
0 665470117124440064 h3
1 665470117124440064 h2
2 665617082613178368 h514
3 665393988195733504 h514
4 665393988195733504 h1375

In [11]:
geotweet_edges.columns = ["Source", "Target"]

In [22]:
geotweet_edges.to_csv("data/geoedges.csv")

In [39]:
geoedges_nohash = %cypher match (t:tweet)--(n:tweet) where t.coordinates is not null and n.coordinates is not null return t.tid as Source, n.tid as Target


136 rows affected.

In [40]:
geoedges_nohash = geoedges_nohash.get_dataframe()

In [41]:
len(geoedges_nohash)


Out[41]:
136

In [43]:
geoedges_nohash.to_csv("data/geoedges_nohash.csv")

In [ ]:


In [56]:
geohash = %cypher match (t:tweet)-[r:USES]->(h:hashtag) where t.coordinates is not null with distinct h.tagid as Id, h.hashtag as Label, count(r) as deg return Id, Label order by deg desc limit 10


10 rows affected.

In [57]:
geohash = geohash.get_dataframe()
geohash.head()


Out[57]:
Id Label
0 h3 paris
1 h2 prayforparis
2 h5 parisattacks
3 h19 prayers4paris
4 h43 france

In [58]:
labels = geohash["Label"].map(lambda x: "#" + x)

In [59]:
geohash["Label"] = labels

In [60]:
geohash.head()


Out[60]:
Id Label
0 h3 #paris
1 h2 #prayforparis
2 h5 #parisattacks
3 h19 #prayers4paris
4 h43 #france

In [61]:
geohash.to_csv("data/geotags.csv")

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [3]:
edges = %cypher match (t:tweet)-[:USES]-(h:hashtag {hashtag: "paris"}) where t.coordinates is not null return h.hashtag, collect(t.tid)


1 rows affected.

In [4]:
import itertools
import networkx as nx

In [5]:
edges = edges.get_dataframe()

In [6]:
edges["collect(t.tid)"] = edges["collect(t.tid)"].map(lambda x: list(itertools.combinations(x, 2)))

In [7]:
edges.head()


Out[7]:
h.hashtag collect(t.tid)
0 paris [(665717865102884866, 665717799705276416), (66...

In [8]:
el = list(itertools.chain.from_iterable(edges["collect(t.tid)"]))

In [9]:
len(el)


Out[9]:
50065021

In [8]:
el[1]


Out[8]:
('665351906051661824', '665698881120968709')

In [9]:
len(el)


Out[9]:
66416062

In [9]:
g = nx.Graph(el)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/networkx/convert.py in to_networkx_graph(data, create_using, multigraph_input)
    123         try:
--> 124             return from_edgelist(data,create_using=create_using)
    125         except:

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/networkx/convert.py in from_edgelist(edgelist, create_using)
    402     G=_prep_create_using(create_using)
--> 403     G.add_edges_from(edgelist)
    404     return G

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/networkx/classes/graph.py in add_edges_from(self, ebunch, attr_dict, **attr)
    870                 self.node[v] = {}
--> 871             datadict = self.adj[u].get(v, self.edge_attr_dict_factory())
    872             datadict.update(attr_dict)

KeyboardInterrupt: 

During handling of the above exception, another exception occurred:

NetworkXError                             Traceback (most recent call last)
<ipython-input-9-cbce78a3ad60> in <module>()
----> 1 g = nx.Graph(el)

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/networkx/classes/graph.py in __init__(self, data, **attr)
    298         # attempt to load graph with data
    299         if data is not None:
--> 300             convert.to_networkx_graph(data, create_using=self)
    301         # load graph attributes (must be after convert)
    302         self.graph.update(attr)

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/networkx/convert.py in to_networkx_graph(data, create_using, multigraph_input)
    124             return from_edgelist(data,create_using=create_using)
    125         except:
--> 126             raise nx.NetworkXError("Input is not a valid edge list")
    127 
    128     # Pandas DataFrame

NetworkXError: Input is not a valid edge list

In [34]:
len(geotweet_edges)


Out[34]:
6689

In [ ]: