notebook.community

Edit and run



In [2]:

    
%load_ext cypher
import json
import random









    



/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")



In [24]:

    
geotweets = %cypher match (n:tweet) where n.coordinates is not null return n.tid, n.lang, n.country, n.name, n.coordinates, n.created_at









    



55881 rows affected.



In [25]:

    
geotweets = geotweets.get_dataframe()
geotweets.head()









    Out[25]:






  
    
      
      n.tid
      n.lang
      n.country
      n.name
      n.coordinates
      n.created_at
    
  
  
    
      0
      665470117124440064
      es
      España
      Barcelona
      [[[2.052477, 41.319999], [2.052477, 41.468266]...
      Sat Nov 14 10:03:34 +0000 2015
    
    
      1
      665617082613178368
      en
      United States
      Manhattan
      [[[-74.026675, 40.683935], [-74.026675, 40.877...
      Sat Nov 14 19:47:33 +0000 2015
    
    
      2
      665466187648778240
      fr
      France
      Riom
      [[[3.070936, 45.864675], [3.070936, 45.922835]...
      Sat Nov 14 09:47:57 +0000 2015
    
    
      3
      665398437790269442
      en
      United Kingdom
      Derby
      [[[-1.556856, 52.870587], [-1.556856, 52.96047...
      Sat Nov 14 05:18:44 +0000 2015
    
    
      4
      665647240824954880
      en
      South Africa
      Durban
      [[[30.87891, -30.055938], [30.87891, -29.74533...
      Sat Nov 14 21:47:23 +0000 2015



In [26]:

    
json.loads(geotweets.ix[1]["n.coordinates"])[0][0]









    Out[26]:





[-74.026675, 40.683935]



In [27]:

    
def get_random_coords(df):
    lats = []
    lons = []
    for row in df.iterrows():
        row = row[1]
        coords = json.loads(row["n.coordinates"])[0]
        lat1 = coords[0][0]
        lat2 = coords[2][0]
        lon1 = coords[0][1]
        lon2 = coords[1][1]
        ran_lat = random.uniform(lat1, lat2)
        ran_lon = random.uniform(lon1, lon2)
        lats.append(ran_lat)
        lons.append(ran_lon)
    df["lat"] = lats
    df["lon"] = lons
    return df



In [28]:

    
df = get_random_coords(geotweets)



In [29]:

    
geotweets.columns = ["Id", "Lang", "Country", "City", "Coords", "Time", "Lon", "Lat"]



In [32]:

    
geotweets["Label"] = "tweet"



In [33]:

    
geotweets.head()









    Out[33]:






  
    
      
      Id
      Lang
      Country
      City
      Coords
      Time
      Lon
      Lat
      Label
    
  
  
    
      0
      665470117124440064
      es
      España
      Barcelona
      [[[2.052477, 41.319999], [2.052477, 41.468266]...
      Sat Nov 14 10:03:34 +0000 2015
      2.110062
      41.432345
      tweet
    
    
      1
      665617082613178368
      en
      United States
      Manhattan
      [[[-74.026675, 40.683935], [-74.026675, 40.877...
      Sat Nov 14 19:47:33 +0000 2015
      -73.969743
      40.865664
      tweet
    
    
      2
      665466187648778240
      fr
      France
      Riom
      [[[3.070936, 45.864675], [3.070936, 45.922835]...
      Sat Nov 14 09:47:57 +0000 2015
      3.124939
      45.866749
      tweet
    
    
      3
      665398437790269442
      en
      United Kingdom
      Derby
      [[[-1.556856, 52.870587], [-1.556856, 52.96047...
      Sat Nov 14 05:18:44 +0000 2015
      -1.469450
      52.951176
      tweet
    
    
      4
      665647240824954880
      en
      South Africa
      Durban
      [[[30.87891, -30.055938], [30.87891, -29.74533...
      Sat Nov 14 21:47:23 +0000 2015
      30.883767
      -29.936401
      tweet



In [34]:

    
geotweets.to_csv("data/geotweets.csv")



In [2]:

    
edges_query = """match (t:tweet)-[:USES]->(h:hashtag) where t.coordinates is not null with h.tagid as hashtag, t.tid as tweet return hashtag, tweet
"""



In [7]:

    
geotweet_edges = %cypher match (t:tweet)-[:USES]->(h:hashtag) where t.coordinates is not null with h.tagid as hashtag, t.tid as tweet return tweet, hashtag









    



41490 rows affected.



In [8]:

    
geotweet_edges = geotweet_edges.get_dataframe()



In [9]:

    
geotweet_edges.head()









    Out[9]:






  
    
      
      tweet
      hashtag
    
  
  
    
      0
      665470117124440064
      h3
    
    
      1
      665470117124440064
      h2
    
    
      2
      665617082613178368
      h514
    
    
      3
      665393988195733504
      h514
    
    
      4
      665393988195733504
      h1375



In [11]:

    
geotweet_edges.columns = ["Source", "Target"]



In [22]:

    
geotweet_edges.to_csv("data/geoedges.csv")



In [39]:

    
geoedges_nohash = %cypher match (t:tweet)--(n:tweet) where t.coordinates is not null and n.coordinates is not null return t.tid as Source, n.tid as Target









    



136 rows affected.



In [40]:

    
geoedges_nohash = geoedges_nohash.get_dataframe()



In [41]:

    
len(geoedges_nohash)









    Out[41]:





136



In [43]:

    
geoedges_nohash.to_csv("data/geoedges_nohash.csv")



In [ ]:



In [56]:

    
geohash = %cypher match (t:tweet)-[r:USES]->(h:hashtag) where t.coordinates is not null with distinct h.tagid as Id, h.hashtag as Label, count(r) as deg return Id, Label order by deg desc limit 10









    



10 rows affected.



In [57]:

    
geohash = geohash.get_dataframe()
geohash.head()









    Out[57]:






  
    
      
      Id
      Label
    
  
  
    
      0
      h3
      paris
    
    
      1
      h2
      prayforparis
    
    
      2
      h5
      parisattacks
    
    
      3
      h19
      prayers4paris
    
    
      4
      h43
      france



In [58]:

    
labels = geohash["Label"].map(lambda x: "#" + x)



In [59]:

    
geohash["Label"] = labels



In [60]:

    
geohash.head()









    Out[60]:






  
    
      
      Id
      Label
    
  
  
    
      0
      h3
      #paris
    
    
      1
      h2
      #prayforparis
    
    
      2
      h5
      #parisattacks
    
    
      3
      h19
      #prayers4paris
    
    
      4
      h43
      #france



In [61]:

    
geohash.to_csv("data/geotags.csv")



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [3]:

    
edges = %cypher match (t:tweet)-[:USES]-(h:hashtag {hashtag: "paris"}) where t.coordinates is not null return h.hashtag, collect(t.tid)









    



1 rows affected.



In [4]:

    
import itertools
import networkx as nx



In [5]:

    
edges = edges.get_dataframe()



In [6]:

    
edges["collect(t.tid)"] = edges["collect(t.tid)"].map(lambda x: list(itertools.combinations(x, 2)))



In [7]:

    
edges.head()









    Out[7]:






  
    
      
      h.hashtag
      collect(t.tid)
    
  
  
    
      0
      paris
      [(665717865102884866, 665717799705276416), (66...



In [8]:

    
el = list(itertools.chain.from_iterable(edges["collect(t.tid)"]))



In [9]:

    
len(el)









    Out[9]:





50065021



In [8]:

    
el[1]









    Out[8]:





('665351906051661824', '665698881120968709')



In [9]:

    
len(el)









    Out[9]:





66416062



In [9]:

    
g = nx.Graph(el)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/networkx/convert.py in to_networkx_graph(data, create_using, multigraph_input)
    123         try:
--> 124             return from_edgelist(data,create_using=create_using)
    125         except:

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/networkx/convert.py in from_edgelist(edgelist, create_using)
    402     G=_prep_create_using(create_using)
--> 403     G.add_edges_from(edgelist)
    404     return G

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/networkx/classes/graph.py in add_edges_from(self, ebunch, attr_dict, **attr)
    870                 self.node[v] = {}
--> 871             datadict = self.adj[u].get(v, self.edge_attr_dict_factory())
    872             datadict.update(attr_dict)

KeyboardInterrupt: 

During handling of the above exception, another exception occurred:

NetworkXError                             Traceback (most recent call last)
<ipython-input-9-cbce78a3ad60> in <module>()
----> 1 g = nx.Graph(el)

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/networkx/classes/graph.py in __init__(self, data, **attr)
    298         # attempt to load graph with data
    299         if data is not None:
--> 300             convert.to_networkx_graph(data, create_using=self)
    301         # load graph attributes (must be after convert)
    302         self.graph.update(attr)

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/networkx/convert.py in to_networkx_graph(data, create_using, multigraph_input)
    124             return from_edgelist(data,create_using=create_using)
    125         except:
--> 126             raise nx.NetworkXError("Input is not a valid edge list")
    127 
    128     # Pandas DataFrame

NetworkXError: Input is not a valid edge list



In [34]:

    
len(geotweet_edges)









    Out[34]:





6689



In [ ]:

	n.tid	n.lang	n.country	n.name	n.coordinates	n.created_at
0	665470117124440064	es	España	Barcelona	[[[2.052477, 41.319999], [2.052477, 41.468266]...	Sat Nov 14 10:03:34 +0000 2015
1	665617082613178368	en	United States	Manhattan	[[[-74.026675, 40.683935], [-74.026675, 40.877...	Sat Nov 14 19:47:33 +0000 2015
2	665466187648778240	fr	France	Riom	[[[3.070936, 45.864675], [3.070936, 45.922835]...	Sat Nov 14 09:47:57 +0000 2015
3	665398437790269442	en	United Kingdom	Derby	[[[-1.556856, 52.870587], [-1.556856, 52.96047...	Sat Nov 14 05:18:44 +0000 2015
4	665647240824954880	en	South Africa	Durban	[[[30.87891, -30.055938], [30.87891, -29.74533...	Sat Nov 14 21:47:23 +0000 2015

	Id	Label
0	h3	paris
1	h2	prayforparis
2	h5	parisattacks
3	h19	prayers4paris
4	h43	france

	Id	Label
0	h3	#paris
1	h2	#prayforparis
2	h5	#parisattacks
3	h19	#prayers4paris
4	h43	#france