In [20]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
cluster = Cluster(
    contact_points=['52.91.166.84'],
    auth_provider = auth_provider
)

session = cluster.connect('capstone')


[WARNING] Failed to create connection pool for new host 52.90.194.70:
Traceback (most recent call last):
  File "cassandra/cluster.py", line 1787, in cassandra.cluster.Session.add_or_renew_pool.run_add_or_renew_pool (cassandra/cluster.c:30488)
    new_pool = HostConnection(host, distance, self)
  File "cassandra/pool.py", line 295, in cassandra.pool.HostConnection.__init__ (cassandra/pool.c:6160)
    self._connection = session.cluster.connection_factory(host.address)
  File "cassandra/cluster.py", line 789, in cassandra.cluster.Cluster.connection_factory (cassandra/cluster.c:9245)
    return self.connection_class.factory(address, self.connect_timeout, *args, **kwargs)
  File "cassandra/connection.py", line 306, in cassandra.connection.Connection.factory (cassandra/connection.c:5139)
    conn = cls(host, *args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/cassandra/io/asyncorereactor.py", line 162, in __init__
    self._connect_socket()
  File "cassandra/connection.py", line 340, in cassandra.connection.Connection._connect_socket (cassandra/connection.c:6271)
    raise socket.error(sockerr.errno, "Tried connecting to %s. Last error: %s" % ([a[4] for a in addresses], sockerr.strerror or sockerr))
error: [Errno None] Tried connecting to [('52.90.194.70', 9042)]. Last error: timed out
[WARNING] Host 52.90.194.70 has been marked down
[WARNING] Failed to create connection pool for new host 52.90.76.138:
Traceback (most recent call last):
  File "cassandra/cluster.py", line 1787, in cassandra.cluster.Session.add_or_renew_pool.run_add_or_renew_pool (cassandra/cluster.c:30488)
    new_pool = HostConnection(host, distance, self)
  File "cassandra/pool.py", line 295, in cassandra.pool.HostConnection.__init__ (cassandra/pool.c:6160)
    self._connection = session.cluster.connection_factory(host.address)
  File "cassandra/cluster.py", line 789, in cassandra.cluster.Cluster.connection_factory (cassandra/cluster.c:9245)
    return self.connection_class.factory(address, self.connect_timeout, *args, **kwargs)
  File "cassandra/connection.py", line 306, in cassandra.connection.Connection.factory (cassandra/connection.c:5139)
    conn = cls(host, *args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/cassandra/io/asyncorereactor.py", line 162, in __init__
    self._connect_socket()
  File "cassandra/connection.py", line 340, in cassandra.connection.Connection._connect_socket (cassandra/connection.c:6271)
    raise socket.error(sockerr.errno, "Tried connecting to %s. Last error: %s" % ([a[4] for a in addresses], sockerr.strerror or sockerr))
error: [Errno None] Tried connecting to [('52.90.76.138', 9042)]. Last error: timed out
[WARNING] Host 52.90.76.138 has been marked down
[WARNING] Error attempting to reconnect to 52.90.194.70, scheduling retry in 2.0 seconds: [Errno None] Tried connecting to [('52.90.194.70', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 52.90.76.138, scheduling retry in 2.0 seconds: [Errno None] Tried connecting to [('52.90.76.138', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 600.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 52.90.194.70, scheduling retry in 4.0 seconds: [Errno None] Tried connecting to [('52.90.194.70', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 52.90.76.138, scheduling retry in 4.0 seconds: [Errno None] Tried connecting to [('52.90.76.138', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 52.90.194.70, scheduling retry in 8.0 seconds: [Errno None] Tried connecting to [('52.90.194.70', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 52.90.76.138, scheduling retry in 8.0 seconds: [Errno None] Tried connecting to [('52.90.76.138', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 52.90.194.70, scheduling retry in 16.0 seconds: [Errno None] Tried connecting to [('52.90.194.70', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 52.90.76.138, scheduling retry in 16.0 seconds: [Errno None] Tried connecting to [('52.90.76.138', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 52.90.194.70, scheduling retry in 600.0 seconds: [Errno None] Tried connecting to [('52.90.194.70', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 52.90.76.138, scheduling retry in 600.0 seconds: [Errno None] Tried connecting to [('52.90.76.138', 9042)]. Last error: timed out

In [7]:
from cassandra.query import tuple_factory
session.row_factory = tuple_factory
rows = session.execute("SELECT code, popularity FROM airportpopularity LIMIT 1000000")
import graphlab
s = sorted(rows, key=lambda k: k[1], reverse=True)
#print s
sframe = graphlab.SFrame({'touple': s})
sframe['popularity'] = sframe['touple'].apply(lambda x: x[1])
sframe = sframe.add_row_number()
sframe['rank'] = sframe['id'].apply(lambda x: x+1)

sframe


[INFO] 1454680666 : INFO:     (initialize_globals_from_environment:282): Setting configuration variable GRAPHLAB_FILEIO_ALTERNATIVE_SSL_CERT_FILE to /usr/local/lib/python2.7/dist-packages/certifi/cacert.pem
1454680666 : INFO:     (initialize_globals_from_environment:282): Setting configuration variable GRAPHLAB_FILEIO_ALTERNATIVE_SSL_CERT_DIR to 
This non-commercial license of GraphLab Create is assigned to kgrodzicki@gmail.com and will expire on October 14, 2016. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-51 - Server binary: /usr/local/lib/python2.7/dist-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1454680666.log
[INFO] GraphLab Server Version: 1.7.1
Out[7]:
id touple popularity rank
0 [ATL, 58187766] 58187766 1
1 [DEN, 31219499] 31219499 2
2 [LAX, 25452018] 25452018 3
3 [CLT, 24276984] 24276984 4
4 [PHX, 21864094] 21864094 5
5 [SFO, 18537961] 18537961 6
6 [SLC, 16124346] 16124346 7
7 [SEA, 16069711] 16069711 8
8 [LAS, 15947830] 15947830 9
9 [BOS, 13285805] 13285805 10
[490 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 8.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 8.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 256.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 256.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 16.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 16.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 32.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 32.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 64.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 64.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 128.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 128.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 512.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 512.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 512.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 512.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 256.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 256.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 512.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 512.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Failed to create connection pool for new host 172.31.59.248:
Traceback (most recent call last):
  File "cassandra/cluster.py", line 1787, in cassandra.cluster.Session.add_or_renew_pool.run_add_or_renew_pool (cassandra/cluster.c:30488)
    new_pool = HostConnection(host, distance, self)
  File "cassandra/pool.py", line 295, in cassandra.pool.HostConnection.__init__ (cassandra/pool.c:6160)
    self._connection = session.cluster.connection_factory(host.address)
  File "cassandra/cluster.py", line 789, in cassandra.cluster.Cluster.connection_factory (cassandra/cluster.c:9245)
    return self.connection_class.factory(address, self.connect_timeout, *args, **kwargs)
  File "cassandra/connection.py", line 306, in cassandra.connection.Connection.factory (cassandra/connection.c:5139)
    conn = cls(host, *args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/cassandra/io/asyncorereactor.py", line 162, in __init__
    self._connect_socket()
  File "cassandra/connection.py", line 340, in cassandra.connection.Connection._connect_socket (cassandra/connection.c:6271)
    raise socket.error(sockerr.errno, "Tried connecting to %s. Last error: %s" % ([a[4] for a in addresses], sockerr.strerror or sockerr))
error: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 2.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 4.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 8.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 16.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 32.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 64.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 128.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 600.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.247, scheduling retry in 600.0 seconds: [Errno None] Tried connecting to [('172.31.59.247', 9042)]. Last error: timed out
[WARNING] Error attempting to reconnect to 172.31.59.248, scheduling retry in 256.0 seconds: [Errno None] Tried connecting to [('172.31.59.248', 9042)]. Last error: timed out

In [9]:
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
plt.loglog(sframe['rank'],sframe['popularity'],'.')
plt.grid(True)
plt.title('loglog')


Out[11]:
<matplotlib.text.Text at 0x7fd5b42b8510>

Power Law Distribution


In [ ]: