Collect transaction data from etherchain API


In [5]:
# import modules
from pymongo import MongoClient
import requests
import time

In [6]:
# connect to the hosted MongoDB instance
client = MongoClient()
db = client['etherchain']
collection = db['new_transactions']
# the transactions with be uniquely identfied by their unique hash, preventing duplicates
collection.create_index([('hash', 1)], unique=True)


Out[6]:
u'hash_1'

In [7]:
def collect_data(count, offset):
    """
    DESCRIPTION:
        Collects blockchain transaction data from etherchain.org API
        https://etherchain.org/documentation/api/
    
    INPUT:
        - offset: the number of txs to skip
        - count: the number of txs to return (max 100 blocks per request)
        - sleeptime: number of seconds to sleep between api requests
    OUTPUT:
        - stores each blockchain transaction as a document in a MongoDB collection
    """
    
    BASE_URL = 'https://etherchain.org/api/txs/{}/{}'.format(offset, count)
    r = requests.get(BASE_URL)

    if r.status_code != 200:
        print('status code = {}'.format(r.status_code))

    else:
    # store each transaction block in our mongo database
        r = convert_price(r.json())

        for row in range(count):
            try:

                collection.insert_one(r['data'][row])

            except Exception as e:
                #print e
                #print(r['data'][row])
                pass
        
def convert_price(json_dict):
    
    for tx in json_dict['data']:
        tx['amount'] = float(tx['amount'])
        tx['price'] = float(tx['price'])
    
    return json_dict

def call_api():
    
    offset = 2214000
    
    while True:
        collect_data(100, offset)
        offset += 200
        #print(offset)
        if offset % 1000 == 0:
            print('offset: {}'.format(offset))
            
        time.sleep(10)

In [13]:
call_api()


offset: 2215000
offset: 2216000
offset: 2217000
offset: 2218000
offset: 2219000
offset: 2220000
offset: 2221000
offset: 2222000
offset: 2223000
offset: 2224000
offset: 2225000
offset: 2226000
offset: 2227000
offset: 2228000
offset: 2229000
offset: 2230000
offset: 2231000
offset: 2232000
offset: 2233000
offset: 2234000
offset: 2235000
offset: 2236000
offset: 2237000
offset: 2238000
offset: 2239000
offset: 2240000
offset: 2241000
offset: 2242000
offset: 2243000
offset: 2244000
offset: 2245000
offset: 2246000
offset: 2247000
offset: 2248000
offset: 2249000
offset: 2250000
offset: 2251000
offset: 2252000
offset: 2253000
offset: 2254000
offset: 2255000
offset: 2256000
offset: 2257000
offset: 2258000
offset: 2259000
offset: 2260000
offset: 2261000
offset: 2262000
offset: 2263000
offset: 2264000
offset: 2265000
offset: 2266000
offset: 2267000
offset: 2268000
offset: 2269000
offset: 2270000
offset: 2271000
offset: 2272000
offset: 2273000
offset: 2274000
offset: 2275000
offset: 2276000
offset: 2277000
offset: 2278000
offset: 2279000
offset: 2280000
offset: 2281000
offset: 2282000
offset: 2283000
offset: 2284000
offset: 2285000
offset: 2286000
offset: 2287000
offset: 2288000
offset: 2289000
offset: 2290000
offset: 2291000
offset: 2292000
offset: 2293000
offset: 2294000
offset: 2295000
offset: 2296000
offset: 2297000
offset: 2298000
offset: 2299000
offset: 2300000
offset: 2301000
offset: 2302000
offset: 2303000
offset: 2304000
offset: 2305000
offset: 2306000
offset: 2307000
offset: 2308000
offset: 2309000
offset: 2310000
offset: 2311000
offset: 2312000
offset: 2313000
offset: 2314000
offset: 2315000
offset: 2316000
offset: 2317000
offset: 2318000
offset: 2319000
offset: 2320000
offset: 2321000
offset: 2322000
offset: 2323000
offset: 2324000
offset: 2325000
offset: 2326000
offset: 2327000
offset: 2328000
offset: 2329000
offset: 2330000
offset: 2331000
offset: 2332000
offset: 2333000
offset: 2334000
offset: 2335000
offset: 2336000
offset: 2337000
offset: 2338000
offset: 2339000
offset: 2340000
offset: 2341000
offset: 2342000
offset: 2343000
offset: 2344000
offset: 2345000
offset: 2346000
offset: 2347000
offset: 2348000
offset: 2349000
offset: 2350000
offset: 2351000
offset: 2352000
offset: 2353000
offset: 2354000
offset: 2355000
offset: 2356000
offset: 2357000
offset: 2358000
offset: 2359000
offset: 2360000
offset: 2361000
offset: 2362000
offset: 2363000
offset: 2364000
offset: 2365000
offset: 2366000
offset: 2367000
offset: 2368000
offset: 2369000
offset: 2370000
offset: 2371000
offset: 2372000
offset: 2373000
offset: 2374000
offset: 2375000
offset: 2376000
status code = 502
offset: 2377000
offset: 2378000
offset: 2379000
offset: 2380000
offset: 2381000
offset: 2382000
offset: 2383000
offset: 2384000
offset: 2385000
offset: 2386000
offset: 2387000
offset: 2388000
offset: 2389000
offset: 2390000
offset: 2391000
offset: 2392000
offset: 2393000
offset: 2394000
offset: 2395000
offset: 2396000
offset: 2397000
offset: 2398000
offset: 2399000
offset: 2400000
offset: 2401000
offset: 2402000
offset: 2403000
offset: 2404000
offset: 2405000
offset: 2406000
offset: 2407000
offset: 2408000
offset: 2409000
offset: 2410000
offset: 2411000
offset: 2412000
offset: 2413000
offset: 2414000
offset: 2415000
offset: 2416000
offset: 2417000
offset: 2418000
offset: 2419000
offset: 2420000
offset: 2421000
offset: 2422000
offset: 2423000
offset: 2424000
offset: 2425000
offset: 2426000
offset: 2427000
offset: 2428000
offset: 2429000
offset: 2430000
offset: 2431000
offset: 2432000
offset: 2433000
offset: 2434000
offset: 2435000
offset: 2436000
status code = 502
offset: 2437000
offset: 2438000
offset: 2439000
offset: 2440000
offset: 2441000
offset: 2442000
offset: 2443000
offset: 2444000
offset: 2445000
offset: 2446000
offset: 2447000
offset: 2448000
offset: 2449000
offset: 2450000
offset: 2451000
offset: 2452000
offset: 2453000
offset: 2454000
offset: 2455000
offset: 2456000
offset: 2457000
offset: 2458000
offset: 2459000
offset: 2460000
offset: 2461000
offset: 2462000
offset: 2463000
offset: 2464000
offset: 2465000
offset: 2466000
offset: 2467000
offset: 2468000
offset: 2469000
offset: 2470000
offset: 2471000
offset: 2472000
offset: 2473000
offset: 2474000
offset: 2475000
offset: 2476000
offset: 2477000
offset: 2478000
offset: 2479000
offset: 2480000
offset: 2481000
offset: 2482000
offset: 2483000
offset: 2484000
offset: 2485000
offset: 2486000
offset: 2487000
status code = 502
status code = 502
status code = 502
status code = 502
offset: 2488000
offset: 2489000
offset: 2490000
offset: 2491000
offset: 2492000
offset: 2493000
offset: 2494000
offset: 2495000
offset: 2496000
offset: 2497000
offset: 2498000
offset: 2499000
offset: 2500000
offset: 2501000
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-13-12d16b0bc370> in <module>()
----> 1 call_api()

<ipython-input-11-16f9863eb110> in call_api()
     46 
     47     while True:
---> 48         collect_data(100, offset)
     49         offset += 200
     50         #print(offset)

<ipython-input-11-16f9863eb110> in collect_data(count, offset)
     14 
     15     BASE_URL = 'https://etherchain.org/api/txs/{}/{}'.format(offset, count)
---> 16     r = requests.get(BASE_URL)
     17 
     18     if r.status_code != 200:

//anaconda/lib/python2.7/site-packages/requests/api.pyc in get(url, params, **kwargs)
     70 
     71     kwargs.setdefault('allow_redirects', True)
---> 72     return request('get', url, params=params, **kwargs)
     73 
     74 

//anaconda/lib/python2.7/site-packages/requests/api.pyc in request(method, url, **kwargs)
     56     # cases, and look like a memory leak in others.
     57     with sessions.Session() as session:
---> 58         return session.request(method=method, url=url, **kwargs)
     59 
     60 

//anaconda/lib/python2.7/site-packages/requests/sessions.pyc in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    506         }
    507         send_kwargs.update(settings)
--> 508         resp = self.send(prep, **send_kwargs)
    509 
    510         return resp

//anaconda/lib/python2.7/site-packages/requests/sessions.pyc in send(self, request, **kwargs)
    616 
    617         # Send the request
--> 618         r = adapter.send(request, **kwargs)
    619 
    620         # Total elapsed time of the request (approximately)

//anaconda/lib/python2.7/site-packages/requests/adapters.pyc in send(self, request, stream, timeout, verify, cert, proxies)
    438                     decode_content=False,
    439                     retries=self.max_retries,
--> 440                     timeout=timeout
    441                 )
    442 

//anaconda/lib/python2.7/site-packages/urllib3/connectionpool.pyc in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    599                                                   timeout=timeout_obj,
    600                                                   body=body, headers=headers,
--> 601                                                   chunked=chunked)
    602 
    603             # If we're going to release the connection in ``finally:``, then

//anaconda/lib/python2.7/site-packages/urllib3/connectionpool.pyc in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    378         try:
    379             try:  # Python 2.7, use buffering of HTTP responses
--> 380                 httplib_response = conn.getresponse(buffering=True)
    381             except TypeError:  # Python 2.6 and older, Python 3
    382                 try:

//anaconda/lib/python2.7/httplib.pyc in getresponse(self, buffering)
   1119 
   1120         try:
-> 1121             response.begin()
   1122             assert response.will_close != _UNKNOWN
   1123             self.__state = _CS_IDLE

//anaconda/lib/python2.7/httplib.pyc in begin(self)
    436         # read until we get a non-100 response
    437         while True:
--> 438             version, status, reason = self._read_status()
    439             if status != CONTINUE:
    440                 break

//anaconda/lib/python2.7/httplib.pyc in _read_status(self)
    392     def _read_status(self):
    393         # Initialize with Simple-Response defaults
--> 394         line = self.fp.readline(_MAXLINE + 1)
    395         if len(line) > _MAXLINE:
    396             raise LineTooLong("header line")

//anaconda/lib/python2.7/socket.pyc in readline(self, size)
    478             while True:
    479                 try:
--> 480                     data = self._sock.recv(self._rbufsize)
    481                 except error, e:
    482                     if e.args[0] == EINTR:

//anaconda/lib/python2.7/site-packages/urllib3/contrib/pyopenssl.pyc in recv(self, *args, **kwargs)
    256     def recv(self, *args, **kwargs):
    257         try:
--> 258             data = self.connection.recv(*args, **kwargs)
    259         except OpenSSL.SSL.SysCallError as e:
    260             if self.suppress_ragged_eofs and e.args == (-1, 'Unexpected EOF'):

//anaconda/lib/python2.7/site-packages/OpenSSL/SSL.pyc in recv(self, bufsiz, flags)
   1301             result = _lib.SSL_peek(self._ssl, buf, bufsiz)
   1302         else:
-> 1303             result = _lib.SSL_read(self._ssl, buf, bufsiz)
   1304         self._raise_ssl_error(self._ssl, result)
   1305         return _ffi.buffer(buf, result)[:]

KeyboardInterrupt: 

In [ ]: