In [1]:
# following http://martindurant.github.io/blog/moving-to-google-compute-and-storage/

import s3fs
s3 = s3fs.S3FileSystem(anon=True)

In [2]:
print(sum(size for key, size in s3.du('dask-data/nyc-taxi/2015/').items() if 'green_' in key) / 2**30)
s3.du('dask-data/nyc-taxi/2015/')


0.0
Out[2]:
{'dask-data/nyc-taxi/2015/parquet': 0,
 'dask-data/nyc-taxi/2015/parquet.gz': 0,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv': 1985964692,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.parq': 0,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-02.csv': 1945357622,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-03.csv': 2087971794,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-04.csv': 2046225765,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-05.csv': 2061869121,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-06.csv': 1932049357,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-07.csv': 1812530041,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-08.csv': 1744852237,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-09.csv': 1760412710,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-10.csv': 1931460927,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-11.csv': 1773468989,
 'dask-data/nyc-taxi/2015/yellow_tripdata_2015-12.csv': 1796283025}

In [3]:
print(s3.head('dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv').decode())


VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.993896484375,40.750110626220703,1,N,-73.974784851074219,40.750617980957031,1,12,1,0.5,3.25,0,0.3,17.05
1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.30,-74.00164794921875,40.7242431640625,1,N,-73.994415283203125,40.759109497070313,1,14.5,0.5,0.5,2,0,0.3,17.8
1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.80,-73.963340759277344,40.802787780761719,1,N,-73.951820373535156,40.824413299560547,2,9.5,0.5,0.5,0,0,0.3,10.8
1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,.50,-74.009086608886719,40.713817596435547,1,N,-74.004325866699219,40.719985961914063,2,3.5,0.5,0.5,0,0,0.3,4.8
1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.00,-73.971176147460938,40.762428283691406,1,N,-74.004180908203125,40.

In [4]:
# prescribe the dataframe with some typical CSV ingest options
import dask.dataframe as dd
dtype={'Store_and_fwd_flag': 'category', 'Passenger_count': 'uint8', 'Payment_type': 'category', 'Trip_type': 'float32', 'RateCodeID': 'category'}
dtype.update({f: 'float32' for f in ['Fare_aamount', 'Extra', 'MTA_tax', 'Tip_amount', 'Tolls_amount', 'Ehail_fee', 'Total_amount', 'Trip_distance', 'Distance_between_service']})
df = dd.read_csv('s3://dask-data/nyc-taxi/2015/yellow*csv', dtype=dtype,
                 parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'],
                 storage_options={'anon': True}, blocksize=1000000000)
# blocksize to control number of rows per partition

In [5]:
df.dtypes


Out[5]:
VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                   int64
trip_distance                   float64
pickup_longitude                float64
pickup_latitude                 float64
RateCodeID                     category
store_and_fwd_flag               object
dropoff_longitude               float64
dropoff_latitude                float64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
dtype: object

In [6]:
import gcsfs
import json
token = json.load(open('application_default_credentials.json'))
gcs = gcsfs.GCSFileSystem(project='polar-project-784', token=token)


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-6-dc924ffc9bb7> in <module>()
      1 import gcsfs
      2 import json
----> 3 token = json.load(open('application_default_credentials.json'))
      4 gcs = gcsfs.GCSFileSystem(project='polar-project-784', token=token)

FileNotFoundError: [Errno 2] No such file or directory: 'application_default_credentials.json'

In [ ]: