Exploring a dataset in the Notebook

Provenance of the data

Downloading and loading a dataset



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
%cd ~/minibook/chapter2/



In [3]:

    
!wget https://raw.githubusercontent.com/ipython-books/minibook-2nd-data/master/nyc_taxi.zip
!unzip nyc_taxi.zip



In [4]:

    
%ls data









    Out[4]:





nyc_data.csv  nyc_fare.csv  [...]



In [5]:

    
data_filename = 'data/nyc_data.csv'
fare_filename = 'data/nyc_fare.csv'



In [6]:

    
data = pd.read_csv(data_filename, parse_dates=['pickup_datetime',
                                               'dropoff_datetime'])
fare = pd.read_csv(fare_filename, parse_dates=['pickup_datetime'])



In [7]:

    
data.head(3)

Making plots with matplotlib



In [8]:

    
data.columns









    Out[8]:





Index(['medallion',
       ...
       'pickup_datetime',
       'dropoff_datetime',
       'passenger_count',
       'trip_time_in_secs',
       'trip_distance',
       'pickup_longitude',
       'pickup_latitude',
       'dropoff_longitude',
       'dropoff_latitude'], dtype='object')



In [9]:

    
p_lng = data.pickup_longitude
p_lat = data.pickup_latitude
d_lng = data.dropoff_longitude
d_lat = data.dropoff_latitude



In [10]:

    
p_lng









    Out[10]:





0        -73.955925
1        -74.005501
...
846943   -73.978477
846944   -73.987206
Name: pickup_longitude, Length: 846945, dtype: float64



In [11]:

    
def lat_lng_to_pixels(lat, lng):
    lat_rad = lat * np.pi / 180.0
    lat_rad = np.log(np.tan((lat_rad + np.pi / 2.0) / 2.0))
    x = 100 * (lng + 180.0) / 360.0
    y = 100 * (lat_rad - np.pi) / (2.0 * np.pi)
    return (x, y)



In [12]:

    
px, py = lat_lng_to_pixels(p_lat, p_lng)



In [13]:

    
px









    Out[13]:





0         29.456688
1         29.442916
...
846943    29.450423
846944    29.447998
Name: pickup_longitude, dtype: float64



In [14]:

    
plt.scatter(px, py)



In [15]:

    
plt.figure(figsize=(8, 6))
plt.scatter(px, py, s=.1, alpha=.03)
plt.axis('equal')
plt.xlim(29.40, 29.55)
plt.ylim(-37.63, -37.54)
plt.axis('off')

Descriptive statistics with pandas and seaborn



In [16]:

    
px.count(), px.min(), px.max()









    Out[16]:





(846945, 29.417137499999995, 29.714313055555561)



In [17]:

    
px.mean(), px.median(), px.std()









    Out[17]:





(29.451345807768575, 29.449418333333337, 0.0097616942794720614)



In [18]:

    
!conda install seaborn -q -y



In [19]:

    
import seaborn as sns
sns.__version__









    Out[19]:





'0.6.0'



In [20]:

    
data.trip_distance.hist(bins=np.linspace(0., 10., 100))