In [2]:
%pylab inline
import requests, zipfile
import StringIO
import datetime
import pandas as pd
import numpy as np
from workflow.data import *
import matplotlib.pyplot as plt
import seaborn as sns
In [7]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
plt.tight_layout
Out[7]:
In [3]:
username = 'psam071'
host = 'localhost'
dbname = 'citibike'
db = create_engine('postgres://%s@%s/%s' % (username,host,dbname))
con = None
con = psycopg2.connect(database = dbname, user = username, host = host)
In [1]:
query = """
SELECT * FROM stations
WHERE long > -74.025;
"""
df = pd.read_sql_query(query, con)
df.head()
In [5]:
neighborhoods = pd.read_csv('nyc_neighborhoods.csv')
hoods = neighborhoods
# find the centroids in hoods that are closest to the station points in df
X = df[['long', 'lat']]
def closest_node(node, nodes):
nodes = np.asarray(nodes)
deltas = nodes - node
dist_2 = np.einsum('ij,ij->i', deltas, deltas)
return np.argmin(dist_2)
a = hoods[['longitude', 'latitude']]
node_labels = []
# return a list of indices of hoods to pass on hoods
for i in xrange(X.shape[0]):
node_labels.append(closest_node([X.iloc[i,:]], a))
n_hoods = df[['id', 'neighborhood', 'borough']]
n_hoods = n_hoods.set_index('id')
n_hoods.head()
n_hoods.to_sql('neighborhoods', db)
Out[5]:
In [10]:
df['neighborhood'] = hoods.iloc[node_labels].reset_index().neighborhood
df['borough'] = hoods.iloc[node_labels].reset_index().Borough
In [18]:
fg = sns.FacetGrid(data = df, hue = 'neighborhood', aspect = 0.7, size = 12)
fg.map(scatter, 'long', 'lat').add_legend()
Out[18]:
In [ ]:
n_hoods = df[['id', 'neighborhood', 'borough']]
n_hoods = n_hoods.set_index('id')
n_hoods.head()
n_hoods.to_sql('neighborhoods', db)
In [ ]:
In [ ]:
In [ ]:
In [20]:
# trips_size = 0
# for year in xrange(2015,2018):
# for month in xrange(1,13):
# print year, month
# try:
# size = pd.read_csv('tripdata/{}{:02}-citibike-tripdata.csv'.format(year,month)).shape[0]
# trips_size += size
# except IOError:
# print '\t{}:{:02} does not exist'.format(year,month)
def calc_rows(path, year_range, delim):
# find the number of total rows of a dataset divided into csv files in a single folder
# the path name should have formatting for year/month
num_rows = 0
for year in xrange(year_range[0],year_range[1]+1):
for month in xrange(1,13):
print year, month
try:
size = pd.read_csv(path.format(year,month),
delimiter = delim,
error_bad_lines = False).shape[0]
num_rows += size
except IOError:
print '\t{}:{:02} does not exist'.format(year,month)
return num_rows
stations_path = 'stationdata/{}{:02}-bikeshare_nyc_raw.csv'
rowsize = calc_rows(stations_path, [2015,2017], '\t')
In [10]:
trips_size
Out[10]:
In [21]:
rowsize
Out[21]: