In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
from datetime import timedelta
In [2]:
df = pd.read_csv("DATA/babs_master/trip_master.csv")
In [3]:
df[4000:4020]
Out[3]:
In [4]:
df.rename(columns = {'Start Date':'Start Date Time', 'End Date':'End Date Time'}, inplace = True)
In [5]:
df['Start Date'], df['Start Time'] = zip(*df['Start Date Time'].apply(lambda x: x.split(' ')))
df['End Date'], df['End Time'] = zip(*df['End Date Time'].apply(lambda x: x.split(' ')))
In [6]:
df.head()
Out[6]:
In [7]:
df.shape
Out[7]:
In [8]:
df_loc = pd.read_csv("DATA/babs_master/station_master.csv")
In [9]:
df_loc.head()
Out[9]:
In [10]:
df_merged_1 = pd.merge(df, df_loc, left_on='Start Station', right_on='name', how='inner', sort=False)
In [11]:
df_merged_1.shape
Out[11]:
In [12]:
df_merged_1.head()
Out[12]:
In [13]:
df_merged_1.rename(columns = {'lat':'Start Latitute', 'long':'Start Longitude', 'dockcount': 'Start Station Dockcount', 'landmark': 'Start Station Landmark', 'installation': 'Start Station Installation'}, inplace = True)
In [14]:
df_merged_1.head()
Out[14]:
In [15]:
df_merged_1 = df_merged_1.drop('name', 1)
In [16]:
df_merged_1 = pd.merge(df_merged_1, df_loc, left_on='End Station', right_on='name', how='inner', sort=False)
In [21]:
df_merged_1.head()
Out[21]:
In [18]:
df_merged_1.rename(columns = {'lat':'End Latitute', 'long':'End Longitude', 'dockcount': 'End Station Dockcount', 'landmark': 'End Station Landmark', 'installation': 'End Station Installation'}, inplace = True)
In [19]:
df_merged_1 = df_merged_1.drop('name', 1)
In [22]:
df_weather = pd.read_csv("DATA/babs_master/weather_master.csv")
df_merged_1 = pd.merge(df_merged_1, df_weather, left_on='Start Date', right_on='Date', how='inner', sort=False)
In [25]:
df_merged_1.columns
Out[25]:
In [24]:
df_merged_1 = df_merged_1.drop('zip', 1)
In [26]:
df_merged_1 = df_merged_1.drop('Date', 1)
In [27]:
df = df_merged_1
In [28]:
month =[]
day_of_month =[]
hour_of_day =[]
day_of_week =[]
weekend = []
for i in range(len(df['Start Date Time'].values)):
date = datetime.strptime(df['Start Date Time'].values[i], "%m/%d/%Y %H:%M")
moy = date.month
dom = date.day
hod = date.hour
month.append(moy)
day_of_month.append(dom)
hour_of_day.append(hod)
day_of_week.append(date.isoweekday())
if date.isoweekday() in range(1, 6):
weekend.append(0)
else:
weekend.append(1)
df['month'] = month
df['day_of_month'] = day_of_month
df['hour_of_day'] = hour_of_day
df['day_of_week'] = day_of_week
df['weekend'] = weekend
df['Duration_in_mins'] = df['Duration'] /60
In [ ]:
df.shape
In [30]:
df.to_csv("DATA/babs_master/merged_master.csv", index=False)
In [29]:
df.head()
Out[29]:
In [ ]: