In [18]:
import numpy as np
import pandas as pd
# plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
trip= pd.read_csv('trip.csv')
#wed= pd.read_csv('weather.csv')
In [ ]:
trip.sample(2)
#trip.dtypes
In [ ]:
#Cantidad de viajes por bike
trip = trip['duration'].sum()
trip
In [ ]:
# Number of duration per bike - consultar*
#trip.groupby('duration').count()['bike_id'].plot(figsize=(14,4));
trip.groupby('bike_id').count()['duration'].plot(figsize=(14,4));
In [ ]:
# Top five bikes durations - falta ver como lo puedo graficar. Grafico anterior*
trip.sort_values('duration',ascending=False).head(5)
In [ ]:
# Top five bikes less durations
#trip.sort_index('duration',ascending=False).head(5)
#Are trip getting longer or shorter?
# susbcripciones length by time, are subscriptions getting longer or shorter ?
trip.groupby('subscription_type').mean().loc[:,'duration'].plot(linewidth=2,figsize=(12,8));
In [ ]:
trip.loc[trip.duration>722236,["id","bike_id", "duration"]].sort_values(by="duration",ascending=False).head(10)
In [ ]:
trip.loc[trip.duration<722236,["id","bike_id", "duration"]].sort_values(by="duration",ascending=False).head(10)
In [ ]:
# Histogram of ratings*
trip['duration'].hist(figsize=(16,8));
In [ ]:
# Histogram of durations
trip = trip['duration'].value_counts()
trip.hist(figsize=(12,8),bins=20,);
In [ ]:
# Histogram of bikes
trip = trip['bike_id'].value_counts()
trip = trip[trip > 5000]
#trip.plot(kind='bar',rot=90,figsize=(12,8));
In [ ]:
#stat= pd.read_csv('status.csv', low_memory=False)
#stat.head(3)
stat= pd.read_csv('status.csv', iterator=True, chunksize=500) # gives TextFileReader, which is iterable with chunks of 1000 rows.
df = pd.concat(stat, ignore_index=True) # df is DataFrame. If errors, do `list(tp)` instead of `tp`
df.sample(3)
In [ ]:
#CON SIOMA
#rated_movies = pd.merge(movies,ratings, left_on='movieId', right_on='movieId', how='inner')
#rated_movies.sample(2)
#Cantidad de viajes por dia con el mismo startdate
#test = trip.groupby(['start_date'])['id'].count()
#test.head()
tw = pd.merge(trip,wed, left_on='start_date', right_on='date', how='outer')
tw.head()
#tw.sample(2)
In [ ]:
# Top five bikes start_date - falta ver como lo puedo graficar. Grafico anterior*
#ver como puedo hacer los de menor salida
trip.sort_values('start_date',ascending=False).head(5)
In [ ]:
# Number of trip by station_id
#trip['start_station_name'].value_counts()[0:19].plot(kind='bar',figsize=(14,4));
#TopOstations = trip['start_station_name'].value_counts()[0:29].index.tolist()
#Dataframe[column].value_counts().index.tolist()
#['apple', 'sausage', 'banana', 'cheese']
trip.sort_index(by='duration', ascending=[False])
In [ ]:
In [ ]:
#df.pivot(index='date', columns='variable', values='value')
tp1 = trip[trip['start_station_name'].isin(TopOperators)].groupby(['Operator','cluster']).size().reset_index()
ap1.columns=['Operator','cluster','size']
ap1 = ap1.pivot(index='cluster', columns='Operator',values='size')
ap1.fillna(0,inplace=True)
fig, ax = plt.subplots(figsize=(14,16)) # Sample figsize in inches
sns.heatmap(ap1, annot=False, linewidths=.5, ax=ax);
In [ ]:
trip.groupby('bike_id')['duration'].plot(figsize=(14,4));
In [19]:
#Visualizacion de la cantidad de viajes segun cada bikeID
#trip2 = trip.groupby('bike_id').count()['duration']
#trip2
trip.groupby('bike_id').count()['duration'].plot(figsize=(14,4));
In [ ]:
trip2
In [ ]:
trip3 = trip[trip['bike_id'] == 34]
trip3
In [20]:
#Visualizacion de la duracion de viajes en segundos segun bikeID
trip.groupby('bike_id').sum()['duration'].plot(figsize=(14,4));
In [21]:
#Cambiamos la duracion de segundos a minutos
trip['duration'] = trip['duration'].apply(lambda x: x/60)
trip.head(10)
Out[21]:
In [23]:
#Visualizacion de la duracion de viajes en segundos segun bikeID
trip.groupby('bike_id').sum()['duration'].plot(figsize=(14,4));
In [24]:
#Cambiamos la duracion de segundos a horas
trip['duration'] = trip['duration'].apply(lambda x: x/3600)
#Visualizacion de la duracion de viajes en segundos segun bikeID
trip.groupby('bike_id').sum()['duration'].plot(figsize=(14,4));
In [27]:
conteoDeViajesSegunStartStation = trip['start_station_name'].value_counts()
conteoDeViajesSegunStartStation
Out[27]:
In [28]:
#Visualizacion de cantidad de viajes segun start_station_name
trip.groupby('start_station_name').count()['duration'].plot(figsize=(14,4));
In [29]:
#Visualizacion de cantidad de viajes segun start_station_id
trip.groupby('start_station_id').count()['duration'].plot(figsize=(14,4));
In [31]:
#Visualizacion de cantidad de viajes segun start_station_name
trip.groupby('start_station_name').count()['duration'].plot('bar',figsize=(14,4));
In [32]:
#Visualizacion de cantidad de viajes segun end_station_name
trip.groupby('end_station_name').count()['duration'].plot('bar',figsize=(14,4));
In [ ]: