The devastating decrease in size of Napoleon's army due to battles, geography, and climate.

Clarity and intuitiveness.



In [150]:

    
print ggplot(data, aes(xvar, yvar)) + geom_point(color = 'coral') + geom_line(color='coral') + \
      ggtitle('title') + xlab('x-label') + ylab('y-label')









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-150-78d2fe86014d> in <module>()
----> 1 print ggplot(data, aes(xvar, yvar)) + geom_point(color = 'coral') + geom_line(color='coral') +       ggtitle('title') + xlab('x-label') + ylab('y-label')

NameError: name 'xvar' is not defined



In [ ]:

    
import pandas as pd
from ggplot import *

import pandas

def lineplot(hr_year_csv):
    # A csv file will be passed in as an argument which
    # contains two columns -- 'HR' (the number of homerun hits)
    # and 'yearID' (the year in which the homeruns were hit).
    #
    # Fill out the body of this function, lineplot, to use the
    # passed-in csv file, hr_year.csv, and create a
    # chart with points connected by lines, both colored 'red',
    # showing the number of HR by year.
    #
    # You will want to first load the csv file into a pandas dataframe
    # and use the pandas dataframe along with ggplot to create your visualization
    #
    # You can check out the data in the csv file at the link below:
    # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv
    #
    # You can read more about ggplot at the following link:
    # https://github.com/yhat/ggplot/
    
    data = pd.read_csv(hr_year_csv)

    # he ended up doing ggplot(data, aes('yearID', 'HR'))...
    gg = ggplot(data, aes(data['yearID'], data['HR'])) + \
    geom_point(color='red') + geom_line(color='red') + \
    ggtitle('homeruns by year') + xlab('year') + ylab('homeruns')
    return gg



In [ ]:

    
from pandas import *
from ggplot import *

import pandas as pd

def lineplot_compare(hr_by_team_year_sf_la_csv):
    # Write a function, lineplot_compare, that will read a csv file
    # called hr_by_team_year_sf_la.csv and plot it using pandas and ggplot2.
    #
    # This csv file has three columns: yearID, HR, and teamID. The data in the
    # file gives the total number of home runs hit each year by the SF Giants 
    # (teamID == 'SFN') and the LA Dodgers (teamID == "LAN"). Produce a 
    # visualization comparing the total home runs by year of the two teams. 
    # 
    # You can see the data in hr_by_team_year_sf_la_csv
    # at the link below:
    # https://www.dropbox.com/s/wn43cngo2wdle2b/hr_by_team_year_sf_la.csv
    #
    # Note that to differentiate between multiple categories on the 
    # same plot in ggplot, we can pass color in with the other arguments
    # to aes, rather than in our geometry functions. For example, 
    # ggplot(data, aes(xvar, yvar, color=category_var)). This should help you 
    # in this exercise.
    
    data = pd.read_csv(hr_by_team_year_sf_la_csv)
    # he added geom_point() and the title and labels (which seem to create a legend)
    gg = ggplot(data, aes('yearID', 'HR', color='teamID')) + geom_line() 
    return gg



In [182]:

    
turnstile_weather = pd.read_csv("turnstile_data_master_with_weather.csv", nrows=1000)
#%matplotlib inline
turnstile_weather.describe()









    Out[182]:






  
    
      
      Unnamed: 0
      Hour
      ENTRIESn_hourly
      EXITSn_hourly
      maxpressurei
      maxdewpti
      mindewpti
      minpressurei
      meandewpti
      meanpressurei
      fog
      rain
      meanwindspdi
      mintempi
      meantempi
      maxtempi
      precipi
      thunder
    
  
  
    
      count
       1000.000000
       1000.000000
        1000.000000
        1000.000000
       1000.00
       1000
       1000
       1000.000000
       1000
       1000.000000
       1000
       1000
       1000
       1000
       1000
       1000
       1000
       1000
    
    
      mean
        499.500000
         10.563000
        1223.888000
        1059.548000
         30.31
         42
         35
         30.230000
         39
         30.270000
          0
          0
          5
         50
         60
         69
          0
          0
    
    
      std
        288.819436
          6.901715
        1932.644908
        1778.694814
          0.00
          0
          0
          0.000001
          0
          0.000001
          0
          0
          0
          0
          0
          0
          0
          0
    
    
      min
          0.000000
          0.000000
           0.000000
           0.000000
         30.31
         42
         35
         30.230000
         39
         30.270000
          0
          0
          5
         50
         60
         69
          0
          0
    
    
      25%
        249.750000
          4.000000
          57.250000
          36.250000
         30.31
         42
         35
         30.230000
         39
         30.270000
          0
          0
          5
         50
         60
         69
          0
          0
    
    
      50%
        499.500000
         12.000000
         544.000000
         426.000000
         30.31
         42
         35
         30.230000
         39
         30.270000
          0
          0
          5
         50
         60
         69
          0
          0
    
    
      75%
        749.250000
         16.000000
        1605.000000
        1302.750000
         30.31
         42
         35
         30.230000
         39
         30.270000
          0
          0
          5
         50
         60
         69
          0
          0
    
    
      max
        999.000000
         23.000000
       20362.000000
       20650.000000
         30.31
         42
         35
         30.230000
         39
         30.270000
          0
          0
          5
         50
         60
         69
          0
          0



In [172]:

    
print turnstile_weather.head()
print turnstile_weather.describe()
unit_counts = data.groupby('UNIT').size()
hour_counts = data.groupby('Hour').size()
hour_counts









    



   Unnamed: 0  UNIT       DATEn     TIMEn  Hour    DESCn  ENTRIESn_hourly  \
0           0  R001  2011-05-01  01:00:00     1  REGULAR                0   
1           1  R001  2011-05-01  05:00:00     5  REGULAR              217   
2           2  R001  2011-05-01  09:00:00     9  REGULAR              890   
3           3  R001  2011-05-01  13:00:00    13  REGULAR             2451   
4           4  R001  2011-05-01  17:00:00    17  REGULAR             4400   

   EXITSn_hourly  maxpressurei  maxdewpti    ...      meandewpti  \
0              0         30.31         42    ...              39   
1            553         30.31         42    ...              39   
2           1262         30.31         42    ...              39   
3           3708         30.31         42    ...              39   
4           2501         30.31         42    ...              39   

   meanpressurei  fog  rain  meanwindspdi  mintempi  meantempi  maxtempi  \
0          30.27    0     0             5        50         60        69   
1          30.27    0     0             5        50         60        69   
2          30.27    0     0             5        50         60        69   
3          30.27    0     0             5        50         60        69   
4          30.27    0     0             5        50         60        69   

   precipi  thunder  
0        0        0  
1        0        0  
2        0        0  
3        0        0  
4        0        0  

[5 rows x 22 columns]
       Unnamed: 0       Hour  ENTRIESn_hourly  EXITSn_hourly  maxpressurei  \
count    50.00000  50.000000        50.000000      50.000000         50.00   
mean     24.50000  10.000000       293.200000     263.760000         30.31   
std      14.57738   7.131419       867.688132     714.077843          0.00   
min       0.00000   0.000000         0.000000       0.000000         30.31   
25%      12.25000   4.000000         0.000000       0.000000         30.31   
50%      24.50000  12.000000         0.000000       0.500000         30.31   
75%      36.75000  16.000000         1.750000      16.000000         30.31   
max      49.00000  21.000000      4400.000000    3708.000000         30.31   

       maxdewpti  mindewpti  minpressurei  meandewpti  meanpressurei  fog  \
count         50         50         50.00          50          50.00   50   
mean          42         35         30.23          39          30.27    0   
std            0          0          0.00           0           0.00    0   
min           42         35         30.23          39          30.27    0   
25%           42         35         30.23          39          30.27    0   
50%           42         35         30.23          39          30.27    0   
75%           42         35         30.23          39          30.27    0   
max           42         35         30.23          39          30.27    0   

       rain  meanwindspdi  mintempi  meantempi  maxtempi  precipi  thunder  
count    50            50        50         50        50       50       50  
mean      0             5        50         60        69        0        0  
std       0             0         0          0         0        0        0  
min       0             5        50         60        69        0        0  
25%       0             5        50         60        69        0        0  
50%       0             5        50         60        69        0        0  
75%       0             5        50         60        69        0        0  
max       0             5        50         60        69        0        0  






    Out[172]:





Hour
0       106
1        52
2         7
3         2
4       106
5        52
6         6
7         2
8        83
9        59
10        6
11        2
12      108
13       55
14       12
15        2
16      107
17       52
18        8
19        6
20      105
21       51
22        7
23        4
dtype: int64



In [170]:

    
import pandas as pd
from ggplot import *

def plot_weather_data(turnstile_weather):
    '''
    You are passed in a dataframe called turnstile_weather. 
    Use turnstile_weather along with ggplot to make a data visualization
    focused on the MTA and weather data we used in assignment #3.  
    You should feel free to implement something that we discussed in class 
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.  

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time of day or day of week
     * How ridership varies based on Subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
     
    You can check out:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
     
    To see all the columns and data points included in the turnstile_weather 
    dataframe. 
     
    However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3
    of the actual data in the turnstile_weather dataframe
    '''
    
    #try with and without stat="bar"
    plot = ggplot(turnstile_weather, aes('Hour', 'ENTRIESn_hourly', fill='UNIT', color='UNIT')) + geom_bar(alpha=0.8, stat="bar") + \
           ggtitle('Subway Usage') + xlab('Hour') + ylab('Number of Entries')
    return plot

plot_weather_data(turnstile_weather)









    












    Out[170]:





<ggplot: (372706941)>



In [168]:

    
import pandas as pd
from ggplot import *

def plot_weather_data(turnstile_weather):
    '''
    You are passed in a dataframe called turnstile_weather. 
    Use turnstile_weather along with ggplot to make a data visualization
    focused on the MTA and weather data we used in assignment #3.  
    You should feel free to implement something that we discussed in class 
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.  

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time of day or day of week
     * How ridership varies based on Subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
     
    You can check out:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
     
    To see all the columns and data points included in the turnstile_weather 
    dataframe. 
     
    However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3
    of the actual data in the turnstile_weather dataframe
    '''

    plot = ggplot(turnstile_weather, aes('Hour', 'ENTRIESn_hourly')) + geom_bar(alpha=0.8, stat="bar") + \
           ggtitle('Subway Usage') + xlab('Hour') + ylab('Number of Entries')
    return plot

plot_weather_data(turnstile_weather)









    












    Out[168]:





<ggplot: (348109373)>



In [ ]:

    
import pandas as pd
from ggplot import *

def plot_weather_data(turnstile_weather):
    '''
    You are passed in a dataframe called turnstile_weather. 
    Use turnstile_weather along with ggplot to make a data visualization
    focused on the MTA and weather data we used in assignment #3.  
    You should feel free to implement something that we discussed in class 
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.  

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time of day or day of week
     * How ridership varies based on Subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
     
    You can check out:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
     
    To see all the columns and data points included in the turnstile_weather 
    dataframe. 
     
    However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3
    of the actual data in the turnstile_weather dataframe
    '''

    plot = ggplot(turnstile_weather, aes('Hour', 'ENTRIESn_hourly')) + geom_bar(alpha=0.8, stat="bar") + \
           ggtitle('Subway Usage') + xlab('Hour') + ylab('Number of Entries')
    return plot



In [180]:

    
import pandas as pd
from ggplot import *

def plot_weather_data(turnstile_weather):
    ''' 
    plot_weather_data is passed a dataframe called turnstile_weather. 
    Use turnstile_weather along with ggplot to make another data visualization
    focused on the MTA and weather data we used in Project 3.
    
    Make a type of visualization different than what you did in the previous exercise.
    Try to use the data in a different way (e.g., if you made a lineplot concerning 
    ridership and time of day in exercise #1, maybe look at weather and try to make a 
    histogram in this exercise). Or try to use multiple encodings in your graph if 
    you didn't in the previous exercise.
    
    You should feel free to implement something that we discussed in class 
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time-of-day or day-of-week
     * How ridership varies by subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
     
    You can check out the link 
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
    to see all the columns and data points included in the turnstile_weather 
    dataframe.
     
    However, due to the limitation of our Amazon EC2 server, we will give you only 
    about 1/3 of the actual data in the turnstile_weather dataframe.
    '''

    plot = ggplot(turnstile_weather, aes('UNIT', 'ENTRIESn_hourly')) + geom_histogram(alpha=0.8) + \
           ggtitle('Entries Per Unit') + xlab('UNIT') + ylab('Entries Per Hour')
    return plot

plot_weather_data(turnstile_weather)









    












    Out[180]:





<ggplot: (373848565)>



In [185]:

    
import pandas as pd
from ggplot import *

def plot_weather_data(turnstile_weather):
    ''' 
    plot_weather_data is passed a dataframe called turnstile_weather. 
    Use turnstile_weather along with ggplot to make another data visualization
    focused on the MTA and weather data we used in Project 3.
    
    Make a type of visualization different than what you did in the previous exercise.
    Try to use the data in a different way (e.g., if you made a lineplot concerning 
    ridership and time of day in exercise #1, maybe look at weather and try to make a 
    histogram in this exercise). Or try to use multiple encodings in your graph if 
    you didn't in the previous exercise.
    
    You should feel free to implement something that we discussed in class 
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time-of-day or day-of-week
     * How ridership varies by subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
     
    You can check out the link 
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
    to see all the columns and data points included in the turnstile_weather 
    dataframe.
     
    However, due to the limitation of our Amazon EC2 server, we will give you only 
    about 1/3 of the actual data in the turnstile_weather dataframe.
    '''

    plot = ggplot(turnstile_weather, aes('UNIT', 'ENTRIESn_hourly', fill='UNIT')) + geom_bar(alpha=0.8, stat="bar") + \
           ggtitle('Entries Per Unit') + xlab('UNIT') + ylab('Entries Per Hour')
    return plot

plot_weather_data(turnstile_weather)









    












    Out[185]:





<ggplot: (348624113)>



In [ ]:

    
import pandas as pd
from ggplot import *

def plot_weather_data(turnstile_weather):
    ''' 
    plot_weather_data is passed a dataframe called turnstile_weather. 
    Use turnstile_weather along with ggplot to make another data visualization
    focused on the MTA and weather data we used in Project 3.
    
    Make a type of visualization different than what you did in the previous exercise.
    Try to use the data in a different way (e.g., if you made a lineplot concerning 
    ridership and time of day in exercise #1, maybe look at weather and try to make a 
    histogram in this exercise). Or try to use multiple encodings in your graph if 
    you didn't in the previous exercise.
    
    You should feel free to implement something that we discussed in class 
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time-of-day or day-of-week
     * How ridership varies by subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
     
    You can check out the link 
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
    to see all the columns and data points included in the turnstile_weather 
    dataframe.
     
    However, due to the limitation of our Amazon EC2 server, we will give you only 
    about 1/3 of the actual data in the turnstile_weather dataframe.
    '''

    plot = ggplot(turnstile_weather, aes('Hour', 'ENTRIESn_hourly', color='UNIT')) + geom_point(alpha=0.8) + \
           ggtitle('Entries Per Unit') + xlab('Hour and Unit') + ylab('Entries Per Hour')
    return plot

#plot_weather_data(turnstile_weather)

	Unnamed: 0	Hour	ENTRIESn_hourly	EXITSn_hourly	maxpressurei	maxdewpti	mindewpti	minpressurei	meandewpti	meanpressurei	fog	rain	meanwindspdi	mintempi	meantempi	maxtempi	precipi	thunder
count	1000.000000	1000.000000	1000.000000	1000.000000	1000.00	1000	1000	1000.000000	1000	1000.000000	1000	1000	1000	1000	1000	1000	1000	1000
mean	499.500000	10.563000	1223.888000	1059.548000	30.31	42	35	30.230000	39	30.270000	0	0	5	50	60	69	0	0
std	288.819436	6.901715	1932.644908	1778.694814	0.00	0	0	0.000001	0	0.000001	0	0	0	0	0	0	0	0
min	0.000000	0.000000	0.000000	0.000000	30.31	42	35	30.230000	39	30.270000	0	0	5	50	60	69	0	0
25%	249.750000	4.000000	57.250000	36.250000	30.31	42	35	30.230000	39	30.270000	0	0	5	50	60	69	0	0
50%	499.500000	12.000000	544.000000	426.000000	30.31	42	35	30.230000	39	30.270000	0	0	5	50	60	69	0	0
75%	749.250000	16.000000	1605.000000	1302.750000	30.31	42	35	30.230000	39	30.270000	0	0	5	50	60	69	0	0
max	999.000000	23.000000	20362.000000	20650.000000	30.31	42	35	30.230000	39	30.270000	0	0	5	50	60	69	0	0