In [62]:

    
import pandas as pd
import numpy as np

%matplotlib inline

Pandas

DataFrame
Series



In [63]:

    
sq = pd.Series({'row1': 'row1 col a', 'row 2': 'row2 col a'})
sq









    Out[63]:





row 2    row2 col a
row1     row1 col a
dtype: object



In [64]:

    
sq.index









    Out[64]:





Index(['row 2', 'row1'], dtype='object')



In [72]:

    
df = pd.DataFrame(
    {
     'column_a': {'row1': 'row1 col a', 'row 2': 'row2 col a'}, 
     'column_b': {'row1': 'row1 col b', 'row 2': 'row2 col b'}, 
    })
df









    Out[72]:







  
    
      
      column_a
      column_b
    
  
  
    
      row 2
      row2 col a
      row2 col b
    
    
      row1
      row1 col a
      row1 col b



In [73]:

    
df.index









    Out[73]:





Index(['row 2', 'row1'], dtype='object')



In [74]:

    
df.columns









    Out[74]:





Index(['column_a', 'column_b'], dtype='object')



In [75]:

    
df.columns = ['new_column_a', 'new_column_b']
df









    Out[75]:







  
    
      
      new_column_a
      new_column_b
    
  
  
    
      row 2
      row2 col a
      row2 col b
    
    
      row1
      row1 col a
      row1 col b



In [76]:

    
print(type(df.new_column_a))
df.new_column_a









    



<class 'pandas.core.series.Series'>






    Out[76]:





row 2    row2 col a
row1     row1 col a
Name: new_column_a, dtype: object



In [9]:

    
type(df.new_column_a)









    Out[9]:





pandas.core.series.Series



In [77]:

    
print(type(df.new_column_a.values))
df.new_column_a.values









    



<class 'numpy.ndarray'>






    Out[77]:





array(['row2 col a', 'row1 col a'], dtype=object)

Read data

Let's read data from train.csv file into memory (assign to df variable in our case)



In [78]:

    
df = pd.read_csv('train.csv')

Questions:

How many objects (rows) and features (columns) are there?
What the name and type of features (columns)?
Are there missing values?
How many memory is use for keep this data in RAM?

Let's use .info() to find the answer



In [79]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
datetime      10886 non-null object
season        10886 non-null int64
holiday       10886 non-null int64
workingday    10886 non-null int64
weather       10886 non-null int64
temp          10886 non-null float64
atemp         10886 non-null float64
humidity      10886 non-null int64
windspeed     10886 non-null float64
casual        10886 non-null int64
registered    10886 non-null int64
count         10886 non-null int64
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.6+ KB

RangeIndex: 10886 entries, 0 to 10885 => there're 10886 rows (objects)
Data columns (total 12 columns): => there're 12 columns (features)
dtypes: float64(3), int64(8), object(1) => three types (float, int, object)
memory usage: 1020.6+ KB => use about 1MB
There are not missing data (because each column has the same number non-missing values)



In [80]:

    
print("count samples & features: ", df.shape)
print("Are there missing values: ", df.isnull().any().any())









    



count samples & features:  (10886, 12)
Are there missing values:  False

Questions

How data looks like?
Are there categorical variables?
The categorical variables have high or low cardinality (how many unique values they have)?
Can we optimize memory usage?



In [87]:

    
df.head(10)









    Out[87]:







  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
    
  
  
    
      0
      2011-01-01 00:00:00
      1
      0
      0
      1
      9.84
      14.395
      81
      0.0000
      3
      13
      16
    
    
      1
      2011-01-01 01:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0000
      8
      32
      40
    
    
      2
      2011-01-01 02:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0000
      5
      27
      32
    
    
      3
      2011-01-01 03:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0000
      3
      10
      13
    
    
      4
      2011-01-01 04:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0000
      0
      1
      1
    
    
      5
      2011-01-01 05:00:00
      1
      0
      0
      2
      9.84
      12.880
      75
      6.0032
      0
      1
      1
    
    
      6
      2011-01-01 06:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0000
      2
      0
      2
    
    
      7
      2011-01-01 07:00:00
      1
      0
      0
      1
      8.20
      12.880
      86
      0.0000
      1
      2
      3
    
    
      8
      2011-01-01 08:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0000
      1
      7
      8
    
    
      9
      2011-01-01 09:00:00
      1
      0
      0
      1
      13.12
      17.425
      76
      0.0000
      8
      6
      14



In [88]:

    
df.season.unique()
#df[''].unique()









    Out[88]:





array([1, 2, 3, 4])



In [92]:

    
df.season.nunique()









    Out[92]:





4



In [93]:

    
df.columns









    Out[93]:





Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')



In [95]:

    
for column in df.columns:
    print(column, df[column].nunique())









    



datetime 10886
season 4
holiday 2
workingday 2
weather 4
temp 49
atemp 60
humidity 89
windspeed 28
casual 309
registered 731
count 822

Categorical variables:

season: 4 unique values
holiday: 2 (binary)
workingday: 2 (binary)
weather: 4



In [97]:

    
df.holiday.unique()









    Out[97]:





array([0, 1])



In [96]:

    
df[ ['holiday'] ].info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 1 columns):
holiday    10886 non-null int64
dtypes: int64(1)
memory usage: 85.1 KB



In [98]:

    
df['holiday'] = df['holiday'].astype('int8')
df[ ['holiday'] ].info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 1 columns):
holiday    10886 non-null int8
dtypes: int8(1)
memory usage: 10.7 KB



In [102]:

    
def optimize_memory(df):
    for cat_var in ['holiday', 'weather', 'season', 'workingday']:
        df[cat_var] = df[cat_var].astype('int8')

    for float_var in ['temp', 'atemp', 'windspeed']:
        df[float_var] = df[float_var].astype('float16')


    for int_var in ['casual', 'registered', 'count']:
        df[int_var] = df[int_var].astype('int16')
        
    return df

df = optimize_memory(df)
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
datetime      10886 non-null object
season        10886 non-null int8
holiday       10886 non-null int8
workingday    10886 non-null int8
weather       10886 non-null int8
temp          10886 non-null float16
atemp         10886 non-null float16
humidity      10886 non-null int64
windspeed     10886 non-null float16
casual        10886 non-null int16
registered    10886 non-null int16
count         10886 non-null int16
dtypes: float16(3), int16(3), int64(1), int8(4), object(1)
memory usage: 340.3+ KB

Working with date



In [103]:

    
df['datetime'] = pd.to_datetime(df['datetime'])
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
datetime      10886 non-null datetime64[ns]
season        10886 non-null int8
holiday       10886 non-null int8
workingday    10886 non-null int8
weather       10886 non-null int8
temp          10886 non-null float16
atemp         10886 non-null float16
humidity      10886 non-null int64
windspeed     10886 non-null float16
casual        10886 non-null int16
registered    10886 non-null int16
count         10886 non-null int16
dtypes: datetime64[ns](1), float16(3), int16(3), int64(1), int8(4)
memory usage: 340.3 KB



In [105]:

    
df = pd.read_csv('train.csv', parse_dates=['datetime'])

df = optimize_memory(df)
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
datetime      10886 non-null datetime64[ns]
season        10886 non-null int8
holiday       10886 non-null int8
workingday    10886 non-null int8
weather       10886 non-null int8
temp          10886 non-null float16
atemp         10886 non-null float16
humidity      10886 non-null int64
windspeed     10886 non-null float16
casual        10886 non-null int16
registered    10886 non-null int16
count         10886 non-null int16
dtypes: datetime64[ns](1), float16(3), int16(3), int64(1), int8(4)
memory usage: 340.3 KB

Understand Data

https://www.kaggle.com/c/bike-sharing-demand/data

datetime - hourly date + timestamp
season - 1 = spring, 2 = summer, 3 = fall, 4 = winter
holiday - whether the day is considered a holiday
workingday - whether the day is neither a weekend nor holiday
weather -
1: Clear, Few clouds, Partly cloudy, Partly cloudy
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
temp - temperature in Celsius
atemp - "feels like" temperature in Celsius
humidity - relative humidity
windspeed - wind speed
casual - number of non-registered user rentals initiated
registered - number of registered user rentals initiated
count - number of total rentals



In [107]:

    
df.head()









    Out[107]:







  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
    
  
  
    
      0
      2011-01-01 00:00:00
      1
      0
      0
      1
      9.843750
      14.398438
      81
      0.0
      3
      13
      16
    
    
      1
      2011-01-01 01:00:00
      1
      0
      0
      1
      9.023438
      13.632812
      80
      0.0
      8
      32
      40
    
    
      2
      2011-01-01 02:00:00
      1
      0
      0
      1
      9.023438
      13.632812
      80
      0.0
      5
      27
      32
    
    
      3
      2011-01-01 03:00:00
      1
      0
      0
      1
      9.843750
      14.398438
      75
      0.0
      3
      10
      13
    
    
      4
      2011-01-01 04:00:00
      1
      0
      0
      1
      9.843750
      14.398438
      75
      0.0
      0
      1
      1

Questions:

What is target variable (should be predicted)?
What the difference between count, registered and casual?



In [110]:

    
df['count'].plot(figsize=(20, 10));



In [111]:

    
df['casual'].plot()









    Out[111]:





<matplotlib.axes._subplots.AxesSubplot at 0x1174e8860>



In [113]:

    
df['registered'].plot()









    Out[113]:





<matplotlib.axes._subplots.AxesSubplot at 0x118183320>



In [118]:

    
(df['count'] == df['casual'] + df['registered']).all()









    Out[118]:





True

Extract day, month, year... from datetime



In [119]:

    
df.datetime.map(lambda x: x.day)









    Out[119]:





0         1
1         1
2         1
3         1
4         1
5         1
6         1
7         1
8         1
9         1
10        1
11        1
12        1
13        1
14        1
15        1
16        1
17        1
18        1
19        1
20        1
21        1
22        1
23        1
24        2
25        2
26        2
27        2
28        2
29        2
         ..
10856    18
10857    18
10858    18
10859    18
10860    18
10861    18
10862    19
10863    19
10864    19
10865    19
10866    19
10867    19
10868    19
10869    19
10870    19
10871    19
10872    19
10873    19
10874    19
10875    19
10876    19
10877    19
10878    19
10879    19
10880    19
10881    19
10882    19
10883    19
10884    19
10885    19
Name: datetime, Length: 10886, dtype: int64



In [120]:

    
df.datetime.dt.hour









    Out[120]:





0         0
1         1
2         2
3         3
4         4
5         5
6         6
7         7
8         8
9         9
10       10
11       11
12       12
13       13
14       14
15       15
16       16
17       17
18       18
19       19
20       20
21       21
22       22
23       23
24        0
25        1
26        2
27        3
28        4
29        6
         ..
10856    18
10857    19
10858    20
10859    21
10860    22
10861    23
10862     0
10863     1
10864     2
10865     3
10866     4
10867     5
10868     6
10869     7
10870     8
10871     9
10872    10
10873    11
10874    12
10875    13
10876    14
10877    15
10878    16
10879    17
10880    18
10881    19
10882    20
10883    21
10884    22
10885    23
Name: datetime, Length: 10886, dtype: int64



In [122]:

    
def plot_by_hour(data, year=None, agg='sum'):
    data['hour'] = data.datetime.dt.hour
    dd = data[ data.datetime.dt.year == year ] if year else data
    
    by_hour = dd.groupby(['hour', 'workingday'])['count'].agg(agg).unstack()
    return by_hour.plot(kind='bar', ylim=(0, 80000), figsize=(15,5), width=0.9, title="Year = {0}".format(year))


plot_by_hour(df, year=2011)
plot_by_hour(df, year=2012);



In [123]:

    
def plot_by_year(data, agg_attr, title):
    data['year'] = data.datetime.dt.year
    data['month'] = data.datetime.dt.month
    data['hour'] = data.datetime.dt.hour
    
    by_year = data.groupby([agg_attr, 'year'])['count'].agg('sum').unstack()
    return by_year.plot(kind='bar', figsize=(15,5), width=0.9, title=title)


plot_by_year(df, 'month', "Rent bikes per month in 2011 and 2012")
plot_by_year(df, 'hour', "Rent bikes per hour in 2011 and 2012");



In [124]:

    
df[ ['count', 'year'] ].boxplot(by="year", figsize=(15, 6));



In [125]:

    
for year in [2011, 2012]:
    for workingday in [0, 1]:
        dd = df[ (df.datetime.dt.year == year) | (df.workingday == workingday) ]
        dd[ ['count', 'month'] ].boxplot(by="month", figsize=(15, 6));



In [ ]:

Mapping

Weather

Clear, Few clouds, Partly cloudy, Partly cloudy
Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog



In [126]:

    
weather = {1: 'Clear', 2: 'Mist', 3: 'Light Snow', 4: 'Heavy Rain'}
df['weather_label'] = df.weather.map(lambda x: weather[x])

df['weather_label'].unique()









    Out[126]:





array(['Clear', 'Mist', 'Light Snow', 'Heavy Rain'], dtype=object)

Apply



In [37]:

    
df[ ['weather', 'season'] ].apply(lambda x: 'weather-{0}, season-{1}'.format(x['weather'], x['season']), axis=1).head()









    Out[37]:





0    weather-1, season-1
1    weather-1, season-1
2    weather-1, season-1
3    weather-1, season-1
4    weather-1, season-1
dtype: object

Value counts



In [127]:

    
df.year = df.datetime.dt.year

df['year'].value_counts()









    Out[127]:





2012    5464
2011    5422
Name: year, dtype: int64



In [128]:

    
df['month'].value_counts()









    Out[128]:





7     912
6     912
5     912
12    912
8     912
11    911
10    911
4     909
9     909
3     901
2     901
1     884
Name: month, dtype: int64

Group



In [129]:

    
df.groupby('year')['month'].value_counts()









    Out[129]:





year  month
2011  5        456
      6        456
      7        456
      8        456
      11       456
      12       456
      4        455
      10       455
      9        453
      2        446
      3        446
      1        431
2012  5        456
      6        456
      7        456
      8        456
      9        456
      10       456
      12       456
      2        455
      3        455
      11       455
      4        454
      1        453
Name: month, dtype: int64

Aggregation



In [131]:

    
df.groupby('year')['count'].min()









    Out[131]:





year
2011    1
2012    1
Name: count, dtype: int16



In [132]:

    
df.groupby('year')['count'].max()









    Out[132]:





year
2011    638
2012    977
Name: count, dtype: int16



In [61]:

    
df.groupby('year')['count'].agg(np.max)









    Out[61]:





year
2011    638
2012    977
Name: count, dtype: int16



In [134]:

    
for agg_func in [np.mean, np.median, np.min, np.max]:
    print(agg_func.__name__, df.groupby(['year', 'month'])['count'].agg(agg_func))









    



mean year  month
2011  1         54.645012
      2         73.641256
      3         86.849776
      4        111.026374
      5        174.809211
      6        196.877193
      7        203.614035
      8        182.666667
      9        174.622517
      10       174.773626
      11       155.458333
      12       134.173246
2012  1        124.353201
      2        145.646154
      3        208.276923
      4        257.455947
      5        264.109649
      6        287.186404
      7        267.037281
      8        285.570175
      9        292.598684
      10       280.508772
      11       231.980220
      12       217.054825
Name: count, dtype: float64
median year  month
2011  1         47.0
      2         59.0
      3         72.0
      4         84.0
      5        154.0
      6        172.0
      7        173.0
      8        159.0
      9        147.0
      10       140.0
      11       133.5
      12       110.5
2012  1         93.0
      2        118.0
      3        171.0
      4        233.5
      5        234.5
      6        254.5
      7        244.0
      8        260.0
      9        245.5
      10       239.5
      11       202.0
      12       192.0
Name: count, dtype: float64
amin year  month
2011  1        1
      2        1
      3        1
      4        1
      5        1
      6        1
      7        1
      8        1
      9        1
      10       1
      11       1
      12       1
2012  1        1
      2        1
      3        1
      4        1
      5        1
      6        4
      7        2
      8        3
      9        3
      10       2
      11       1
      12       1
Name: count, dtype: int16
amax year  month
2011  1        219
      2        327
      3        332
      4        452
      5        611
      6        638
      7        596
      8        600
      9        628
      10       625
      11       553
      12       477
2012  1        512
      2        539
      3        801
      4        822
      5        873
      6        869
      7        872
      8        897
      9        977
      10       948
      11       724
      12       759
Name: count, dtype: int16

Sort



In [136]:

    
df.sort_values(by=['year', 'month'], ascending=False).head()









    Out[136]:







  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
      hour
      year
      month
      weather_label
    
  
  
    
      10430
      2012-12-01 00:00:00
      4
      0
      0
      1
      10.65625
      15.148438
      81
      0.000000
      9
      99
      108
      0
      2012
      12
      Clear
    
    
      10431
      2012-12-01 01:00:00
      4
      0
      0
      1
      10.65625
      15.148438
      81
      0.000000
      5
      64
      69
      1
      2012
      12
      Clear
    
    
      10432
      2012-12-01 02:00:00
      4
      0
      0
      2
      10.65625
      15.148438
      81
      0.000000
      3
      47
      50
      2
      2012
      12
      Mist
    
    
      10433
      2012-12-01 03:00:00
      4
      0
      0
      2
      10.65625
      13.632812
      81
      9.000000
      1
      14
      15
      3
      2012
      12
      Mist
    
    
      10434
      2012-12-01 04:00:00
      4
      0
      0
      1
      10.65625
      14.398438
      81
      6.003906
      0
      5
      5
      4
      2012
      12
      Clear

Save



In [139]:

    
df.to_csv('df.csv', index=False)



In [140]:

    
!head df.csv









    



datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,year,month,weather_label
2011-01-01 00:00:00,1,0,0,1,9.84375,14.3984375,81,0.0,3,13,16,0,2011,1,Clear
2011-01-01 01:00:00,1,0,0,1,9.0234375,13.6328125,80,0.0,8,32,40,1,2011,1,Clear
2011-01-01 02:00:00,1,0,0,1,9.0234375,13.6328125,80,0.0,5,27,32,2,2011,1,Clear
2011-01-01 03:00:00,1,0,0,1,9.84375,14.3984375,75,0.0,3,10,13,3,2011,1,Clear
2011-01-01 04:00:00,1,0,0,1,9.84375,14.3984375,75,0.0,0,1,1,4,2011,1,Clear
2011-01-01 05:00:00,1,0,0,2,9.84375,12.8828125,75,6.00390625,0,1,1,5,2011,1,Mist
2011-01-01 06:00:00,1,0,0,1,9.0234375,13.6328125,80,0.0,2,0,2,6,2011,1,Clear
2011-01-01 07:00:00,1,0,0,1,8.203125,12.8828125,86,0.0,1,2,3,7,2011,1,Clear
2011-01-01 08:00:00,1,0,0,1,9.84375,14.3984375,75,0.0,1,7,8,8,2011,1,Clear



In [141]:

    
df.to_hdf('df.h5', 'df')



In [ ]:

	datetime	season	weather	temp	atemp	humidity	windspeed	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	0.0000	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	0.0000	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	0.0000	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	0.0000	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0.0000	0	1	1
5	2011-01-01 05:00:00	1	2	9.84	12.880	75	6.0032	0	1	1
6	2011-01-01 06:00:00	1	1	9.02	13.635	80	0.0000	2	0	2
7	2011-01-01 07:00:00	1	1	8.20	12.880	86	0.0000	1	2	3
8	2011-01-01 08:00:00	1	1	9.84	14.395	75	0.0000	1	7	8
9	2011-01-01 09:00:00	1	1	13.12	17.425	76	0.0000	8	6	14

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.843750	14.398438	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.023438	13.632812	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.023438	13.632812	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.843750	14.398438	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.843750	14.398438	75	0	1	1

	datetime	season	weather	temp	atemp	humidity	windspeed	casual	registered	count	hour	year	month	weather_label
10430	2012-12-01 00:00:00	4	1	10.65625	15.148438	81	0.000000	9	99	108	0	2012	12	Clear
10431	2012-12-01 01:00:00	4	1	10.65625	15.148438	81	0.000000	5	64	69	1	2012	12	Clear
10432	2012-12-01 02:00:00	4	2	10.65625	15.148438	81	0.000000	3	47	50	2	2012	12	Mist
10433	2012-12-01 03:00:00	4	2	10.65625	13.632812	81	9.000000	1	14	15	3	2012	12	Mist
10434	2012-12-01 04:00:00	4	1	10.65625	14.398438	81	6.003906	0	5	5	4	2012	12	Clear