In [ ]:
'''
LESSION 3
 - Creating functions 
 - Reading from EXCEL 
 - Exporting to EXCEL 
 - Outliers 
 - Lambda functions 
 - Slice and dice data
'''

In [4]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as np
import sys
import matplotlib

%matplotlib inline

In [6]:
print('Python version ' + sys.version)
print('Pandas version: ' + pd.__version__)
print('Matplotlib version ' + matplotlib.__version__)


Python version 2.7.13 |Anaconda 4.3.0 (64-bit)| (default, Dec 19 2016, 13:29:36) [MSC v.1500 64 bit (AMD64)]
Pandas version: 0.19.2
Matplotlib version 2.0.0

In [8]:
# set seed
np.seed(111)

# Function to generate test data
def CreateDataSet(Number=1):
    
    Output = []
    
    for i in range(Number):
        
        # Create a weekly (mondays) date range
        rng = pd.date_range(start='1/1/2009', end='12/31/2012', freq='W-MON')
        
        # Create random data
        data = np.randint(low=25,high=1000,size=len(rng))
        
        # Status pool
        status = [1,2,3]
        
        # Make a random list of statuses
        random_status = [status[np.randint(low=0,high=len(status))] for i in range(len(rng))]
        
        # State pool
        states = ['GA','FL','fl','NY','NJ','TX']
        
        # Make a random list of states 
        random_states = [states[np.randint(low=0,high=len(states))] for i in range(len(rng))]
    
        Output.extend(zip(random_states, random_status, data, rng))
        
    return Output

In [19]:
dataset = CreateDataSet(4)
df = pd.DataFrame(data=dataset, columns=['State','Status','CustomerCount','StatusDate'])
df.info


Out[19]:
<bound method DataFrame.info of     State  Status  CustomerCount StatusDate
0      FL       2            454 2009-01-05
1      FL       1            319 2009-01-12
2      TX       2             78 2009-01-19
3      NJ       3            691 2009-01-26
4      NJ       3            634 2009-02-02
5      TX       3            932 2009-02-09
6      fl       3            376 2009-02-16
7      NJ       2             92 2009-02-23
8      NY       2            672 2009-03-02
9      NJ       3            153 2009-03-09
10     TX       3            167 2009-03-16
11     GA       2            938 2009-03-23
12     GA       1             63 2009-03-30
13     TX       2            615 2009-04-06
14     NJ       3            813 2009-04-13
15     fl       1            788 2009-04-20
16     GA       2            601 2009-04-27
17     fl       3            371 2009-05-04
18     NJ       1            199 2009-05-11
19     fl       2            405 2009-05-18
20     NJ       3            397 2009-05-25
21     FL       3            136 2009-06-01
22     NY       3            813 2009-06-08
23     GA       3            406 2009-06-15
24     fl       1            997 2009-06-22
25     TX       2            399 2009-06-29
26     FL       1            689 2009-07-06
27     NY       3            585 2009-07-13
28     NY       3            772 2009-07-20
29     GA       1            582 2009-07-27
..    ...     ...            ...        ...
806    NJ       2             80 2012-06-11
807    GA       1            649 2012-06-18
808    NJ       3            949 2012-06-25
809    GA       2            754 2012-07-02
810    GA       2            160 2012-07-09
811    GA       2            990 2012-07-16
812    FL       2            385 2012-07-23
813    NY       1            449 2012-07-30
814    NJ       2            551 2012-08-06
815    fl       1            217 2012-08-13
816    fl       1             50 2012-08-20
817    FL       2            417 2012-08-27
818    TX       1            790 2012-09-03
819    NJ       1            647 2012-09-10
820    fl       3             50 2012-09-17
821    fl       1            125 2012-09-24
822    NY       3            765 2012-10-01
823    FL       3            471 2012-10-08
824    NY       2            841 2012-10-15
825    NY       3            486 2012-10-22
826    NJ       2            140 2012-10-29
827    fl       2            362 2012-11-05
828    TX       3            102 2012-11-12
829    TX       2            276 2012-11-19
830    GA       3            367 2012-11-26
831    FL       2            942 2012-12-03
832    fl       1            760 2012-12-10
833    NJ       2             90 2012-12-17
834    FL       2            661 2012-12-24
835    FL       2            291 2012-12-31

[836 rows x 4 columns]>

In [10]:
df.head()


Out[10]:
State Status CustomerCount StatusDate
0 GA 1 877 2009-01-05
1 FL 1 901 2009-01-12
2 fl 3 749 2009-01-19
3 FL 3 111 2009-01-26
4 GA 1 300 2009-02-02

In [20]:
# Save results to excel
df.to_excel('Lesson3.xlsx', index=False)
print('Done')


Done

In [ ]:
'''
Grab Data from Excel
'''

In [12]:
pd.read_excel?

In [21]:
# Location of file
Location = r'C:\Users\cr\Documents\UCM 4\MD\teamMin\tutorial_pandas\Lesson3.xlsx'

# Parse a specific sheet
df = pd.read_excel(Location, 0, index_col='StatusDate')
df.dtypes


Out[21]:
State            object
Status            int64
CustomerCount     int64
dtype: object

In [22]:
df.index


Out[22]:
DatetimeIndex(['2009-01-05', '2009-01-12', '2009-01-19', '2009-01-26',
               '2009-02-02', '2009-02-09', '2009-02-16', '2009-02-23',
               '2009-03-02', '2009-03-09',
               ...
               '2012-10-29', '2012-11-05', '2012-11-12', '2012-11-19',
               '2012-11-26', '2012-12-03', '2012-12-10', '2012-12-17',
               '2012-12-24', '2012-12-31'],
              dtype='datetime64[ns]', name=u'StatusDate', length=836, freq=None)

In [23]:
df.head()


Out[23]:
State Status CustomerCount
StatusDate
2009-01-05 FL 2 454
2009-01-12 FL 1 319
2009-01-19 TX 2 78
2009-01-26 NJ 3 691
2009-02-02 NJ 3 634

In [ ]:
'''
Prepare Data
This section attempts to clean up the data for analysis.

   1 Make sure the state column is all in upper case
   2 Only select records where the account status is equal to "1"
   3 Merge (NJ and NY) to NY in the state column
   4 Remove any outliers (any odd results in the data set)
Lets take a quick look on how some of the State values are upper case and some are lower case
'''

In [27]:
df['State'].unique()


Out[27]:
array([u'FL', u'TX', u'NJ', u'NY', u'GA'], dtype=object)

In [25]:
# Clean State Column, convert to upper case
df['State'] = df.State.apply(lambda x: x.upper())

In [28]:
df['State'].unique()


Out[28]:
array([u'FL', u'TX', u'NJ', u'NY', u'GA'], dtype=object)

In [32]:
df['CustomerCount'].plot(figsize=(15,5));



In [34]:
sortdf = df[df['State']=='NY'].sort_index(axis=0)
sortdf.head(10)


Out[34]:
State Status CustomerCount
StatusDate
2009-01-05 NY 1 681
2009-01-19 NY 3 591
2009-01-19 NY 3 110
2009-02-02 NY 3 900
2009-03-02 NY 2 672
2009-03-09 NY 2 352
2009-03-09 NY 2 236
2009-03-16 NY 1 140
2009-03-23 NY 2 538
2009-04-06 NY 3 359

In [35]:
# Group by State and StatusDate
Daily = df.reset_index().groupby(['State','StatusDate']).sum()
Daily.head()


Out[35]:
Status CustomerCount
State StatusDate
FL 2009-01-05 5 1253
2009-01-12 7 1270
2009-01-26 1 127
2009-02-02 1 261
2009-02-16 6 555

In [36]:
del Daily['Status']
Daily.head()


Out[36]:
CustomerCount
State StatusDate
FL 2009-01-05 1253
2009-01-12 1270
2009-01-26 127
2009-02-02 261
2009-02-16 555

In [37]:
# What is the index of the dataframe
Daily.index


Out[37]:
MultiIndex(levels=[[u'FL', u'GA', u'NJ', u'NY', u'TX'], [2009-01-05 00:00:00, 2009-01-12 00:00:00, 2009-01-19 00:00:00, 2009-01-26 00:00:00, 2009-02-02 00:00:00, 2009-02-09 00:00:00, 2009-02-16 00:00:00, 2009-02-23 00:00:00, 2009-03-02 00:00:00, 2009-03-09 00:00:00, 2009-03-16 00:00:00, 2009-03-23 00:00:00, 2009-03-30 00:00:00, 2009-04-06 00:00:00, 2009-04-13 00:00:00, 2009-04-20 00:00:00, 2009-04-27 00:00:00, 2009-05-04 00:00:00, 2009-05-11 00:00:00, 2009-05-18 00:00:00, 2009-05-25 00:00:00, 2009-06-01 00:00:00, 2009-06-08 00:00:00, 2009-06-15 00:00:00, 2009-06-22 00:00:00, 2009-06-29 00:00:00, 2009-07-06 00:00:00, 2009-07-13 00:00:00, 2009-07-20 00:00:00, 2009-07-27 00:00:00, 2009-08-03 00:00:00, 2009-08-10 00:00:00, 2009-08-17 00:00:00, 2009-08-24 00:00:00, 2009-08-31 00:00:00, 2009-09-07 00:00:00, 2009-09-14 00:00:00, 2009-09-21 00:00:00, 2009-09-28 00:00:00, 2009-10-05 00:00:00, 2009-10-12 00:00:00, 2009-10-19 00:00:00, 2009-10-26 00:00:00, 2009-11-02 00:00:00, 2009-11-09 00:00:00, 2009-11-16 00:00:00, 2009-11-23 00:00:00, 2009-11-30 00:00:00, 2009-12-07 00:00:00, 2009-12-14 00:00:00, 2009-12-21 00:00:00, 2009-12-28 00:00:00, 2010-01-04 00:00:00, 2010-01-11 00:00:00, 2010-01-18 00:00:00, 2010-01-25 00:00:00, 2010-02-01 00:00:00, 2010-02-08 00:00:00, 2010-02-15 00:00:00, 2010-02-22 00:00:00, 2010-03-01 00:00:00, 2010-03-08 00:00:00, 2010-03-15 00:00:00, 2010-03-22 00:00:00, 2010-03-29 00:00:00, 2010-04-05 00:00:00, 2010-04-12 00:00:00, 2010-04-19 00:00:00, 2010-04-26 00:00:00, 2010-05-03 00:00:00, 2010-05-10 00:00:00, 2010-05-17 00:00:00, 2010-05-24 00:00:00, 2010-05-31 00:00:00, 2010-06-07 00:00:00, 2010-06-14 00:00:00, 2010-06-21 00:00:00, 2010-06-28 00:00:00, 2010-07-05 00:00:00, 2010-07-12 00:00:00, 2010-07-19 00:00:00, 2010-07-26 00:00:00, 2010-08-02 00:00:00, 2010-08-09 00:00:00, 2010-08-16 00:00:00, 2010-08-23 00:00:00, 2010-08-30 00:00:00, 2010-09-06 00:00:00, 2010-09-13 00:00:00, 2010-09-20 00:00:00, 2010-09-27 00:00:00, 2010-10-04 00:00:00, 2010-10-11 00:00:00, 2010-10-18 00:00:00, 2010-10-25 00:00:00, 2010-11-01 00:00:00, 2010-11-08 00:00:00, 2010-11-15 00:00:00, 2010-11-22 00:00:00, 2010-11-29 00:00:00, 2010-12-06 00:00:00, 2010-12-13 00:00:00, 2010-12-20 00:00:00, 2010-12-27 00:00:00, 2011-01-03 00:00:00, 2011-01-10 00:00:00, 2011-01-17 00:00:00, 2011-01-24 00:00:00, 2011-01-31 00:00:00, 2011-02-07 00:00:00, 2011-02-14 00:00:00, 2011-02-21 00:00:00, 2011-02-28 00:00:00, 2011-03-07 00:00:00, 2011-03-14 00:00:00, 2011-03-21 00:00:00, 2011-03-28 00:00:00, 2011-04-04 00:00:00, 2011-04-11 00:00:00, 2011-04-18 00:00:00, 2011-04-25 00:00:00, 2011-05-02 00:00:00, 2011-05-09 00:00:00, 2011-05-16 00:00:00, 2011-05-23 00:00:00, 2011-05-30 00:00:00, 2011-06-06 00:00:00, 2011-06-13 00:00:00, 2011-06-20 00:00:00, 2011-06-27 00:00:00, 2011-07-04 00:00:00, 2011-07-11 00:00:00, 2011-07-18 00:00:00, 2011-07-25 00:00:00, 2011-08-01 00:00:00, 2011-08-08 00:00:00, 2011-08-15 00:00:00, 2011-08-22 00:00:00, 2011-08-29 00:00:00, 2011-09-05 00:00:00, 2011-09-12 00:00:00, 2011-09-19 00:00:00, 2011-09-26 00:00:00, 2011-10-03 00:00:00, 2011-10-10 00:00:00, 2011-10-17 00:00:00, 2011-10-24 00:00:00, 2011-10-31 00:00:00, 2011-11-07 00:00:00, 2011-11-14 00:00:00, 2011-11-21 00:00:00, 2011-11-28 00:00:00, 2011-12-05 00:00:00, 2011-12-12 00:00:00, 2011-12-19 00:00:00, 2011-12-26 00:00:00, 2012-01-02 00:00:00, 2012-01-09 00:00:00, 2012-01-16 00:00:00, 2012-01-23 00:00:00, 2012-01-30 00:00:00, 2012-02-06 00:00:00, 2012-02-13 00:00:00, 2012-02-20 00:00:00, 2012-02-27 00:00:00, 2012-03-05 00:00:00, 2012-03-12 00:00:00, 2012-03-19 00:00:00, 2012-03-26 00:00:00, 2012-04-02 00:00:00, 2012-04-09 00:00:00, 2012-04-16 00:00:00, 2012-04-23 00:00:00, 2012-04-30 00:00:00, 2012-05-07 00:00:00, 2012-05-14 00:00:00, 2012-05-21 00:00:00, 2012-05-28 00:00:00, 2012-06-04 00:00:00, 2012-06-11 00:00:00, 2012-06-18 00:00:00, 2012-06-25 00:00:00, 2012-07-02 00:00:00, 2012-07-09 00:00:00, 2012-07-16 00:00:00, 2012-07-23 00:00:00, 2012-07-30 00:00:00, 2012-08-06 00:00:00, 2012-08-13 00:00:00, 2012-08-20 00:00:00, 2012-08-27 00:00:00, 2012-09-03 00:00:00, 2012-09-10 00:00:00, 2012-09-17 00:00:00, 2012-09-24 00:00:00, 2012-10-01 00:00:00, 2012-10-08 00:00:00, 2012-10-15 00:00:00, 2012-10-22 00:00:00, 2012-10-29 00:00:00, 2012-11-05 00:00:00, 2012-11-12 00:00:00, 2012-11-19 00:00:00, 2012-11-26 00:00:00, 2012-12-03 00:00:00, 2012-12-10 00:00:00, 2012-12-17 00:00:00, 2012-12-24 00:00:00, 2012-12-31 00:00:00]],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], [0, 1, 3, 4, 6, 10, 12, 13, 14, 15, 17, 19, 20, 21, 22, 23, 24, 26, 27, 28, 30, 33, 34, 36, 39, 42, 44, 46, 47, 48, 50, 51, 52, 54, 55, 57, 58, 59, 60, 62, 63, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 83, 84, 86, 87, 88, 89, 93, 94, 95, 96, 97, 98, 99, 100, 101, 103, 104, 105, 106, 107, 108, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 125, 127, 128, 129, 130, 132, 133, 134, 135, 136, 137, 138, 140, 142, 144, 147, 149, 150, 151, 152, 154, 156, 159, 162, 163, 164, 165, 166, 168, 169, 170, 171, 173, 174, 175, 177, 178, 182, 183, 184, 185, 186, 187, 188, 189, 190, 193, 194, 195, 196, 198, 199, 200, 202, 203, 204, 205, 206, 207, 208, 2, 3, 5, 8, 9, 10, 11, 12, 13, 16, 17, 18, 22, 23, 29, 31, 32, 37, 39, 40, 43, 46, 48, 49, 50, 51, 52, 56, 60, 61, 62, 64, 66, 69, 72, 74, 75, 76, 77, 78, 80, 81, 82, 85, 86, 87, 89, 92, 93, 95, 96, 99, 100, 101, 102, 103, 104, 106, 109, 110, 115, 116, 118, 122, 126, 129, 130, 132, 134, 135, 137, 139, 141, 143, 144, 147, 148, 150, 153, 154, 155, 156, 157, 158, 160, 163, 165, 166, 167, 168, 171, 173, 176, 177, 179, 180, 182, 183, 184, 185, 186, 189, 190, 194, 195, 198, 199, 201, 203, 204, 205, 208, 3, 4, 5, 6, 7, 8, 9, 11, 14, 15, 16, 18, 20, 25, 26, 28, 29, 32, 34, 35, 36, 38, 39, 40, 41, 43, 44, 45, 47, 48, 50, 53, 55, 56, 57, 61, 62, 64, 65, 66, 70, 71, 75, 77, 80, 85, 87, 88, 90, 91, 92, 95, 101, 102, 104, 105, 107, 111, 115, 117, 118, 122, 123, 124, 125, 128, 129, 130, 133, 134, 139, 140, 143, 145, 148, 150, 151, 152, 153, 155, 156, 157, 158, 159, 160, 161, 171, 174, 179, 180, 181, 182, 183, 186, 187, 188, 189, 190, 192, 198, 199, 200, 201, 204, 206, 0, 2, 4, 8, 9, 10, 11, 13, 14, 15, 16, 17, 19, 21, 22, 24, 25, 27, 28, 31, 33, 34, 35, 37, 40, 41, 42, 43, 45, 46, 48, 49, 50, 52, 53, 54, 58, 63, 64, 67, 68, 72, 74, 76, 78, 79, 81, 82, 83, 84, 86, 88, 90, 91, 93, 97, 98, 100, 103, 108, 109, 110, 112, 118, 120, 123, 124, 126, 127, 128, 129, 130, 131, 132, 133, 136, 138, 139, 140, 141, 144, 145, 146, 149, 151, 153, 156, 157, 158, 159, 160, 161, 162, 163, 166, 167, 168, 169, 172, 173, 174, 175, 176, 177, 178, 179, 181, 186, 187, 190, 191, 192, 193, 195, 197, 198, 200, 202, 206, 208, 0, 1, 2, 3, 5, 6, 8, 10, 12, 13, 19, 21, 22, 25, 27, 29, 30, 33, 35, 37, 38, 39, 41, 45, 47, 49, 56, 58, 60, 62, 63, 64, 66, 69, 70, 72, 73, 74, 76, 78, 79, 80, 82, 83, 84, 86, 90, 91, 93, 94, 96, 98, 99, 100, 102, 103, 107, 108, 109, 112, 113, 114, 117, 119, 120, 123, 124, 125, 126, 127, 131, 133, 134, 136, 137, 138, 142, 145, 147, 149, 154, 155, 157, 158, 160, 161, 162, 165, 166, 168, 170, 174, 175, 176, 178, 180, 182, 183, 184, 188, 191, 192, 193, 196, 197, 201, 202, 204, 205]],
           names=[u'State', u'StatusDate'])

In [39]:
# Select the State index
Daily.index.levels[0]


Out[39]:
Index([u'FL', u'GA', u'NJ', u'NY', u'TX'], dtype='object', name=u'State')

In [42]:
# Select the StatusDate index
Daily.index.levels[1]


Out[42]:
DatetimeIndex(['2009-01-05', '2009-01-12', '2009-01-19', '2009-01-26',
               '2009-02-02', '2009-02-09', '2009-02-16', '2009-02-23',
               '2009-03-02', '2009-03-09',
               ...
               '2012-10-29', '2012-11-05', '2012-11-12', '2012-11-19',
               '2012-11-26', '2012-12-03', '2012-12-10', '2012-12-17',
               '2012-12-24', '2012-12-31'],
              dtype='datetime64[ns]', name=u'StatusDate', length=209, freq=None)

In [44]:
Daily.loc['FL'].plot()
Daily.loc['GA'].plot()
Daily.loc['NY'].plot()
Daily.loc['TX'].plot();



In [45]:
Daily.loc['FL']['2012':].plot()
Daily.loc['GA']['2012':].plot()
Daily.loc['NY']['2012':].plot()
Daily.loc['TX']['2012':].plot();



In [46]:
# Calculate Outliers
StateYearMonth = Daily.groupby([Daily.index.get_level_values(0), Daily.index.get_level_values(1).year, Daily.index.get_level_values(1).month])
Daily['Lower'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.25) - (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
Daily['Upper'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.75) + (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
Daily['Outlier'] = (Daily['CustomerCount'] < Daily['Lower']) | (Daily['CustomerCount'] > Daily['Upper']) 

# Remove Outliers
Daily = Daily[Daily['Outlier'] == False]

In [47]:
Daily.head()


Out[47]:
CustomerCount Lower Upper Outlier
State StatusDate
FL 2009-01-05 1253 -512.25 2463.75 False
2009-01-12 1270 -512.25 2463.75 False
2009-01-26 127 -512.25 2463.75 False
2009-02-02 261 -53.25 869.25 False
2009-02-16 555 -53.25 869.25 False

In [48]:
# Combine all markets

# Get the max customer count by Date
ALL = pd.DataFrame(Daily['CustomerCount'].groupby(Daily.index.get_level_values(1)).sum())
ALL.columns = ['CustomerCount'] # rename column

# Group by Year and Month
YearMonth = ALL.groupby([lambda x: x.year, lambda x: x.month])

# What is the max customer count per Year and Month
ALL['Max'] = YearMonth['CustomerCount'].transform(lambda x: x.max())
ALL.head()


Out[48]:
CustomerCount Max
StatusDate
2009-01-05 2532 2532
2009-01-12 1417 2532
2009-01-19 960 2532
2009-01-26 1478 2532
2009-02-02 2119 2119

In [49]:
pd.date_range?

In [50]:
# Create the BHAG dataframe
data = [1000,2000,3000]
idx = pd.date_range(start='12/31/2011', end='12/31/2013', freq='A')
BHAG = pd.DataFrame(data, index=idx, columns=['BHAG'])
BHAG


Out[50]:
BHAG
2011-12-31 1000
2012-12-31 2000
2013-12-31 3000

In [51]:
# Combine the BHAG and the ALL data set 
combined = pd.concat([ALL,BHAG], axis=0)
combined = combined.sort_index(axis=0)
combined.tail()


Out[51]:
BHAG CustomerCount Max
2012-12-10 NaN 2761.0 2761.0
2012-12-17 NaN 1932.0 2761.0
2012-12-31 NaN 2024.0 2761.0
2012-12-31 2000.0 NaN NaN
2013-12-31 3000.0 NaN NaN

In [52]:
fig, axes = plt.subplots(figsize=(12, 7))

combined['BHAG'].fillna(method='pad').plot(color='green', label='BHAG')
combined['Max'].plot(color='blue', label='All Markets')
plt.legend(loc='best');



In [53]:
# Group by Year and then get the max value per year
Year = combined.groupby(lambda x: x.year).max()
Year


Out[53]:
BHAG CustomerCount Max
2009 NaN 2956.0 2956.0
2010 NaN 3257.0 3257.0
2011 1000.0 3221.0 3221.0
2012 2000.0 3649.0 3649.0
2013 3000.0 NaN NaN

In [54]:
# Add a column representing the percent change per year
Year['YR_PCT_Change'] = Year['Max'].pct_change(periods=1)
Year


Out[54]:
BHAG CustomerCount Max YR_PCT_Change
2009 NaN 2956.0 2956.0 NaN
2010 NaN 3257.0 3257.0 0.101827
2011 1000.0 3221.0 3221.0 -0.011053
2012 2000.0 3649.0 3649.0 0.132878
2013 3000.0 NaN NaN NaN

In [55]:
'''
Present Data
'''
# First Graph
ALL['Max'].plot(figsize=(10, 5));plt.title('ALL Markets')

# Last four Graphs
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))
fig.subplots_adjust(hspace=1.0) ## Create space between plots

Daily.loc['FL']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,0])
Daily.loc['GA']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,1]) 
Daily.loc['TX']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,0]) 
Daily.loc['NY']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,1]) 

# Add titles
axes[0,0].set_title('Florida')
axes[0,1].set_title('Georgia')
axes[1,0].set_title('Texas')
axes[1,1].set_title('North East');