notebook.community

Edit and run



In [1]:

    
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as np
import sys

%matplotlib inline



In [2]:

    
# set seed
np.seed(111)

# Function to generate test data
def CreateDataSet(Number=1):
    
    Output = []
    
    for i in range(Number):
        
        # Create a weekly (mondays) date range
        rng = pd.date_range(start='1/1/2009', end='12/31/2012', freq='W-MON')
        
        # Create random data
        data = np.randint(low=25,high=1000,size=len(rng))
        
        # Status pool
        status = [1,2,3]
        
        # Make a random list of statuses
        random_status = [status[np.randint(low=0,high=len(status))] for i in range(len(rng))]
        
        # State pool
        states = ['GA','FL','fl','NY','NJ','TX']
        
        # Make a random list of states 
        random_states = [states[np.randint(low=0,high=len(states))] for i in range(len(rng))]
    
        Output.extend(zip(random_states, random_status, data, rng))
        
    return Output



In [3]:

    
dataset = CreateDataSet(4)
df = pd.DataFrame(data=dataset, columns=['State','Status','CustomerCount','StatusDate'])
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 836 entries, 0 to 835
Data columns (total 4 columns):
State            836 non-null object
Status           836 non-null int64
CustomerCount    836 non-null int64
StatusDate       836 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 32.7+ KB



In [4]:

    
df.head()









    Out[4]:






  
    
      
      State
      Status
      CustomerCount
      StatusDate
    
  
  
    
      0
      GA
      1
      877
      2009-01-05
    
    
      1
      FL
      1
      901
      2009-01-12
    
    
      2
      fl
      3
      749
      2009-01-19
    
    
      3
      FL
      3
      111
      2009-01-26
    
    
      4
      GA
      1
      300
      2009-02-02



In [8]:

    
df.to_csv('Lesson3.csv', index=False)
print('done')









    



done



In [9]:

    
df['State'].unique()









    Out[9]:





array(['GA', 'FL', 'fl', 'TX', 'NY', 'NJ'], dtype=object)



In [10]:

    
# Clean State Column, convert to upper case
df['State'] = df.State.apply(lambda x: x.upper())



In [11]:

    
df['State'].unique()









    Out[11]:





array(['GA', 'FL', 'TX', 'NY', 'NJ'], dtype=object)



In [12]:

    
# Only grab where Status == 1
mask = df['Status'] == 1
df = df[mask]



In [13]:

    
# Convert NJ to NY
mask = df.State == 'NJ'
df['State'][mask] = 'NY'









    



/home/nickedes/nickedesEnv/local/lib/python3.4/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/nickedes/nickedesEnv/local/lib/python3.4/site-packages/pandas/core/generic.py:3636: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/home/nickedes/nickedesEnv/local/lib/python3.4/site-packages/IPython/core/interactiveshell.py:3066: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)



In [15]:

    
df['State'].unique()









    Out[15]:





array(['GA', 'FL', 'NY', 'TX'], dtype=object)



In [16]:

    
df['CustomerCount'].plot(figsize=(15,5));



In [17]:

    
sortdf = df[df['State']=='NY'].sort(axis=0)
sortdf.head(10)









    Out[17]:






  
    
      
      State
      Status
      CustomerCount
      StatusDate
    
  
  
    
      9
      NY
      1
      992
      2009-03-09
    
    
      16
      NY
      1
      753
      2009-04-27
    
    
      29
      NY
      1
      990
      2009-07-27
    
    
      36
      NY
      1
      772
      2009-09-14
    
    
      38
      NY
      1
      349
      2009-09-28
    
    
      40
      NY
      1
      694
      2009-10-12
    
    
      55
      NY
      1
      640
      2010-01-25
    
    
      66
      NY
      1
      375
      2010-04-12
    
    
      72
      NY
      1
      538
      2010-05-24
    
    
      81
      NY
      1
      314
      2010-07-26



In [18]:

    
# Group by State and StatusDate
Daily = df.reset_index().groupby(['State','StatusDate']).sum()
Daily.head()









    Out[18]:






  
    
      
      
      index
      Status
      CustomerCount
    
    
      State
      StatusDate
      
      
      
    
  
  
    
      FL
      2009-01-12
      1
      1
      901
    
    
      2009-02-02
      422
      1
      653
    
    
      2009-03-23
      429
      1
      752
    
    
      2009-04-06
      653
      2
      1086
    
    
      2009-06-08
      440
      1
      649



In [19]:

    
del Daily['Status']
Daily.head()









    Out[19]:






  
    
      
      
      index
      CustomerCount
    
    
      State
      StatusDate
      
      
    
  
  
    
      FL
      2009-01-12
      1
      901
    
    
      2009-02-02
      422
      653
    
    
      2009-03-23
      429
      752
    
    
      2009-04-06
      653
      1086
    
    
      2009-06-08
      440
      649



In [20]:

    
# Select the State index
Daily.index.levels[0]









    Out[20]:





Index(['FL', 'GA', 'NY', 'TX'], dtype='object', name='State')



In [21]:

    
# Select the StatusDate index
Daily.index.levels[1]









    Out[21]:





DatetimeIndex(['2009-01-05', '2009-01-12', '2009-01-19', '2009-02-02',
               '2009-02-23', '2009-03-09', '2009-03-16', '2009-03-23',
               '2009-03-30', '2009-04-06', 
               ...
               '2012-09-24', '2012-10-01', '2012-10-08', '2012-10-22',
               '2012-10-29', '2012-11-05', '2012-11-12', '2012-11-19',
               '2012-11-26', '2012-12-10'],
              dtype='datetime64[ns]', name='StatusDate', length=161, freq=None, tz=None)



In [22]:

    
Daily.loc['FL'].plot()
Daily.loc['GA'].plot()
Daily.loc['NY'].plot()
Daily.loc['TX'].plot();



In [23]:

    
Daily.loc['FL']['2012':].plot()
Daily.loc['GA']['2012':].plot()
Daily.loc['NY']['2012':].plot()
Daily.loc['TX']['2012':].plot();



In [24]:

    
# Calculate Outliers
StateYearMonth = Daily.groupby([Daily.index.get_level_values(0), Daily.index.get_level_values(1).year, Daily.index.get_level_values(1).month])
Daily['Lower'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.25) - (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
Daily['Upper'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.75) + (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
Daily['Outlier'] = (Daily['CustomerCount'] < Daily['Lower']) | (Daily['CustomerCount'] > Daily['Upper']) 

# Remove Outliers
Daily = Daily[Daily['Outlier'] == False]



In [25]:

    
Daily.head()









    Out[25]:






  
    
      
      
      index
      CustomerCount
      Lower
      Upper
      Outlier
    
    
      State
      StatusDate
      
      
      
      
      
    
  
  
    
      FL
      2009-01-12
      1
      901
      450.5
      1351.5
      False
    
    
      2009-02-02
      422
      653
      326.5
      979.5
      False
    
    
      2009-03-23
      429
      752
      376.0
      1128.0
      False
    
    
      2009-04-06
      653
      1086
      543.0
      1629.0
      False
    
    
      2009-06-08
      440
      649
      324.5
      973.5
      False



In [26]:

    
# Combine all markets

# Get the max customer count by Date
ALL = pd.DataFrame(Daily['CustomerCount'].groupby(Daily.index.get_level_values(1)).sum())
ALL.columns = ['CustomerCount'] # rename column

# Group by Year and Month
YearMonth = ALL.groupby([lambda x: x.year, lambda x: x.month])

# What is the max customer count per Year and Month
ALL['Max'] = YearMonth['CustomerCount'].transform(lambda x: x.max())
ALL.head()









    Out[26]:






  
    
      
      CustomerCount
      Max
    
    
      StatusDate
      
      
    
  
  
    
      2009-01-05
      877
      901
    
    
      2009-01-12
      901
      901
    
    
      2009-01-19
      522
      901
    
    
      2009-02-02
      953
      953
    
    
      2009-02-23
      710
      953



In [27]:

    
# Create the BHAG dataframe
data = [1000,2000,3000]
idx = pd.date_range(start='12/31/2011', end='12/31/2013', freq='A')
BHAG = pd.DataFrame(data, index=idx, columns=['BHAG'])
BHAG



In [28]:

    
# Combine the BHAG and the ALL data set 
combined = pd.concat([ALL,BHAG], axis=0)
combined = combined.sort(axis=0)
combined.tail()









    Out[28]:






  
    
      
      BHAG
      CustomerCount
      Max
    
  
  
    
      2012-11-19
      NaN
      136
      1115
    
    
      2012-11-26
      NaN
      1115
      1115
    
    
      2012-12-10
      NaN
      1269
      1269
    
    
      2012-12-31
      2000
      NaN
      NaN
    
    
      2013-12-31
      3000
      NaN
      NaN



In [29]:

    
fig, axes = plt.subplots(figsize=(12, 7))

combined['BHAG'].fillna(method='pad').plot(color='green', label='BHAG')
combined['Max'].plot(color='blue', label='All Markets')
plt.legend(loc='best');



In [30]:

    
# Group by Year and then get the max value per year
Year = combined.groupby(lambda x: x.year).max()
Year









    Out[30]:






  
    
      
      BHAG
      CustomerCount
      Max
    
  
  
    
      2009
      NaN
      2452
      2452
    
    
      2010
      NaN
      2065
      2065
    
    
      2011
      1000
      2711
      2711
    
    
      2012
      2000
      2061
      2061
    
    
      2013
      3000
      NaN
      NaN



In [31]:

    
# Add a column representing the percent change per year
Year['YR_PCT_Change'] = Year['Max'].pct_change(periods=1)
Year









    Out[31]:






  
    
      
      BHAG
      CustomerCount
      Max
      YR_PCT_Change
    
  
  
    
      2009
      NaN
      2452
      2452
      NaN
    
    
      2010
      NaN
      2065
      2065
      -0.157830
    
    
      2011
      1000
      2711
      2711
      0.312833
    
    
      2012
      2000
      2061
      2061
      -0.239764
    
    
      2013
      3000
      NaN
      NaN
      NaN



In [32]:

    
# Present data
# First Graph
ALL['Max'].plot(figsize=(10, 5));plt.title('ALL Markets')

# Last four Graphs
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))
fig.subplots_adjust(hspace=1.0) ## Create space between plots

Daily.loc['FL']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,0])
Daily.loc['GA']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,1]) 
Daily.loc['TX']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,0]) 
Daily.loc['NY']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,1]) 

# Add titles
axes[0,0].set_title('Florida')
axes[0,1].set_title('Georgia')
axes[1,0].set_title('Texas')
axes[1,1].set_title('North East');



In [ ]:

	State	Status	CustomerCount	StatusDate
0	GA	1	877	2009-01-05
1	FL	1	901	2009-01-12
2	fl	3	749	2009-01-19
3	FL	3	111	2009-01-26
4	GA	1	300	2009-02-02

	State	Status	CustomerCount	StatusDate
9	NY	1	992	2009-03-09
16	NY	1	753	2009-04-27
29	NY	1	990	2009-07-27
36	NY	1	772	2009-09-14
38	NY	1	349	2009-09-28
40	NY	1	694	2009-10-12
55	NY	1	640	2010-01-25
66	NY	1	375	2010-04-12
72	NY	1	538	2010-05-24
81	NY	1	314	2010-07-26

		index	Status	CustomerCount
State	StatusDate
FL	2009-01-12	1	1	901
	2009-02-02	422	1	653
	2009-03-23	429	1	752
	2009-04-06	653	2	1086
	2009-06-08	440	1	649

	BHAG	CustomerCount	Max
2012-11-19	NaN	136	1115
2012-11-26	NaN	1115	1115
2012-12-10	NaN	1269	1269
2012-12-31	2000	NaN	NaN
2013-12-31	3000	NaN	NaN

	BHAG	CustomerCount	Max
2009	NaN	2452	2452
2010	NaN	2065	2065
2011	1000	2711	2711
2012	2000	2061	2061
2013	3000	NaN	NaN