In [63]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

More females than males visit Necter


In [64]:
import pandas as pd

dfs = pd.read_excel('/home/saket/BISC104-Session01-Thursday-A.xlsx', sheet_name=[1, 2, 3, 4, 5, 6,
                                                                                 7, 8, 9, 10, 11, 12])

sheet_number_to_name = {1: 'Mingyu Park',
                        2: 'William Garrick',
                        3: 'Mariah & Ahmed',
                        4: 'Reem AlFaisal',
                        5: 'Sophia and Catherine',
                        6: 'Jesus Juarez',
                        7: 'Reagor Wagnon',
                        8: 'Yuyuan Chen',
                        9: 'Gus and Vedika',
                       10: 'Zak Cook',
                       11: 'Sam Gholami',
                       12: 'Hannah Sobelman and Erin Philen'}

master_dfs = []
for key in dfs.keys():
    df = dfs[key]
    df.columns = list(map(lambda x: x.replace(' ',''), df.columns))
    df['sheet_name'] = sheet_number_to_name[key]
    try:
        df['Time'] = df['Time'].str.replace(' ','') 
    except:
        print(df.columns)
    df['start'] , df['end'] = df['Time'].str.split('-', 1).str
    master_dfs.append(df)

In [65]:
master_df = pd.concat(master_dfs)
master_df.Day = master_df.Day.str.replace(' ', '')
master_df = master_df.drop(columns=['Time'])

In [66]:
master_df = master_df.sort_values(by=['start', 'Day'])

In [67]:
print(master_df.reset_index(drop=True).to_latex(index=False))


\begin{tabular}{lrrlll}
\toprule
       Day &  \#Skateboarders &  \#Bikeriders &                       sheet\_name &  start &    end \\
\midrule
 Wednesday &              17 &           13 &                     Jesus Juarez &  10:00 &  10:05 \\
   Tuesday &              29 &           23 &             Sophia and Catherine &  10:35 &  10:50 \\
    Friday &              37 &           43 &                         Zak Cook &  10:50 &  11:00 \\
 Wednesday &              15 &           14 &                  William Garrick &  11:00 &  11:05 \\
 Wednesday &              21 &           42 &                    Reem AlFaisal &  11:00 &  11:10 \\
 Wednesday &               9 &           37 &                      Sam Gholami &  11:02 &  11:07 \\
    Friday &              32 &           47 &             Sophia and Catherine &  11:15 &  11:25 \\
   Tuesday &              23 &           79 &                      Sam Gholami &  11:30 &  11:40 \\
   Tuesday &              34 &           52 &                         Zak Cook &  11:50 &  12:05 \\
   Tuesday &              52 &          107 &  Hannah Sobelman and Erin Philen &  12:00 &  12:15 \\
    Friday &              17 &           16 &                   Gus and Vedika &  13:00 &  13:05 \\
   Tuesday &              12 &           14 &                     Jesus Juarez &  13:00 &  13:05 \\
   Tuesday &              12 &           14 &                   Gus and Vedika &  13:00 &  13:05 \\
 Wednesday &              13 &           13 &                   Gus and Vedika &  13:00 &  13:05 \\
 Wednesday &              44 &          147 &  Hannah Sobelman and Erin Philen &  13:00 &  13:15 \\
   Tuesday &              21 &           83 &                      Sam Gholami &  13:09 &  13:19 \\
    Sunday &              13 &           14 &                      Yuyuan Chen &  13:26 &  13:33 \\
    Friday &              30 &           40 &                      Yuyuan Chen &  13:59 &  14:06 \\
    Monday &               4 &            3 &                    Reagor Wagnon &  14:00 &  14:05 \\
    Sunday &              22 &           13 &                     Jesus Juarez &  14:00 &  14:05 \\
   Tuesday &              20 &           18 &                  William Garrick &  14:00 &  14:10 \\
   Tuesday &               9 &           14 &                    Reagor Wagnon &  14:00 &  14:05 \\
 Wednesday &              17 &           19 &                    Reagor Wagnon &  14:00 &  14:05 \\
 Wednesday &              12 &           21 &                      Mingyu Park &  14:10 &  14:20 \\
 Wednesday &              10 &           28 &             Sophia and Catherine &  14:20 &  14:30 \\
   Tuesday &              44 &           39 &                   Mariah \& Ahmed &  15:00 &  15:25 \\
  Thursday &              91 &          133 &  Hannah Sobelman and Erin Philen &  15:15 &  15:45 \\
 Wednesday &              30 &           88 &                      Mingyu Park &  15:15 &  15:25 \\
    Friday &             121 &          135 &                    Reem AlFaisal &  15:32 &  16:00 \\
    Monday &              39 &           47 &                   Mariah \& Ahmed &  15:45 &  17:30 \\
 Wednesday &              19 &           41 &                      Mingyu Park &  16:02 &  16:07 \\
    Monday &              35 &           28 &                  William Garrick &  17:15 &  17:30 \\
 Wednesday &              57 &           55 &                   Mariah \& Ahmed &  17:15 &  18:05 \\
   Tuesday &             120 &          181 &                    Reem AlFaisal &  17:23 &  18:00 \\
  Thursday &              21 &           27 &                      Yuyuan Chen &  19:39 &  19:46 \\
\bottomrule
\end{tabular}


In [68]:
master_df = master_df.rename(columns={'#Skateboarders': 'Skateboarders', '#Bikeriders': 'Bikeriders'})

In [69]:
master_df.head()


Out[69]:
Day Skateboarders Bikeriders sheet_name start end
2 Wednesday 17 13 Jesus Juarez 10:00 10:05
1 Tuesday 29 23 Sophia and Catherine 10:35 10:50
0 Friday 37 43 Zak Cook 10:50 11:00
2 Wednesday 15 14 William Garrick 11:00 11:05
2 Wednesday 21 42 Reem AlFaisal 11:00 11:10

In [70]:
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('talk', font_scale=2)

In [71]:
df_molten = pd.melt(master_df, id_vars=['Day', 'sheet_name', 'start', 'end'], 
                    value_vars=['Skateboarders', 'Bikeriders'])

In [72]:
pd.to_datetime('06:00')


Out[72]:
Timestamp('2018-09-06 06:00:00')

In [73]:
df_molten['start_time'] = pd.to_datetime(df_molten['start'])
df_molten['end_time'] = pd.to_datetime(df_molten['end'])

In [74]:
df_molten['delta'] = df_molten['end_time'] - df_molten['start_time']

In [75]:
fig, ax = plt.subplots(figsize=(12,12))
sns.barplot(data=df_molten, hue='variable', x='sheet_name', y='value', ax=ax)
plt.setp(ax.get_xticklabels(), rotation=90)

fig.tight_layout()
fig.savefig('thursday_a_sheet_wise_plots.pdf')



In [76]:
fig, ax = plt.subplots(figsize=(12,8))
sns.barplot(data=df_molten, hue='variable', x='Day', y='value', ax=ax)
plt.setp(ax.get_xticklabels(), rotation=90)

fig.tight_layout()
fig.savefig('thursday_a_day_wise_plots.pdf')



In [ ]:


In [77]:
df_molten['delta_min'] = df_molten['delta'].dt.components.minutes
df_molten['normalized_value'] = df_molten['value']/df_molten['delta_min']

In [78]:
fig, ax = plt.subplots(figsize=(12,8))
sns.barplot(data=df_molten, hue='variable', x='Day', y='normalized_value', ax=ax)
plt.setp(ax.get_xticklabels(), rotation=90)

fig.tight_layout()
fig.savefig('thursday_a_day_wise_plots_normalized.pdf')



In [33]:
fig, ax = plt.subplots(figsize=(12,12))
sns.barplot(data=df_molten, hue='variable', x='sheet_name', y='normalized_value', ax=ax)
plt.setp(ax.get_xticklabels(), rotation=90)

fig.tight_layout()
fig.savefig('thursday_a_sheet_wise_plots_normalized.pdf')



In [51]:
df_molten['start_hours'] = df_molten['start_time'].dt.hour

In [53]:
df_molten.head()


Out[53]:
Day sheet_name start end variable value start_time end_time delta delta_min normalized_value start_hours
0 Wednesday Jesus Juarez 10:00 10:05 Skateboarders 17 2018-09-06 10:00:00 2018-09-06 10:05:00 00:05:00 5 3.400000 10
1 Tuesday Sophia and Catherine 10:35 10:50 Skateboarders 29 2018-09-06 10:35:00 2018-09-06 10:50:00 00:15:00 15 1.933333 10
2 Friday Zak Cook 10:50 11:00 Skateboarders 37 2018-09-06 10:50:00 2018-09-06 11:00:00 00:10:00 10 3.700000 10
3 Wednesday William Garrick 11:00 11:05 Skateboarders 15 2018-09-06 11:00:00 2018-09-06 11:05:00 00:05:00 5 3.000000 11
4 Wednesday Reem AlFaisal 11:00 11:10 Skateboarders 21 2018-09-06 11:00:00 2018-09-06 11:10:00 00:10:00 10 2.100000 11

In [62]:
df_molten['subject'] = 0
df_molten.head()


Out[62]:
Day sheet_name start end variable value start_time end_time delta delta_min normalized_value start_hours subject
0 Wednesday Jesus Juarez 10:00 10:05 Skateboarders 17 2018-09-06 10:00:00 2018-09-06 10:05:00 00:05:00 5 3.400000 10 0
1 Tuesday Sophia and Catherine 10:35 10:50 Skateboarders 29 2018-09-06 10:35:00 2018-09-06 10:50:00 00:15:00 15 1.933333 10 0
2 Friday Zak Cook 10:50 11:00 Skateboarders 37 2018-09-06 10:50:00 2018-09-06 11:00:00 00:10:00 10 3.700000 10 0
3 Wednesday William Garrick 11:00 11:05 Skateboarders 15 2018-09-06 11:00:00 2018-09-06 11:05:00 00:05:00 5 3.000000 11 0
4 Wednesday Reem AlFaisal 11:00 11:10 Skateboarders 21 2018-09-06 11:00:00 2018-09-06 11:10:00 00:10:00 10 2.100000 11 0

In [61]:
sns.tsplot(data=df_molten, time="start_hours", unit='subject', value="normalized_value")


/home/saket/anaconda3/lib/python3.6/site-packages/seaborn/timeseries.py:183: UserWarning: The tsplot function is deprecated and will be removed or replaced (in a substantially altered version) in a future release.
  warnings.warn(msg, UserWarning)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-61-f4c2dbb9c6a9> in <module>()
----> 1 sns.tsplot(data=df_molten, time="start_hours", unit='subject', value="normalized_value")

~/anaconda3/lib/python3.6/site-packages/seaborn/timeseries.py in tsplot(data, time, unit, condition, value, err_style, ci, interpolate, color, estimator, n_boot, err_palette, err_kws, legend, ax, **kwargs)
    294     for c, (cond, df_c) in enumerate(data.groupby(condition, sort=False)):
    295 
--> 296         df_c = df_c.pivot(unit, time, value)
    297         x = df_c.columns.values.astype(np.float)
    298 

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in pivot(self, index, columns, values)
   5189         """
   5190         from pandas.core.reshape.reshape import pivot
-> 5191         return pivot(self, index=index, columns=columns, values=values)
   5192 
   5193     _shared_docs['pivot_table'] = """

~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in pivot(self, index, columns, values)
    407             indexed = self._constructor_sliced(self[values].values,
    408                                                index=index)
--> 409     return indexed.unstack(columns)
    410 
    411 

~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in unstack(self, level, fill_value)
   2895         """
   2896         from pandas.core.reshape.reshape import unstack
-> 2897         return unstack(self, level, fill_value)
   2898 
   2899     # ----------------------------------------------------------------------

~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in unstack(obj, level, fill_value)
    493         unstacker = _Unstacker(obj.values, obj.index, level=level,
    494                                fill_value=fill_value,
--> 495                                constructor=obj._constructor_expanddim)
    496         return unstacker.get_result()
    497 

~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in __init__(self, values, index, level, value_columns, fill_value, constructor)
    129 
    130         self._make_sorted_values_labels()
--> 131         self._make_selectors()
    132 
    133     def _make_sorted_values_labels(self):

~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in _make_selectors(self)
    167 
    168         if mask.sum() < len(self.index):
--> 169             raise ValueError('Index contains duplicate entries, '
    170                              'cannot reshape')
    171 

ValueError: Index contains duplicate entries, cannot reshape

In [34]:
sns.lmplot(x="delta_min", y="value", hue="variable", data=df_molten, size=15)
plt.savefig('thursday_a_time_wise_plots.pdf')



In [ ]: