In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [60]:
df = pd.read_csv('dietbet.csv')
df.dtypes
def fix_dates(x):
    return x.split(' ')[0]
df['Posted'] = df.Posted.apply(fix_dates)

In [61]:
df.drop(['Post ID','Permalink'],inplace=True,axis=1)

In [62]:
df.drop(['Countries','Languages'],inplace=True,axis=1)

In [63]:
engagement = pd.Series(df['Lifetime Engaged Users']/df['Lifetime Post Total Impressions'])
df.insert(5,'Engagement Rate',engagement)

In [64]:
def multiply_ten(x):
    return x*100
df['Engagement Rate'] = df['Engagement Rate'].apply(multiply_ten)

In [65]:
df.drop(['Lifetime Average time video viewed':],axis=1,inplace=True)


  File "<ipython-input-65-a08d4e9d7b63>", line 1
    df.drop(['Lifetime Average time video viewed':],axis=1,inplace=True)
                                                 ^
SyntaxError: invalid syntax

In [66]:
df.drop(df.columns[21:],inplace=True,axis=1)

In [68]:
length_msg = pd.Series([len(x) for x in df['Post Message']])
df.insert(0,'Message Length',length_msg)

In [70]:
df.to_csv('dietbet2.csv')
df2 = pd.read_csv('dietbet2.csv',parse_dates=True)

In [ ]:
df3 = pd.read_csv('dietbet2.csv',parse_dates=True)

In [113]:
ax = df3.boxplot(column='Lifetime Post Total Reach',by='Type')



In [74]:
df2['Posted'] = pd.to_datetime(df['Posted'])

In [76]:
df.dtypes


Out[76]:
Unnamed: 0                                                                      int64
Message Length                                                                  int64
Post Message                                                                   object
Type                                                                           object
Posted                                                                 datetime64[ns]
Audience Targeting                                                             object
Lifetime Post Total Reach                                                       int64
Engagement Rate                                                               float64
Lifetime Post organic reach                                                     int64
Lifetime Post Paid Reach                                                        int64
Lifetime Post Total Impressions                                                 int64
Lifetime Post Organic Impressions                                               int64
Lifetime Post Paid Impressions                                                  int64
Lifetime Engaged Users                                                          int64
Lifetime Post Consumers                                                         int64
Lifetime Post Consumptions                                                      int64
Lifetime Negative feedback                                                      int64
Lifetime Negative Feedback from Users                                           int64
Lifetime Post Impressions by people who have liked your Page                    int64
Lifetime Post reach by people who like your Page                                int64
Lifetime Post Paid Impressions by people who have liked your Page               int64
Lifetime Paid reach of a post by people who like your Page                      int64
Lifetime People who have liked your Page and engaged with your post             int64
dtype: object

In [82]:
df_freq = df2[['Posted','Lifetime Post Total Reach','Engagement Rate','Lifetime Post Consumptions']]

In [83]:
df_freq['count'] = 1


/Users/Mike/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [84]:
df_freq.drop('Engagement Rate',inplace=True,axis=1)


/Users/Mike/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [86]:
df_freq.set_index('Posted',inplace=True)
df_resample = df_freq.resample('D',how='sum')


/Users/Mike/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).sum()
  from ipykernel import kernelapp as app

In [88]:
df_resample.dropna(inplace=True)

In [90]:
df_resample['Mean Reach'] = df_resample['Lifetime Post Total Reach'] / df_resample['count']
df_resample['Mean Consumption'] = df_resample['Lifetime Post Consumptions'] / df_resample['count']

In [94]:
df_resample
df_resample['Consumption Rate'] = df_resample['Mean Consumption'] / df_resample['Mean Reach']

In [97]:
df_resample.corr()


Out[97]:
Lifetime Post Total Reach Lifetime Post Consumptions count Mean Reach Mean Consumption Consumption Rate
Lifetime Post Total Reach 1.000000 0.822745 0.242287 0.863281 0.753697 0.333010
Lifetime Post Consumptions 0.822745 1.000000 0.010129 0.851056 0.975008 0.663392
count 0.242287 0.010129 1.000000 -0.211647 -0.161383 -0.017229
Mean Reach 0.863281 0.851056 -0.211647 1.000000 0.884081 0.387112
Mean Consumption 0.753697 0.975008 -0.161383 0.884081 1.000000 0.618451
Consumption Rate 0.333010 0.663392 -0.017229 0.387112 0.618451 1.000000

In [98]:
df_resample.describe()


Out[98]:
Lifetime Post Total Reach Lifetime Post Consumptions count Mean Reach Mean Consumption Consumption Rate
count 75.000000 75.000000 75.000000 75.000000 75.000000 75.000000
mean 3343.013333 117.706667 1.426667 2611.907556 97.564667 0.029437
std 2632.497712 180.568230 0.756474 2426.448961 178.371113 0.022312
min 86.000000 0.000000 1.000000 86.000000 0.000000 0.000000
25% 1646.500000 30.000000 1.000000 1317.500000 23.500000 0.014901
50% 2593.000000 59.000000 1.000000 1801.000000 39.000000 0.024898
75% 4431.000000 129.500000 2.000000 2651.000000 95.250000 0.034212
max 14526.000000 1049.000000 5.000000 14526.000000 1049.000000 0.135101

In [100]:
df_resample[(df_resample['count'] > 4)]


Out[100]:
Lifetime Post Total Reach Lifetime Post Consumptions count Mean Reach Mean Consumption Consumption Rate
Posted
2016-01-06 3812.0 123.0 5.0 762.4 24.6 0.032267

In [107]:
_ = df_resample.plot(kind='scatter',x='count',y='Lifetime Post Consumptions')



In [ ]: