cdcd tutorials/paa_2017_social_media will change to the paa_2017_social_media folder within the tutorials directoryjupyter notebookconda create -n py27 python=2.7 in the terminal to create a Python2.7 environment named py27source activate py27 to activate the py27 environment conda install notebook ipykernel matplotlibipython kernel install --userpip install -r requirements.txt
python setup.py install
import is a Python modulefrom collections import OrderedDictimport pandas as pd imports the pandas module but assigns this module the name "pd" in the namespacepandas.read_csv() to read a .csv file can now type pd.read_csv()import to import custom code/functions from a Python .py file in the current directoryfrom utils import *
In [1]:
# uncomment the line below to view the functions in utils.py
#% cat utils.py
In [2]:
import os
import re
import sys
import csv
import json
import glob
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from collections import OrderedDict
from pysocialwatcher import watcherAPI
from utils import *
% matplotlib inline
os.getcwd()getwd() in R
In [ ]:
os.getcwd()
Click "Extend Access Token" button (at the bottom) (the button may not show up right away, just refresh the page)
Finally, you want to get your Ads Manager Account ID.
watcherAPI() is a class within the pySocialWathcer module that makes requests with the Facebook Marketing APIwatcherAPI() has multiple attributes including credentialsload_credentials_file is a function within the watcher class that allows users to upload a .csv of Facebook Marketing API credentialscheck_tokens_account_valid is a function that sends a generic example request to the Facebook Marketing API to assess the validity of the credentials loaded from the load_credentials_file function
In [3]:
watcher = watcherAPI()
watcher.load_credentials_file("../credentials/facebook_credentials.csv")
watcher.check_tokens_account_valid()
print_geo_locations_given_query_and_location_type
In [ ]:
watcherAPI.print_geo_locations_given_query_and_location_type("new", ["city"])
In [4]:
watcher.print_bad_joke()
In [ ]:
watcher.print_behaviors_list()
read_table function
In [5]:
US_states = pd.read_table("../data/US_states.csv", sep = ",", header=0)
US_states.head()
Out[5]:
In [6]:
US_states.iloc[12]
Out[6]:
run_data_collection functionrun_data_collection takes a .json file as an input and creates an API requestrun_data_collection is a pandas dataframe and .csv filesrun_data_collection function
In [7]:
illinois_dic={"name": "IL_example", "geo_locations": [{"name": "regions",
"values": [{"key":str(US_states["FB_key"][12])}],
"location_types": ["home"]}],
"genders": [0,1,2],
"ages_ranges": [{"min":13, "max":65}]}
illinois_dic_ordered = OrderedDict(sorted(illinois_dic.items(),key=lambda t: len(t[0])))
illinois_dic_json=json.dumps(illinois_dic_ordered, indent = 4)
print illinois_dic_json
file_name = "IL_example.json"
with open('../data/%s' % file_name, 'w') as outfile:
outfile.write(json.dumps(illinois_dic_ordered, indent = 4))
make a request to the Marketing API with the example dictionary created above
In [ ]:
watcher.run_data_collection("../data/%s" % file_name)
6029662272682 | This Life Event segment contains consumers who are likely to be new movers in the last 6 months.| New mover
In [ ]:
illinois_dic_new_movers={"name": "IL_example", "geo_locations": [{"name": "regions",
"values": [{"key":str(US_states["FB_key"][12])}],
"location_types": ["home"]}],
"genders": [0,1,2],
"ages_ranges": [{"min":13, "max":65}],
"behavior":[{"or": [6029662272682],"name": "new_mover"}]}
illinois_dic_new_movers_ordered = OrderedDict(sorted(illinois_dic_new_movers.items(),key=lambda t: len(t[0])))
illinois_dic_new_movers_json=json.dumps(illinois_dic_new_movers_ordered, indent = 4)
print illinois_dic_new_movers_json
file_name = "IL_example_new_movers.json"
with open('../data/%s' % file_name, 'w') as outfile:
outfile.write(json.dumps(illinois_dic_new_movers_ordered, indent = 4))
make a request to the Marketing API with the new movers dictionary created above
In [ ]:
watcher.run_data_collection("../data/%s" % file_name)
In [8]:
state_subset = US_states.loc[US_states['Abbreviation'].isin(["CA","NY", "TX", "IL"])]
# reindex state_subset
state_subset.index = [0,1,2,3]
state_subset
Out[8]:
In [ ]:
for state in range(0, len(state_subset["State Name"])):
if not os.path.exists('../data/new_movers_by_state/%s' % state_subset["State Name"][state]):
os.makedirs('../data/new_movers_by_state/%s' % state_subset["State Name"][state])
In [ ]:
UN_age_min = [13,15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
UN_age_max = [65,19, 24, 29, 34, 39, 44, 49, 54, 59, 65]
UN_age_table = {'age_min': UN_age_min, 'age_max': UN_age_max}
UN_age_table_df = pd.DataFrame(data=UN_age_table, index=None)
UN_age_table_df = UN_age_table_df[["age_min", "age_max"]]
UN_age_table_df.to_csv("../data/UN_age_table.csv", index=None)
In [ ]:
ctr = 0
for state in range(0, len(state_subset["State Name"])):
state_dic={"name": str(state_subset["State Name"][state])+"_new_movers",
"geo_locations": [{"name": "regions", "values": [{"key":str(US_states["FB_key"][state])}],
"location_types": ["home"]}],
"genders": [0,1,2],
"ages_ranges": [{"min":13, "max":65}],
"behavior":[{"or": [6029662272682],"name": "new_movers"}]}
state_dic["geo_locations"][0]["values"][0]['key'] = str(state_subset["FB_key"][state])
state_dic_ordered = OrderedDict(sorted(state_dic.items(),key=lambda t: len(t[0])))
state_dic_json=json.dumps(state_dic_ordered, indent = 4)
gender_dict = {'0':'female_male_total_pop'}
file_name = str(state_subset["State Name"][state])+"_new_movers"+".json"
state_folder = state_subset["State Name"][state]
if not os.path.exists('../data/new_movers_by_state/%s/api_requests_json' % state_folder):
os.makedirs('../data/new_movers_by_state/%s/api_requests_json' % state_folder)
if not os.path.exists('../data/new_movers_by_state/%s/api_requests_csv' % state_folder):
os.makedirs('../data/new_movers_by_state/%s/api_requests_csv' % state_folder)
with open('../data/new_movers_by_state/%s/api_requests_json/%s' % (state_folder,file_name), 'w') as outfile:
outfile.write(json.dumps(state_dic_ordered, indent = 4))
state_api_request = watcher.run_data_collection('../data/new_movers_by_state/%s/api_requests_json/%s' % (state_folder,file_name))
state_api_request
state_api_request.insert(0, "state",state_subset["State Name"][state])
csv_filename = file_name.split(".")[0]+".csv"
state_api_request.to_csv("../data/new_movers_by_state/%s/api_requests_csv/%s" % (state_folder,csv_filename),
index=False)
ctr = ctr +1
total =len(state_subset["State Name"])
print("file %.f of %.f " % (ctr,total))
print(file_name)
In [ ]:
%rm collect_finished_* dataframe_collecting_* dataframe_skeleton_*
use merge_subdirectories function from utils to merge all folders directory of new movers by state into one .csv file
In [ ]:
merge_subdirectories('../data/new_movers_by_state/',
'raw_new_movers_data.csv')
read in merged .csv file of new movers by state
In [9]:
raw_new_movers_data = pd.read_csv("../data/raw_new_movers_data.csv")
raw_new_movers_data.head()
Out[9]:
In [10]:
print("the data has %s rows and %s columns" % (raw_new_movers_data.shape[0], raw_new_movers_data.shape[1]))
print(raw_new_movers_data.dtypes)
use process_facebook_data function from utils to munge dataframe of merged movers
In [11]:
merged_new_movers_data = process_facebook_data(raw_new_movers_data)
merged_new_movers_data.head()
Out[11]:
group_by function to group data by a column
In [12]:
merged_new_movers_data_genders = merged_new_movers_data.groupby("genders")
merged_new_movers_data_genders["audience"].sum()
Out[12]:
In [13]:
new_movers_total_pop = merged_new_movers_data.loc[merged_new_movers_data["genders"]=="total population"][["state", "audience"]]
new_movers_total_pop
Out[13]:
In [14]:
new_movers_audience = new_movers_total_pop["audience"]
new_movers_audience
Out[14]:
In [15]:
labels = [i for i in state_subset["Abbreviation"]]
labels
Out[15]:
In [16]:
new_movers_audience.mean()
Out[16]:
In [17]:
plt.subplots(1, figsize=(12, 8))
plt.subplots_adjust(bottom = 0.1)
plt.scatter(np.arange(len(new_movers_audience)), new_movers_audience)
for label, x, y in zip(labels,np.arange(len(new_movers_audience)), new_movers_audience):
plt.annotate(
label,
xy=(x, y), xytext=(-20, 20),
textcoords='offset points', ha='right', va='bottom',
bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))
plt.xticks([])
plt.axhline(new_movers_audience.mean(), linestyle='--', color='red', alpha=0.3)
plt.title("Total Population of New Movers by State")
plt.xlabel("State")
plt.ylabel("Total New Movers")
plt.show()
In [18]:
import matplotlib
print("System and module version information: \n")
print('Python version:', sys.version_info)
print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('matplotlib version:', matplotlib.__version__)