In [1]:
%matplotlib inline
import os
import datetime
from io import StringIO
import pandas as pd
import python_pachyderm
First, we'll create a couple of repos and populate them:
In [3]:
pfs_client = python_pachyderm.PfsClient()
pps_client = python_pachyderm.PpsClient()
# First create the repos/pipelines
pfs_client.create_repo("trips")
pfs_client.create_repo("weather")
pps_client.create_pipeline(
"jupyter",
transform=python_pachyderm.Transform(
image="pachyderm/pachyderm_jupyter:2019",
cmd=["python3", "merge.py"],
),
input=python_pachyderm.Input(cross=[
python_pachyderm.Input(pfs=python_pachyderm.PFSInput(glob="/", repo="weather")),
python_pachyderm.Input(pfs=python_pachyderm.PFSInput(glob="/", repo="trips")),
])
)
In [4]:
# Populate the input repos
def insert_data(name):
print("Inserting {} data...".format(name))
with pfs_client.commit(name, "master") as c:
data_dir = "{}_data".format(name)
for data_filename in os.listdir(data_dir):
data_filepath = os.path.join(data_dir, data_filename)
with open(data_filepath, "rb") as f:
pfs_client.put_file_bytes(c, data_filename, f)
return c
trips_commit = insert_data("trips")
weather_commit = insert_data("weather")
# Wait for the commits to finish
print("Waiting for commits to finish...")
for commit in pfs_client.flush_commit([trips_commit, weather_commit]):
print(commit)
In [5]:
file = pfs_client.get_file(("jupyter", "master"), "data.csv")
contents = "\n".join([chunk.decode("utf8") for chunk in file])
df = pd.read_csv(StringIO(contents), names=["Date", "Precipitation", "Trips", "Sales"], index_col="Date")
df.index = pd.to_datetime(df.index)
df.sort_index(inplace=True)
# Get just July 2016
df = df[datetime.datetime(year=2016, month=7, day=1):datetime.datetime(year=2016, month=7, day=31)]
print(df)
In [6]:
ax = df.plot(secondary_y=["Precipitation"], figsize=(10, 8))
ax.set_ylabel("Sales ($), # Trips")
ax.right_ax.set_ylabel("Precipitation probability")
ax.right_ax.legend(loc="best")
ax.legend(loc="upper left")
Out[6]:
We can see that their was a probability of precipitation in NYC above 70% both of the days in question. This is likely to be the explanation for the poor sales. Of course, we can attach our Jupyter notebook other parts of the data to explore other unexpected behavior, develop further analyses, etc.