In [4]:
import pandas as pd
from pylab import *
import matplotlib.pyplot as plt

In [7]:
data = pd.read_csv("../../data/cleaned/UCB_dept_merge.csv")

In [ ]:


In [9]:
#Changing data type to date
data.creation_date = pd.to_datetime(data.creation_date)
data.po_closed_date = pd.to_datetime(data.po_closed_date)

In [10]:
#Creating Time to completion column
data["days_to_close"] = data.po_closed_date - data.creation_date
data['days_to_close'] = data['days_to_close'].astype('timedelta64[D]')

In [13]:
#Shows that data.po_id is not unique
len(data.po_id) == len(data.po_id.unique())


Out[13]:
False

In [14]:
#total number of po_id
len(data.po_id)


Out[14]:
611110

In [20]:
#i need to pull out those data duplicate
data[data.po_id.duplicated()]['po_id'].unique().shape[0]


Out[20]:
103668

In [26]:
po_counts = data.groupby('po_id').count().iloc[:, 0].sort(inplace=False)
po_counts.index = po_counts.index.astype('str')

In [30]:
%matplotlib inline

In [38]:
po_counts[po_counts > 50].plot(figsize=(15, 5))


Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x11470e690>

Total 103668 Blanket_POs


In [2]:
100000/600000


Out[2]:
0

In [ ]:
h

In [ ]:


In [ ]:


In [ ]:


In [ ]: