In [4]:
import pandas as pd
from pylab import *
import matplotlib.pyplot as plt
In [7]:
data = pd.read_csv("../../data/cleaned/UCB_dept_merge.csv")
In [ ]:
In [9]:
#Changing data type to date
data.creation_date = pd.to_datetime(data.creation_date)
data.po_closed_date = pd.to_datetime(data.po_closed_date)
In [10]:
#Creating Time to completion column
data["days_to_close"] = data.po_closed_date - data.creation_date
data['days_to_close'] = data['days_to_close'].astype('timedelta64[D]')
In [13]:
#Shows that data.po_id is not unique
len(data.po_id) == len(data.po_id.unique())
Out[13]:
In [14]:
#total number of po_id
len(data.po_id)
Out[14]:
In [20]:
#i need to pull out those data duplicate
data[data.po_id.duplicated()]['po_id'].unique().shape[0]
Out[20]:
In [26]:
po_counts = data.groupby('po_id').count().iloc[:, 0].sort(inplace=False)
po_counts.index = po_counts.index.astype('str')
In [30]:
%matplotlib inline
In [38]:
po_counts[po_counts > 50].plot(figsize=(15, 5))
Out[38]:
Total 103668 Blanket_POs
In [2]:
100000/600000
Out[2]:
In [ ]:
h
In [ ]:
In [ ]:
In [ ]:
In [ ]: