notebook.community

Edit and run



In [4]:

    
import pandas as pd
from pylab import *
import matplotlib.pyplot as plt



In [7]:

    
data = pd.read_csv("../../data/cleaned/UCB_dept_merge.csv")



In [ ]:



In [9]:

    
#Changing data type to date
data.creation_date = pd.to_datetime(data.creation_date)
data.po_closed_date = pd.to_datetime(data.po_closed_date)



In [10]:

    
#Creating Time to completion column
data["days_to_close"] = data.po_closed_date - data.creation_date
data['days_to_close'] = data['days_to_close'].astype('timedelta64[D]')



In [13]:

    
#Shows that data.po_id is not unique
len(data.po_id) == len(data.po_id.unique())









    Out[13]:





False



In [14]:

    
#total number of po_id
len(data.po_id)









    Out[14]:





611110



In [20]:

    
#i need to pull out those data duplicate
data[data.po_id.duplicated()]['po_id'].unique().shape[0]









    Out[20]:





103668



In [26]:

    
po_counts = data.groupby('po_id').count().iloc[:, 0].sort(inplace=False)
po_counts.index = po_counts.index.astype('str')



In [30]:

    
%matplotlib inline



In [38]:

    
po_counts[po_counts > 50].plot(figsize=(15, 5))









    Out[38]:





<matplotlib.axes._subplots.AxesSubplot at 0x11470e690>

Total 103668 Blanket_POs



In [2]:

    
100000/600000









    Out[2]:





0



In [ ]:

    
h



In [ ]:



In [ ]:



In [ ]:



In [ ]: