notebook.community

Edit and run



In [ ]:



In [ ]:

    
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
print(pd.__version__)
import random
plt.style.use('ggplot')

from collections import defaultdict



In [ ]:

    
from itertools import product

Problem:

Want to do a skew calculation.
Have amps where rows exist.
Do not have amps where rows do not exist.

Goal:

Using a product of all possible rows, left join with amps where rows exist to fill in known information.

Generate all amp table with row count = 0



In [ ]:

    
ColumnCount = int(input("Column Count: "))
AMPCount = int(input("AMP Count: "))



In [ ]:

    
print("Total Rows of Product: {}".format(ColumnCount*AMPCount))



In [ ]:

    
##want a cross product.
ColumnNames = ["column.{}".format(i) for i in np.arange(ColumnCount)]
AMPIds = ["AMP{}".format(i) for i in np.arange(AMPCount)]

#for each column name and ampid
##don't need this anymore but it's really useful...
#foo = [z for z in product(ColumnNames,AMPIds)]



In [ ]:

    
#Multiindex information
##https://pandas.pydata.org/pandas-docs/stable/advanced.html

##hmm, since there is only one value is this a series instead of a data frame?  Does it matter?



In [ ]:

    
allAmpS = pd.DataFrame({'RowCount':0}, index=pd.MultiIndex.from_product([AMPIds, ColumnNames],
                                                        names=['AMPId', 'ColumnName']))



In [ ]:

    
allAmpS



In [ ]:

    
rowRandomCount = int(input('number of random rows:'))



In [ ]:

    
usedAmpS = pd.DataFrame({'RowCount':0},
                     index=pd.MultiIndex.from_tuples(random.sample([z for z in product(AMPIds,ColumnNames)],
                                                                   rowRandomCount),
                                                     names=['AMPId', 'ColumnName']),
                     )



In [ ]:

    
usedAmpS['RowCount'] = usedAmpS.apply(axis=1, func=lambda row: random.randint(100,1000))



In [ ]:

    
(a,df) = allAmpS.align(usedAmpS, join='left', fill_value = 0)



In [ ]:

    
df



In [ ]:

    
#teradata skew calculation
#1-(min(rowcount)/max(rowcount))



In [ ]:

    
dfgb = df.groupby(['ColumnName'])



In [ ]:

    
def teradataAmpSkew(gv):
    gbmin = gv.RowCount.min()
    #print("min:{}".format(gbmin))
    gbmax = gv.RowCount.max()
    #print("max:{}".format(gbmax))
    
    skew = 1
    if(gbmax != 0):
        skew = 1 - (float(gbmin)/gbmax)
    #print('skew:{}'.format(skew))
    return(skew)



In [ ]:

    
dfgb.aggregate(teradataAmpSkew)



In [ ]:

    
plt.figure()
dfgb.aggregate(teradataAmpSkew).sort_values(by=['RowCount']).plot.bar()
plt.show()



In [ ]:

    
##look at a particular column 'column.3'
##so I want to slice the data by Column.3 and show the AMPid and row count.



In [ ]:

    
df.index



In [ ]:

    
df.index.get_level_values(level='ColumnName')



In [ ]:

    
df.query("ColumnName == 'column.15'")



In [ ]: