In [ ]:


In [ ]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
print(pd.__version__)
import random
plt.style.use('ggplot')

from collections import defaultdict

In [ ]:
from itertools import product

Problem:

  • Want to do a skew calculation.
  • Have amps where rows exist.
  • Do not have amps where rows do not exist.

Goal:

Using a product of all possible rows, left join with amps where rows exist to fill in known information.

Generate all amp table with row count = 0


In [ ]:
ColumnCount = int(input("Column Count: "))
AMPCount = int(input("AMP Count: "))

In [ ]:
print("Total Rows of Product: {}".format(ColumnCount*AMPCount))

In [ ]:
##want a cross product.
ColumnNames = ["column.{}".format(i) for i in np.arange(ColumnCount)]
AMPIds = ["AMP{}".format(i) for i in np.arange(AMPCount)]

#for each column name and ampid
##don't need this anymore but it's really useful...
#foo = [z for z in product(ColumnNames,AMPIds)]

In [ ]:
#Multiindex information
##https://pandas.pydata.org/pandas-docs/stable/advanced.html

##hmm, since there is only one value is this a series instead of a data frame?  Does it matter?

In [ ]:
allAmpS = pd.DataFrame({'RowCount':0}, index=pd.MultiIndex.from_product([AMPIds, ColumnNames],
                                                        names=['AMPId', 'ColumnName']))

In [ ]:
allAmpS

In [ ]:
rowRandomCount = int(input('number of random rows:'))

In [ ]:
usedAmpS = pd.DataFrame({'RowCount':0},
                     index=pd.MultiIndex.from_tuples(random.sample([z for z in product(AMPIds,ColumnNames)],
                                                                   rowRandomCount),
                                                     names=['AMPId', 'ColumnName']),
                     )

In [ ]:
usedAmpS['RowCount'] = usedAmpS.apply(axis=1, func=lambda row: random.randint(100,1000))

In [ ]:
(a,df) = allAmpS.align(usedAmpS, join='left', fill_value = 0)

In [ ]:
df

In [ ]:
#teradata skew calculation
#1-(min(rowcount)/max(rowcount))

In [ ]:
dfgb = df.groupby(['ColumnName'])

In [ ]:
def teradataAmpSkew(gv):
    gbmin = gv.RowCount.min()
    #print("min:{}".format(gbmin))
    gbmax = gv.RowCount.max()
    #print("max:{}".format(gbmax))
    
    skew = 1
    if(gbmax != 0):
        skew = 1 - (float(gbmin)/gbmax)
    #print('skew:{}'.format(skew))
    return(skew)

In [ ]:
dfgb.aggregate(teradataAmpSkew)

In [ ]:
plt.figure()
dfgb.aggregate(teradataAmpSkew).sort_values(by=['RowCount']).plot.bar()
plt.show()

In [ ]:
##look at a particular column 'column.3'
##so I want to slice the data by Column.3 and show the AMPid and row count.

In [ ]:
df.index

In [ ]:
df.index.get_level_values(level='ColumnName')

In [ ]:
df.query("ColumnName == 'column.15'")

In [ ]: