In [ ]:
In [ ]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
print(pd.__version__)
import random
plt.style.use('ggplot')
from collections import defaultdict
In [ ]:
from itertools import product
Problem:
Goal:
Using a product of all possible rows, left join with amps where rows exist to fill in known information.
Generate all amp table with row count = 0
In [ ]:
ColumnCount = int(input("Column Count: "))
AMPCount = int(input("AMP Count: "))
In [ ]:
print("Total Rows of Product: {}".format(ColumnCount*AMPCount))
In [ ]:
##want a cross product.
ColumnNames = ["column.{}".format(i) for i in np.arange(ColumnCount)]
AMPIds = ["AMP{}".format(i) for i in np.arange(AMPCount)]
#for each column name and ampid
##don't need this anymore but it's really useful...
#foo = [z for z in product(ColumnNames,AMPIds)]
In [ ]:
#Multiindex information
##https://pandas.pydata.org/pandas-docs/stable/advanced.html
##hmm, since there is only one value is this a series instead of a data frame? Does it matter?
In [ ]:
allAmpS = pd.DataFrame({'RowCount':0}, index=pd.MultiIndex.from_product([AMPIds, ColumnNames],
names=['AMPId', 'ColumnName']))
In [ ]:
allAmpS
In [ ]:
rowRandomCount = int(input('number of random rows:'))
In [ ]:
usedAmpS = pd.DataFrame({'RowCount':0},
index=pd.MultiIndex.from_tuples(random.sample([z for z in product(AMPIds,ColumnNames)],
rowRandomCount),
names=['AMPId', 'ColumnName']),
)
In [ ]:
usedAmpS['RowCount'] = usedAmpS.apply(axis=1, func=lambda row: random.randint(100,1000))
In [ ]:
(a,df) = allAmpS.align(usedAmpS, join='left', fill_value = 0)
In [ ]:
df
In [ ]:
#teradata skew calculation
#1-(min(rowcount)/max(rowcount))
In [ ]:
dfgb = df.groupby(['ColumnName'])
In [ ]:
def teradataAmpSkew(gv):
gbmin = gv.RowCount.min()
#print("min:{}".format(gbmin))
gbmax = gv.RowCount.max()
#print("max:{}".format(gbmax))
skew = 1
if(gbmax != 0):
skew = 1 - (float(gbmin)/gbmax)
#print('skew:{}'.format(skew))
return(skew)
In [ ]:
dfgb.aggregate(teradataAmpSkew)
In [ ]:
plt.figure()
dfgb.aggregate(teradataAmpSkew).sort_values(by=['RowCount']).plot.bar()
plt.show()
In [ ]:
##look at a particular column 'column.3'
##so I want to slice the data by Column.3 and show the AMPid and row count.
In [ ]:
df.index
In [ ]:
df.index.get_level_values(level='ColumnName')
In [ ]:
df.query("ColumnName == 'column.15'")
In [ ]: