In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from cifar_utils import count_files
In [3]:
PATH = Path('data/cifar10')
In [4]:
with open(PATH/'labels.txt') as f:
text = f.read()
# print(text)
cats = text.split('\n')[:-1]
cats
Out[4]:
In [5]:
cat_pops = {cat:count_files(PATH/'train'/cat) for cat in cats}
cat_pops
Out[5]:
In [6]:
cols = dict()
cols['cat'],cols['pop'] = zip(*cat_pops.items())
In [7]:
df = pd.DataFrame(cols)
df.head()
Out[7]:
In [8]:
df.plot.bar(x=df[df.columns[0]], ylim=(0,1.2*df[df.columns[1]].values.max()),
yerr=max(df.mean()[0]*0.005, df.std()[0]), alpha=.8)
df.mean()[0], df.std()[0]
Out[8]:
In [9]:
def plot_pops(df, print_ms=True):
if print_ms: print(f"{df.mean()[0]:.2f} {df.std()[0]:.2f}")
df.plot.bar(x=df[df.columns[0]], ylim=(0,1.2*df[df.columns[1]].values.max()),
yerr=max(df.mean()[0]*0.005, df.std()[0]), alpha=.8)
df.mean()[0], df.std()[0]
In [10]:
df_backup = df.copy()
In [11]:
# df.at[2,'cat'] # equals 'bird'
df.at[2, 'pop'] /= 2 # 4250 -> 2125
df.at[2,'pop'] # for editing df values see: https://stackoverflow.com/a/13842286
Out[11]:
In [12]:
plot_pops(df)
In [13]:
df.std()[0]
Out[13]:
In [14]:
# generate a csv from a dataset
def generate_csv(path, labelmap=None, folder='train'):
"""Infers a csv from directory structure.
`labelmap` is a dictionary mapping class folders to class names.
Class names are taken from class folder names if no mapping is provided.
For Single-Category Classification.
"""
# find categories
catfolders = [f for f in os.listdir(path/folder) if (path/folder/f).is_dir()]
if labelmap is None: labelmap = {cf:cf for cf in catfolders}
rows = []
for cat in catfolders:
catpath = path/folder/cat
fpaths = list(map(lambda x: cat+'/'+x, os.listdir(catpath)))
rows.extend(list(zip(fpaths,[labelmap[cat] for i in range(len(fpaths))])))
df = pd.DataFrame(rows, columns=['file','class'])
return df
In [15]:
%%time
csv = generate_csv(PATH)
In [16]:
csv.sample(n=10)
Out[16]:
In [17]:
def pops_from_df(df, catdx=1, colnames=['cat','pop']):
"""Extracts category populations from a DataFrame.
If `colnames=None`: returns a dictionary, otherwise
a dataframe with `colnames` columns.
"""
catcol = df.columns[catdx] # prevsly: y = df.columns[ydx]
cats = df[catcol].unique()
pops = [df[df[catcol]==cat].count()[0] for cat in cats] # prevsly: y -> catcol
cat_pops = {cat:pop for cat,pop in zip(cats,pops)}
if colnames:
cat_pops = list(zip(cats,pops))
cat_pops = pd.DataFrame(cat_pops, columns=colnames)
else:
cat_pops = {cat:pop for cat,pop in zip(cats,pops)}
return cat_pops
In [18]:
pops_from_df(csv, colnames=None)
Out[18]:
In [19]:
plot_pops(pops_from_df(csv))
In [20]:
def csv_subset(df, catdx=1, p=0.5):
"""Returns a percetnage of the original dataset, sampled uniformly by category."""
if type(p)==int: p /= df.count()
catcol = df.columns[catdx]
cats = df[catcol].unique()
df_slice = df[catcol]
keep_idxs = np.array([], dtype=np.int64)
for cat in cats:
cat_idxs = np.where(df_slice==cat)[0]
n = max(1, int(len(cat_idxs)*p))
keep_idxs = np.concatenate((keep_idxs, np.random.choice(cat_idxs, size=n, replace=False)))
return df.iloc[keep_idxs]
In [410]:
full_df = generate_csv(PATH)
cat_pops = pops_from_df(full_df)
full_df.count()[0]
Out[410]:
In [411]:
new_csv = csv_subset(full_df)
new_csv.count()[0]
Out[411]:
In [412]:
pops_from_df(new_csv)
Out[412]:
In [413]:
new_csv.head()
Out[413]:
In [421]:
np.unique(new_csv[new_csv['class']=='cat'].as_matrix()[:,0]).shape # check: no repeats
Out[421]:
In [950]:
def smooth_csv_dataset(df, eps=0.1, full_df=None, catdx=1):
"""'Smooths' out a dataset by adding copied samples.
For use with single-label classification (2-column) CSVs."""
# result DF and sampling DF
new_df = df.copy()
full_df = df if full_df is None else full_df
# get category column name
catcol = df.columns[catdx]
# get category populations & calculate desired range
cat_pops = pops_from_df(df)
c_totals = cat_pops.as_matrix(columns=['pop'])
sd = eps * c_totals.mean()
# Normalize category sizes
c_norm = c_totals/c_totals.max()
new_mean = c_totals.max() - sd
new_totals = (2*sd * c_norm + (new_mean - sd)).astype(int)
# Increase category sizes by differences
diffs = new_totals - c_totals
cats = cat_pops['cat'].values
copy_idxs = []
for i,cat in enumerate(cats):
diff = diffs[i]
cat_idxs = np.where(full_df[catcol]==cat)[0]
full_cat_pop = len(cat_idxs)
# if the difference is more than Nx greater, copy the whole category N times
if diff > full_cat_pop:
n_copy = int(diff) // full_cat_pop
diff -= n_copy * full_cat_pop
for i in range(n_copy): copy_idxs.extend(cat_idxs)
copy_idxs.extend(np.random.choice(cat_idxs, size=diff, replace=False))
copy_rows = full_df.iloc[copy_idxs]
new_df = new_df.append(copy_rows)
return new_df
In [959]:
def basic_plot2(c_totals=c_totals, sd=sd, oldmean=None, catlist=None):
sa,mean = c_totals.std(),c_totals.mean() if oldmean is None else oldmean
plt.bar(x=range(len(c_totals)),height=c_totals, alpha=0.4, color='k');
plt.axhline(y=mean,c='r'); plt.axhline(y=mean+sd,c='k'); plt.axhline(y=mean-sd,c='k');
if catlist is not None: plt.xticks(range(len(c_totals)), catlist, rotation=90)
print(mean, (sd, sa))
print(c_totals)
Makes this:
In [963]:
df = generate_csv(PATH)
skewed_df = df.sample(n=5000)
drop_idxs = np.where(skewed_df['class']=='airplane')[0]
drop_idxs = np.random.choice(drop_idxs, size=len(drop_idxs)//2, replace=False)
skewed_df.drop(skewed_df.index[drop_idxs], inplace=True)
cat_pops = pops_from_df(skewed_df)
c_totals = cat_pops.as_matrix(columns=['pop'])
catlist = cat_pops.as_matrix(columns=['cat'])
eps = 0.1; sd = c_totals.mean()*eps
basic_plot2(c_totals=c_totals[:,0],sd=sd,catlist=catlist[:,0])
Become this:
In [975]:
smoothed_df = smooth_csv_dataset(skewed_df, eps=eps, full_df=df)
cat_pops = pops_from_df(smoothed_df)
c_totals = cat_pops.as_matrix(columns=['pop'])
catlist = cat_pops.as_matrix(columns=['cat'])
basic_plot2(c_totals=c_totals[:,0], sd=sd, catlist=catlist[:,0])
Looking at it another way:
In [976]:
basic_plot2(c_totals=c_totals[:,0], sd=sd, catlist=catlist[:,0], oldmean=c_totals.max()-sd)
New algorithm:
set max to max category.
calculate new mean as 1 σδ below max.
normalize dataset to 1 σδ of new mean.
normalization:
In [462]:
def basic_plot():
mean,sa = c_totals.mean(),c_totals.std()
plt.bar(x=range(len(c_totals)),height=c_totals, alpha=0.4, color='k');
plt.axhline(y=mean,c='r'); plt.axhline(y=mean+sd,c='k'); plt.axhline(y=mean-sd,c='k');
print(mean, (sd, sa))
print(c_totals)
In [500]:
df = generate_csv(PATH)
Getting a 'skewed' dataset from the original DataFrame
In [556]:
skewed_df = df.sample(n=5000)
counts = pops_from_df(skewed_df); counts.mean()[0], counts.std()[0]
Out[556]:
In [557]:
drop_idxs = np.where(skewed_df['class']=='dog')[0]
drop_idxs.shape
Out[557]:
In [558]:
drop_idxs = np.random.choice(drop_idxs, size=len(drop_idxs)//2, replace=False)
drop_idxs.shape
Out[558]:
In [584]:
skewed_df.drop(skewed_df.index[drop_idxs], inplace=True)
skewed_df.count()[0], np.where(skewed_df['class']=='dog')[0].shape[0]
Out[584]:
In [608]:
plot_pops(pops_from_df(skewed_df))
In this case with an ε of 10%, we'd want a standard deviation of about 47. It's currently at 75.
Modifying the algorithm to work on dataframes instead of integer arrays:
In [610]:
eps = 0.1
cat_pops = pops_from_df(skewed_df)
c_totals = cat_pops.as_matrix(columns=['pop'])
NumPy and Pandas seem to disagree on the standard deviation. Looks like by default they're calculated subtly differently.
In [611]:
c_totals.mean(), c_totals.std()
Out[611]:
In [631]:
sd = eps * c_totals.mean() # desired standard deviation
Normalize category sizes:
In [728]:
# normalize category sizes
c_normd = c_totals/c_totals.max()# - c_totals.min()/c_totals.max()
mean = c_totals.max() - sd
new_totals = (2*sd * c_normd + (mean - sd)).astype(int)
#display
print(c_totals.mean(),mean);print(c_totals.std(),new_totals.std()); list(zip(c_totals, new_totals))
Out[728]:
In [729]:
c_normd
Out[729]:
In [730]:
c_totals.max()
Out[730]:
In [731]:
c_totals[:,0]
Out[731]:
Looks like things are working to spec:
In [741]:
def basic_plot2(c_totals=c_totals, sd=sd, oldmean=None):
sa,mean = c_totals.std(),c_totals.mean() if oldmean is None else oldmean
plt.bar(x=range(len(c_totals)),height=c_totals, alpha=0.4, color='k');
plt.axhline(y=mean,c='r'); plt.axhline(y=mean+sd,c='k'); plt.axhline(y=mean-sd,c='k');
print(mean, (sd, sa))
print(c_totals)
Showing the new distribution with original mean flanked by desired standard deviation:
In [739]:
basic_plot2(c_totals=new_totals[:,0], sd=sd, oldmean=mean)
Showing the new mean flanked by desired standard deviation:
In [740]:
basic_plot2(c_totals=new_totals[:,0], sd=sd)
Looks like the algorithm mechanics are working as they should. Now to do the dataframe resample bit.
In [745]:
diffs = new_totals - c_totals
diffs[:,0]
Out[745]:
Adding to a single category:
In [749]:
cat_pops['cat'][0]
Out[749]:
In [750]:
cat_pops
Out[750]:
In [759]:
# cats = cat_pops['cat'].values
idx = 0; cat = cat_pops['cat'].values[idx]
cat
Out[759]:
New Totals says the 'deer' category should have 15 elements added to it, bringing a total of 506 to 521 (I'm a bit paranoid about numerical stability):
In [764]:
cat_pops.iloc[idx]['pop'], new_totals[idx][0]
Out[764]:
Grab a random sample of 15 elements from the original dataset for 'deer':
In [782]:
copy_idxs = np.random.choice(np.where(df['class'] == cat)[0], size=diffs[idx], replace=False)
copy_idxs
Out[782]:
In [783]:
copyrows = df.iloc[copy_idxs]
In [784]:
copyrows
Out[784]:
Holy shit it works.
In [787]:
tmpdf = skewed_df.copy()
tmpdf.describe()
Out[787]:
In [788]:
tmpdf.append(copyrows).describe()
Out[788]:
In [791]:
tmpdf = tmpdf.append(copyrows)
tmpdf.iloc[-16:]
Out[791]:
Looks good. Now for all categories.
In [810]:
cats = cat_pops['cat'].values
copy_idxs = []
for i,cat in enumerate(cats):
diff = diffs[i]
copy_idxs.extend(np.random.choice(np.where(df['class'] == cat)[0], size=diff, replace=False))
len(copy_idxs)
Out[810]:
In [811]:
new_totals.sum() - c_totals.sum()
Out[811]:
In [814]:
copyrows = df.iloc[copy_idxs]
copyrows.count()
Out[814]:
In [818]:
copyrows.sample(n=5)
Out[818]:
Moment of truth: adding it back together and counting the result:
In [842]:
tmpdf = skewed_df.copy()
tmpdf.count()
Out[842]:
In [843]:
print(skewed_df.count()[0])
skewed_df = skewed_df.append(copyrows)
print(skewed_df.count()[0])
In [824]:
tmpdf.count()
Out[824]:
In [844]:
plot_pops(pops_from_df(skewed_df))
What the dataset looks like now:
In [845]:
basic_plot2(c_totals=pops_from_df(skewed_df).as_matrix(columns=['pop'])[:,0], sd=sd)
And what it looks like with the original mean superimposed:
In [846]:
basic_plot2(c_totals=pops_from_df(skewed_df).as_matrix(columns=['pop'])[:,0], sd=sd, oldmean=c_totals.mean())
The dataset's class distribution has been smoothed out to within 1 desired standard deviation of the mean.
Automated dataset smoothing:
In [922]:
def smooth_csv_dataset(df, eps=0.1, full_df=None, catdx=1):
"""'Smooths' out a dataset by adding copied samples.
For use with single-label classification (2-column) CSVs."""
# result DF and sampling DF
new_df = df.copy()
full_df = df if full_df is None else full_df
# get category column name
catcol = df.columns[catdx]
# get category populations & calculate desired range
cat_pops = pops_from_df(df)
c_totals = cat_pops.as_matrix(columns=['pop'])
sd = eps * c_totals.mean()
# Normalize category sizes
c_norm = c_totals/c_totals.max()
new_mean = c_totals.max() - sd
new_totals = (2*sd * c_norm + (new_mean - sd)).astype(int)
# Increase category sizes by differences
diffs = new_totals - c_totals
cats = cat_pops['cat'].values
copy_idxs = []
for i,cat in enumerate(cats):
diff = diffs[i]
cat_idxs = np.where(full_df[catcol]==cat)[0]
full_cat_pop = len(cat_idxs)
# if the difference is more than Nx greater, copy the whole category N times
if diff > full_cat_pop:
n_copy = int(diff) // full_cat_pop
diff -= n_copy * full_cat_pop
# import pdb;pdb.set_trace()
for i in range(n_copy): copy_idxs.extend(cat_idxs)
copy_idxs.extend(np.random.choice(cat_idxs, size=diff, replace=False))
copy_rows = full_df.iloc[copy_idxs]
new_df = new_df.append(copy_rows)
return new_df
In [876]:
int(np.array([1]))
Out[876]:
In [867]:
tmp = []
arr = [1,2,3]
ndarr = np.array(arr)
# tmp.extend(arr)
# tmp.extend(ndarr)
tmp.extend(np.array([1,2,3]))
tmp
Out[867]:
Testing:
In [500]:
df = generate_csv(PATH)
In [848]:
skewed_df = df.sample(n=5000)
counts = pops_from_df(skewed_df); counts.mean()[0], counts.std()[0]
drop_idxs = np.where(skewed_df['class']=='dog')[0]
drop_idxs = np.random.choice(drop_idxs, size=len(drop_idxs)//2, replace=False)
skewed_df.drop(skewed_df.index[drop_idxs], inplace=True)
skewed_df.count()[0], np.where(skewed_df['class']=='dog')[0].shape[0]
Out[848]:
In [913]:
cat_pops = pops_from_df(skewed_df); c_totals = cat_pops.as_matrix(columns=['pop']); sd=c_totals.mean()*eps
basic_plot2(c_totals=c_totals[:,0], sd=sd)
Without a 'full' base dataset:
In [918]:
eps * c_totals.mean()
Out[918]:
In [919]:
c_totals.max()
Out[919]:
In [932]:
smooth_df = smooth_csv_dataset(skewed_df)
In [933]:
smooth_df.count()[0]
Out[933]:
In [934]:
oldmean
Out[934]:
In [935]:
c_totals.mean()
Out[935]:
I got a little confused looking at these plots, and realized I wasn't actually displaying or calculating the 'old' mean: more of a 'pseudomean'. The 'oldmean' parameter to the plot ('new_mean' inside the smoother function) is the pseudomean.
Although it's what I meant, I'm not actually asking for the dataset to be within 1 desired standard deviation from the mean. I'm asking that no element of the dataset be less than 2 desired standard deviations from the maximum.
That also means nothing is further than ±1 standard deviation from the new actual mean. So I'm in effect imposing 2 constraints:
In [936]:
# Smoothed dataset is now
cat_pops = pops_from_df(smooth_df); c_totals = cat_pops.as_matrix(columns=['pop'])
basic_plot2(c_totals=c_totals[:,0], sd=sd, oldmean=c_totals.max()-sd)
In [923]:
cat_pops = pops_from_df(smooth_df); c_totals = cat_pops.as_matrix(columns=['pop'])
basic_plot2(c_totals=c_totals[:,0], sd=sd)
With a 'full' base dataset:
In [940]:
%time smooth_df = smooth_csv_dataset(skewed_df, full_df=df)
In [942]:
# entire dataset is within 2 stdevs of the maximum
cat_pops = pops_from_df(smooth_df); c_totals = cat_pops.as_matrix(columns=['pop'])
basic_plot2(c_totals=c_totals[:,0], sd=sd, oldmean=c_totals.max()-sd)
In [941]:
# entire dataset is within 1 stdev of the mean
cat_pops = pops_from_df(smooth_df); c_totals = cat_pops.as_matrix(columns=['pop'])
basic_plot2(c_totals=c_totals[:,0], sd=sd)
In [945]:
smooth_df.describe()
Out[945]:
In [949]:
smooth_df.count()[0], skewed_df.count()[0]
Out[949]:
In [ ]:
In [764]:
cat_pops.iloc[idx]['pop'], new_totals[idx][0]
Out[764]:
Grab a random sample of 15 elements from the original dataset for 'deer':
In [782]:
copy_idxs = np.random.choice(np.where(df['class'] == cat)[0], size=diffs[idx], replace=False)
copy_idxs
Out[782]:
In [783]:
copyrows = df.iloc[copy_idxs]
In [ ]:
In [ ]:
# randomly sample from dataset to fill the difference
adds = []
diffs = new_totals - c_totals
for i,diff in enumerate(diffs):
add = np.random.choice(df[catcol]==#TODO)
In [ ]:
In [704]:
c_totals[np.argmin(c_totals)], c_normd[np.argmin(c_totals)]
Out[704]:
In [ ]:
# normalize wrt max
c_normd = c_totals/c_totals.max()
# set floor as 2sd below max
In [ ]:
In [702]:
cmin = c_totals[np.argmin(c_totals)]
cnormin = c_normd[np.argmin(c_totals)]
2*sd * cnormin
Out[702]:
Too many values are being reduced. My algorithm isn't perfect for the job. But: what if I just take the max of the result? What does the mean & stdv look like then?
In [693]:
maxs = np.max((c_totals, new_totals), axis=0)
print(maxs.mean(), maxs.std())
list(zip(*maxs))[0]
Out[693]:
No values have increased beyond the original max, and all
In [695]:
maxs.min() - maxs.mean(), maxs.max() - maxs.mean()
Out[695]:
In [ ]:
In [646]:
c_totals.mean(), mean
Out[646]:
In [ ]:
In [638]:
new_totals
Out[638]:
In [ ]:
In [490]:
## prototyping: dummy dataset variation
c_totals = np.random.randint(1000, 3000, size=10)
# calculate mean, actual & desired standard deviation
# mean = c_totals.mean()
# sa = c_totals.std()
sd = eps * mean
# normalize category sizes
c_normd = c_totals/c_totals.mean() - c_totals.min()/c_totals.max()
mean = c_totals.max() - sd
new_totals = (2*sd * c_normd + (mean - sd)).astype(int)
# randomly sample from dataset to fill the difference
adds = []
diffs = new_totals - c_totals
for i,diff in enumerate(diffs):
add = np.random.choice(df[catcol]==#TODO)
Out[490]:
In [ ]:
In [612]:
cat_pops.std()
Out[612]:
In [613]:
c_totals
Out[613]:
In [614]:
cat_pops
Out[614]:
In [615]:
def plot_pops(df, print_ms=True):
if print_ms: print(f"{df.mean()[0]:.2f} {df.std()[0]:.2f}")
df.plot.bar(x=df[df.columns[0]], ylim=(0,1.2*df[df.columns[1]].values.max()),
yerr=max(df.mean()[0]*0.005, df.std()[0]), alpha=.8)
df.mean()[0], df.std()[0]
In [616]:
cat_pops.mean()
Out[616]:
In [617]:
cat_pops.std()
Out[617]:
In [618]:
cat_pops.as_matrix(columns=['pop']).std()
Out[618]:
In [619]:
cat_pops.as_matrix(columns=['pop'])
Out[619]:
In [629]:
cat_pops.as_matrix(columns=['pop']).mean(), cat_pops.as_matrix(columns=['pop']).std()
Out[629]:
In [626]:
cat_pops['pop'].sum()
Out[626]:
In [628]:
cat_pops['pop'].mean(), cat_pops['pop'].std()
Out[628]:
In [ ]:
In [599]:
c_totals.sum()
Out[599]:
In [490]:
## prototyping: dummy dataset variation
c_totals = np.random.randint(1000, 3000, size=10)
# calculate mean, actual & desired standard deviation
# mean = c_totals.mean()
# sa = c_totals.std()
sd = eps * mean
# normalize category sizes
c_normd = c_totals/c_totals.mean() - c_totals.min()/c_totals.max()
mean = c_totals.max() - sd
new_totals = (2*sd * c_normd + (mean - sd)).astype(int)
# randomly sample from dataset to fill the difference
adds = []
diffs = new_totals - c_totals
for i,diff in enumerate(diffs):
add = np.random.choice(df[catcol]==)
Out[490]:
In [ ]:
In [496]:
tmp = []
tmp.append(df.iloc[5:10])
tmp
Out[496]:
In [497]:
pd.DataFrame(tmp)
Out[497]:
In [ ]:
In [463]:
eps = 0.1
c_totals = np.random.randint(1000, 3000, size=10)
mean = c_totals.mean()
sa = c_totals.std()
sd = eps * mean
basic_plot()
c_normd = c_totals/c_totals.mean() - c_totals.min()/c_totals.max()
mean = c_totals.max() - sd
c_totals = (2*sd * c_normd + (mean - sd))
basic_plot()
In [ ]:
First time I got the normalization algorithm working:
In [386]:
eps = 0.1
c_totals = np.random.randint(1000, 3000, size=10)
mean = c_totals.mean()
sa = c_totals.std()
sd = eps * mean
basic_plot()
In [412]:
c_totals / c_totals.mean() - (c_totals.min()/c_totals.max())
Out[412]:
In [413]:
c_normd = c_totals/c_totals.mean() - c_totals.min()/c_totals.max()
c_normd
Out[413]:
In [414]:
new_mean = c_totals.max() - sd
new_mean
Out[414]:
In [420]:
new_totals = (2*sd * c_normd + (new_mean - sd))
new_totals
Out[420]:
In [427]:
c_totals = new_totals
mean = new_mean
basic_plot()
In [ ]:
In [390]:
c_norm = c_totals / np.linalg.norm(c_totals, ord=np.inf)
c_range= c_norm * sd*2
c_range
Out[390]:
In [392]:
c_range.max()
Out[392]:
In [ ]:
New algorithm:
set max to max category.
calculate new mean as 1 σδ below max.
normalize dataset to 1 σδ of new mean.
normalization:
% distance to new mean = % distance to original mean, with max distance set by σδ.
This means that categories further from the original mean than the max is from it will be clamped to new mean - σδ.
In [385]:
c_max = c_totals.max()
new_mean = c_max - sd
c_norm = c_totals/np.linalg.norm(c_totals, ord=np.inf)
c_total = c_norm * sd + new_mean
# for i,c_tot in enumerate(c_totals):
# c_totals[i] = max(new_mean-sd, min())
# if c_tot < mean:
# c_totals[i] = max(new_mean-sd, c_tot*(c_tot/mean)/(mean/(new_mean+sd)))
# elif c_tot > mean:
# c_totals[i] = max(new_mean-sd, c_tot*(c_tot/c_max)/(c_max/(new_mean+sd)))
mean = new_mean
sa = c_totals.std()
sd = eps*mean
basic_plot()
In [ ]:
In [313]:
c_max = c_totals.max()
new_mean = c_max - sd
for i,c_tot in enumerate(c_totals):
# c_totals[i] = max(new_mean-sd, min())
if c_tot < mean:
c_totals[i] = max(new_mean-sd, c_tot*(c_tot/mean)/(mean/(new_mean+sd)))
elif c_tot > mean:
c_totals[i] = max(new_mean-sd, c_tot*(c_tot/c_max)/(c_max/(new_mean+sd)))
mean = new_mean
sa = c_totals.std()
sd = eps*mean
basic_plot()
In [ ]:
In [491]:
diffs
Out[491]:
In [479]:
# cat_pops = pops_from_df(csv, colnames=None)
cat_pops = pops_from_df(csv)
In [480]:
cat_pops.as_matrix(columns=['pop'])
Out[480]:
In [481]:
totals = cat_pops.as_matrix(columns=['pop'])
In [483]:
totals.std()
Out[483]:
In [484]:
cat_pops
Out[484]:
In [485]:
for cat in cat_pops.as_matrix(columns=['cat']):
print(cat)
In [489]:
df['class']==cat[0]
Out[489]:
In [ ]:
In [306]:
c_max = 150
c_tot = c_max
sd = 20
new_mean = c_max - sd
c_tot*(c_tot/c_max)/(c_max/(new_mean+sd))
Out[306]:
In [309]:
c_max
Out[309]:
In [311]:
new_mean + sd
Out[311]:
In [ ]:
In [298]:
c_totals.mean()
Out[298]:
In [299]:
mean
Out[299]:
In [297]:
basic_plot()
In [287]:
(70/100)/(100/150)
Out[287]:
In [294]:
(150/100)/(100/150)
Out[294]:
In [ ]:
Standard Deviation formula:
So, I want σ to be eps
percent of the total dataset. The main factor in this eq is the difference from the mean. And it's squared, so there's a high penalty on being off-mean. What makes this not-simple is σ and μ affect each other, so there's a differential equation somewhere that has to be solved.
1st algorithm idea:
σ: standard deviation; α: actual; δ: desired
k = σδ/2
While the σα is greater than σδ:
For each category:
if that category is below the mean by more than 1 σδ:
add k σδ of random copies to that category
recalculate mean
recalculate σδ
If no categories are below the mean by more than 1 σδ:
k = k/2
I'm concerned about updates. The 'perfect' numerical solution is to recalculate the mean/stdv every time k
is added, and to keep k
at 1. This is also a worst-case scenario for time. Setting k=σδ/2
is my attempt at striking a balance between 'resolution' and speed.
In [338]:
eps = 0.1
c_totals = np.random.randint(1000, 3000, size=10)
mean = c_totals.mean()
sa = c_totals.std()
sd = eps * mean
k = sd/2
iters = 0
basic_plot()
this initial version of the algorithm stops progressing when the minimum total is within 1 desired standard deviation of the mean. However, if any totals are more than a desired stdv from the mean, the actual stdv will be greater than that desired. This algorithm will fail to converge whenever this condition exists.
In [349]:
# while sa > sd:
k_OK = False
for i,c_tot in enumerate(c_totals):
if mean - c_tot > sd:
c_totals[i] += k
k_OK = True
# update
mean = c_totals.mean()
sa = c_totals.std()
sd = eps * mean
# check resolution change
if not k_OK: k /= 2
iters += 1; print(f'iterations: {iters}')
basic_plot()
In [350]:
k
Out[350]:
In [146]:
k
Out[146]:
In [ ]:
In [29]:
cat_pops
Out[29]:
In [ ]:
In [21]:
data = np.array([1000, 1500, 700, 1200, 1100, 900])
data.mean(), data.std()
Out[21]:
In [26]:
ε = 0.1
σ = data.mean() * ε; σ
Out[26]:
In [ ]:
In [277]:
pops_from_df(df.sample(n=1000))
Out[277]:
In [375]:
n = 100
catdx=1
catcol = df.columns[catdx]
cats = df[catcol].unique()
df_slice = df[catcol]
keep_idxs = np.array([], dtype=np.int64) ## np.dtype(int) --> dtype('int64')
for cat in cats:
keep_idxs = np.concatenate((keep_idxs, np.random.choice(np.where(df_slice==cat)[0], size=n, replace=False)))
In [385]:
np.where(df_slice==cat)[0]
Out[385]:
In [384]:
len(np.where(df_slice==cat)[0])
Out[384]:
In [390]:
np.array([], dtype='int64'), np.array([], dtype=np.int64)
Out[390]:
In [377]:
keep_idxs.shape
Out[377]:
In [380]:
df.iloc[keep_idxs].head(n=10)
Out[380]:
In [381]:
df.iloc[keep_idxs].sample(n=10)
Out[381]:
In [ ]:
In [ ]:
In [374]:
np.dtype(type(3))
Out[374]:
In [376]:
keep_idxs[:100]
Out[376]:
In [351]:
np.concatenate(keep_idxs, np.random.choice(np.where(df_slice==cat)[0], size=n, replace=False))
In [355]:
np.random.choice(np.where(df_slice==cat)[0], size=n, replace=False)
Out[355]:
In [ ]:
In [337]:
keep_idxs = np.array([])
In [339]:
x = np.array([1,2,3,])
In [340]:
x.shape
Out[340]:
In [342]:
np.concatenate((keep_idxs, x))
Out[342]:
In [ ]:
In [298]:
catdx=1
catcol = df.columns[catdx]
cats = df[catcol].unique()
for cat in cats:
# pops = [df[df[y]==cat].count()[0] for cat in cats]
keepidx = np.where(df[df[catcol]==cat])
break
In [ ]:
In [334]:
df_slice = df[catcol]
keepidxs = np.random.choice(np.where(df_slice==cat)[0], size = 20, replace=False)
In [332]:
np.where(df_slice==cat)[0]
Out[332]:
In [327]:
df_slice
Out[327]:
In [ ]:
In [308]:
keepidx = np.where(df[df[catcol]==cat])
keepidx[0].shape
Out[308]:
In [317]:
keepidx = df[df[catcol]==cat].as_matrix()[:,0]
keepidx.shape
Out[317]:
In [318]:
keepidx[:n]
Out[318]:
In [319]:
keepidx = np.where(df[df[catcol]==cat])
keepidx[0][:10]
Out[319]:
In [323]:
df.iloc[keepidx].head()
Out[323]:
In [ ]:
In [261]:
list(zip(cats,pops))
Out[261]:
In [210]:
list(zip(cats,pops))
Out[210]:
In [248]:
rows=[]
for cat in catfolders:
catpath = path/folder/cat
fpaths = list(map(lambda x: cat+'/'+x, os.listdir(catpath)))
rows.extend(list(zip(fpaths,[labelmap[cat] for i in range(len(fpaths))])))
df = pd.DataFrame(rows, columns=['file','class'])
In [251]:
rows[0]
Out[251]:
In [250]:
cols['cat']
Out[250]:
In [252]:
list(zip(cols['cat'],cols['pop']))
Out[252]:
In [240]:
dictionary=cat_pops
cols = dict()
cols['cat'],cols['pop'] = zip(*dictionary.items())
# pd.DataFrame([cols['cat'],cols['pop']],columns=['cat','pop'])
In [236]:
cols
Out[236]:
In [238]:
pd.DataFrame(cols)
Out[238]:
In [239]:
cols['cat']
Out[239]:
In [224]:
cols = dict()
cols['cat'],cols['pop'] = zip(*cat_pops.items())
In [225]:
cols
Out[225]:
In [ ]:
In [207]:
df=csv;xdx=0;ydx=1
x,y = df.columns[xdx],df.columns[ydx]
cats = df[y].unique()
print(cats)
pops = [df[df[y]==cat].count()[0] for cat in cats]
In [177]:
csv[csv['class']=='cat'].count()
Out[177]:
In [179]:
csv.columns[1]
Out[179]:
In [ ]:
In [149]:
%%time
labelmap = None
folder='train'
path = PATH
# find categories
catfolders = [f for f in os.listdir(path/folder) if (path/folder/f).is_dir()]
if labelmap is None: labelmap = {cf:cf for cf in catfolders}
df = pd.DataFrame(columns=['file_path','class'])
rows = []
for cat in catfolders:
catpath = path/folder/cat
fpaths = list(map(lambda x: cat+'/'+x , os.listdir(catpath)))
# fpaths = list(map(str, catpath.iterdir()))
rows.extend(list(zip(fpaths,[cat for i in range(len(fpaths))])))
break
In [150]:
# rows
In [135]:
len(rows)
Out[135]:
In [ ]:
In [136]:
df = pd.DataFrame(rows, columns=['file_path','class'])
df.head()
Out[136]:
In [117]:
n=10
# fpaths[:n], [cat for i in range(n)]
list(zip(fpaths[:n],[cat for i in range(n)]))
Out[117]:
In [127]:
n = 2000
In [128]:
%%time
rows = []
rows1 = list(zip(fpaths[:n], [cat for i in range(n)]));
rows2 = list(zip(fpaths[n:2*n], [cat for i in range(n)]));
rows.append(rows1)
rows.append(rows2)
rows
In [129]:
%%time
rows = []
rows1 = list(zip(fpaths[:n], [cat for i in range(n)]));
rows2 = list(zip(fpaths[n:2*n], [cat for i in range(n)]));
rows.extend(rows1)
rows.extend(rows2)
rows
In [122]:
rows
Out[122]:
In [ ]:
In [118]:
df = pd.DataFrame(list(zip(fpaths[:n],[cat for i in range(n)])), columns=['file_path','class'])
In [119]:
df
Out[119]:
In [109]:
df = pd.DataFrame(rows[0],rows[1], columns=['file_path','class'])
In [106]:
df
Out[106]:
In [ ]:
In [75]:
files = generate_csv(PATH)
cat = 'cat'
path = PATH
folder = 'train'
In [76]:
files_np = np.array(files)
In [83]:
catpath = path/folder/cat;
catpath
Out[83]:
In [166]:
list(map(str, catpath.iterdir()))[:20]
Out[166]:
In [ ]:
In [46]:
cat_pops_np = np.array([[k,v] for k,v in cat_pops.items()])
In [47]:
cat_pops_np[:,1]
Out[47]:
In [49]:
from collections import defaultdict
cat_pops_dict = defaultdict(lambda x: [])
for k,v in cat_pops.items():
cat_pops_dict[k] = v
In [50]:
cat_pops_dict
Out[50]:
In [48]:
cat_pops_dict = {'cat':k,'pop':v for k,v in cat_pops.items()}
In [29]:
df = pd.DataFrame({'cat':[cat for cat in cat_pops.keys()],'pop':[pop for pop in cat_pops.values()]})
In [30]:
df.plot.bar()
Out[30]:
In [ ]:
df = pd.DataFrame({'cat':})
In [35]:
cat_pops_np[:,0]
Out[35]:
In [37]:
df = pd.DataFrame({'cat':cat_pops_np[:,0],'pop':cat_pops_np[:,1]})
In [44]:
df.
Out[44]:
In [43]:
df.plot.bar()
In [ ]:
def csv_smooth_data(path, threshold):