AutoCSV Dev

WNixalo –– 2018/6/2-3



In [1]:

    
%matplotlib inline
%reload_ext autoreload
%autoreload 2



In [2]:

    
from pathlib import Path
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

from cifar_utils import count_files



In [3]:

    
PATH = Path('data/cifar10')

Dataset Population



In [4]:

    
with open(PATH/'labels.txt') as f:
    text = f.read()
# print(text)
cats = text.split('\n')[:-1]
cats









    Out[4]:





['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']



In [5]:

    
cat_pops = {cat:count_files(PATH/'train'/cat) for cat in cats}
cat_pops









    Out[5]:





{'airplane': 4250,
 'automobile': 4250,
 'bird': 4250,
 'cat': 4250,
 'deer': 4250,
 'dog': 4250,
 'frog': 4250,
 'horse': 4250,
 'ship': 4250,
 'truck': 4250}



In [6]:

    
cols = dict()
cols['cat'],cols['pop'] = zip(*cat_pops.items())



In [7]:

    
df = pd.DataFrame(cols)
df.head()



In [8]:

    
df.plot.bar(x=df[df.columns[0]], ylim=(0,1.2*df[df.columns[1]].values.max()), 
            yerr=max(df.mean()[0]*0.005, df.std()[0]), alpha=.8)
df.mean()[0], df.std()[0]









    Out[8]:





(4250.0, 0.0)

Uneven data set:



In [9]:

    
def plot_pops(df, print_ms=True):
    if print_ms: print(f"{df.mean()[0]:.2f} {df.std()[0]:.2f}")
    df.plot.bar(x=df[df.columns[0]], ylim=(0,1.2*df[df.columns[1]].values.max()), 
            yerr=max(df.mean()[0]*0.005, df.std()[0]), alpha=.8)
    df.mean()[0], df.std()[0]



In [10]:

    
df_backup = df.copy()



In [11]:

    
# df.at[2,'cat'] # equals 'bird'
df.at[2, 'pop'] /= 2 # 4250 -> 2125
df.at[2,'pop'] # for editing df values see: https://stackoverflow.com/a/13842286









    Out[11]:





2125



In [12]:

    
plot_pops(df)









    



4037.50 671.98



In [13]:

    
df.std()[0]









    Out[13]:





671.9840027857806

AutoCSV



In [14]:

    
# generate a csv from a dataset
def generate_csv(path, labelmap=None, folder='train'):
    """Infers a csv from directory structure. 
       `labelmap` is a dictionary mapping class folders to class names.
       Class names are taken from class folder names if no mapping is provided.
       For Single-Category Classification.
    """
    # find categories
    catfolders = [f for f in os.listdir(path/folder) if (path/folder/f).is_dir()]
    if labelmap is None: labelmap = {cf:cf for cf in catfolders}

    rows = []

    for cat in catfolders:
        catpath = path/folder/cat
        fpaths  = list(map(lambda x: cat+'/'+x, os.listdir(catpath)))
        rows.extend(list(zip(fpaths,[labelmap[cat] for i in range(len(fpaths))])))

    df = pd.DataFrame(rows, columns=['file','class'])
    return df



In [15]:

    
%%time
csv = generate_csv(PATH)









    



CPU times: user 68.8 ms, sys: 52.1 ms, total: 121 ms
Wall time: 122 ms



In [16]:

    
csv.sample(n=10)









    Out[16]:







  
    
      
      file
      class
    
  
  
    
      6483
      dog/20688_dog.png
      dog
    
    
      12388
      truck/1602_truck.png
      truck
    
    
      16061
      bird/14294_bird.png
      bird
    
    
      18073
      airplane/18106_airplane.png
      airplane
    
    
      42118
      automobile/46162_automobile.png
      automobile
    
    
      39891
      automobile/33188_automobile.png
      automobile
    
    
      17797
      airplane/38187_airplane.png
      airplane
    
    
      1668
      cat/45885_cat.png
      cat
    
    
      22111
      ship/24451_ship.png
      ship
    
    
      13452
      bird/37374_bird.png
      bird

CSV-based Pop Counting



In [17]:

    
def pops_from_df(df, catdx=1, colnames=['cat','pop']):
    """Extracts category populations from a DataFrame.
       If `colnames=None`: returns a dictionary, otherwise
       a dataframe with `colnames` columns.
    """
    catcol = df.columns[catdx] # prevsly: y = df.columns[ydx]
    cats = df[catcol].unique()
    pops = [df[df[catcol]==cat].count()[0] for cat in cats] # prevsly: y -> catcol
    cat_pops = {cat:pop for cat,pop in zip(cats,pops)}
    
    if colnames:
        cat_pops = list(zip(cats,pops))
        cat_pops = pd.DataFrame(cat_pops, columns=colnames)
    else:
        cat_pops = {cat:pop for cat,pop in zip(cats,pops)}
    
    return cat_pops



In [18]:

    
pops_from_df(csv, colnames=None)









    Out[18]:





{'cat': 4250,
 'dog': 4250,
 'truck': 4250,
 'bird': 4250,
 'airplane': 4250,
 'ship': 4250,
 'frog': 4250,
 'horse': 4250,
 'deer': 4250,
 'automobile': 4250}



In [19]:

    
plot_pops(pops_from_df(csv))









    



4250.00 0.00

CSV sampling

Retreive a portion of the csv



In [20]:

    
def csv_subset(df, catdx=1, p=0.5):
    """Returns a percetnage of the original dataset, sampled uniformly by category."""
    if type(p)==int: p /= df.count()
    
    catcol = df.columns[catdx]
    cats   = df[catcol].unique()
    df_slice  = df[catcol]
    keep_idxs = np.array([], dtype=np.int64)
    
    for cat in cats:
        cat_idxs = np.where(df_slice==cat)[0]
        n = max(1, int(len(cat_idxs)*p))
        keep_idxs = np.concatenate((keep_idxs, np.random.choice(cat_idxs, size=n, replace=False)))
    
    return df.iloc[keep_idxs]



In [410]:

    
full_df  = generate_csv(PATH)
cat_pops = pops_from_df(full_df)

full_df.count()[0]









    Out[410]:





42500



In [411]:

    
new_csv = csv_subset(full_df)
new_csv.count()[0]









    Out[411]:





21250



In [412]:

    
pops_from_df(new_csv)



In [413]:

    
new_csv.head()









    Out[413]:







  
    
      
      file
      class
    
  
  
    
      2311
      cat/14032_cat.png
      cat
    
    
      3438
      cat/6348_cat.png
      cat
    
    
      2914
      cat/38894_cat.png
      cat
    
    
      3382
      cat/18582_cat.png
      cat
    
    
      80
      cat/11476_cat.png
      cat



In [421]:

    
np.unique(new_csv[new_csv['class']=='cat'].as_matrix()[:,0]).shape # check: no repeats









    Out[421]:





(2125,)

Dataset Smoothing

"Smooth" out category populations to within a specified standard deviation.



In [950]:

    
def smooth_csv_dataset(df, eps=0.1, full_df=None, catdx=1):
    """'Smooths' out a dataset by adding copied samples.
    
        For use with single-label classification (2-column) CSVs."""
    
    # result DF and sampling DF
    new_df  = df.copy()
    full_df = df if full_df is None else full_df
    # get category column name
    catcol = df.columns[catdx]
    
    # get category populations & calculate desired range
    cat_pops = pops_from_df(df)
    c_totals = cat_pops.as_matrix(columns=['pop'])
    sd       = eps * c_totals.mean()
    # Normalize category sizes
    c_norm     = c_totals/c_totals.max()
    new_mean   = c_totals.max() - sd
    new_totals = (2*sd * c_norm + (new_mean - sd)).astype(int)
    
    # Increase category sizes by differences
    diffs     = new_totals - c_totals
    cats      = cat_pops['cat'].values
    copy_idxs = []
    
    for i,cat in enumerate(cats):
        diff         = diffs[i]
        cat_idxs     = np.where(full_df[catcol]==cat)[0]
        full_cat_pop = len(cat_idxs)
        
        # if the difference is more than Nx greater, copy the whole category N times
        if diff > full_cat_pop:
            n_copy = int(diff) // full_cat_pop
            diff  -= n_copy * full_cat_pop
            for i in range(n_copy): copy_idxs.extend(cat_idxs)
        copy_idxs.extend(np.random.choice(cat_idxs, size=diff, replace=False))
    
    copy_rows = full_df.iloc[copy_idxs]
    new_df    = new_df.append(copy_rows)
    
    return new_df



In [959]:

    
def basic_plot2(c_totals=c_totals, sd=sd, oldmean=None, catlist=None):
    sa,mean = c_totals.std(),c_totals.mean() if oldmean is None else oldmean
    plt.bar(x=range(len(c_totals)),height=c_totals, alpha=0.4, color='k');
    plt.axhline(y=mean,c='r'); plt.axhline(y=mean+sd,c='k'); plt.axhline(y=mean-sd,c='k');
    if catlist is not None: plt.xticks(range(len(c_totals)), catlist, rotation=90)
    print(mean, (sd, sa))
    print(c_totals)

Makes this:



In [963]:

    
df = generate_csv(PATH)

skewed_df = df.sample(n=5000)
drop_idxs = np.where(skewed_df['class']=='airplane')[0]
drop_idxs = np.random.choice(drop_idxs, size=len(drop_idxs)//2, replace=False)
skewed_df.drop(skewed_df.index[drop_idxs], inplace=True)

cat_pops = pops_from_df(skewed_df)
c_totals = cat_pops.as_matrix(columns=['pop'])
catlist  = cat_pops.as_matrix(columns=['cat'])

eps = 0.1; sd = c_totals.mean()*eps

basic_plot2(c_totals=c_totals[:,0],sd=sd,catlist=catlist[:,0])









    



476.5 (47.650000000000006, 81.13599694340361)
[517 513 497 477 510 500 514 236 487 514]

Become this:



In [975]:

    
smoothed_df = smooth_csv_dataset(skewed_df, eps=eps, full_df=df)

cat_pops = pops_from_df(smoothed_df)
c_totals = cat_pops.as_matrix(columns=['pop'])
catlist  = cat_pops.as_matrix(columns=['cat'])

basic_plot2(c_totals=c_totals[:,0], sd=sd, catlist=catlist[:,0])









    



509.1 (47.650000000000006, 14.895972610071489)
[517 516 513 509 515 513 516 465 511 516]

Looking at it another way:



In [976]:

    
basic_plot2(c_totals=c_totals[:,0], sd=sd, catlist=catlist[:,0], oldmean=c_totals.max()-sd)









    



469.35 (47.650000000000006, 14.895972610071489)
[517 516 513 509 515 513 516 465 511 516]

Manual prototyping:

New algorithm:

set max to max category.
calculate new mean as 1 σδ below max.
normalize dataset to 1 σδ of new mean.

normalization:

todo



In [462]:

    
def basic_plot():
    mean,sa = c_totals.mean(),c_totals.std()
    plt.bar(x=range(len(c_totals)),height=c_totals, alpha=0.4, color='k');
    plt.axhline(y=mean,c='r'); plt.axhline(y=mean+sd,c='k'); plt.axhline(y=mean-sd,c='k');
    print(mean, (sd, sa))
    print(c_totals)



In [500]:

    
df = generate_csv(PATH)

Getting a 'skewed' dataset from the original DataFrame



In [556]:

    
skewed_df = df.sample(n=5000)
counts = pops_from_df(skewed_df); counts.mean()[0], counts.std()[0]









    Out[556]:





(500.0, 22.181073012818835)



In [557]:

    
drop_idxs = np.where(skewed_df['class']=='dog')[0]
drop_idxs.shape









    Out[557]:





(532,)



In [558]:

    
drop_idxs = np.random.choice(drop_idxs, size=len(drop_idxs)//2, replace=False)
drop_idxs.shape









    Out[558]:





(266,)



In [584]:

    
skewed_df.drop(skewed_df.index[drop_idxs], inplace=True)
skewed_df.count()[0], np.where(skewed_df['class']=='dog')[0].shape[0]









    Out[584]:





(4734, 266)



In [608]:

    
plot_pops(pops_from_df(skewed_df))









    



473.40 75.34

In this case with an ε of 10%, we'd want a standard deviation of about 47. It's currently at 75.

Modifying the algorithm to work on dataframes instead of integer arrays:



In [610]:

    
eps = 0.1

cat_pops = pops_from_df(skewed_df)
c_totals = cat_pops.as_matrix(columns=['pop'])

NumPy and Pandas seem to disagree on the standard deviation. Looks like by default they're calculated subtly differently.



In [611]:

    
c_totals.mean(), c_totals.std()









    Out[611]:





(473.4, 71.47335167739092)



In [631]:

    
sd = eps * c_totals.mean() # desired standard deviation

Normalize category sizes:



In [728]:

    
# normalize category sizes
c_normd = c_totals/c_totals.max()# - c_totals.min()/c_totals.max()
mean = c_totals.max() - sd
new_totals = (2*sd * c_normd + (mean - sd)).astype(int)

#display
print(c_totals.mean(),mean);print(c_totals.std(),new_totals.std()); list(zip(c_totals, new_totals))









    



473.4 477.65999999999997
71.47335167739092 12.867012085173464






    Out[728]:





[(array([506]), array([521])),
 (array([508]), array([521])),
 (array([525]), array([525])),
 (array([509]), array([522])),
 (array([486]), array([517])),
 (array([511]), array([522])),
 (array([471]), array([515])),
 (array([266]), array([478])),
 (array([488]), array([518])),
 (array([464]), array([513]))]



In [729]:

    
c_normd









    Out[729]:





array([[0.96380952],
       [0.96761905],
       [1.        ],
       [0.96952381],
       [0.92571429],
       [0.97333333],
       [0.89714286],
       [0.50666667],
       [0.92952381],
       [0.88380952]])



In [730]:

    
c_totals.max()









    Out[730]:





525



In [731]:

    
c_totals[:,0]









    Out[731]:





array([506, 508, 525, 509, 486, 511, 471, 266, 488, 464])

Looks like things are working to spec:



In [741]:

    
def basic_plot2(c_totals=c_totals, sd=sd, oldmean=None):
    sa,mean = c_totals.std(),c_totals.mean() if oldmean is None else oldmean
    plt.bar(x=range(len(c_totals)),height=c_totals, alpha=0.4, color='k');
    plt.axhline(y=mean,c='r'); plt.axhline(y=mean+sd,c='k'); plt.axhline(y=mean-sd,c='k');
    print(mean, (sd, sa))
    print(c_totals)

Showing the new distribution with original mean flanked by desired standard deviation:



In [739]:

    
basic_plot2(c_totals=new_totals[:,0], sd=sd, oldmean=mean)









    



477.65999999999997 (47.34, 12.867012085173464)
[521 521 525 522 517 522 515 478 518 513]

Showing the new mean flanked by desired standard deviation:



In [740]:

    
basic_plot2(c_totals=new_totals[:,0], sd=sd)









    



515.2 (47.34, 12.867012085173464)
[521 521 525 522 517 522 515 478 518 513]

Looks like the algorithm mechanics are working as they should. Now to do the dataframe resample bit.



In [745]:

    
diffs = new_totals - c_totals
diffs[:,0]









    Out[745]:





array([ 15,  13,   0,  13,  31,  11,  44, 212,  30,  49])

Adding to a single category:



In [749]:

    
cat_pops['cat'][0]









    Out[749]:





'deer'



In [750]:

    
cat_pops



In [759]:

    
# cats = cat_pops['cat'].values
idx = 0; cat = cat_pops['cat'].values[idx]
cat









    Out[759]:





'deer'

New Totals says the 'deer' category should have 15 elements added to it, bringing a total of 506 to 521 (I'm a bit paranoid about numerical stability):



In [764]:

    
cat_pops.iloc[idx]['pop'], new_totals[idx][0]









    Out[764]:





(506, 521)

Grab a random sample of 15 elements from the original dataset for 'deer':



In [782]:

    
copy_idxs = np.random.choice(np.where(df['class'] == cat)[0], size=diffs[idx], replace=False)
copy_idxs









    Out[782]:





array([34346, 35384, 37827, 36420, 37771, 37047, 34198, 34865, 36252,
       37255, 37010, 34754, 34176, 35629, 38172])



In [783]:

    
copyrows = df.iloc[copy_idxs]



In [784]:

    
copyrows









    Out[784]:







  
    
      
      file
      class
    
  
  
    
      34346
      deer/44641_deer.png
      deer
    
    
      35384
      deer/3868_deer.png
      deer
    
    
      37827
      deer/19897_deer.png
      deer
    
    
      36420
      deer/25216_deer.png
      deer
    
    
      37771
      deer/42982_deer.png
      deer
    
    
      37047
      deer/35743_deer.png
      deer
    
    
      34198
      deer/36008_deer.png
      deer
    
    
      34865
      deer/27258_deer.png
      deer
    
    
      36252
      deer/24967_deer.png
      deer
    
    
      37255
      deer/13428_deer.png
      deer
    
    
      37010
      deer/17150_deer.png
      deer
    
    
      34754
      deer/32969_deer.png
      deer
    
    
      34176
      deer/39202_deer.png
      deer
    
    
      35629
      deer/14786_deer.png
      deer
    
    
      38172
      deer/12975_deer.png
      deer

Holy shit it works.



In [787]:

    
tmpdf = skewed_df.copy()
tmpdf.describe()









    Out[787]:







  
    
      
      file
      class
    
  
  
    
      count
      4734
      4734
    
    
      unique
      4734
      10
    
    
      top
      horse/22789_horse.png
      frog
    
    
      freq
      1
      525



In [788]:

    
tmpdf.append(copyrows).describe()









    Out[788]:







  
    
      
      file
      class
    
  
  
    
      count
      4749
      4749
    
    
      unique
      4748
      10
    
    
      top
      deer/35743_deer.png
      frog
    
    
      freq
      2
      525



In [791]:

    
tmpdf = tmpdf.append(copyrows)
tmpdf.iloc[-16:]









    Out[791]:







  
    
      
      file
      class
    
  
  
    
      34133
      deer/36514_deer.png
      deer
    
    
      34346
      deer/44641_deer.png
      deer
    
    
      35384
      deer/3868_deer.png
      deer
    
    
      37827
      deer/19897_deer.png
      deer
    
    
      36420
      deer/25216_deer.png
      deer
    
    
      37771
      deer/42982_deer.png
      deer
    
    
      37047
      deer/35743_deer.png
      deer
    
    
      34198
      deer/36008_deer.png
      deer
    
    
      34865
      deer/27258_deer.png
      deer
    
    
      36252
      deer/24967_deer.png
      deer
    
    
      37255
      deer/13428_deer.png
      deer
    
    
      37010
      deer/17150_deer.png
      deer
    
    
      34754
      deer/32969_deer.png
      deer
    
    
      34176
      deer/39202_deer.png
      deer
    
    
      35629
      deer/14786_deer.png
      deer
    
    
      38172
      deer/12975_deer.png
      deer

Looks good. Now for all categories.



In [810]:

    
cats = cat_pops['cat'].values
copy_idxs = []
for i,cat in enumerate(cats):
    diff = diffs[i]
    copy_idxs.extend(np.random.choice(np.where(df['class'] == cat)[0], size=diff, replace=False))
len(copy_idxs)









    Out[810]:





418



In [811]:

    
new_totals.sum() - c_totals.sum()









    Out[811]:





418



In [814]:

    
copyrows = df.iloc[copy_idxs]
copyrows.count()









    Out[814]:





file     418
class    418
dtype: int64



In [818]:

    
copyrows.sample(n=5)









    Out[818]:







  
    
      
      file
      class
    
  
  
    
      6861
      dog/42691_dog.png
      dog
    
    
      18405
      airplane/25212_airplane.png
      airplane
    
    
      6379
      dog/23967_dog.png
      dog
    
    
      4885
      dog/37623_dog.png
      dog
    
    
      11506
      truck/29574_truck.png
      truck

Moment of truth: adding it back together and counting the result:



In [842]:

    
tmpdf = skewed_df.copy()
tmpdf.count()









    Out[842]:





file     4734
class    4734
dtype: int64



In [843]:

    
print(skewed_df.count()[0])
skewed_df = skewed_df.append(copyrows)
print(skewed_df.count()[0])



In [824]:

    
tmpdf.count()









    Out[824]:





file     5152
class    5152
dtype: int64



In [844]:

    
plot_pops(pops_from_df(skewed_df))









    



515.20 13.56

What the dataset looks like now:



In [845]:

    
basic_plot2(c_totals=pops_from_df(skewed_df).as_matrix(columns=['pop'])[:,0], sd=sd)









    



515.2 (47.34, 12.867012085173464)
[521 521 525 522 517 522 515 478 518 513]

And what it looks like with the original mean superimposed:



In [846]:

    
basic_plot2(c_totals=pops_from_df(skewed_df).as_matrix(columns=['pop'])[:,0], sd=sd, oldmean=c_totals.mean())









    



473.4 (47.34, 12.867012085173464)
[521 521 525 522 517 522 515 478 518 513]

The dataset's class distribution has been smoothed out to within 1 desired standard deviation of the mean.

Automated dataset smoothing:



In [922]:

    
def smooth_csv_dataset(df, eps=0.1, full_df=None, catdx=1):
    """'Smooths' out a dataset by adding copied samples.
    
        For use with single-label classification (2-column) CSVs."""
    
    # result DF and sampling DF
    new_df  = df.copy()
    full_df = df if full_df is None else full_df
    # get category column name
    catcol = df.columns[catdx]
    
    # get category populations & calculate desired range
    cat_pops = pops_from_df(df)
    c_totals = cat_pops.as_matrix(columns=['pop'])
    sd       = eps * c_totals.mean()
    # Normalize category sizes
    c_norm     = c_totals/c_totals.max()
    new_mean   = c_totals.max() - sd
    new_totals = (2*sd * c_norm + (new_mean - sd)).astype(int)
    
    # Increase category sizes by differences
    diffs     = new_totals - c_totals
    cats      = cat_pops['cat'].values
    copy_idxs = []
    
    for i,cat in enumerate(cats):
        diff         = diffs[i]
        cat_idxs     = np.where(full_df[catcol]==cat)[0]
        full_cat_pop = len(cat_idxs)
        
        # if the difference is more than Nx greater, copy the whole category N times
        if diff > full_cat_pop:
            n_copy = int(diff) // full_cat_pop
            diff  -= n_copy * full_cat_pop
#             import pdb;pdb.set_trace()
            for i in range(n_copy): copy_idxs.extend(cat_idxs)
        copy_idxs.extend(np.random.choice(cat_idxs, size=diff, replace=False))
    
    copy_rows = full_df.iloc[copy_idxs]
    new_df    = new_df.append(copy_rows)
    
    return new_df



In [876]:

    
int(np.array([1]))









    Out[876]:





1



In [867]:

    
tmp = []
arr = [1,2,3]
ndarr = np.array(arr)
# tmp.extend(arr)
# tmp.extend(ndarr)
tmp.extend(np.array([1,2,3]))
tmp









    Out[867]:





[1, 2, 3]

Testing:



In [500]:

    
df = generate_csv(PATH)



In [848]:

    
skewed_df = df.sample(n=5000)
counts = pops_from_df(skewed_df); counts.mean()[0], counts.std()[0]

drop_idxs = np.where(skewed_df['class']=='dog')[0]
drop_idxs = np.random.choice(drop_idxs, size=len(drop_idxs)//2, replace=False)

skewed_df.drop(skewed_df.index[drop_idxs], inplace=True)
skewed_df.count()[0], np.where(skewed_df['class']=='dog')[0].shape[0]









    Out[848]:





(4755, 245)



In [913]:

    
cat_pops = pops_from_df(skewed_df); c_totals = cat_pops.as_matrix(columns=['pop']); sd=c_totals.mean()*eps
basic_plot2(c_totals=c_totals[:,0], sd=sd)









    



475.5 (47.550000000000004, 80.82109873046765)
[515 245 452 494 485 495 505 547 532 485]

Without a 'full' base dataset:



In [918]:

    
eps * c_totals.mean()









    Out[918]:





47.550000000000004



In [919]:

    
c_totals.max()









    Out[919]:





547



In [932]:

    
smooth_df = smooth_csv_dataset(skewed_df)



In [933]:

    
smooth_df.count()[0]









    Out[933]:





5341



In [934]:

    
oldmean









    Out[934]:





475.5



In [935]:

    
c_totals.mean()









    Out[935]:





534.1

I got a little confused looking at these plots, and realized I wasn't actually displaying or calculating the 'old' mean: more of a 'pseudomean'. The 'oldmean' parameter to the plot ('new_mean' inside the smoother function) is the pseudomean.

Although it's what I meant, I'm not actually asking for the dataset to be within 1 desired standard deviation from the mean. I'm asking that no element of the dataset be less than 2 desired standard deviations from the maximum.

That also means nothing is further than ±1 standard deviation from the new actual mean. So I'm in effect imposing 2 constraints:

Nothing is further than 1 stdev from the (actual) mean,
and the minumum is within 2 stdevs of the maximum.



In [936]:

    
# Smoothed dataset is now
cat_pops = pops_from_df(smooth_df); c_totals = cat_pops.as_matrix(columns=['pop'])
basic_plot2(c_totals=c_totals[:,0], sd=sd, oldmean=c_totals.max()-sd)









    



499.45 (47.550000000000004, 14.088647912415158)
[541 494 530 537 536 537 539 547 544 536]



In [923]:

    
cat_pops = pops_from_df(smooth_df); c_totals = cat_pops.as_matrix(columns=['pop'])
basic_plot2(c_totals=c_totals[:,0], sd=sd)









    



534.1 (47.550000000000004, 14.088647912415158)
[541 494 530 537 536 537 539 547 544 536]

With a 'full' base dataset:



In [940]:

    
%time smooth_df = smooth_csv_dataset(skewed_df, full_df=df)









    



CPU times: user 74.1 ms, sys: 3.08 ms, total: 77.2 ms
Wall time: 76.5 ms



In [942]:

    
# entire dataset is within 2 stdevs of the maximum
cat_pops = pops_from_df(smooth_df); c_totals = cat_pops.as_matrix(columns=['pop'])
basic_plot2(c_totals=c_totals[:,0], sd=sd, oldmean=c_totals.max()-sd)









    



499.45 (47.550000000000004, 14.088647912415158)
[541 494 530 537 536 537 539 547 544 536]



In [941]:

    
# entire dataset is within 1 stdev of the mean
cat_pops = pops_from_df(smooth_df); c_totals = cat_pops.as_matrix(columns=['pop'])
basic_plot2(c_totals=c_totals[:,0], sd=sd)









    



534.1 (47.550000000000004, 14.088647912415158)
[541 494 530 537 536 537 539 547 544 536]



In [945]:

    
smooth_df.describe()









    Out[945]:







  
    
      
      file
      class
    
  
  
    
      count
      5341
      5341
    
    
      unique
      5287
      10
    
    
      top
      dog/14963_dog.png
      frog
    
    
      freq
      2
      547



In [949]:

    
smooth_df.count()[0], skewed_df.count()[0]









    Out[949]:





(5341, 4755)



In [ ]:

Scrap



In [764]:

    
cat_pops.iloc[idx]['pop'], new_totals[idx][0]









    Out[764]:





(506, 521)

Grab a random sample of 15 elements from the original dataset for 'deer':



In [782]:

    
copy_idxs = np.random.choice(np.where(df['class'] == cat)[0], size=diffs[idx], replace=False)
copy_idxs









    Out[782]:





array([34346, 35384, 37827, 36420, 37771, 37047, 34198, 34865, 36252,
       37255, 37010, 34754, 34176, 35629, 38172])



In [783]:

    
copyrows = df.iloc[copy_idxs]



In [ ]:



In [ ]:

    
# randomly sample from dataset to fill the difference
adds = []
diffs = new_totals - c_totals
for i,diff in enumerate(diffs):
    add = np.random.choice(df[catcol]==#TODO)



In [ ]:



In [704]:

    
c_totals[np.argmin(c_totals)], c_normd[np.argmin(c_totals)]









    Out[704]:





(array([266]), array([0.56189269]))



In [ ]:

    
# normalize wrt max
c_normd = c_totals/c_totals.max()

# set floor as 2sd below max



In [ ]:



In [702]:

    
cmin = c_totals[np.argmin(c_totals)]
cnormin = c_normd[np.argmin(c_totals)]

2*sd * cnormin









    Out[702]:





array([5.2288])

Too many values are being reduced. My algorithm isn't perfect for the job. But: what if I just take the max of the result? What does the mean & stdv look like then?



In [693]:

    
maxs = np.max((c_totals, new_totals), axis=0)
print(maxs.mean(), maxs.std())
list(zip(*maxs))[0]









    



491.9 24.570103784884587






    Out[693]:





(506, 508, 525, 509, 486, 511, 476, 435, 488, 475)

No values have increased beyond the original max, and all



In [695]:

    
maxs.min() - maxs.mean(), maxs.max() - maxs.mean()









    Out[695]:





(-56.89999999999998, 33.10000000000002)



In [ ]:



In [646]:

    
c_totals.mean(), mean









    Out[646]:





(473.4, 477.65999999999997)



In [ ]:



In [638]:

    
new_totals









    Out[638]:





array([[483],
       [483],
       [487],
       [484],
       [479],
       [484],
       [476],
       [435],
       [479],
       [475]])



In [ ]:



In [490]:

    
## prototyping: dummy dataset variation
c_totals = np.random.randint(1000, 3000, size=10)
# calculate mean, actual & desired standard deviation
# mean = c_totals.mean()
# sa   = c_totals.std()
sd   = eps * mean

# normalize category sizes
c_normd = c_totals/c_totals.mean() - c_totals.min()/c_totals.max()
mean = c_totals.max() - sd
new_totals = (2*sd * c_normd + (mean - sd)).astype(int)

# randomly sample from dataset to fill the difference
adds = []
diffs = new_totals - c_totals
for i,diff in enumerate(diffs):
    add = np.random.choice(df[catcol]==#TODO)









    Out[490]:





array([1330, 1122,  235, 1282, 1262, 1055,  166,  838, 1381,  975])



In [ ]:



In [612]:

    
cat_pops.std()









    Out[612]:





pop    75.339528
dtype: float64



In [613]:

    
c_totals









    Out[613]:





array([[506],
       [508],
       [525],
       [509],
       [486],
       [511],
       [471],
       [266],
       [488],
       [464]])



In [614]:

    
cat_pops



In [615]:

    
def plot_pops(df, print_ms=True):
    if print_ms: print(f"{df.mean()[0]:.2f} {df.std()[0]:.2f}")
    df.plot.bar(x=df[df.columns[0]], ylim=(0,1.2*df[df.columns[1]].values.max()), 
            yerr=max(df.mean()[0]*0.005, df.std()[0]), alpha=.8)
    df.mean()[0], df.std()[0]



In [616]:

    
cat_pops.mean()









    Out[616]:





pop    473.4
dtype: float64



In [617]:

    
cat_pops.std()









    Out[617]:





pop    75.339528
dtype: float64



In [618]:

    
cat_pops.as_matrix(columns=['pop']).std()









    Out[618]:





71.47335167739092



In [619]:

    
cat_pops.as_matrix(columns=['pop'])









    Out[619]:





array([[506],
       [508],
       [525],
       [509],
       [486],
       [511],
       [471],
       [266],
       [488],
       [464]])



In [629]:

    
cat_pops.as_matrix(columns=['pop']).mean(), cat_pops.as_matrix(columns=['pop']).std()









    Out[629]:





(473.4, 71.47335167739092)



In [626]:

    
cat_pops['pop'].sum()









    Out[626]:





4734



In [628]:

    
cat_pops['pop'].mean(), cat_pops['pop'].std()









    Out[628]:





(473.4, 75.33952776892383)



In [ ]:



In [599]:

    
c_totals.sum()









    Out[599]:





4734



In [490]:

    
## prototyping: dummy dataset variation
c_totals = np.random.randint(1000, 3000, size=10)
# calculate mean, actual & desired standard deviation
# mean = c_totals.mean()
# sa   = c_totals.std()
sd   = eps * mean

# normalize category sizes
c_normd = c_totals/c_totals.mean() - c_totals.min()/c_totals.max()
mean = c_totals.max() - sd
new_totals = (2*sd * c_normd + (mean - sd)).astype(int)

# randomly sample from dataset to fill the difference
adds = []
diffs = new_totals - c_totals
for i,diff in enumerate(diffs):
    add = np.random.choice(df[catcol]==)









    Out[490]:





array([1330, 1122,  235, 1282, 1262, 1055,  166,  838, 1381,  975])



In [ ]:



In [496]:

    
tmp = []
tmp.append(df.iloc[5:10])
tmp









    Out[496]:





[                file class
 5   cat/3863_cat.png   cat
 6  cat/45897_cat.png   cat
 7  cat/49197_cat.png   cat
 8   cat/1846_cat.png   cat
 9   cat/8760_cat.png   cat]



In [497]:

    
pd.DataFrame(tmp)









    Out[497]:







  
    
      
      0
    
  
  
    
      0
      file class
5   cat/3863_cat.pn...



In [ ]:



In [463]:

    
eps = 0.1

c_totals = np.random.randint(1000, 3000, size=10)
mean = c_totals.mean()
sa   = c_totals.std()
sd   = eps * mean

basic_plot()

c_normd = c_totals/c_totals.mean() - c_totals.min()/c_totals.max()
mean = c_totals.max() - sd
c_totals = (2*sd * c_normd + (mean - sd))

basic_plot()









    



1936.1 (193.61, 608.770145457216)
[2648 1644 1261 1311 1215 2895 2227 2026 1475 2659]
2732.4879792746115 (193.61, 121.75402909144326)
[2874.86797927 2674.06797927 2597.46797927 2607.46797927 2588.26797927
 2924.26797927 2790.66797927 2750.46797927 2640.26797927 2877.06797927]



In [ ]:

First time I got the normalization algorithm working:



In [386]:

    
eps = 0.1

c_totals = np.random.randint(1000, 3000, size=10)
mean = c_totals.mean()
sa   = c_totals.std()
sd   = eps * mean

basic_plot()









    



2159.1 (215.91, 556.2047195053275)
[2974 2803 2686 2567 2208 1818 1286 1631 1521 2097]



In [412]:

    
c_totals / c_totals.mean() - (c_totals.min()/c_totals.max())









    Out[412]:





array([0.94501152, 0.86581186, 0.81162261, 0.75650705, 0.59023407,
       0.40960325, 0.16320429, 0.32299309, 0.27204593, 0.53882376])



In [413]:

    
c_normd = c_totals/c_totals.mean() - c_totals.min()/c_totals.max()
c_normd









    Out[413]:





array([0.94501152, 0.86581186, 0.81162261, 0.75650705, 0.59023407,
       0.40960325, 0.16320429, 0.32299309, 0.27204593, 0.53882376])



In [414]:

    
new_mean = c_totals.max() - sd
new_mean









    Out[414]:





2758.09



In [420]:

    
new_totals = (2*sd * c_normd + (new_mean - sd))
new_totals









    Out[420]:





array([2950.25487559, 2916.05487559, 2892.65487559, 2868.85487559,
       2797.05487559, 2719.05487559, 2612.65487559, 2681.65487559,
       2659.65487559, 2774.85487559])



In [427]:

    
c_totals = new_totals
mean = new_mean

basic_plot()









    



2787.274875588433 (215.91, 111.2409439010655)
[2950.25487559 2916.05487559 2892.65487559 2868.85487559 2797.05487559
 2719.05487559 2612.65487559 2681.65487559 2659.65487559 2774.85487559]



In [ ]:



In [390]:

    
c_norm = c_totals / np.linalg.norm(c_totals, ord=np.inf)
c_range= c_norm * sd*2
c_range









    Out[390]:





array([431.82      , 406.99107599, 390.00286483, 372.72425689,
       320.59803631, 263.97066577, 186.72512441, 236.81856759,
       220.84674512, 304.48101547])



In [392]:

    
c_range.max()









    Out[392]:





True



In [ ]:

New algorithm:

set max to max category.
calculate new mean as 1 σδ below max.
normalize dataset to 1 σδ of new mean.

normalization:

% distance to new mean = % distance to original mean, with max distance set by σδ.
This means that categories further from the original mean than the max is from it will be clamped to new mean - σδ.



In [385]:

    
c_max = c_totals.max()
new_mean = c_max - sd

c_norm = c_totals/np.linalg.norm(c_totals, ord=np.inf)
c_total = c_norm * sd + new_mean

# for i,c_tot in enumerate(c_totals):
#     c_totals[i] = max(new_mean-sd, min())
#     if c_tot < mean:
#         c_totals[i] = max(new_mean-sd, c_tot*(c_tot/mean)/(mean/(new_mean+sd)))
#     elif c_tot > mean:
#         c_totals[i] = max(new_mean-sd, c_tot*(c_tot/c_max)/(c_max/(new_mean+sd)))

mean = new_mean
sa   = c_totals.std()
sd   = eps*mean

basic_plot()









    



2721.04 (272.104, 455.21449889035824)
[2257 1746 2911 1583 1453 2064 1231 1741 1809 2201]



In [ ]:



In [313]:

    
c_max = c_totals.max()
new_mean = c_max - sd
for i,c_tot in enumerate(c_totals):
#     c_totals[i] = max(new_mean-sd, min())
    if c_tot < mean:
        c_totals[i] = max(new_mean-sd, c_tot*(c_tot/mean)/(mean/(new_mean+sd)))
    elif c_tot > mean:
        c_totals[i] = max(new_mean-sd, c_tot*(c_tot/c_max)/(c_max/(new_mean+sd)))

mean = new_mean
sa   = c_totals.std()
sd   = eps*mean

basic_plot()









    



2673.48 (267.348, 133.56885864601824)
[2449 2449 2449 2449 2497 2449 2449 2449 2449 2897]



In [ ]:



In [491]:

    
diffs









    Out[491]:





array([1330, 1122,  235, 1282, 1262, 1055,  166,  838, 1381,  975])



In [479]:

    
# cat_pops = pops_from_df(csv, colnames=None)
cat_pops = pops_from_df(csv)



In [480]:

    
cat_pops.as_matrix(columns=['pop'])









    Out[480]:





array([[4250],
       [4250],
       [4250],
       [4250],
       [4250],
       [4250],
       [4250],
       [4250],
       [4250],
       [4250]])



In [481]:

    
totals = cat_pops.as_matrix(columns=['pop'])



In [483]:

    
totals.std()









    Out[483]:





0.0



In [484]:

    
cat_pops



In [485]:

    
for cat in cat_pops.as_matrix(columns=['cat']):
    print(cat)









    



['cat']
['dog']
['truck']
['bird']
['airplane']
['ship']
['frog']
['horse']
['deer']
['automobile']



In [489]:

    
df['class']==cat[0]









    Out[489]:





0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
42470     True
42471     True
42472     True
42473     True
42474     True
42475     True
42476     True
42477     True
42478     True
42479     True
42480     True
42481     True
42482     True
42483     True
42484     True
42485     True
42486     True
42487     True
42488     True
42489     True
42490     True
42491     True
42492     True
42493     True
42494     True
42495     True
42496     True
42497     True
42498     True
42499     True
Name: class, Length: 42500, dtype: bool



In [ ]:



In [306]:

    
c_max = 150
c_tot = c_max
sd = 20
new_mean = c_max - sd

c_tot*(c_tot/c_max)/(c_max/(new_mean+sd))









    Out[306]:





150.0



In [309]:

    
c_max









    Out[309]:





2704



In [311]:

    
new_mean + sd









    Out[311]:





2775.487



In [ ]:



In [298]:

    
c_totals.mean()









    Out[298]:





2575.0



In [299]:

    
mean









    Out[299]:





2787.3



In [297]:

    
basic_plot()









    



2787.3 (278.73, 0.0)
[2575 2575 2575 2575 2575 2575 2575 2575 2575 2575]



In [287]:

    
(70/100)/(100/150)









    Out[287]:





1.05



In [294]:

    
(150/100)/(100/150)









    Out[294]:





2.25



In [ ]:

Standard Deviation formula:

So, I want σ to be eps percent of the total dataset. The main factor in this eq is the difference from the mean. And it's squared, so there's a high penalty on being off-mean. What makes this not-simple is σ and μ affect each other, so there's a differential equation somewhere that has to be solved.

1st algorithm idea:

σ: standard deviation; α: actual; δ: desired

k = σδ/2

While the σα is greater than σδ:

    For each category:
        if that category is below the mean by more than 1 σδ:
            add k σδ of random copies to that category

    recalculate mean
    recalculate σδ

    If no categories are below the mean by more than 1 σδ:
        k = k/2

I'm concerned about updates. The 'perfect' numerical solution is to recalculate the mean/stdv every time k is added, and to keep k at 1. This is also a worst-case scenario for time. Setting k=σδ/2 is my attempt at striking a balance between 'resolution' and speed.



In [338]:

    
eps = 0.1

c_totals = np.random.randint(1000, 3000, size=10)
mean = c_totals.mean()
sa   = c_totals.std()
sd   = eps * mean

k = sd/2

iters = 0

basic_plot()









    



2191.6 (219.16, 501.9958565566055)
[2187 1255 1965 2302 2380 2358 2208 2954 2836 1471]

this initial version of the algorithm stops progressing when the minimum total is within 1 desired standard deviation of the mean. However, if any totals are more than a desired stdv from the mean, the actual stdv will be greater than that desired. This algorithm will fail to converge whenever this condition exists.



In [349]:

    
# while sa > sd:
k_OK = False
for i,c_tot in enumerate(c_totals):
    if mean - c_tot > sd:
        c_totals[i] += k
        k_OK = True
        
# update
mean = c_totals.mean()
sa   = c_totals.std()
sd   = eps * mean

# check resolution change
if not k_OK: k /= 2

iters += 1; print(f'iterations: {iters}')
basic_plot()









    



iterations: 11
2387.8 (238.78000000000003, 262.88126597382325)
[2187 2236 2183 2302 2380 2358 2208 2954 2836 2234]



In [350]:

    
k









    Out[350]:





27.395



In [146]:

    
k









    Out[146]:





0.363515625



In [ ]:



In [29]:

    
cat_pops



In [ ]:



In [21]:

    
data = np.array([1000, 1500, 700, 1200, 1100, 900])
data.mean(), data.std()









    Out[21]:





(1066.6666666666667, 249.44382578492943)



In [26]:

    
ε = 0.1
σ = data.mean() * ε; σ









    Out[26]:





106.66666666666669



In [ ]:



In [277]:

    
pops_from_df(df.sample(n=1000))



In [375]:

    
n = 100

catdx=1
catcol = df.columns[catdx]
cats = df[catcol].unique()

df_slice = df[catcol]

keep_idxs = np.array([], dtype=np.int64) ## np.dtype(int) --> dtype('int64')
for cat in cats:
    keep_idxs = np.concatenate((keep_idxs, np.random.choice(np.where(df_slice==cat)[0], size=n, replace=False)))



In [385]:

    
np.where(df_slice==cat)[0]









    Out[385]:





array([38250, 38251, 38252, ..., 42497, 42498, 42499])



In [384]:

    
len(np.where(df_slice==cat)[0])









    Out[384]:





4250



In [390]:

    
np.array([], dtype='int64'), np.array([], dtype=np.int64)









    Out[390]:





(array([], dtype=int64), array([], dtype=int64))



In [377]:

    
keep_idxs.shape









    Out[377]:





(1000,)



In [380]:

    
df.iloc[keep_idxs].head(n=10)









    Out[380]:







  
    
      
      file
      class
    
  
  
    
      1212
      cat/32702_cat.png
      cat
    
    
      3603
      cat/39013_cat.png
      cat
    
    
      2162
      cat/5065_cat.png
      cat
    
    
      2046
      cat/47554_cat.png
      cat
    
    
      2525
      cat/5502_cat.png
      cat
    
    
      93
      cat/32423_cat.png
      cat
    
    
      1000
      cat/18970_cat.png
      cat
    
    
      2507
      cat/2083_cat.png
      cat
    
    
      771
      cat/19922_cat.png
      cat
    
    
      3884
      cat/45850_cat.png
      cat



In [381]:

    
df.iloc[keep_idxs].sample(n=10)









    Out[381]:







  
    
      
      file
      class
    
  
  
    
      7354
      dog/28740_dog.png
      dog
    
    
      38808
      automobile/42438_automobile.png
      automobile
    
    
      1072
      cat/41673_cat.png
      cat
    
    
      13373
      bird/8321_bird.png
      bird
    
    
      30367
      horse/19089_horse.png
      horse
    
    
      28762
      frog/20297_frog.png
      frog
    
    
      20112
      airplane/41345_airplane.png
      airplane
    
    
      5926
      dog/7472_dog.png
      dog
    
    
      34833
      deer/18875_deer.png
      deer
    
    
      15485
      bird/14946_bird.png
      bird



In [ ]:



In [ ]:



In [374]:

    
np.dtype(type(3))









    Out[374]:





dtype('int64')



In [376]:

    
keep_idxs[:100]









    Out[376]:





array([1212, 3603, 2162, 2046, 2525,   93, 1000, 2507,  771, 3884, 1585,
       2912, 3028,  982, 3309, 1072, 4057,  825, 2564, 3107,   77,  767,
        997, 3694, 2481, 2382, 3043, 3716, 3621, 2356, 2052, 1666, 2901,
       3226, 1771, 3892,  893,  984, 3541, 2205, 2203, 4204,  484, 1039,
       1787, 1927, 3474,  578, 1190, 2221, 3273, 2151,   35,  677, 2006,
       1774,  583, 3697,  403, 3899, 2748, 3850, 1392,   84,  819,  338,
        939, 2811, 1995, 3407, 2153, 3354, 3842, 1176, 3324, 2386,  197,
       2875, 2634, 2375, 3219, 2606,  657, 3614, 1268, 1887, 3122, 2855,
         95, 2668, 1413, 1763, 2014, 1710,  700,  401, 2624, 2309,  650,
       3701])



In [351]:

    
np.concatenate(keep_idxs, np.random.choice(np.where(df_slice==cat)[0], size=n, replace=False))









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-351-ab1012697a98> in <module>()
      1 np.concatenate(keep_idxs, 
----> 2                np.random.choice(np.where(df_slice==cat)[0], size=n, replace=False))

TypeError: only integer scalar arrays can be converted to a scalar index



In [355]:

    
np.random.choice(np.where(df_slice==cat)[0], size=n, replace=False)









    Out[355]:





array([ 649,  909, 3390,  377, 1873, 4088, 3097, 2128, 2550, 2647, 3446,
        920, 2876, 3022, 2431,   16,  866, 3886, 1753,  339, 2684, 3358,
       3514, 3127, 1155,  837, 2827,  724,  487, 1114, 2674, 2455,  391,
       1143, 2353,  279, 1566,  838, 3808,  974,   35,  456,  829, 1755,
       3984,   41,  794, 2937,  307, 2745,   40, 2961, 1691, 1818, 3538,
       3718,  536, 3196, 3880, 1179, 3325,  672, 1607,  861, 1905, 1749,
        732, 2645, 4022,  968, 3445, 3611, 1892,  972,  439, 3835, 1444,
       3375, 1005, 1389, 2117, 1413, 1266, 2432, 2787, 2492,  912, 4086,
       1612, 3692, 3106,  538, 2894,  289, 1040,  870, 1866, 2189, 3222,
       2082])



In [ ]:



In [337]:

    
keep_idxs = np.array([])



In [339]:

    
x = np.array([1,2,3,])



In [340]:

    
x.shape









    Out[340]:





(3,)



In [342]:

    
np.concatenate((keep_idxs, x))









    Out[342]:





array([1., 2., 3.])



In [ ]:



In [298]:

    
catdx=1
catcol = df.columns[catdx]
cats = df[catcol].unique()
for cat in cats:
#     pops = [df[df[y]==cat].count()[0] for cat in cats]
    keepidx = np.where(df[df[catcol]==cat])
    break



In [ ]:



In [334]:

    
df_slice = df[catcol]
keepidxs = np.random.choice(np.where(df_slice==cat)[0], size = 20, replace=False)



In [332]:

    
np.where(df_slice==cat)[0]









    Out[332]:





array([   0,    1,    2, ..., 4247, 4248, 4249])



In [327]:

    
df_slice









    Out[327]:





0               cat
1               cat
2               cat
3               cat
4               cat
5               cat
6               cat
7               cat
8               cat
9               cat
10              cat
11              cat
12              cat
13              cat
14              cat
15              cat
16              cat
17              cat
18              cat
19              cat
20              cat
21              cat
22              cat
23              cat
24              cat
25              cat
26              cat
27              cat
28              cat
29              cat
            ...    
42470    automobile
42471    automobile
42472    automobile
42473    automobile
42474    automobile
42475    automobile
42476    automobile
42477    automobile
42478    automobile
42479    automobile
42480    automobile
42481    automobile
42482    automobile
42483    automobile
42484    automobile
42485    automobile
42486    automobile
42487    automobile
42488    automobile
42489    automobile
42490    automobile
42491    automobile
42492    automobile
42493    automobile
42494    automobile
42495    automobile
42496    automobile
42497    automobile
42498    automobile
42499    automobile
Name: class, Length: 42500, dtype: object



In [ ]:



In [308]:

    
keepidx = np.where(df[df[catcol]==cat])
keepidx[0].shape









    Out[308]:





(8500,)



In [317]:

    
keepidx = df[df[catcol]==cat].as_matrix()[:,0]
keepidx.shape









    Out[317]:





(4250,)



In [318]:

    
keepidx[:n]









    Out[318]:





array(['cat/21851_cat.png', 'cat/45066_cat.png', 'cat/4835_cat.png', ...,
       'cat/46013_cat.png', 'cat/45379_cat.png', 'cat/49298_cat.png'],
      dtype=object)



In [319]:

    
keepidx = np.where(df[df[catcol]==cat])
keepidx[0][:10]









    Out[319]:





array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4])



In [323]:

    
df.iloc[keepidx].head()









    Out[323]:







  
    
      
      file
      class
      file
      class
      file
      class
      file
      class
      file
      class
      ...
      file
      class
      file
      class
      file
      class
      file
      class
      file
      class
    
  
  
    
      0
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      ...
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
    
    
      0
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      ...
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
      cat/21851_cat.png
      cat
    
    
      1
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      ...
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
    
    
      1
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      ...
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
      cat/45066_cat.png
      cat
    
    
      2
      cat/4835_cat.png
      cat
      cat/4835_cat.png
      cat
      cat/4835_cat.png
      cat
      cat/4835_cat.png
      cat
      cat/4835_cat.png
      cat
      ...
      cat/4835_cat.png
      cat
      cat/4835_cat.png
      cat
      cat/4835_cat.png
      cat
      cat/4835_cat.png
      cat
      cat/4835_cat.png
      cat
    
  

5 rows × 8500 columns



In [ ]:



In [261]:

    
list(zip(cats,pops))









    Out[261]:





[('cat', 4250),
 ('dog', 4250),
 ('truck', 4250),
 ('bird', 4250),
 ('airplane', 4250),
 ('ship', 4250),
 ('frog', 4250),
 ('horse', 4250),
 ('deer', 4250),
 ('automobile', 4250)]



In [210]:

    
list(zip(cats,pops))









    Out[210]:





[('cat', 4250),
 ('dog', 4250),
 ('truck', 4250),
 ('bird', 4250),
 ('airplane', 4250),
 ('ship', 4250),
 ('frog', 4250),
 ('horse', 4250),
 ('deer', 4250),
 ('automobile', 4250)]



In [248]:

    
rows=[]
for cat in catfolders:
    catpath = path/folder/cat
    fpaths  = list(map(lambda x: cat+'/'+x, os.listdir(catpath)))
    rows.extend(list(zip(fpaths,[labelmap[cat] for i in range(len(fpaths))])))

df = pd.DataFrame(rows, columns=['file','class'])



In [251]:

    
rows[0]









    Out[251]:





('cat/21851_cat.png', 'cat')



In [250]:

    
cols['cat']









    Out[250]:





('cat',
 'dog',
 'truck',
 'bird',
 'airplane',
 'ship',
 'frog',
 'horse',
 'deer',
 'automobile')



In [252]:

    
list(zip(cols['cat'],cols['pop']))









    Out[252]:





[('cat', 4250),
 ('dog', 4250),
 ('truck', 4250),
 ('bird', 4250),
 ('airplane', 4250),
 ('ship', 4250),
 ('frog', 4250),
 ('horse', 4250),
 ('deer', 4250),
 ('automobile', 4250)]



In [240]:

    
dictionary=cat_pops

cols = dict()
cols['cat'],cols['pop'] = zip(*dictionary.items())
# pd.DataFrame([cols['cat'],cols['pop']],columns=['cat','pop'])









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-240-b03925bf8c0e> in <module>()
      3 cols = dict()
      4 cols['cat'],cols['pop'] = zip(*dictionary.items())
----> 5 pd.DataFrame([cols['cat'],cols['pop']],columns=['cat','pop'])

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    367                     if is_named_tuple(data[0]) and columns is None:
    368                         columns = data[0]._fields
--> 369                     arrays, columns = _to_arrays(data, columns, dtype=dtype)
    370                     columns = _ensure_index(columns)
    371 

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in _to_arrays(data, columns, coerce_float, dtype)
   6282     if isinstance(data[0], (list, tuple)):
   6283         return _list_to_arrays(data, columns, coerce_float=coerce_float,
-> 6284                                dtype=dtype)
   6285     elif isinstance(data[0], collections.Mapping):
   6286         return _list_of_dict_to_arrays(data, columns,

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in _list_to_arrays(data, columns, coerce_float, dtype)
   6361         content = list(lib.to_object_array(data).T)
   6362     return _convert_object_array(content, columns, dtype=dtype,
-> 6363                                  coerce_float=coerce_float)
   6364 
   6365 

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in _convert_object_array(content, columns, coerce_float, dtype)
   6418             # caller's responsibility to check for this...
   6419             raise AssertionError('%d columns passed, passed data had %s '
-> 6420                                  'columns' % (len(columns), len(content)))
   6421 
   6422     # provide soft conversion of object dtypes

AssertionError: 2 columns passed, passed data had 10 columns



In [236]:

    
cols









    Out[236]:





{'cat': ('cat',
  'dog',
  'truck',
  'bird',
  'airplane',
  'ship',
  'frog',
  'horse',
  'deer',
  'automobile'),
 'pop': (4250, 4250, 4250, 4250, 4250, 4250, 4250, 4250, 4250, 4250)}



In [238]:

    
pd.DataFrame(cols)



In [239]:

    
cols['cat']









    Out[239]:





('cat',
 'dog',
 'truck',
 'bird',
 'airplane',
 'ship',
 'frog',
 'horse',
 'deer',
 'automobile')



In [224]:

    
cols = dict()
cols['cat'],cols['pop'] = zip(*cat_pops.items())



In [225]:

    
cols









    Out[225]:





{'cat': ('airplane',
  'automobile',
  'bird',
  'cat',
  'deer',
  'dog',
  'frog',
  'horse',
  'ship',
  'truck'),
 'pop': (4250, 4250, 4250, 4250, 4250, 4250, 4250, 4250, 4250, 4250)}



In [ ]:



In [207]:

    
df=csv;xdx=0;ydx=1

x,y = df.columns[xdx],df.columns[ydx]
cats = df[y].unique()
print(cats)
pops = [df[df[y]==cat].count()[0] for cat in cats]









    



['cat' 'dog' 'truck' 'bird' 'airplane' 'ship' 'frog' 'horse' 'deer'
 'automobile']



In [177]:

    
csv[csv['class']=='cat'].count()









    Out[177]:





file     4250
class    4250
dtype: int64



In [179]:

    
csv.columns[1]









    Out[179]:





'class'



In [ ]:



In [149]:

    
%%time
labelmap = None
folder='train'
path = PATH
# find categories
catfolders = [f for f in os.listdir(path/folder) if (path/folder/f).is_dir()]
if labelmap is None: labelmap = {cf:cf for cf in catfolders}

df = pd.DataFrame(columns=['file_path','class'])
rows = []

for cat in catfolders:
    catpath = path/folder/cat
    fpaths = list(map(lambda x: cat+'/'+x , os.listdir(catpath)))
#     fpaths  = list(map(str, catpath.iterdir()))
    rows.extend(list(zip(fpaths,[cat for i in range(len(fpaths))])))
    break









    



CPU times: user 5.45 ms, sys: 6.93 ms, total: 12.4 ms
Wall time: 12 ms



In [150]:

    
# rows



In [135]:

    
len(rows)









    Out[135]:





4250



In [ ]:



In [136]:

    
df = pd.DataFrame(rows, columns=['file_path','class'])
df.head()









    Out[136]:







  
    
      
      file_path
      class
    
  
  
    
      0
      data/cifar10/train/cat/21851_cat.png
      cat
    
    
      1
      data/cifar10/train/cat/45066_cat.png
      cat
    
    
      2
      data/cifar10/train/cat/4835_cat.png
      cat
    
    
      3
      data/cifar10/train/cat/35804_cat.png
      cat
    
    
      4
      data/cifar10/train/cat/24696_cat.png
      cat



In [117]:

    
n=10
# fpaths[:n], [cat for i in range(n)]

list(zip(fpaths[:n],[cat for i in range(n)]))









    Out[117]:





[('data/cifar10/train/cat/21851_cat.png', 'cat'),
 ('data/cifar10/train/cat/45066_cat.png', 'cat'),
 ('data/cifar10/train/cat/4835_cat.png', 'cat'),
 ('data/cifar10/train/cat/35804_cat.png', 'cat'),
 ('data/cifar10/train/cat/24696_cat.png', 'cat'),
 ('data/cifar10/train/cat/3863_cat.png', 'cat'),
 ('data/cifar10/train/cat/45897_cat.png', 'cat'),
 ('data/cifar10/train/cat/49197_cat.png', 'cat'),
 ('data/cifar10/train/cat/1846_cat.png', 'cat'),
 ('data/cifar10/train/cat/8760_cat.png', 'cat')]



In [127]:

    
n = 2000



In [128]:

    
%%time
rows = []
rows1 = list(zip(fpaths[:n], [cat for i in range(n)]));
rows2 = list(zip(fpaths[n:2*n], [cat for i in range(n)]));
rows.append(rows1)
rows.append(rows2)
rows









    



CPU times: user 2.14 ms, sys: 34 µs, total: 2.17 ms
Wall time: 2.33 ms



In [129]:

    
%%time
rows = []
rows1 = list(zip(fpaths[:n], [cat for i in range(n)]));
rows2 = list(zip(fpaths[n:2*n], [cat for i in range(n)]));
rows.extend(rows1)
rows.extend(rows2)
rows









    



CPU times: user 644 µs, sys: 0 ns, total: 644 µs
Wall time: 655 µs



In [122]:

    
rows









    Out[122]:





[('data/cifar10/train/cat/21851_cat.png', 'cat'),
 ('data/cifar10/train/cat/45066_cat.png', 'cat'),
 ('data/cifar10/train/cat/4835_cat.png', 'cat'),
 ('data/cifar10/train/cat/35804_cat.png', 'cat'),
 ('data/cifar10/train/cat/24696_cat.png', 'cat'),
 ('data/cifar10/train/cat/3863_cat.png', 'cat'),
 ('data/cifar10/train/cat/45897_cat.png', 'cat'),
 ('data/cifar10/train/cat/49197_cat.png', 'cat'),
 ('data/cifar10/train/cat/1846_cat.png', 'cat'),
 ('data/cifar10/train/cat/8760_cat.png', 'cat'),
 ('data/cifar10/train/cat/21851_cat.png', 'cat'),
 ('data/cifar10/train/cat/45066_cat.png', 'cat'),
 ('data/cifar10/train/cat/4835_cat.png', 'cat'),
 ('data/cifar10/train/cat/35804_cat.png', 'cat'),
 ('data/cifar10/train/cat/24696_cat.png', 'cat'),
 ('data/cifar10/train/cat/3863_cat.png', 'cat'),
 ('data/cifar10/train/cat/45897_cat.png', 'cat'),
 ('data/cifar10/train/cat/49197_cat.png', 'cat'),
 ('data/cifar10/train/cat/1846_cat.png', 'cat'),
 ('data/cifar10/train/cat/8760_cat.png', 'cat')]



In [ ]:



In [118]:

    
df = pd.DataFrame(list(zip(fpaths[:n],[cat for i in range(n)])), columns=['file_path','class'])



In [119]:

    
df









    Out[119]:







  
    
      
      file_path
      class
    
  
  
    
      0
      data/cifar10/train/cat/21851_cat.png
      cat
    
    
      1
      data/cifar10/train/cat/45066_cat.png
      cat
    
    
      2
      data/cifar10/train/cat/4835_cat.png
      cat
    
    
      3
      data/cifar10/train/cat/35804_cat.png
      cat
    
    
      4
      data/cifar10/train/cat/24696_cat.png
      cat
    
    
      5
      data/cifar10/train/cat/3863_cat.png
      cat
    
    
      6
      data/cifar10/train/cat/45897_cat.png
      cat
    
    
      7
      data/cifar10/train/cat/49197_cat.png
      cat
    
    
      8
      data/cifar10/train/cat/1846_cat.png
      cat
    
    
      9
      data/cifar10/train/cat/8760_cat.png
      cat



In [109]:

    
df = pd.DataFrame(rows[0],rows[1], columns=['file_path','class'])









    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-109-0388191ef300> in <module>()
----> 1 df = pd.DataFrame([rows[0],rows[1]], columns=['file_path','class'])

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    367                     if is_named_tuple(data[0]) and columns is None:
    368                         columns = data[0]._fields
--> 369                     arrays, columns = _to_arrays(data, columns, dtype=dtype)
    370                     columns = _ensure_index(columns)
    371 

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in _to_arrays(data, columns, coerce_float, dtype)
   6282     if isinstance(data[0], (list, tuple)):
   6283         return _list_to_arrays(data, columns, coerce_float=coerce_float,
-> 6284                                dtype=dtype)
   6285     elif isinstance(data[0], collections.Mapping):
   6286         return _list_of_dict_to_arrays(data, columns,

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in _list_to_arrays(data, columns, coerce_float, dtype)
   6361         content = list(lib.to_object_array(data).T)
   6362     return _convert_object_array(content, columns, dtype=dtype,
-> 6363                                  coerce_float=coerce_float)
   6364 
   6365 

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/frame.py in _convert_object_array(content, columns, coerce_float, dtype)
   6418             # caller's responsibility to check for this...
   6419             raise AssertionError('%d columns passed, passed data had %s '
-> 6420                                  'columns' % (len(columns), len(content)))
   6421 
   6422     # provide soft conversion of object dtypes

AssertionError: 2 columns passed, passed data had 4250 columns



In [106]:

    
df









    Out[106]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      4240
      4241
      4242
      4243
      4244
      4245
      4246
      4247
      4248
      4249
    
  
  
    
      0
      data/cifar10/train/cat/21851_cat.png
      data/cifar10/train/cat/45066_cat.png
      data/cifar10/train/cat/4835_cat.png
      data/cifar10/train/cat/35804_cat.png
      data/cifar10/train/cat/24696_cat.png
      data/cifar10/train/cat/3863_cat.png
      data/cifar10/train/cat/45897_cat.png
      data/cifar10/train/cat/49197_cat.png
      data/cifar10/train/cat/1846_cat.png
      data/cifar10/train/cat/8760_cat.png
      ...
      data/cifar10/train/cat/2699_cat.png
      data/cifar10/train/cat/42321_cat.png
      data/cifar10/train/cat/24597_cat.png
      data/cifar10/train/cat/29004_cat.png
      data/cifar10/train/cat/40304_cat.png
      data/cifar10/train/cat/12904_cat.png
      data/cifar10/train/cat/48928_cat.png
      data/cifar10/train/cat/40314_cat.png
      data/cifar10/train/cat/20196_cat.png
      data/cifar10/train/cat/33964_cat.png
    
    
      1
      cat
      cat
      cat
      cat
      cat
      cat
      cat
      cat
      cat
      cat
      ...
      cat
      cat
      cat
      cat
      cat
      cat
      cat
      cat
      cat
      cat
    
  

2 rows × 4250 columns



In [ ]:



In [75]:

    
files = generate_csv(PATH)
cat = 'cat'
path = PATH
folder = 'train'



In [76]:

    
files_np = np.array(files)



In [83]:

    
catpath = path/folder/cat;
catpath









    Out[83]:





PosixPath('data/cifar10/train/cat')



In [166]:

    
list(map(str, catpath.iterdir()))[:20]









    Out[166]:





['data/cifar10/train/cat/21851_cat.png',
 'data/cifar10/train/cat/45066_cat.png',
 'data/cifar10/train/cat/4835_cat.png',
 'data/cifar10/train/cat/35804_cat.png',
 'data/cifar10/train/cat/24696_cat.png',
 'data/cifar10/train/cat/3863_cat.png',
 'data/cifar10/train/cat/45897_cat.png',
 'data/cifar10/train/cat/49197_cat.png',
 'data/cifar10/train/cat/1846_cat.png',
 'data/cifar10/train/cat/8760_cat.png',
 'data/cifar10/train/cat/7493_cat.png',
 'data/cifar10/train/cat/39751_cat.png',
 'data/cifar10/train/cat/16851_cat.png',
 'data/cifar10/train/cat/1856_cat.png',
 'data/cifar10/train/cat/7483_cat.png',
 'data/cifar10/train/cat/33903_cat.png',
 'data/cifar10/train/cat/44127_cat.png',
 'data/cifar10/train/cat/46757_cat.png',
 'data/cifar10/train/cat/36970_cat.png',
 'data/cifar10/train/cat/34689_cat.png']



In [ ]:



In [46]:

    
cat_pops_np = np.array([[k,v] for k,v in cat_pops.items()])



In [47]:

    
cat_pops_np[:,1]









    Out[47]:





array(['4250', '4250', '4250', '4250', '4250', '4250', '4250', '4250',
       '4250', '4250'], dtype='<U10')



In [49]:

    
from collections import defaultdict
cat_pops_dict = defaultdict(lambda x: [])
for k,v in cat_pops.items():
    cat_pops_dict[k] = v



In [50]:

    
cat_pops_dict









    Out[50]:





defaultdict(<function __main__.<lambda>(x)>,
            {'airplane': 4250,
             'automobile': 4250,
             'bird': 4250,
             'cat': 4250,
             'deer': 4250,
             'dog': 4250,
             'frog': 4250,
             'horse': 4250,
             'ship': 4250,
             'truck': 4250})



In [48]:

    
cat_pops_dict = {'cat':k,'pop':v for k,v in cat_pops.items()}









    



  File "<ipython-input-48-11346347af81>", line 1
    cat_pops_dict = {'cat':k,'pop':v for k,v in cat_pops.items()}
                                       ^
SyntaxError: invalid syntax



In [29]:

    
df = pd.DataFrame({'cat':[cat for cat in cat_pops.keys()],'pop':[pop for pop in cat_pops.values()]})



In [30]:

    
df.plot.bar()









    Out[30]:





<matplotlib.axes._subplots.AxesSubplot at 0x115aa2278>



In [ ]:

    
df = pd.DataFrame({'cat':})



In [35]:

    
cat_pops_np[:,0]









    Out[35]:





array(['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog',
       'horse', 'ship', 'truck'], dtype='<U10')



In [37]:

    
df = pd.DataFrame({'cat':cat_pops_np[:,0],'pop':cat_pops_np[:,1]})



In [44]:

    
df.









    Out[44]:





0    4250
1    4250
2    4250
3    4250
4    4250
5    4250
6    4250
7    4250
8    4250
9    4250
Name: pop, dtype: object



In [43]:

    
df.plot.bar()









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-43-68c901b9fdd6> in <module>()
----> 1 df.plot.bar()

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/plotting/_core.py in bar(self, x, y, **kwds)
   2714         axes : matplotlib.AxesSubplot or np.array of them
   2715         """
-> 2716         return self(kind='bar', x=x, y=y, **kwds)
   2717 
   2718     def barh(self, x=None, y=None, **kwds):

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/plotting/_core.py in __call__(self, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   2675                           fontsize=fontsize, colormap=colormap, table=table,
   2676                           yerr=yerr, xerr=xerr, secondary_y=secondary_y,
-> 2677                           sort_columns=sort_columns, **kwds)
   2678     __call__.__doc__ = plot_frame.__doc__
   2679 

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/plotting/_core.py in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   1900                  yerr=yerr, xerr=xerr,
   1901                  secondary_y=secondary_y, sort_columns=sort_columns,
-> 1902                  **kwds)
   1903 
   1904 

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/plotting/_core.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   1727         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   1728 
-> 1729     plot_obj.generate()
   1730     plot_obj.draw()
   1731     return plot_obj.result

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/plotting/_core.py in generate(self)
    248     def generate(self):
    249         self._args_adjust()
--> 250         self._compute_plot_data()
    251         self._setup_subplots()
    252         self._make_plot()

~/Miniconda3/envs/fastai/lib/python3.6/site-packages/pandas/plotting/_core.py in _compute_plot_data(self)
    363         if is_empty:
    364             raise TypeError('Empty {0!r}: no numeric data to '
--> 365                             'plot'.format(numeric_data.__class__.__name__))
    366 
    367         self.data = numeric_data

TypeError: Empty 'DataFrame': no numeric data to plot



In [ ]:

    
def csv_smooth_data(path, threshold):

	file	class
6483	dog/20688_dog.png	dog
12388	truck/1602_truck.png	truck
16061	bird/14294_bird.png	bird
18073	airplane/18106_airplane.png	airplane
42118	automobile/46162_automobile.png	automobile
39891	automobile/33188_automobile.png	automobile
17797	airplane/38187_airplane.png	airplane
1668	cat/45885_cat.png	cat
22111	ship/24451_ship.png	ship
13452	bird/37374_bird.png	bird

	cat	pop
0	cat	2125
1	dog	2125
2	truck	2125
3	bird	2125
4	airplane	2125
5	ship	2125
6	frog	2125
7	horse	2125
8	deer	2125
9	automobile	2125

	file	class
2311	cat/14032_cat.png	cat
3438	cat/6348_cat.png	cat
2914	cat/38894_cat.png	cat
3382	cat/18582_cat.png	cat
80	cat/11476_cat.png	cat

	file	class
34346	deer/44641_deer.png	deer
35384	deer/3868_deer.png	deer
37827	deer/19897_deer.png	deer
36420	deer/25216_deer.png	deer
37771	deer/42982_deer.png	deer
37047	deer/35743_deer.png	deer
34198	deer/36008_deer.png	deer
34865	deer/27258_deer.png	deer
36252	deer/24967_deer.png	deer
37255	deer/13428_deer.png	deer
37010	deer/17150_deer.png	deer
34754	deer/32969_deer.png	deer
34176	deer/39202_deer.png	deer
35629	deer/14786_deer.png	deer
38172	deer/12975_deer.png	deer

	file	class
count	4734	4734
unique	4734	10
top	horse/22789_horse.png	frog
freq	1	525

	file	class
34133	deer/36514_deer.png	deer
34346	deer/44641_deer.png	deer
35384	deer/3868_deer.png	deer
37827	deer/19897_deer.png	deer
36420	deer/25216_deer.png	deer
37771	deer/42982_deer.png	deer
37047	deer/35743_deer.png	deer
34198	deer/36008_deer.png	deer
34865	deer/27258_deer.png	deer
36252	deer/24967_deer.png	deer
37255	deer/13428_deer.png	deer
37010	deer/17150_deer.png	deer
34754	deer/32969_deer.png	deer
34176	deer/39202_deer.png	deer
35629	deer/14786_deer.png	deer
38172	deer/12975_deer.png	deer

	file	class
6861	dog/42691_dog.png	dog
18405	airplane/25212_airplane.png	airplane
6379	dog/23967_dog.png	dog
4885	dog/37623_dog.png	dog
11506	truck/29574_truck.png	truck

	cat	pop
0	cat	4250
1	dog	4250
2	truck	4250
3	bird	4250
4	airplane	4250
5	ship	4250
6	frog	4250
7	horse	4250
8	deer	4250
9	automobile	4250

	file	class
1212	cat/32702_cat.png	cat
3603	cat/39013_cat.png	cat
2162	cat/5065_cat.png	cat
2046	cat/47554_cat.png	cat
2525	cat/5502_cat.png	cat
93	cat/32423_cat.png	cat
1000	cat/18970_cat.png	cat
2507	cat/2083_cat.png	cat
771	cat/19922_cat.png	cat
3884	cat/45850_cat.png	cat

	file	class
7354	dog/28740_dog.png	dog
38808	automobile/42438_automobile.png	automobile
1072	cat/41673_cat.png	cat
13373	bird/8321_bird.png	bird
30367	horse/19089_horse.png	horse
28762	frog/20297_frog.png	frog
20112	airplane/41345_airplane.png	airplane
5926	dog/7472_dog.png	dog
34833	deer/18875_deer.png	deer
15485	bird/14946_bird.png	bird

	file	class	file	class	file	class	file	class	file	class	...	file	class	file	class	file	class	file	class	file	class
0	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat	...	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat
0	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat	...	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat	cat/21851_cat.png	cat
1	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat	...	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat
1	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat	...	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat	cat/45066_cat.png	cat
2	cat/4835_cat.png	cat	cat/4835_cat.png	cat	cat/4835_cat.png	cat	cat/4835_cat.png	cat	cat/4835_cat.png	cat	...	cat/4835_cat.png	cat	cat/4835_cat.png	cat	cat/4835_cat.png	cat	cat/4835_cat.png	cat	cat/4835_cat.png	cat

	file_path	class
0	data/cifar10/train/cat/21851_cat.png	cat
1	data/cifar10/train/cat/45066_cat.png	cat
2	data/cifar10/train/cat/4835_cat.png	cat
3	data/cifar10/train/cat/35804_cat.png	cat
4	data/cifar10/train/cat/24696_cat.png	cat