In [34]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import glob
import LoadDataset
import seaborn as sns
import pandas as pd
%matplotlib inline

In [35]:
DATA_SET_FOLDER = '/home/ankdesh/explore/DeepLearning-UdacityCapston/data/train'

In [36]:
heights = []
widths = []
for fileName in glob.glob(DATA_SET_FOLDER + '/*.png'):
    img = Image.open(fileName)
    heights.append(img.height)
    widths.append(img.width)

numLen = [] # Num of digits in the number in image  
for (imgs,labels) in LoadDataset.getNextImage(DATA_SET_FOLDER):
    numLen.append(len(labels))

In [37]:
sns.distplot(pd.Series(numLen, name="Histogram - Len of number"), kde = False, label = "Histogram")


Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fed1ea23c50>

In [38]:
sns.distplot(pd.Series(widths, name="Histogram - Width of images"), kde = False, label = "Histogram")


Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fed1e86d210>

In [39]:
sns.distplot(pd.Series(heights, name="Histogram - Height of images"), kde = False, label = "Histogram")


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fed1ef38dd0>

In [41]:
lenMean = np.mean(numLen)
widthMean = np.mean(widths)
heightMean = np.mean(heights)

In [43]:
print (widthMean, heightMean)


(128.28498293515358, 57.213011196934318)

In [54]:
cummHistNumLen = []
for i in range(10):
    cummHistNumLen.append(float(len([x for x in numLen if x <= i])) / len(numLen) * 100)

In [55]:
print (cummHistNumLen)


[0.0, 15.379318603676426, 69.6575055385905, 95.6769055745165, 99.97006167295372, 99.99700616729538, 100.0, 100.0, 100.0, 100.0]

In [ ]: