Calculations witn Numpy - Statistics


In [1]:
import numpy as np
import pandas as pd

In [2]:
test_scores = [70,65,95,88]
type(test_scores)


Out[2]:
list

In [3]:
scores = np.array(test_scores)
type(scores)


Out[3]:
numpy.ndarray

In [4]:
scores.mean()


Out[4]:
79.5

In [5]:
print(scores.max(), scores.min())


95 65

In [6]:
#More Calculations
income = np.array([75000, 55000, 88000, 125000, 64000, 97000])

In [7]:
print(income.mean())
print(income.max())
print(income.min())


84000.0
125000
55000

In [8]:
income = np.append(income, 12000000)
income.mean()


Out[8]:
1786285.7142857143

In [9]:
np.median(income)


Out[9]:
88000.0

In [10]:
# Standard Deviation
income.std()


Out[10]:
4169786.007331644

In [11]:
scores.std()


Out[11]:
12.379418403139947

In [12]:
scores.sum()


Out[12]:
318

In [13]:
# MATRICES
np.random.seed(seed=60)
# create a 5 by 5 Matrix
random_square = np.random.rand(5,5)
random_square


Out[13]:
array([[0.30087333, 0.18694582, 0.32318268, 0.66574957, 0.5669708 ],
       [0.39825396, 0.37941492, 0.01058154, 0.1703656 , 0.12339337],
       [0.69240128, 0.87444156, 0.3373969 , 0.99245923, 0.13154007],
       [0.50032984, 0.28662051, 0.22058485, 0.50208555, 0.63606254],
       [0.63567694, 0.08043309, 0.58143375, 0.83919086, 0.29301825]])

In [14]:
random_square[0]


Out[14]:
array([0.30087333, 0.18694582, 0.32318268, 0.66574957, 0.5669708 ])

In [15]:
# Get column specified e.g. 0
random_square[:,0]


Out[15]:
array([0.30087333, 0.39825396, 0.69240128, 0.50032984, 0.63567694])

In [16]:
random_square[0,0] == random_square[0][0]


Out[16]:
True

In [17]:
random_square.mean()


Out[17]:
0.42917627159618377

In [18]:
# Mean of specified Row
random_square[0].mean()


Out[18]:
0.4087444389228477

In [19]:
# Mean of specified column
random_square[:,0].mean()


Out[19]:
0.5055070681931644

In [20]:
# Mean of last column
random_square[:,-1].mean()


Out[20]:
0.35019700684996913

In [21]:
import time

Computation Times Calculations


In [22]:
%%time
# How long it takes to generate a 100000 by 100 matrix
np.random.seed(seed=60)
big_matrix = np.random.rand(100000, 100)


CPU times: user 216 ms, sys: 43.6 ms, total: 260 ms
Wall time: 547 ms

In [23]:
%%time
# duration to calculate mean
big_matrix = np.random.rand(100000, 100)
big_matrix.mean()


CPU times: user 178 ms, sys: 36.7 ms, total: 215 ms
Wall time: 235 ms

Dealing with NPArrays


In [24]:
np.arange(1,101)


Out[24]:
array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [25]:
# Reshape to 20 by 5
np.arange(1, 101).reshape(20,5)


Out[25]:
array([[  1,   2,   3,   4,   5],
       [  6,   7,   8,   9,  10],
       [ 11,  12,  13,  14,  15],
       [ 16,  17,  18,  19,  20],
       [ 21,  22,  23,  24,  25],
       [ 26,  27,  28,  29,  30],
       [ 31,  32,  33,  34,  35],
       [ 36,  37,  38,  39,  40],
       [ 41,  42,  43,  44,  45],
       [ 46,  47,  48,  49,  50],
       [ 51,  52,  53,  54,  55],
       [ 56,  57,  58,  59,  60],
       [ 61,  62,  63,  64,  65],
       [ 66,  67,  68,  69,  70],
       [ 71,  72,  73,  74,  75],
       [ 76,  77,  78,  79,  80],
       [ 81,  82,  83,  84,  85],
       [ 86,  87,  88,  89,  90],
       [ 91,  92,  93,  94,  95],
       [ 96,  97,  98,  99, 100]])

In [26]:
mat1 = np.arange(1, 101).reshape(20,5)
mat1 - 50


Out[26]:
array([[-49, -48, -47, -46, -45],
       [-44, -43, -42, -41, -40],
       [-39, -38, -37, -36, -35],
       [-34, -33, -32, -31, -30],
       [-29, -28, -27, -26, -25],
       [-24, -23, -22, -21, -20],
       [-19, -18, -17, -16, -15],
       [-14, -13, -12, -11, -10],
       [ -9,  -8,  -7,  -6,  -5],
       [ -4,  -3,  -2,  -1,   0],
       [  1,   2,   3,   4,   5],
       [  6,   7,   8,   9,  10],
       [ 11,  12,  13,  14,  15],
       [ 16,  17,  18,  19,  20],
       [ 21,  22,  23,  24,  25],
       [ 26,  27,  28,  29,  30],
       [ 31,  32,  33,  34,  35],
       [ 36,  37,  38,  39,  40],
       [ 41,  42,  43,  44,  45],
       [ 46,  47,  48,  49,  50]])

In [27]:
mat1 * 10


Out[27]:
array([[  10,   20,   30,   40,   50],
       [  60,   70,   80,   90,  100],
       [ 110,  120,  130,  140,  150],
       [ 160,  170,  180,  190,  200],
       [ 210,  220,  230,  240,  250],
       [ 260,  270,  280,  290,  300],
       [ 310,  320,  330,  340,  350],
       [ 360,  370,  380,  390,  400],
       [ 410,  420,  430,  440,  450],
       [ 460,  470,  480,  490,  500],
       [ 510,  520,  530,  540,  550],
       [ 560,  570,  580,  590,  600],
       [ 610,  620,  630,  640,  650],
       [ 660,  670,  680,  690,  700],
       [ 710,  720,  730,  740,  750],
       [ 760,  770,  780,  790,  800],
       [ 810,  820,  830,  840,  850],
       [ 860,  870,  880,  890,  900],
       [ 910,  920,  930,  940,  950],
       [ 960,  970,  980,  990, 1000]])

In [28]:
mat1 + mat1


Out[28]:
array([[  2,   4,   6,   8,  10],
       [ 12,  14,  16,  18,  20],
       [ 22,  24,  26,  28,  30],
       [ 32,  34,  36,  38,  40],
       [ 42,  44,  46,  48,  50],
       [ 52,  54,  56,  58,  60],
       [ 62,  64,  66,  68,  70],
       [ 72,  74,  76,  78,  80],
       [ 82,  84,  86,  88,  90],
       [ 92,  94,  96,  98, 100],
       [102, 104, 106, 108, 110],
       [112, 114, 116, 118, 120],
       [122, 124, 126, 128, 130],
       [132, 134, 136, 138, 140],
       [142, 144, 146, 148, 150],
       [152, 154, 156, 158, 160],
       [162, 164, 166, 168, 170],
       [172, 174, 176, 178, 180],
       [182, 184, 186, 188, 190],
       [192, 194, 196, 198, 200]])

In [29]:
mat1 * mat1


Out[29]:
array([[    1,     4,     9,    16,    25],
       [   36,    49,    64,    81,   100],
       [  121,   144,   169,   196,   225],
       [  256,   289,   324,   361,   400],
       [  441,   484,   529,   576,   625],
       [  676,   729,   784,   841,   900],
       [  961,  1024,  1089,  1156,  1225],
       [ 1296,  1369,  1444,  1521,  1600],
       [ 1681,  1764,  1849,  1936,  2025],
       [ 2116,  2209,  2304,  2401,  2500],
       [ 2601,  2704,  2809,  2916,  3025],
       [ 3136,  3249,  3364,  3481,  3600],
       [ 3721,  3844,  3969,  4096,  4225],
       [ 4356,  4489,  4624,  4761,  4900],
       [ 5041,  5184,  5329,  5476,  5625],
       [ 5776,  5929,  6084,  6241,  6400],
       [ 6561,  6724,  6889,  7056,  7225],
       [ 7396,  7569,  7744,  7921,  8100],
       [ 8281,  8464,  8649,  8836,  9025],
       [ 9216,  9409,  9604,  9801, 10000]])

In [30]:
#take the dot product of mat1 and mat1.T
np.dot(mat1, mat1.T)


Out[30]:
array([[   55,   130,   205,   280,   355,   430,   505,   580,   655,
          730,   805,   880,   955,  1030,  1105,  1180,  1255,  1330,
         1405,  1480],
       [  130,   330,   530,   730,   930,  1130,  1330,  1530,  1730,
         1930,  2130,  2330,  2530,  2730,  2930,  3130,  3330,  3530,
         3730,  3930],
       [  205,   530,   855,  1180,  1505,  1830,  2155,  2480,  2805,
         3130,  3455,  3780,  4105,  4430,  4755,  5080,  5405,  5730,
         6055,  6380],
       [  280,   730,  1180,  1630,  2080,  2530,  2980,  3430,  3880,
         4330,  4780,  5230,  5680,  6130,  6580,  7030,  7480,  7930,
         8380,  8830],
       [  355,   930,  1505,  2080,  2655,  3230,  3805,  4380,  4955,
         5530,  6105,  6680,  7255,  7830,  8405,  8980,  9555, 10130,
        10705, 11280],
       [  430,  1130,  1830,  2530,  3230,  3930,  4630,  5330,  6030,
         6730,  7430,  8130,  8830,  9530, 10230, 10930, 11630, 12330,
        13030, 13730],
       [  505,  1330,  2155,  2980,  3805,  4630,  5455,  6280,  7105,
         7930,  8755,  9580, 10405, 11230, 12055, 12880, 13705, 14530,
        15355, 16180],
       [  580,  1530,  2480,  3430,  4380,  5330,  6280,  7230,  8180,
         9130, 10080, 11030, 11980, 12930, 13880, 14830, 15780, 16730,
        17680, 18630],
       [  655,  1730,  2805,  3880,  4955,  6030,  7105,  8180,  9255,
        10330, 11405, 12480, 13555, 14630, 15705, 16780, 17855, 18930,
        20005, 21080],
       [  730,  1930,  3130,  4330,  5530,  6730,  7930,  9130, 10330,
        11530, 12730, 13930, 15130, 16330, 17530, 18730, 19930, 21130,
        22330, 23530],
       [  805,  2130,  3455,  4780,  6105,  7430,  8755, 10080, 11405,
        12730, 14055, 15380, 16705, 18030, 19355, 20680, 22005, 23330,
        24655, 25980],
       [  880,  2330,  3780,  5230,  6680,  8130,  9580, 11030, 12480,
        13930, 15380, 16830, 18280, 19730, 21180, 22630, 24080, 25530,
        26980, 28430],
       [  955,  2530,  4105,  5680,  7255,  8830, 10405, 11980, 13555,
        15130, 16705, 18280, 19855, 21430, 23005, 24580, 26155, 27730,
        29305, 30880],
       [ 1030,  2730,  4430,  6130,  7830,  9530, 11230, 12930, 14630,
        16330, 18030, 19730, 21430, 23130, 24830, 26530, 28230, 29930,
        31630, 33330],
       [ 1105,  2930,  4755,  6580,  8405, 10230, 12055, 13880, 15705,
        17530, 19355, 21180, 23005, 24830, 26655, 28480, 30305, 32130,
        33955, 35780],
       [ 1180,  3130,  5080,  7030,  8980, 10930, 12880, 14830, 16780,
        18730, 20680, 22630, 24580, 26530, 28480, 30430, 32380, 34330,
        36280, 38230],
       [ 1255,  3330,  5405,  7480,  9555, 11630, 13705, 15780, 17855,
        19930, 22005, 24080, 26155, 28230, 30305, 32380, 34455, 36530,
        38605, 40680],
       [ 1330,  3530,  5730,  7930, 10130, 12330, 14530, 16730, 18930,
        21130, 23330, 25530, 27730, 29930, 32130, 34330, 36530, 38730,
        40930, 43130],
       [ 1405,  3730,  6055,  8380, 10705, 13030, 15355, 17680, 20005,
        22330, 24655, 26980, 29305, 31630, 33955, 36280, 38605, 40930,
        43255, 45580],
       [ 1480,  3930,  6380,  8830, 11280, 13730, 16180, 18630, 21080,
        23530, 25980, 28430, 30880, 33330, 35780, 38230, 40680, 43130,
        45580, 48030]])

Pandas Section


In [31]:
# Create dictionary of test scores
test_dict = {'Corey':[63,75,88], 'Kevin':[48,98,92], 'Akshay': [87, 86, 85]}

In [32]:
# Create DataFrame
df = pd.DataFrame(test_dict)

In [33]:
df


Out[33]:
Corey Kevin Akshay
0 63 48 87
1 75 98 86
2 88 92 85

In [34]:
df.describe()


Out[34]:
Corey Kevin Akshay
count 3.000000 3.000000 3.0
mean 75.333333 79.333333 86.0
std 12.503333 27.300794 1.0
min 63.000000 48.000000 85.0
25% 69.000000 70.000000 85.5
50% 75.000000 92.000000 86.0
75% 81.500000 95.000000 86.5
max 88.000000 98.000000 87.0

In [35]:
# Transpose the DataFrame
df = df.T
df


Out[35]:
0 1 2
Corey 63 75 88
Kevin 48 98 92
Akshay 87 86 85

In [36]:
# Rename the columns
df.columns = ['Quiz_1', 'Quiz_2', 'Quiz_3']
df


Out[36]:
Quiz_1 Quiz_2 Quiz_3
Corey 63 75 88
Kevin 48 98 92
Akshay 87 86 85

In [37]:
# Access first row by index number
df.iloc[0]


Out[37]:
Quiz_1    63
Quiz_2    75
Quiz_3    88
Name: Corey, dtype: int64

In [38]:
df['Quiz_1']


Out[38]:
Corey     63
Kevin     48
Akshay    87
Name: Quiz_1, dtype: int64

In [39]:
# Access first column using dot notation
df.Quiz_1


Out[39]:
Corey     63
Kevin     48
Akshay    87
Name: Quiz_1, dtype: int64

In [40]:
# Limit DataFrame to first 2 rows
df[0:2]


Out[40]:
Quiz_1 Quiz_2 Quiz_3
Corey 63 75 88
Kevin 48 98 92

In [41]:
# Defining a new DataFrame from first 2 rows and last 2 columns 
rows = ['Corey', 'Kevin']
cols = ['Quiz_2', 'Quiz_3']
df_spring = df.loc[rows, cols]
df_spring


Out[41]:
Quiz_2 Quiz_3
Corey 75 88
Kevin 98 92

In [42]:
# Select first 2 rows and last 2 columns using index numbers 
df.iloc[[0,1], [1,2]]


Out[42]:
Quiz_2 Quiz_3
Corey 75 88
Kevin 98 92

In [43]:
# Define new column as mean of other columns
df['Quiz_Avg'] = df.mean(axis=1)
df


Out[43]:
Quiz_1 Quiz_2 Quiz_3 Quiz_Avg
Corey 63 75 88 75.333333
Kevin 48 98 92 79.333333
Akshay 87 86 85 86.000000

In [44]:
# Create a new column
df['Quiz_4'] = [92, 95, 88]
df


Out[44]:
Quiz_1 Quiz_2 Quiz_3 Quiz_Avg Quiz_4
Corey 63 75 88 75.333333 92
Kevin 48 98 92 79.333333 95
Akshay 87 86 85 86.000000 88

In [45]:
# Delete Column
del df['Quiz_Avg']
df


Out[45]:
Quiz_1 Quiz_2 Quiz_3 Quiz_4
Corey 63 75 88 92
Kevin 48 98 92 95
Akshay 87 86 85 88

In [46]:
# Create new DataFrame of one row
df_new = pd.DataFrame({'Quiz_1':[np.NaN], 'Quiz_2':[np.NaN], 'Quiz_3': [np.NaN], 'Quiz_4':[71]}, index=['Adrian'])

In [47]:
df_new


Out[47]:
Quiz_1 Quiz_2 Quiz_3 Quiz_4
Adrian NaN NaN NaN 71

In [48]:
# Concatenate DataFrames
df = pd.concat([df, df_new])
# Display new DataFrame
df


Out[48]:
Quiz_1 Quiz_2 Quiz_3 Quiz_4
Corey 63.0 75.0 88.0 92
Kevin 48.0 98.0 92.0 95
Akshay 87.0 86.0 85.0 88
Adrian NaN NaN NaN 71

In [49]:
df['Quiz_Avg'] = df.mean(axis=1, skipna=True)
df


Out[49]:
Quiz_1 Quiz_2 Quiz_3 Quiz_4 Quiz_Avg
Corey 63.0 75.0 88.0 92 79.50
Kevin 48.0 98.0 92.0 95 83.25
Akshay 87.0 86.0 85.0 88 86.50
Adrian NaN NaN NaN 71 71.00

In [50]:
df.Quiz_4.astype(float)


Out[50]:
Corey     92.0
Kevin     95.0
Akshay    88.0
Adrian    71.0
Name: Quiz_4, dtype: float64

In [51]:
df


Out[51]:
Quiz_1 Quiz_2 Quiz_3 Quiz_4 Quiz_Avg
Corey 63.0 75.0 88.0 92 79.50
Kevin 48.0 98.0 92.0 95 83.25
Akshay 87.0 86.0 85.0 88 86.50
Adrian NaN NaN NaN 71 71.00

In [52]:
housing_df = pd.read_csv('data/HousingData.csv')

In [53]:
housing_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       486 non-null float64
ZN         486 non-null float64
INDUS      486 non-null float64
CHAS       486 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        486 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null int64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      486 non-null float64
MEDV       506 non-null float64
dtypes: float64(12), int64(2)
memory usage: 55.4 KB

In [54]:
housing_df.describe()


Out[54]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
count 486.000000 486.000000 486.000000 486.000000 506.000000 506.000000 486.000000 506.000000 506.000000 506.000000 506.000000 506.000000 486.000000 506.000000
mean 3.611874 11.211934 11.083992 0.069959 0.554695 6.284634 68.518519 3.795043 9.549407 408.237154 18.455534 356.674032 12.715432 22.532806
std 8.720192 23.388876 6.835896 0.255340 0.115878 0.702617 27.999513 2.105710 8.707259 168.537116 2.164946 91.294864 7.155871 9.197104
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000 5.000000
25% 0.081900 0.000000 5.190000 0.000000 0.449000 5.885500 45.175000 2.100175 4.000000 279.000000 17.400000 375.377500 7.125000 17.025000
50% 0.253715 0.000000 9.690000 0.000000 0.538000 6.208500 76.800000 3.207450 5.000000 330.000000 19.050000 391.440000 11.430000 21.200000
75% 3.560262 12.500000 18.100000 0.000000 0.624000 6.623500 93.975000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000 25.000000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000 50.000000

In [55]:
housing_df.head()


Out[55]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 NaN 36.2

In [56]:
housing_df.shape


Out[56]:
(506, 14)

In [57]:
housing_df.isnull().any()


Out[57]:
CRIM        True
ZN          True
INDUS       True
CHAS        True
NOX        False
RM         False
AGE         True
DIS        False
RAD        False
TAX        False
PTRATIO    False
B          False
LSTAT       True
MEDV       False
dtype: bool

In [58]:
# show some records with NULL values
housing_df.loc[:5, housing_df.isnull().any()]


Out[58]:
CRIM ZN INDUS CHAS AGE LSTAT
0 0.00632 18.0 2.31 0.0 65.2 4.98
1 0.02731 0.0 7.07 0.0 78.9 9.14
2 0.02729 0.0 7.07 0.0 61.1 4.03
3 0.03237 0.0 2.18 0.0 45.8 2.94
4 0.06905 0.0 2.18 0.0 54.2 NaN
5 0.02985 0.0 2.18 0.0 58.7 5.21

In [59]:
# Data Description of columns with any Null values
housing_df.loc[:, housing_df.isnull().any()].describe()


Out[59]:
CRIM ZN INDUS CHAS AGE LSTAT
count 486.000000 486.000000 486.000000 486.000000 486.000000 486.000000
mean 3.611874 11.211934 11.083992 0.069959 68.518519 12.715432
std 8.720192 23.388876 6.835896 0.255340 27.999513 7.155871
min 0.006320 0.000000 0.460000 0.000000 2.900000 1.730000
25% 0.081900 0.000000 5.190000 0.000000 45.175000 7.125000
50% 0.253715 0.000000 9.690000 0.000000 76.800000 11.430000
75% 3.560262 12.500000 18.100000 0.000000 93.975000 16.955000
max 88.976200 100.000000 27.740000 1.000000 100.000000 37.970000
The choice depends on what you ultimately want to do with the data. If the goal is a straightforward data analysis, eliminating the rows with null values is worth consideration. However, if the goal is to use machine learning to predict data, then perhaps more is to be gained by changing the null values to a suitable replacement. It's impossible to know in advance.

In [61]:
# Replace Null values in the data
housing_df['AGE'] = housing_df['AGE'].fillna(housing_df.mean())

In [62]:
housing_df['CHAS'] = housing_df['CHAS'].fillna(0)

In [63]:
housing_df = housing_df.fillna(housing_df.median())

In [64]:
housing_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null int64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
MEDV       506 non-null float64
dtypes: float64(12), int64(2)
memory usage: 55.4 KB

VISUALIZATION


In [65]:
import matplotlib.pyplot as plt
%matplotlib inline

In [66]:
# !pip uninstall seaborn -y
import sys
sys.version


Out[66]:
'3.7.6 (default, Dec 30 2019, 19:38:26) \n[Clang 11.0.0 (clang-1100.0.33.16)]'

In [67]:
# !pip3 install seaborn --upgrade
import seaborn as sns
sns.set()

In [68]:
plt.hist(housing_df['MEDV'])
plt.show()



In [69]:
plt.hist(housing_df['MEDV'])
plt.title('Median Boston Housing Prices')
plt.xlabel('1980 Median Value in Thousands')
plt.ylabel('Count')
plt.show()



In [70]:
housing_df.columns


Out[70]:
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

In [71]:
# Tweak more
title = 'Median Boston Housing Prices'
plt.figure(figsize=(10,6))
plt.hist(housing_df['MEDV'])
plt.title(title, fontsize=15)
plt.xlabel('1980 Median Value in Thousands')
plt.ylabel('Count')
plt.savefig(title, dpi=300)
plt.show()



In [72]:
def my_hist(column, title, xlab, ylab):
    title = title
    plt.figure(figsize=(10,6))
    plt.hist(column)
    plt.title(title, fontsize=15)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.savefig(title, dpi=300)
    plt.show()

In [73]:
my_hist(housing_df['RM'], 'Average Number of Rooms in Boston Households', 'Average Number of Rooms', 'Count')



In [74]:
# Optimizing solution
def my_hist(column, title, xlab, ylab, bins=10, alpha=0.7, color='c'):
    title = title
    plt.figure(figsize=(10,6))
    plt.hist(column, bins=bins, range=(3,9), alpha=alpha, color=color)
    plt.title(title, fontsize=15)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.savefig(title, dpi=300)
    plt.show()

In [75]:
my_hist(housing_df['RM'], 'Average Number of Rooms in Boston Households', 'Average Number of Rooms', 'Count')



In [76]:
# SCATTER PLOTS
x = housing_df['RM']
y = housing_df['MEDV']
plt.scatter(x, y) 
plt.show()



In [77]:
# CORRELATION - Data points Correlation
housing_df.corr()


Out[77]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
CRIM 1.000000 -0.185359 0.392632 -0.055585 0.410971 -0.220045 0.343427 -0.366025 0.601224 0.560469 0.277964 -0.365336 0.437417 -0.383895
ZN -0.185359 1.000000 -0.507304 -0.032992 -0.498619 0.312295 -0.535341 0.632428 -0.300061 -0.304385 -0.394622 0.170125 -0.398838 0.362292
INDUS 0.392632 -0.507304 1.000000 0.054693 0.738387 -0.377978 0.614248 -0.698621 0.592735 0.716267 0.385366 -0.354840 0.564508 -0.476394
CHAS -0.055585 -0.032992 0.054693 1.000000 0.070867 0.106797 0.074984 -0.092318 -0.003339 -0.035822 -0.109451 0.050608 -0.047279 0.183844
NOX 0.410971 -0.498619 0.738387 0.070867 1.000000 -0.302188 0.711864 -0.769230 0.611441 0.668023 0.188933 -0.380051 0.573040 -0.427321
RM -0.220045 0.312295 -0.377978 0.106797 -0.302188 1.000000 -0.239518 0.205246 -0.209847 -0.292048 -0.355501 0.128069 -0.604323 0.695360
AGE 0.343427 -0.535341 0.614248 0.074984 0.711864 -0.239518 1.000000 -0.724354 0.447088 0.498408 0.261826 -0.268029 0.575022 -0.377572
DIS -0.366025 0.632428 -0.698621 -0.092318 -0.769230 0.205246 -0.724354 1.000000 -0.494588 -0.534432 -0.232471 0.291512 -0.483244 0.249929
RAD 0.601224 -0.300061 0.592735 -0.003339 0.611441 -0.209847 0.447088 -0.494588 1.000000 0.910228 0.464741 -0.444413 0.467765 -0.381626
TAX 0.560469 -0.304385 0.716267 -0.035822 0.668023 -0.292048 0.498408 -0.534432 0.910228 1.000000 0.460853 -0.441808 0.524156 -0.468536
PTRATIO 0.277964 -0.394622 0.385366 -0.109451 0.188933 -0.355501 0.261826 -0.232471 0.464741 0.460853 1.000000 -0.177383 0.370727 -0.507787
B -0.365336 0.170125 -0.354840 0.050608 -0.380051 0.128069 -0.268029 0.291512 -0.444413 -0.441808 -0.177383 1.000000 -0.370993 0.333461
LSTAT 0.437417 -0.398838 0.564508 -0.047279 0.573040 -0.604323 0.575022 -0.483244 0.467765 0.524156 0.370727 -0.370993 1.000000 -0.723093
MEDV -0.383895 0.362292 -0.476394 0.183844 -0.427321 0.695360 -0.377572 0.249929 -0.381626 -0.468536 -0.507787 0.333461 -0.723093 1.000000

In [78]:
# Using Seaborn to visualize the Correlation
corr = housing_df.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, xticklabels=corr.columns.values, 
yticklabels=corr.columns.values, cmap="Blues", linewidths=1.25, alpha=0.8)
plt.show()



In [79]:
# REGRESSION
plt.figure(figsize=(13, 8))
sns.regplot(x,y)
plt.show()


/usr/local/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

In [85]:
# More details on Regression line
import statsmodels.api as sm
X = sm.add_constant(x)
model = sm.OLS(y, X)
est = model.fit()
print(est.summary())
# The strangest part of the code is adding the constant. This is basically the y-intercept. 
# When the constant is not added, the y-intercept is 0. In our case, it makes sense that 
# the y-intercept would be 0: if there are 0 rooms, the house should have no value. In 
# general, however, it's a good idea to keep a y-intercept, and it's the default choice of 
# the preceding Seaborn graph.


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.484
Model:                            OLS   Adj. R-squared:                  0.483
Method:                 Least Squares   F-statistic:                     471.8
Date:                Wed, 15 Jan 2020   Prob (F-statistic):           2.49e-74
Time:                        13:12:23   Log-Likelihood:                -1673.1
No. Observations:                 506   AIC:                             3350.
Df Residuals:                     504   BIC:                             3359.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -34.6706      2.650    -13.084      0.000     -39.877     -29.465
RM             9.1021      0.419     21.722      0.000       8.279       9.925
==============================================================================
Omnibus:                      102.585   Durbin-Watson:                   0.684
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              612.449
Skew:                           0.726   Prob(JB):                    1.02e-133
Kurtosis:                       8.190   Cond. No.                         58.4
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

There's a lot of important information in this table. The first is the value of R^2. This suggests that 48% of the data can be explained by the regression line. The second is the coefficient constant of -34.6706. This is the y-intercept. The third is the RM coefficient of 9.1021. This suggests that for every one-bedroom increase, the value of the house increased by 9,102. (Keep in mind that this is from 1980.)

The standard error suggests how far off the actual values are from the line on average, and the numbers underneath the [0.025 0.975] column give the 95% Confidence Interval of the value, meaning statsmodel is 95% confident that the true increase in the value of the average house for every one-bedroom increase is between 8,279 and 9,925.


In [ ]:
# !pip3 uninstall statsmodels #--upgrade
sys.version

In [89]:
# BOX PLOTS
plt.figure(figsize=(12, 8))
x = housing_df['RM']
y = housing_df['MEDV']
plt.boxplot(x)
plt.show()



In [90]:
# VIOLIN PLOTS
plt.figure(figsize=(10,8))
plt.violinplot(x)
plt.show()



In [91]:
# TASK:  Salary Visualization
uk_salary_df = pd.read_csv('data/UKStatistics.csv')

In [92]:
uk_salary_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 19 columns):
Post Unique Reference              51 non-null object
Name                               51 non-null object
Grade (or equivalent)              51 non-null object
Job Title                          51 non-null object
Job/Team Function                  49 non-null object
Parent Department                  51 non-null object
Organisation                       51 non-null object
Unit                               49 non-null object
Contact Phone                      46 non-null object
Contact E-mail                     49 non-null object
Reports to Senior Post             51 non-null object
Salary Cost of Reports (£)         51 non-null int64
FTE                                51 non-null float64
Actual Pay Floor (£)               51 non-null int64
Actual Pay Ceiling (£)             51 non-null int64
Unnamed: 15                        0 non-null float64
Professional/Occupational Group    51 non-null object
Notes                              27 non-null object
Valid?                             51 non-null int64
dtypes: float64(2), int64(4), object(13)
memory usage: 7.6+ KB

In [94]:
# GATHER NULL INFO IN DATASET
# housing_df.loc[:, housing_df.isnull().any()].describe()
uk_salary_df.loc[:, uk_salary_df.isnull().any()].describe()


Out[94]:
Unnamed: 15
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN

In [100]:
# uk_salary_df.head()
uk_salary_df['Unnamed: 15'].dropna()
# uk_salary_df.columns


Out[100]:
Series([], Name: Unnamed: 15, dtype: float64)

In [102]:
del uk_salary_df['Unnamed: 15']

In [103]:
uk_salary_df.columns


Out[103]:
Index(['Post Unique Reference', 'Name', 'Grade (or equivalent)', 'Job Title',
       'Job/Team Function', 'Parent Department', 'Organisation', 'Unit',
       'Contact Phone', 'Contact E-mail', 'Reports to Senior Post',
       'Salary Cost of Reports (£)', 'FTE', 'Actual Pay Floor (£)',
       'Actual Pay Ceiling (£)', 'Professional/Occupational Group', 'Notes',
       'Valid?'],
      dtype='object')

In [107]:
# Histogram: Actual Pay Floor (£) [def my_hist(column, title, xlab, ylab, bins=10, alpha=0.7, color='c'):]
my_hist("Actual Pay Floor (£)", "UK Actual Pay", "pay", "level", bins=7, alpha=0.8, color='b')



In [106]:
title = 'UK Salary'
plt.figure(figsize=(10,6))
plt.hist(uk_salary_df['Actual Pay Floor (£)'])
plt.title(title, fontsize=15)
plt.xlabel('Salary in Thousands')
plt.ylabel('Level')
plt.savefig(title, dpi=300)
plt.show()



In [112]:
# scatter plot using the values for x as Salary Cost of Reports (£) and y as Actual Pay Floor (£).
plt.figure(figsize=(14,8))
x = uk_salary_df['Salary Cost of Reports (£)'].map(lambda x: x * 1000)
y = uk_salary_df['Actual Pay Floor (£)']
plt.scatter(x, y) 
plt.show()



In [113]:
# BOX PLOT
plt.figure(figsize=(14,8))
plt.boxplot(x)
plt.show()



In [114]:
plt.figure(figsize=(14,8))
plt.boxplot(y)
plt.show()



In [115]:
uk_salary_df.describe()


Out[115]:
Salary Cost of Reports (£) FTE Actual Pay Floor (£) Actual Pay Ceiling (£) Valid?
count 5.100000e+01 51.000000 51.000000 51.000000 51.0
mean 1.651590e+06 0.982157 22745.098039 23823.313725 1.0
std 2.656579e+06 0.101771 45114.451404 47133.322796 0.0
min 0.000000e+00 0.300000 0.000000 0.000000 1.0
25% 0.000000e+00 1.000000 0.000000 0.000000 1.0
50% 1.105736e+06 1.000000 0.000000 0.000000 1.0
75% 2.175629e+06 1.000000 0.000000 0.000000 1.0
max 1.672047e+07 1.000000 150000.000000 154999.000000 1.0

In [116]:
uk_salary_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 18 columns):
Post Unique Reference              51 non-null object
Name                               51 non-null object
Grade (or equivalent)              51 non-null object
Job Title                          51 non-null object
Job/Team Function                  49 non-null object
Parent Department                  51 non-null object
Organisation                       51 non-null object
Unit                               49 non-null object
Contact Phone                      46 non-null object
Contact E-mail                     49 non-null object
Reports to Senior Post             51 non-null object
Salary Cost of Reports (£)         51 non-null int64
FTE                                51 non-null float64
Actual Pay Floor (£)               51 non-null int64
Actual Pay Ceiling (£)             51 non-null int64
Professional/Occupational Group    51 non-null object
Notes                              27 non-null object
Valid?                             51 non-null int64
dtypes: float64(1), int64(4), object(13)
memory usage: 7.2+ KB

In [ ]: