Content and Objective

  • Show behavior and correlation and pdfs of a set of data being imported.
  • Pandas is being used since this significantly simplifies importing, extracting, and dealing with data frames.
  • Even if you are not interested in dealing with pandas, understanding of operations should still be possible.

Import


In [1]:
# importing
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
import matplotlib

# showing figures inline
%matplotlib inline

In [2]:
# plotting options 
font = {'size'   : 20}
plt.rc('font', **font)
plt.rc('text', usetex=True)

matplotlib.rc('figure', figsize=(18, 6) )

Importing Data

NOTE: file "Umfragedaten_v1_an.xlsx" required; can be found at http://wikis.fu-berlin.de/pages/viewpage.action?pageId=696156185

NOTE 2: Importing and dealing with data may be relevant if your are interested in this stuff. Otherwise you may skip those lines.

NOTE 3: The following lines are using "pandas" being a Python module for dealing with data frames.


In [3]:
# load data frame
df = pd.read_excel( 'Umfragedaten_v1_an.xlsx' )

In [4]:
# extract according data (size, weight) by slicing data out of the data frame
size_weight = df[ ['GESCHL', 'GRO', 'GEW'] ]

# removing NaN values
size_weight = size_weight.dropna( how='any' )

# extract from data-frame to numpy arrays
size = size_weight[ 'GRO' ].values
weight = size_weight[ 'GEW' ].values

In [5]:
# NOTE: Finding least squares solution for linear regression, 
#       which is not discussed in this leture
S = np.ones( (len(size) , 2) )
S[ :, 0 ] = size
params = np.dot( np.linalg.pinv( S ) , weight )
 
# including linear regression    
regression = params[0] * np.array(size) + params[1]

Plotting Data


In [6]:
# plotting

# point cloud of (size,weight) pairs
plt.plot( size, weight, '.', alpha=.3, ms = 24, mew = 2.0)    
   
# linear regression
plt.plot( size, regression, linewidth=2.0, color='r' )

# histograms on x- and y-axis
bins = 20

w_hist = np.histogram( weight, bins = bins, density = 1 )
width = ( np.max( weight ) - np.min( weight) ) / bins
plt.barh( w_hist[1][:-1] , 120 + w_hist[0] / np.max( w_hist[0]) * 20, width, color = '#ff7f0e' )   

s_hist = np.histogram( size, bins = bins, density = 1 )
width = ( np.max( size ) - np.min( size) ) / bins
plt.bar( s_hist[1][:-1] , s_hist[0] / np.max( s_hist[0]) * 30, width, color = '#ff7f0e' ) 

# axes and stuff
plt.grid( True )
plt.xlabel('$s/\mathrm{cm}$')
plt.ylabel('$w/\mathrm{kg}$')

plt.xlim( (120, 220 ) )
plt.ylim( (0, 220 ) )


Out[6]:
(0, 220)

In [7]:
# getting men and women
size_weight_m = size_weight[ size_weight.GESCHL == 'MAENNLICH' ]
size_weight_w = size_weight[ size_weight.GESCHL == 'WEIBLICH' ]

size_w = size_weight_w[ 'GRO' ].values
weight_w = size_weight_w[ 'GEW' ].values

size_m = size_weight_m[ 'GRO' ].values
weight_m = size_weight_m[ 'GEW' ].values

In [8]:
# plotting

# point cloud of (size,weight) pairs for women
plt.subplot(121)
plt.plot( size_w, weight_w, '.', alpha=.3, ms = 24, mew = 2.0)    

# histograms on x- and y-axis
bins = 20

w_hist = np.histogram( weight_w, bins = bins, density = 1 )
width = ( np.max( weight_w ) - np.min( weight_w ) ) / bins
plt.barh( w_hist[1][:-1] , 120 + w_hist[0] / np.max( w_hist[0]) * 20, width, color = '#ff7f0e' )   

s_hist = np.histogram( size_w, bins = bins, density = 1 )
width = ( np.max( size_w ) - np.min( size_w) ) / bins
plt.bar( s_hist[1][:-1] , s_hist[0] / np.max( s_hist[0]) * 30, width, color = '#ff7f0e' ) 


# axes and stuff
plt.title('Women')
plt.grid( True )
plt.xlabel('$s/\mathrm{cm}$')
plt.ylabel('$w/\mathrm{kg}$')
plt.xlim( (120, 220 ) )
plt.ylim( (0, 200 ) )


# now men
plt.subplot(122)
plt.plot( size_m, weight_m, '.', alpha=.3, ms = 24, mew = 2.0)    

# histograms on x- and y-axis
bins = 20

w_hist = np.histogram( weight_m, bins = bins, density = 1 )
width = ( np.max( weight_m ) - np.min( weight_m ) ) / bins
plt.barh( w_hist[1][:-1] , 120 + w_hist[0] / np.max( w_hist[0]) * 20, width, color = '#ff7f0e' )   

s_hist = np.histogram( size_m, bins = bins, density = 1 )
width = ( np.max( size_m ) - np.min( size_m) ) / bins
plt.bar( s_hist[1][:-1] , s_hist[0] / np.max( s_hist[0]) * 30, width, color = '#ff7f0e' ) 


# axes and stuff
plt.title('Men')
plt.grid( True )
plt.xlabel('$s/\mathrm{cm}$')
plt.ylabel('$w/\mathrm{kg}$')
plt.xlim( (120, 220 ) )
plt.ylim( (0, 200 ) )


Out[8]:
(0, 200)

Get Marginal PDFs and Plot


In [9]:
# reduce to weights where size is within predefined interval
weight_160 = [ w for w, s in zip( weight, size ) if s <= 160 ]
weight_160_180 = [ w for w, s in zip( weight, size ) if 160 < s <= 180 ]    
weight_180_ = [ w for w, s in zip( weight, size ) if s > 180 ]

In [10]:
# plotting
plt.subplot(141)
plt.hist( weight, bins=bins, color='#ff7f0e', density = 1 )

plt.grid(True)
plt.xlim( ( np.min(weight), np.max(weight) ) )
plt.ylim( (0, .1 ) )
plt.xlabel('$w/\mathrm{kg}$')
plt.title('$H_{{{}}}(w)$'.format(len(size)))


plt.subplot(142)
plt.hist( weight_160, bins=bins, color='#ff7f0e', density = 1 )

plt.grid(True)
plt.xlim( ( np.min(weight), np.max(weight) ) )    
plt.ylim( (0, .1 ) )
plt.xlabel('$w/\mathrm{kg}$')
plt.title('$H_{{{}}}(w|s<160)$'.format(len(weight_160)))


plt.subplot(143)
plt.hist( weight_160_180, bins=bins, color='#ff7f0e', density = 1 )

plt.grid(True)
plt.xlim( ( np.min(weight), np.max(weight) ) )    
plt.ylim( (0, .1 ) )
plt.xlabel('$w/\mathrm{kg}$')
plt.title('$H_{{{}}}(w|s\\in(160,180))$'.format(len(weight_160_180)))

plt.subplot(144)
plt.hist( weight_180_, bins=bins, color='#ff7f0e', density = 1 )

plt.grid(True)
plt.xlim( ( np.min(weight), np.max(weight) ) )
plt.ylim( (0, .1 ) )
plt.xlabel('$w/\mathrm{kg}$')
plt.title('$H_{{{}}}(w| s>180 )$'.format(len(weight_180_)))


Out[10]:
Text(0.5, 1.0, '$H_{653}(w| s>180 )$')

Printing Some Numbers


In [11]:
# output various numbers
print('Number of data sets: \t\t\t\t{}'.format( len( weight ) ) )
print('Number of data sets with s <= 160: \t\t{}'.format( len( weight_160) ) )
print('Number of data sets with 160 < s <= 180: \t{}'.format( len( weight_160_180 ) ) )
print('Number of data sets with s > 180: \t\t{}\n'.format( len( weight_180_) ) )

print('----------')

print('Notation: S = Size; W = Weight\n')

print('E( S ) = {:2.2f} cm'.format( np.average( size) ) )
print('D( S ) = {:2.2f} cm\n'.format( np.std( size) ) )

print('E( W ) = {:2.2f} kg'.format( np.average( weight) ) )
print('D( W ) = {:2.2f} kg\n'.format( np.std( weight) ) )

print('E( W | S <= 160 ) = \t\t{:2.2f} kg'.format( np.average( weight_160) ) )
print('E( W | 160 < S <= 180 ) = \t{:2.2f} kg'.format( np.average( weight_160_180) ) )
print('E( W | S > 180 ) = \t\t{:2.2f} kg\n'.format( np.average( weight_180_) ) )


print('----------')

# find and print least squares solution
print('Parameter estimation in linear model w = a s + b:   a = {:2.2f} kg/cm, b = {:2.2f} kg'.format( params[0], params[1] ) )


Number of data sets: 				3424
Number of data sets with s <= 160: 		418
Number of data sets with 160 < s <= 180: 	2353
Number of data sets with s > 180: 		653

----------
Notation: S = Size; W = Weight

E( S ) = 172.66 cm
D( S ) = 9.34 cm

E( W ) = 78.36 kg
D( W ) = 16.62 kg

E( W | S <= 160 ) = 		67.54 kg
E( W | 160 < S <= 180 ) = 	76.43 kg
E( W | S > 180 ) = 		92.25 kg

----------
Parameter estimation in linear model w = a s + b:   a = 0.93 kg/cm, b = -82.57 kg

Find and Print Correlation Coefficient


In [12]:
rho = np.corrcoef( size, weight )
print('Correlation coefficient: {:2.4f}\n'.format( rho[0,1] ) )

rho_w = np.corrcoef( size_w, weight_w )
print('Correlation coefficient women: {:2.4f}\n'.format( rho_w[0,1] ) )

rho_m = np.corrcoef( size_m, weight_m )
print('Correlation coefficient men: {:2.4f}\n'.format( rho_m[0,1] ) )


Correlation coefficient: 0.5239

Correlation coefficient women: 0.2472

Correlation coefficient men: 0.4074

Playground for Experiments

You may, if you like to, try to plot/identifiy dependencies in other pairs of data


In [ ]: