In [1]:
# Importing the required libraries
# Note %matplotlib inline works only for ipython notebook. It will not work for PyCharm. It is used to show the plot distributions
# Make sure to put %matplotlib inline as the first line of code when visualising plots. Also in pyCharm IDE use plt.show() to see the plot
%matplotlib inline 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api
import statsmodels.formula.api as smf
sns.set(color_codes=True)

# Loading the data
gapURL='https://raw.githubusercontent.com/duttashi/Data-Analysis-Visualization/master/gapminder.csv'
data=pd.read_csv(gapURL)


:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [3]:
# setting variables that you will be working with to numeric
data['breastcancerper100th']= data['breastcancerper100th'].convert_objects(convert_numeric=True)
data['femaleemployrate']= data['femaleemployrate'].convert_objects(convert_numeric=True)
data['alcconsumption']= data['alcconsumption'].convert_objects(convert_numeric=True)
# shows the number of rows and columns
print (len(data)) 
print (len(data.columns))
print (len(data.index))


213
16
213

In [4]:
# Print the column headers/headings
names=data.columns.values
print names


['country' 'incomeperperson' 'alcconsumption' 'armedforcesrate'
 'breastcancerper100th' 'co2emissions' 'femaleemployrate' 'hivrate'
 'internetuserate' 'lifeexpectancy' 'oilperperson' 'polityscore'
 'relectricperperson' 'suicideper100th' 'employrate' 'urbanrate']

In [5]:
# using the describe function to get the standard deviation and other descriptive statistics of our variables
desc1=data['breastcancerper100th'].describe()
desc2=data['femaleemployrate'].describe()
desc3=data['alcconsumption'].describe()
print "\nBreast Cancer per 100th person\n", desc1
print "\nfemale employ rate\n", desc2
print "\nAlcohol consumption in litres\n", desc3


Breast Cancer per 100th person
count    173.000000
mean      37.402890
std       22.697901
min        3.900000
25%       20.600000
50%       30.000000
75%       50.300000
max      101.100000
Name: breastcancerper100th, dtype: float64

female employ rate
count    178.000000
mean      47.549438
std       14.625743
min       11.300000
25%       38.725000
50%       47.549999
75%       55.875000
max       83.300003
Name: femaleemployrate, dtype: float64

Alcohol consumption in litres
count    187.000000
mean       6.689412
std        4.899617
min        0.030000
25%        2.625000
50%        5.920000
75%        9.925000
max       23.010000
Name: alcconsumption, dtype: float64

In [6]:
data.describe()
# Show the frequency distribution
print "\nAlcohol Consumption\nFrequency Distribution (in %)"
c1=data['alcconsumption'].value_counts(sort=False,dropna=False)
print c1
print "\nBreast Cancer per 100th"
c2=data['breastcancerper100th'].value_counts(sort=False)
print c2
print "\nFemale Employee Rate"
c3=data['femaleemployrate'].value_counts(sort=False)
print c3


Alcohol Consumption
Frequency Distribution (in %)
NaN       26
 10.20     1
 5.25      1
 9.75      1
 0.50      1
 9.50      1
 3.90      1
 5.05      1
 0.96      1
 3.61      1
 7.29      1
 7.38      1
 7.60      1
 15.00     1
 1.24      1
...
10.08    1
6.66     1
4.81     1
14.43    1
0.65     1
0.92     1
1.92     1
4.71     1
9.46     1
10.41    1
12.09    1
9.48     1
1.03     1
3.64     1
3.11     1
Length: 181, dtype: int64

Breast Cancer per 100th
23.5    2
70.5    1
31.5    1
62.5    1
19.5    6
21.5    1
16.5    3
29.5    1
43.5    1
52.5    1
38.5    1
82.5    1
10.5    1
22.5    2
55.5    1
...
17.3    2
10.3    1
18.8    1
76.1    1
91.9    2
49.6    1
88.7    1
31.2    3
18.7    1
74.8    1
10.9    1
16.2    1
86.7    1
20.4    2
24.1    1
Length: 136, dtype: int64

Female Employee Rate
42.099998    4
55.500000    1
35.500000    1
40.500000    1
45.500000    1
48.500000    1
47.500000    1
57.500000    1
48.599998    1
66.500000    1
36.500000    1
39.900002    1
80.500000    1
13.000000    1
50.500000    1
...
32.299999    1
39.599998    3
56.200001    1
22.600000    1
64.099998    1
54.299999    1
79.199997    1
54.900002    1
37.900002    1
69.400002    1
45.799999    1
35.799999    1
53.099998    1
59.299999    1
58.900002    1
Length: 153, dtype: int64

In [8]:
# Show the frequency distribution of the quantitative variable using the groupby function
ac1=data.groupby('alcconsumption').size()
print "ac1\n",ac1
# Creating a subset of the data
sub1=data[(data['femaleemployrate']>40) & (data['alcconsumption']>=20)& (data['breastcancerper100th']<50)]
# creating a copy of the subset. This copy will be used for subsequent analysis
sub2=sub1.copy()
print "\nContries where Female Employee Rate is greater than 40 &" \
      " Alcohol Consumption is greater than 20L & new breast cancer cases reported are less than 50\n"
print sub2
print "\nContries where Female Employee Rate is greater than 50 &" \
      " Alcohol Consumption is greater than 10L & new breast cancer cases reported are greater than 70\n"
sub3=data[(data['alcconsumption']>10)&(data['breastcancerper100th']>70)&(data['femaleemployrate']>50)]
print sub3


ac1
alcconsumption
0.03              1
0.05              1
0.10              2
0.11              1
0.17              1
0.20              1
0.28              1
0.32              1
0.34              2
0.47              1
0.50              1
0.51              1
0.52              1
0.54              1
0.56              1
...
14.43             1
14.92             1
14.94             1
15.00             1
16.12             1
16.15             1
16.23             1
16.30             1
16.40             1
16.47             1
17.24             1
17.47             1
18.85             1
19.15             1
23.01             1
Length: 180, dtype: int64

Contries where Female Employee Rate is greater than 40 & Alcohol Consumption is greater than 20L & new breast cancer cases reported are less than 50

     country   incomeperperson  alcconsumption armedforcesrate  \
126  Moldova  595.874534521728           23.01        .5415062   

     breastcancerper100th      co2emissions  femaleemployrate hivrate  \
126                  49.6  149904333.333333         43.599998      .4   

     internetuserate lifeexpectancy oilperperson polityscore  \
126  40.122234699607         69.317                        8   

    relectricperperson suicideper100th        employrate urbanrate  
126   304.940114846777        15.53849  44.2999992370606     41.76  

Contries where Female Employee Rate is greater than 50 & Alcohol Consumption is greater than 10L & new breast cancer cases reported are greater than 70

            country   incomeperperson  alcconsumption armedforcesrate  \
9         Australia    25249.98606148           10.21        .4862799   
32           Canada  25575.3526227298           10.20        .3429763   
50          Denmark  30532.2770436986           12.02        1.012373   
63          Finland   27110.731590755           13.10       1.1774157   
90          Ireland  27595.0913472761           14.92        .4500244   
185     Switzerland   37662.751249706           11.41          .52422   
202  United Kingdom  28033.4892827622           13.24        .5080181   

     breastcancerper100th      co2emissions  femaleemployrate hivrate  \
9                    83.2  12970092666.6667         54.599998      .1   
32                   84.3  24979045666.6667         58.900002      .2   
50                   88.7  3503877666.66667         58.099998      .2   
63                   84.7  2420300666.66667         53.400002      .1   
90                   74.9  1633778666.66667         51.000000      .2   
185                  81.7  2406741333.33333         57.000000      .4   
202                  87.2  72524250333.3333         53.099998      .2   

      internetuserate lifeexpectancy      oilperperson polityscore  \
9    75.8956537961344         81.907  1.91302610912404          10   
32   81.3383926859286         81.012  3.00735585130469          10   
50   88.7702538741662         78.826  1.56752746145954          10   
63     86.89884450783         79.977  1.93865426822699          10   
90   69.7703944134078         80.557  1.70026175082217          10   
185   82.166659877332         82.338  1.48741218722918          10   
202  84.7317047499679          80.17  1.18802809420466          10   

    relectricperperson  suicideper100th        employrate urbanrate  
9     2825.39109539914  8.4700301251191              61.5     88.74  
32    4772.37064777813         10.10099              63.5      80.4  
50     1884.2993420087         8.973104  63.0999984741211     86.68  
63    4036.95399341322         16.23437  57.2000007629394      63.3  
90    2051.80233770977         10.36507  59.9000015258789     61.34  
185   2361.03333632306         13.23981  64.3000030517578     73.48  
202   1933.94561510918         6.014659  59.2999992370606     89.94  

In [9]:
# Checking for missing values in the data row-wise 

print "Missing data rows count: ",sum([True for idx,row in data.iterrows() if any(row.isnull())])
# Checking for missing values in the data column-wise

print "Showing missing data coulmn-wise"
print data.isnull().sum()
# Create a copy of the original dataset as sub4 by using the copy() method
sub4=data.copy()
# Now showing the count of null values in the variables
print sub4.isnull().sum()
# Since the data is all continuous variables therefore the use the mean() for missing value imputation 
# if dealing with categorical data, than use the mode() for missing value imputation

sub4.fillna(sub4['breastcancerper100th'].mean(), inplace=True)
sub4.fillna(sub4['femaleemployrate'].mean(), inplace=True)
sub4.fillna(sub4['alcconsumption'].mean(), inplace=True)
# Showing the count of null values after imputation
print sub4.isnull().sum()
# categorize quantitative variable based on customized splits using the cut function
sub4['alco']=pd.qcut(sub4.alcconsumption,6,labels=["0","1-4","5-9","10-14","15-19","20-24"])
sub4['brst']=pd.qcut(sub4.breastcancerper100th,5,labels=["1-20","21-40","41-60","61-80","81-90"])
sub4['emply']=pd.qcut(sub4.femaleemployrate,4,labels=["30-39","40-59","60-79","80-90"])


Missing data rows count:  48
Showing missing data coulmn-wise
country                  0
incomeperperson          0
alcconsumption          26
armedforcesrate          0
breastcancerper100th    40
co2emissions             0
femaleemployrate        35
hivrate                  0
internetuserate          0
lifeexpectancy           0
oilperperson             0
polityscore              0
relectricperperson       0
suicideper100th          0
employrate               0
urbanrate                0
dtype: int64
country                  0
incomeperperson          0
alcconsumption          26
armedforcesrate          0
breastcancerper100th    40
co2emissions             0
femaleemployrate        35
hivrate                  0
internetuserate          0
lifeexpectancy           0
oilperperson             0
polityscore              0
relectricperperson       0
suicideper100th          0
employrate               0
urbanrate                0
dtype: int64
country                 0
incomeperperson         0
alcconsumption          0
armedforcesrate         0
breastcancerper100th    0
co2emissions            0
femaleemployrate        0
hivrate                 0
internetuserate         0
lifeexpectancy          0
oilperperson            0
polityscore             0
relectricperperson      0
suicideper100th         0
employrate              0
urbanrate               0
dtype: int64

In [10]:
# Showing the frequency distribution of the categorised quantitative variables
print "\n\nFrequency distribution of the categorized quantitative variables\n"
fd1=sub4['alco'].value_counts(sort=False,dropna=False)
fd2=sub4['brst'].value_counts(sort=False,dropna=False)
fd3=sub4['emply'].value_counts(sort=False,dropna=False)
print "Alcohol Consumption\n",fd1
print "\n------------------------\n"
print "Breast Cancer per 100th\n",fd2
print "\n------------------------\n"
print "Female Employee Rate\n",fd3
print "\n------------------------\n"



Frequency distribution of the categorized quantitative variables

Alcohol Consumption
0        36
1-4      35
5-9      36
10-14    35
15-19    35
20-24    36
dtype: int64

------------------------

Breast Cancer per 100th
1-20     43
21-40    43
41-60    65
61-80    19
81-90    43
dtype: int64

------------------------

Female Employee Rate
30-39    73
40-59    34
60-79    53
80-90    53
dtype: int64

------------------------


In [12]:
# Now plotting the univariate quantitative variables using the distribution plot
sub5=sub4.copy()
sns.distplot(sub5['alcconsumption'].dropna(),kde=True)

plt.xlabel('Alcohol consumption in litres')
plt.title('Breast cancer in working class women')
plt.show() # Note: Although there is no need to use the show() method for ipython notebook as %matplotlib inline does the trick but 
           #I am adding it here because matplotlib inline does not work for an IDE like Pycharm and for that i need to use plt.show



In [13]:
sns.distplot(sub5['breastcancerper100th'].dropna(),kde=True)
plt.xlabel('Breast cancer per 100th women')
plt.title('Breast cancer in working class women')
plt.show()



In [14]:
sns.distplot(sub5['femaleemployrate'].dropna(),kde=True)
plt.xlabel('Female employee rate')
plt.title('Breast cancer in working class women')
plt.show()



In [15]:
# using scatter plot the visulaize quantitative variable. 
# if categorical variable then use histogram
scat1= sns.regplot(x='alcconsumption', y='breastcancerper100th', data=data)
plt.xlabel('Alcohol consumption in liters')
plt.ylabel('Breast cancer per 100th person')
plt.title('Scatterplot for the Association between Alcohol Consumption and Breast Cancer 100th person')


Out[15]:
<matplotlib.text.Text at 0x1ab2d278>

In [16]:
scat2= sns.regplot(x='femaleemployrate', y='breastcancerper100th', data=data)
plt.xlabel('Female Employ Rate')
plt.ylabel('Breast cancer per 100th person')
plt.title('Scatterplot for the Association between Female Employ Rate and Breast Cancer per 100th Rate')


Out[16]:
<matplotlib.text.Text at 0x19aa0860>

In [19]:
sub6=sub4.copy()
model1=smf.ols(formula='breastcancerper100th~C(alco)',data=sub6)
results1=model1.fit()

print(results1.summary())

m1=sub5.groupby('alcconsumption').mean()
sd1=sub5.groupby('alcconsumption').std()
'''
print m1
print "\n"
print sd1
'''
# Conducting a post hoc comparison test to check for type 1 error
mc1=multi.MultiComparison(sub6['breastcancerper100th'],sub6['alco'])
res1=mc1.tukeyhsd()
print res1.summary()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-19-a0b968234595> in <module>()
      1 sub6=sub4.copy()
----> 2 model1=smf.ols(formula='breastcancerper100th~C(alco)',data=sub6)
      3 results1=model1.fit()
      4 
      5 print(results1.summary())

C:\Users\ashish dutt\Anaconda\envs\dato-env\lib\site-packages\statsmodels\base\model.pyc in from_formula(cls, formula, data, subset, *args, **kwargs)
    145         (endog, exog), missing_idx = handle_formula_data(data, None, formula,
    146                                                          depth=eval_env,
--> 147                                                          missing=missing)
    148         kwargs.update({'missing_idx': missing_idx,
    149                        'missing': missing})

C:\Users\ashish dutt\Anaconda\envs\dato-env\lib\site-packages\statsmodels\formula\formulatools.pyc in handle_formula_data(Y, X, formula, depth, missing)
     63         if data_util._is_using_pandas(Y, None):
     64             result = dmatrices(formula, Y, depth, return_type='dataframe',
---> 65                                NA_action=na_action)
     66         else:
     67             result = dmatrices(formula, Y, depth, return_type='dataframe',

C:\Users\ashish dutt\Anaconda\envs\dato-env\lib\site-packages\patsy\highlevel.pyc in dmatrices(formula_like, data, eval_env, NA_action, return_type)
    295     eval_env = EvalEnvironment.capture(eval_env, reference=1)
    296     (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
--> 297                                       NA_action, return_type)
    298     if lhs.shape[1] == 0:
    299         raise PatsyError("model is missing required outcome variables")

C:\Users\ashish dutt\Anaconda\envs\dato-env\lib\site-packages\patsy\highlevel.pyc in _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
    150         return iter([data])
    151     builders = _try_incr_builders(formula_like, data_iter_maker, eval_env,
--> 152                                   NA_action)
    153     if builders is not None:
    154         return build_design_matrices(builders, data,

C:\Users\ashish dutt\Anaconda\envs\dato-env\lib\site-packages\patsy\highlevel.pyc in _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action)
     55                                        formula_like.rhs_termlist],
     56                                       data_iter_maker,
---> 57                                       NA_action)
     58     else:
     59         return None

C:\Users\ashish dutt\Anaconda\envs\dato-env\lib\site-packages\patsy\build.pyc in design_matrix_builders(termlists, data_iter_maker, NA_action)
    658                                                    factor_states,
    659                                                    data_iter_maker,
--> 660                                                    NA_action)
    661     # Now we need the factor evaluators, which encapsulate the knowledge of
    662     # how to turn any given factor into a chunk of data:

C:\Users\ashish dutt\Anaconda\envs\dato-env\lib\site-packages\patsy\build.pyc in _examine_factor_types(factors, factor_states, data_iter_maker, NA_action)
    427                     cat_sniffers[factor] = CategoricalSniffer(NA_action,
    428                                                               factor.origin)
--> 429                 done = cat_sniffers[factor].sniff(value)
    430                 if done:
    431                     examine_needed.remove(factor)

C:\Users\ashish dutt\Anaconda\envs\dato-env\lib\site-packages\patsy\categorical.pyc in sniff(self, data)
    169         # fastpath to avoid doing an item-by-item iteration over boolean
    170         # arrays, as requested by #44
--> 171         if hasattr(data, "dtype") and np.issubdtype(data.dtype, np.bool_):
    172             self._level_set = set([True, False])
    173             return True

C:\Users\ashish dutt\Anaconda\envs\dato-env\lib\site-packages\numpy\core\numerictypes.pyc in issubdtype(arg1, arg2)
    761     """
    762     if issubclass_(arg2, generic):
--> 763         return issubclass(dtype(arg1).type, arg2)
    764     mro = dtype(arg2).type.mro()
    765     if len(mro) > 1:

TypeError: data type not understood

In [ ]: