Analysis ZR-Data

Load the libraries


In [1]:
import pandas as pd
import numpy as np
import pylab as plt
import statsmodels.formula.api as sm
%matplotlib inline
pd.set_option('display.precision', 4)
pd.set_option('display.max_columns', 999)

Load the data


In [2]:
ZR=pd.ExcelFile('ZR_Daten_DDR_1976_1990.xlsx')
df=ZR.parse("Tabelle1")

Describe the data


In [3]:
df=df[(df['ERTRAG (dt/ha)']<600) & (df['ERTRAG (dt/ha)']>100)]

In [4]:
print df.describe()


       ERNTEJAHR  Mittlere Ackerzahl  TAG_AUSSAAT  TAG_AUFGANG  \
count  4474.0000           4474.0000    4474.0000    4474.0000   
mean   1982.4779             50.1616     111.0096     128.0346   
std       3.9931             14.5134      11.1164       9.7309   
min    1976.0000             24.0000      75.0000      94.0000   
25%    1979.0000             36.0000     104.0000     122.0000   
50%    1982.0000             48.0000     110.0000     128.0000   
75%    1986.0000             56.0000     118.0000     134.0000   
max    1989.0000             80.0000     159.0000     177.0000   

       Tage Aussaat bis Aufgang  N_DUENGUNG  TAG_ERNTEABSCHLUSS  \
count                  4474.000   4474.0000           4474.0000   
mean                     17.025    175.1495            303.9104   
std                       6.008     39.9842             16.1333   
min                       1.000     33.0000            254.0000   
25%                      13.000    152.0000            293.0000   
50%                      16.000    176.0000            303.0000   
75%                      21.000    200.0000            314.0000   
max                      47.000    437.0000            357.0000   

       ERTRAG (dt/ha)       LT_1       LT_2       LT_3       LT_4       LT_5  \
count       4474.0000  4474.0000  4474.0000  4474.0000  4474.0000  4474.0000   
mean         300.7501    -0.8972    -0.7380     3.4538     7.0675    12.5807   
std           79.5878     3.4533     2.8268     2.2390     1.1700     1.5621   
min          101.0000    -8.1000    -8.7000    -2.5000     4.0000     8.6000   
25%          248.0000    -3.9000    -2.1000     2.2000     6.3000    11.5000   
50%          304.9500    -0.2500    -0.6000     3.2000     7.0000    12.7000   
75%          352.0000     1.5000     1.1000     5.2000     7.8000    13.8000   
max          599.0000     5.0000     4.8000     7.7000    10.0000    15.7000   

            LT_6       LT_7       LT_8       LT_9      LT_10      LT_11  \
count  4474.0000  4474.0000  4474.0000  4474.0000  4474.0000  4474.0000   
mean     15.5563    17.2089    16.8956    13.5632     9.4887     4.2856   
std       1.2385     1.6445     1.0667     1.4016     1.0064     1.5429   
min      11.7000    13.7000    13.4000    10.4000     6.4000    -0.8000   
25%      14.8000    15.9000    16.2000    12.7000     8.9000     3.3000   
50%      15.7000    17.3000    16.8000    13.7000     9.5000     4.3000   
75%      16.4000    18.3000    17.7000    14.3000    10.1000     5.4000   
max      19.3000    21.2000    19.4000    17.3000    11.7000     7.4000   

           LT_12    LT_4-10       NI_1       NI_2       NI_3       NI_4  \
count  4474.0000  4474.0000  4474.0000  4474.0000  4474.0000  4474.0000   
mean      1.3852    13.1944    44.0878    27.4916    41.9594    42.3425   
std       1.9276     0.8669    25.3962    16.1304    23.3331    26.4749   
min      -3.9000    10.5429     4.8000     2.4000     5.3000     1.0000   
25%       0.1000    12.5857    26.3000    14.8000    24.3000    20.3000   
50%       1.8000    13.1143    38.4000    25.8000    39.2000    39.9000   
75%       2.8000    13.7571    56.5000    35.1000    54.4000    61.7000   
max       4.6000    15.0429   179.5000    92.3000   117.1000   157.0000   

            NI_5       NI_6       NI_7       NI_8       NI_9      NI_10  \
count  4474.0000  4474.0000  4474.0000  4474.0000  4474.0000  4474.0000   
mean     49.7888    62.0105    58.6842    55.5303    45.8966    35.7546   
std      30.0849    31.9561    31.7607    31.3821    25.0877    21.8849   
min       3.2000     8.6000     1.0000     5.5000     4.9000     3.2000   
25%      28.9000    40.6000    36.0000    32.6000    28.5000    16.7000   
50%      43.8000    57.2000    53.1000    49.9000    42.6000    34.1000   
75%      66.3000    79.0000    77.6000    69.6000    60.0000    47.6000   
max     171.7000   248.0000   168.5000   198.6000   127.4000   122.5000   

           NI_11      NI_12    NI_4-10  
count  4474.0000  4474.0000  4474.0000  
mean     42.9523    50.0898    50.0011  
std      20.4999    23.4667    13.9143  
min       4.9000     7.7000    19.0000  
25%      29.9750    33.4000    39.3714  
50%      42.2000    44.0000    49.9857  
75%      53.7000    64.2000    59.0857  
max     152.7000   150.4000    93.8714  

Corrcoef between yield and the rest


In [5]:
corr=df.corr()
print corr['ERTRAG (dt/ha)']


ERNTEJAHR                   0.1439
Mittlere Ackerzahl          0.1354
TAG_AUSSAAT                -0.1403
TAG_AUFGANG                -0.1639
Tage Aussaat bis Aufgang   -0.0060
N_DUENGUNG                  0.0217
TAG_ERNTEABSCHLUSS          0.0960
ERTRAG (dt/ha)              1.0000
LT_1                       -0.1632
LT_2                       -0.1056
LT_3                        0.0505
LT_4                        0.0578
LT_5                       -0.0226
LT_6                       -0.1851
LT_7                       -0.1356
LT_8                       -0.1366
LT_9                       -0.1098
LT_10                       0.1055
LT_11                       0.0081
LT_12                       0.0430
LT_4-10                    -0.1011
NI_1                       -0.1370
NI_2                        0.0378
NI_3                        0.0031
NI_4                        0.0784
NI_5                        0.0900
NI_6                        0.1065
NI_7                        0.1198
NI_8                        0.2420
NI_9                        0.1891
NI_10                       0.0118
NI_11                       0.0324
NI_12                       0.0699
NI_4-10                     0.2525
Name: ERTRAG (dt/ha), dtype: float64

Regression for Erntejahr


In [6]:
data=pd.DataFrame({'Y':df['ERTRAG (dt/ha)'],'X':df['ERNTEJAHR']})

In [7]:
model= sm.ols(formula="Y ~ X", data=data).fit()

In [8]:
print model.params


Intercept   -5383.5639
X               2.8673
dtype: float64

In [9]:
print model.summary()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                     94.50
Date:                Mon, 27 Mar 2017   Prob (F-statistic):           4.05e-22
Time:                        10:33:32   Log-Likelihood:                -25883.
No. Observations:                4474   AIC:                         5.177e+04
Df Residuals:                    4472   BIC:                         5.178e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept  -5383.5639    584.731     -9.207      0.000     -6529.927 -4237.201
X              2.8673      0.295      9.721      0.000         2.289     3.446
==============================================================================
Omnibus:                       10.652   Durbin-Watson:                   1.232
Prob(Omnibus):                  0.005   Jarque-Bera (JB):               12.055
Skew:                           0.060   Prob(JB):                      0.00241
Kurtosis:                       3.224   Cond. No.                     9.84e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.84e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

In [10]:
df['ERTRAG (dt/ha)']-=(2.867*(df['ERNTEJAHR']-1976))

In [11]:
df['ERTRAG (dt/ha)'].describe()


Out[11]:
count    4474.0000
mean      282.1781
std        78.7599
min        66.5960
25%       230.3217
50%       287.0640
75%       334.2660
max       596.1330
Name: ERTRAG (dt/ha), dtype: float64

In [12]:
data1=pd.DataFrame({'Y':df['ERTRAG (dt/ha)'],'X':df['TAG_AUSSAAT']})

In [13]:
model1= sm.ols(formula="Y ~ X", data=data1).fit()
print model1.params


Intercept    365.7124
X             -0.7525
dtype: float64

In [14]:
print model1.summary()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     51.02
Date:                Mon, 27 Mar 2017   Prob (F-statistic):           1.06e-12
Time:                        10:33:32   Log-Likelihood:                -25858.
No. Observations:                4474   AIC:                         5.172e+04
Df Residuals:                    4472   BIC:                         5.173e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept    365.7124     11.753     31.116      0.000       342.671   388.754
X             -0.7525      0.105     -7.143      0.000        -0.959    -0.546
==============================================================================
Omnibus:                       17.663   Durbin-Watson:                   1.224
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               22.513
Skew:                           0.049   Prob(JB):                     1.29e-05
Kurtosis:                       3.333   Cond. No.                     1.12e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.12e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

In [15]:
df['ERTRAG (dt/ha)']-=(0.752*(df['TAG_AUSSAAT']-75)) # wrong due to errors in sowing date

In [16]:
df['ERTRAG (dt/ha)'].describe()


Out[16]:
count    4474.0000
mean      255.0988
std        80.0804
min        25.1630
25%       201.6195
50%       259.0740
75%       307.8680
max       558.5330
Name: ERTRAG (dt/ha), dtype: float64

Histograms to support model building


In [17]:
plt.hist(df['ERTRAG (dt/ha)'].dropna(),bins=50)
plt.xlabel('ERTRAG (dt/ha)')
plt.grid(True)
plt.show()



In [18]:
plt.hist(df['Mittlere Ackerzahl'].dropna(),bins=50)
plt.xlabel('Mittlere Ackerzahl')
plt.grid(True)
plt.show()



In [19]:
plt.hist(df['LT_6']+df['LT_7']+df['LT_8']+df['LT_9'],bins=50)
plt.xlabel('LT6..LT9')
plt.grid(True)
plt.show()



In [20]:
plt.hist(df['NI_6']+df['NI_7']+df['NI_8']+df['NI_9'],bins=50)
plt.xlabel('NI_6..NI_9')
plt.grid(True)
plt.show()



In [ ]:


In [ ]: