In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [4]:
# read files
soy = pd.read_csv('dataset/yield.csv')
geo = pd.read_csv('dataset/geo.csv')
gene = pd.read_csv('dataset/gene.csv')
In [23]:
soy
Out[23]:
EXPERIMENT
YEAR
LOCATION
VARIETY
FAMILY
CHECK
RM
REPNO
YIELD
CLASS_OF
0
09YT000052
2009
3310
V000016
FAM05619
True
3.9
1
62.548835
0
1
09YT000052
2009
3310
V000016
FAM05619
True
3.9
2
60.315357
0
2
09YT000052
2009
3310
V000018
FAM05619
True
3.7
1
65.900242
0
3
09YT000052
2009
3310
V000018
FAM05619
True
3.7
2
61.433286
0
4
09YT000052
2009
3310
V000039
FAM05625
True
3.4
1
68.133720
0
5
09YT000052
2009
3310
V000039
FAM05625
True
3.4
2
60.315357
0
6
09YT000052
2009
3310
V000082
FAM13251
True
3.1
1
60.315357
0
7
09YT000052
2009
3310
V000082
FAM13251
True
3.1
2
65.900242
0
8
09YT000052
2009
3310
V030090
FAM13986
False
3.2
1
54.730473
0
9
09YT000052
2009
3310
V030090
FAM13986
False
3.2
2
61.433286
0
10
09YT000052
2009
3310
V030093
FAM13986
False
3.5
1
53.612544
0
11
09YT000052
2009
3310
V030093
FAM13986
False
3.5
2
53.612544
0
12
09YT000052
2009
3310
V030101
FAM13986
False
3.5
1
58.081879
0
13
09YT000052
2009
3310
V030101
FAM13986
False
3.5
2
62.548835
0
14
09YT000052
2009
3310
V030102
FAM13986
False
3.4
1
62.548835
0
15
09YT000052
2009
3310
V030102
FAM13986
False
3.4
2
61.433286
0
16
09YT000052
2009
3310
V030105
FAM13986
False
3.3
1
68.133720
0
17
09YT000052
2009
3310
V030105
FAM13986
False
3.3
2
61.433286
0
18
09YT000052
2009
3310
V030107
FAM13991
False
3.5
1
59.197429
0
19
09YT000052
2009
3310
V030107
FAM13991
False
3.5
2
59.197429
0
20
09YT000052
2009
3310
V030110
FAM13991
False
3.4
1
67.018171
0
21
09YT000052
2009
3310
V030110
FAM13991
False
3.4
2
63.666764
0
22
09YT000052
2009
3310
V030116
FAM13991
False
3.7
1
59.197429
0
23
09YT000052
2009
3310
V030116
FAM13991
False
3.7
2
62.548835
0
24
09YT000052
2009
3310
V030134
FAM13991
False
3.3
1
63.666764
0
25
09YT000052
2009
3310
V030134
FAM13991
False
3.3
2
63.666764
0
26
09YT000052
2009
3310
V030137
FAM13991
False
3.5
1
70.367050
0
27
09YT000052
2009
3310
V030137
FAM13991
False
3.5
2
63.666764
0
28
09YT000052
2009
3310
V030139
FAM13991
False
3.5
1
69.251649
0
29
09YT000052
2009
3310
V030139
FAM13991
False
3.5
2
62.548835
0
...
...
...
...
...
...
...
...
...
...
...
172027
14YT005707
2014
3340
V152053
FAM14333
True
3.0
1
59.016460
2013
172028
14YT005707
2014
3340
V152053
FAM14333
True
3.0
2
61.033130
2013
172029
14YT005707
2014
3340
V152079
FAM11179
True
2.7
1
52.145320
2013
172030
14YT005707
2014
3340
V152079
FAM11179
True
2.7
2
62.416490
2013
172031
14YT005707
2014
3340
V155853
FAM13521
True
2.7
1
60.408890
2011
172032
14YT005707
2014
3340
V155853
FAM13521
True
2.7
2
71.876060
2011
172033
14YT005707
2014
3340
V156314
FAM12531
True
2.8
1
47.151370
2012
172034
14YT005707
2014
3340
V156314
FAM12531
True
2.8
2
51.038540
2012
172035
14YT005707
2014
3340
V156553
FAM14238
True
2.9
1
51.976100
2013
172036
14YT005707
2014
3340
V156553
FAM14238
True
2.9
2
65.448490
2013
172037
14YT005707
2014
3370
V152053
FAM14333
True
3.0
1
54.665040
2013
172038
14YT005707
2014
3370
V152053
FAM14333
True
3.0
2
53.002720
2013
172039
14YT005707
2014
3370
V152079
FAM11179
True
2.7
1
57.845590
2013
172040
14YT005707
2014
3370
V152079
FAM11179
True
2.7
2
58.819730
2013
172041
14YT005707
2014
3370
V155853
FAM13521
True
2.7
1
52.049260
2011
172042
14YT005707
2014
3370
V155853
FAM13521
True
2.7
2
54.424890
2011
172043
14YT005707
2014
3370
V156314
FAM12531
True
2.8
1
51.607920
2012
172044
14YT005707
2014
3370
V156314
FAM12531
True
2.8
2
57.927970
2012
172045
14YT005707
2014
3370
V156553
FAM14238
True
2.9
1
65.409680
2013
172046
14YT005707
2014
3370
V156553
FAM14238
True
2.9
2
55.609440
2013
172047
14YT005707
2014
3490
V152053
FAM14333
True
3.0
1
62.554490
2013
172048
14YT005707
2014
3490
V152053
FAM14333
True
3.0
2
59.494380
2013
172049
14YT005707
2014
3490
V152079
FAM11179
True
2.7
1
50.660990
2013
172050
14YT005707
2014
3490
V152079
FAM11179
True
2.7
2
64.556290
2013
172051
14YT005707
2014
3490
V155853
FAM13521
True
2.7
1
57.359940
2011
172052
14YT005707
2014
3490
V155853
FAM13521
True
2.7
2
57.215110
2011
172053
14YT005707
2014
3490
V156314
FAM12531
True
2.8
1
69.472020
2012
172054
14YT005707
2014
3490
V156314
FAM12531
True
2.8
2
64.089670
2012
172055
14YT005707
2014
3490
V156553
FAM14238
True
2.9
1
72.767820
2013
172056
14YT005707
2014
3490
V156553
FAM14238
True
2.9
2
60.186730
2013
172057 rows × 10 columns
In [24]:
geo
Out[24]:
LOCATION
LATITUDE
LONGITUDE
FIPS
AREA
IRRIGATION
TEMP_01
TEMP_02
TEMP_03
TEMP_04
...
RAD_14
RAD_13.1
RAD_14.1
CEC
PH
ORGANIC_MATTER
CLAY
SILT_TOP
SAND_TOP
AWC_100CM
0
10
46.501212
-96.338590
27167
7992.5
831.652312
2929.7
2822.0
2832.7
2612.9
...
1100823
1015998
992691
26.655599
7.222128
6.803356
22.303241
35.982133
41.781395
21.478885
1
20
48.481819
-97.354542
38099
2102.8
56.178384
2757.8
2567.5
2739.9
2332.2
...
1056317
1023342
974648
29.820095
7.589446
7.175127
31.485822
46.015191
22.490958
25.397483
2
30
47.383756
-96.362880
27107
6420.4
20.234374
2919.2
2775.3
2862.9
2528.7
...
1080511
1011937
974301
26.386140
7.328356
6.924282
19.525246
25.714048
54.747396
18.263845
3
101
47.908480
-97.626370
38035
5189.8
454.833566
2829.6
2652.6
2784.9
2422.6
...
1072692
1029851
977706
27.709274
7.314308
6.861001
23.775101
33.212891
42.986473
21.693859
4
103
46.977000
-96.884000
38017
12774.4
201.405003
2977.4
2852.1
2901.7
2625.5
...
1099641
1018317
988927
32.209780
7.344828
7.484837
41.345197
42.814670
15.911892
27.676606
5
107
46.908000
-98.709000
38093
4048.6
115.384619
2914.4
2789.7
2860.3
2579.4
...
1103974
1046380
1003437
25.642289
7.104803
5.968004
23.659071
34.745153
41.614005
21.378516
6
110
47.114430
-97.131311
38017
13618.6
256.113597
2977.4
2852.1
2901.7
2625.5
...
1099641
1018317
988927
28.207248
7.323727
7.122696
30.040148
35.409505
34.561415
24.018750
7
119
45.543370
-96.435570
27011
9093.1
188.308826
3073.8
3007.8
2983.2
2826.0
...
1108340
1031682
1015075
29.832679
7.188441
7.167354
30.999855
39.053037
30.019108
24.644398
8
120
46.300390
-96.481110
27167
9245.7
190.189566
3034.4
2939.4
2935.2
2736.5
...
1106461
1014054
1002638
29.574653
7.383941
6.698668
32.118056
34.002604
33.870587
24.463845
9
130
48.137077
-97.573581
38035
4860.3
427.861246
2829.6
2652.6
2784.9
2422.6
...
1072692
1029851
977706
28.903212
7.419958
6.853100
25.350694
33.128906
41.495226
22.431814
10
140
45.314360
-98.298900
46013
8651.3
234.629694
3178.2
3153.7
3126.2
2958.0
...
1142812
1077983
1048878
27.307198
6.951561
5.989278
25.472148
49.133363
25.414725
24.106608
11
150
46.202694
-98.239100
38021
7690.1
596.776817
3027.1
2954.3
2981.4
2746.5
...
1121718
1048643
1024703
26.957535
7.003484
6.298501
24.167102
36.957535
38.861135
22.186890
12
160
47.351845
-99.475466
38103
9482.9
33.397585
2856.8
2697.5
2810.7
2494.9
...
1081757
1050316
1011466
25.457031
7.031474
6.220953
25.044560
36.189453
38.683521
22.354102
13
1102
46.499750
-97.578490
38073
9268.8
1275.500954
3038.7
2943.5
2965.2
2724.5
...
1112523
1029232
1012789
25.136548
7.078729
6.359579
23.057703
34.114755
42.801622
20.991182
14
1110
44.309980
-96.705308
46011
5092.6
789.362212
3208.2
3180.3
3126.5
3026.4
...
1149779
1062451
1036223
27.102865
6.796730
6.349816
24.457972
41.334997
34.166522
22.973416
15
1130
44.935970
-98.393310
46115
7413.3
664.385747
3267.1
3262.5
3218.2
3069.3
...
1149209
1085817
1058598
27.291794
6.901997
5.611397
25.629412
49.651634
24.664706
24.378083
16
1140
44.348412
-96.634137
46011
6319.8
948.247934
3157.6
3137.9
3082.1
2987.2
...
1148611
1058003
1034445
27.426505
6.764019
6.074994
24.948061
44.301143
30.841869
23.508854
17
1210
44.717000
-94.086000
27143
7201.4
33.303463
3124.1
3096.0
3089.4
2980.3
...
1119820
1020404
1000506
28.921152
6.617180
7.626366
26.607350
37.865741
35.574797
23.595226
18
1220
44.483000
-93.035000
27049
4406.2
142.675785
3172.3
3138.2
3126.1
3026.2
...
1115279
1000423
995052
26.025365
6.178823
6.803279
20.850073
38.459942
40.704167
21.853048
19
1250
44.089000
-93.303000
27147
5480.1
62.634715
3120.3
3126.4
3087.1
3022.5
...
1127106
1022768
998548
30.351997
6.215126
7.539843
25.572700
43.866753
30.560547
24.642687
20
1260
43.688810
-96.286160
27133
7653.3
108.258660
3189.0
3201.1
3116.2
3047.3
...
1160481
1054856
1031275
27.297671
6.476338
5.662898
26.234158
48.901910
24.833333
24.465885
21
1270
44.643000
-95.156000
27127
6997.8
27.144595
3140.0
3125.3
3099.3
2979.9
...
1126041
1035417
1002471
26.269748
6.827640
6.528933
25.591073
36.268012
38.143953
22.656272
22
1280
44.923630
-95.983470
27073
8155.1
267.142745
3120.0
3071.6
3042.7
2905.6
...
1115487
1031477
1010586
28.927517
7.071832
6.562415
28.440611
38.973235
32.669126
24.088759
23
1290
43.757000
-92.872000
27099
6472.2
210.585953
3128.7
3139.9
3083.9
3034.6
...
1130376
1022627
999616
28.694517
6.090791
7.233593
23.257957
47.151982
29.602865
23.748958
24
1301
43.307700
-89.323430
55021
2856.1
209.094754
3203.5
3200.3
3023.0
3036.3
...
1128769
1012620
1031580
23.702722
6.126796
8.143973
21.061396
52.044237
26.935491
23.242420
25
1320
42.979716
-89.449691
55025
2156.2
225.682054
3279.5
3275.4
3106.3
3124.7
...
1132342
1014960
1032072
23.763310
6.200955
7.813844
20.289352
51.361834
28.372830
23.030469
26
1402
43.507340
-83.523320
26157
7189.9
250.272799
2972.0
2990.2
2714.7
2799.4
...
1110759
1017373
1041977
23.011212
6.628805
7.814006
22.881583
37.460142
39.638817
21.558485
27
1410
43.173979
-84.787589
26057
5669.8
467.131812
3088.4
3133.2
2920.8
3000.6
...
1108274
1020250
1027014
22.447917
6.367093
9.740458
21.438006
35.346427
43.180990
20.807444
28
2110
43.319600
-96.568820
46083
6760.4
120.261918
3319.5
3340.1
3220.8
3171.9
...
1179091
1054586
1040005
27.936053
6.615104
4.423613
26.624349
44.908709
28.442419
24.199002
29
2111
43.319600
-96.568820
46083
6760.4
120.261918
3319.5
3340.1
3220.8
3171.9
...
1179091
1054586
1040005
27.936053
6.615104
4.423613
26.624349
44.908709
28.442419
24.199002
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
89
3420
39.323490
-85.233553
18137
3497.9
17.129785
3676.2
3819.9
3497.3
3644.8
...
1132126
1063793
1077948
18.956670
5.843229
4.390081
19.402995
61.038050
19.528935
23.085026
90
3490
40.899287
-86.814469
18181
6426.6
192.953296
3453.0
3482.7
3268.5
3348.1
...
1149816
1060328
1056727
19.566045
6.176917
5.671409
15.652633
27.064525
57.282335
17.002778
91
3491
40.899287
-86.814469
18181
6426.6
192.953296
3453.0
3482.7
3268.5
3348.1
...
1149816
1060328
1056727
19.566045
6.176917
5.671409
15.652633
27.064525
57.282335
17.002778
92
3501
40.125880
-76.534230
42071
1073.8
596.548113
3723.3
3861.6
3558.5
3757.9
...
1058000
1044000
1072968
19.599710
5.963137
5.472535
20.728395
54.385403
24.856064
22.873704
93
3503
40.125870
-76.534340
42071
1073.8
596.548113
3723.3
3861.6
3558.5
3757.9
...
1058000
1044000
1072968
19.599710
5.963137
5.472535
20.728395
54.385403
24.856064
22.873704
94
4104
39.010983
-95.210466
20045
4722.5
190.911028
4049.4
4029.1
3891.9
3815.0
...
1179171
1080636
1116892
22.894242
6.499313
3.997221
23.591942
54.151910
22.282190
23.752799
95
4108
39.805992
-95.261603
20043
9546.9
177.615088
3854.1
3871.6
3728.6
3665.6
...
1184089
1079015
1112949
26.337167
6.267969
3.531338
26.081742
61.863498
12.046224
25.126845
96
4110
39.784210
-97.866910
20157
5759.3
4111.376335
3876.7
3978.3
3751.6
3770.7
...
1217500
1117055
1147378
23.202257
6.453711
3.052826
21.578704
50.681279
27.755136
23.025694
97
4120
39.747108
-95.470291
20013
8446.3
110.484664
3848.3
3869.8
3725.1
3662.6
...
1187499
1082148
1116161
27.320964
6.285648
3.535033
28.220631
58.249060
13.506149
25.616840
98
4215
40.569134
-95.694995
19071
7635.0
681.312339
3779.8
3827.9
3665.8
3632.0
...
1196469
1074075
1104333
27.944251
6.697676
3.853998
26.140023
55.525919
18.320374
24.817159
99
4220
38.815030
-90.592700
29183
2505.9
123.239327
3988.3
4078.8
3837.3
3800.5
...
1164382
1070697
1093418
21.866319
6.501440
4.424011
22.712095
58.599248
18.633825
23.809570
100
4260
40.160670
-92.057200
29103
5382.6
31.564411
3760.1
3759.3
3637.1
3603.5
...
1157662
1073014
1056834
25.205078
5.976924
4.041788
26.602937
58.694806
14.610026
25.771376
101
4270
39.185239
-93.324883
29195
9507.9
182.632845
3948.1
3944.1
3808.3
3756.7
...
1166912
1081778
1085627
24.522642
6.443902
3.781648
23.424117
62.754630
13.753255
24.496202
102
4310
38.736200
-89.596980
17119
4763.7
104.707088
3999.2
4130.6
3851.8
3840.5
...
1161294
1062226
1108868
20.155888
6.258608
4.017265
19.458984
72.323061
7.991175
23.981228
103
4312
38.830010
-90.037900
17119
2803.3
70.842181
3965.0
4075.0
3819.1
3788.5
...
1164547
1064169
1104253
19.728082
6.281380
3.789960
20.059968
65.455584
14.367549
23.461914
104
4320
39.140200
-87.860000
17033
7761.1
623.312683
3885.1
4038.2
3720.4
3774.2
...
1159041
1059175
1086454
20.061343
6.059295
3.831641
19.669777
56.416088
23.963614
23.069878
105
4330
38.495000
-87.727400
17185
6350.3
1469.001391
4009.7
4174.2
3845.0
3899.2
...
1160654
1058236
1098336
19.990656
6.098254
4.293061
20.680936
57.564537
21.725699
23.091504
106
4360
39.202040
-89.827130
17117
5678.6
35.699188
3852.5
3960.1
3704.3
3694.0
...
1165577
1068931
1091194
22.157769
6.085475
3.762400
20.287977
69.740885
9.805266
24.536806
107
4370
38.484790
-89.742030
17027
7208.1
130.756720
3999.5
4125.1
3851.6
3835.9
...
1160997
1062053
1111071
20.444083
6.312001
3.991933
20.109230
69.956814
9.771340
24.010981
108
4501
39.610970
-75.391980
34033
4802.5
2352.601418
3813.4
3936.3
3642.8
3827.1
...
1070288
1071220
1091680
16.448016
5.244490
7.353585
13.377498
47.019476
39.525702
20.002976
109
4503
38.992264
-76.143491
24035
2329.9
872.796547
3785.1
3961.6
3653.2
3842.6
...
1106452
1083053
1117816
14.713976
5.174819
7.175737
11.376374
45.319878
43.055411
18.938498
110
4504
39.362128
-75.836019
24015
2336.7
900.479957
3839.5
3961.8
3665.6
3847.9
...
1069374
1067801
1094285
17.459655
5.267529
7.178402
13.020716
48.955889
38.091192
20.103433
111
5210
37.889510
-89.830430
17157
3739.8
73.342896
4038.2
4131.5
3892.6
3864.8
...
1155549
1061590
1115782
19.181517
5.803161
4.650474
20.106455
62.624761
17.167328
23.519938
112
5220
37.147997
-89.780363
29207
9736.4
10090.705750
4211.9
4191.0
4057.5
4049.1
...
1161432
1070630
1102711
18.224392
5.964569
4.880537
20.614945
55.379413
24.001085
22.750217
113
5240
35.789506
-90.535658
5031
7138.2
15068.233260
4342.0
4288.1
4182.8
4161.2
...
1163163
1088825
1112485
15.023582
6.103371
4.162950
16.251374
42.369792
41.500289
20.284266
114
5340
37.741470
-88.219900
17059
3597.7
1769.231583
4096.0
4191.5
3930.5
3972.0
...
1157357
1056117
1109931
16.931450
5.923573
4.914770
19.293912
61.731056
18.932691
22.722580
115
5410
37.732700
-87.184860
21059
6110.4
341.723997
4057.8
4162.6
3891.8
3949.1
...
1149309
1048094
1102341
16.712167
5.837854
4.958798
17.089193
57.426577
25.463903
21.843880
116
6510
35.652100
-76.553500
37095
4878.8
603.696746
4066.9
4214.5
4058.5
4191.6
...
1048570
1032294
1053288
27.879051
5.277074
12.225963
19.396447
40.531891
40.059025
23.972948
117
6530
35.262400
-77.817400
37107
2868.7
489.568647
4147.1
4288.3
4144.9
4277.3
...
1040903
1034525
1077398
14.394748
5.245334
5.587760
15.159722
40.641710
44.129557
20.141168
118
6550
36.264200
-76.251700
37139
6018.4
300.925987
3951.9
4102.8
3875.7
4053.2
...
1069212
1055486
1066473
20.756655
5.254557
8.446850
16.555845
41.410590
41.942057
21.564280
119 rows × 57 columns
In [25]:
gene
Out[25]:
VARIETY
FAMILY
RM
CLASS_OF
X2
X7
X9
X10
X11
X23
...
X6045
X6046
X6049
X6054
X6056
X6058
X6059
X6060
X6061
X6062
0
V000016
FAM05619
3.9
NaN
GG
AA
AA
AA
AA
GG
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1
V000017
FAM05619
3.7
NaN
GG
AA
AA
NN
AA
GG
...
GG
TT
AA
AA
NN
CC
AA
GG
AG
GG
2
V000018
FAM05619
3.7
NaN
GG
AA
AA
NN
AA
GG
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
3
V000023
FAM13245
2.8
NaN
GG
AA
AA
GG
AC
AG
...
GG
TT
AA
AA
NN
NN
AA
GG
AA
GG
4
V000024
FAM13246
2.4
NaN
GG
AA
AA
GG
AC
GG
...
GG
TT
AA
AA
NN
NN
AA
GG
AA
GG
5
V000025
FAM13244
2.7
NaN
GG
TT
GG
AA
AA
AA
...
AA
AA
AA
AA
AA
GG
AA
AA
GG
AA
6
V000032
FAM02899
0.6
NaN
GG
AA
AA
NN
AA
AA
...
NN
AA
GG
AA
AA
GG
AA
AA
AA
GG
7
V000034
FAM05620
4.4
NaN
GG
AA
AA
GG
NN
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
8
V000036
FAM05621
4.1
NaN
NN
NN
AG
NN
AA
GG
...
NN
TT
NN
NN
NN
NN
AA
NN
AA
GG
9
V000039
FAM05625
3.4
NaN
NN
NN
AG
NN
AA
AA
...
NN
TT
NN
NN
NN
NN
AA
NN
AA
GG
10
V000051
FAM13243
3.0
NaN
NN
NN
AA
NN
AC
AG
...
NN
TT
NN
NN
NN
NN
AA
NN
AA
GG
11
V000062
FAM05624
3.5
NaN
GG
TT
NN
AA
AA
AA
...
GG
TT
AA
AA
NN
NN
AA
GG
AA
GG
12
V000067
FAM09634
1.7
NaN
GG
AA
AA
AA
AA
AA
...
GG
TT
AA
AA
NN
NN
AA
GG
AA
GG
13
V000070
FAM09629
1.2
NaN
GG
AA
GG
NN
AA
AA
...
GG
TT
AA
AT
GG
CG
AA
GG
AA
GG
14
V000071
FAM09640
0.8
NaN
GG
AA
AA
GG
NN
AA
...
GG
TT
AA
AA
AA
GG
CC
GG
AA
GG
15
V000075
FAM09630
2.6
NaN
GG
TT
GG
AA
AA
GG
...
AA
AA
GG
AA
GG
GG
CC
GG
AA
GG
16
V000077
FAM09631
2.2
NaN
GG
TT
GG
NN
AA
GG
...
GG
TT
AA
AA
AG
GG
CC
GG
AA
GG
17
V000078
FAM00498
1.3
NaN
GG
NN
AA
NN
AC
NN
...
GG
TT
AA
AA
AA
GG
CC
GG
AA
GG
18
V000079
FAM13255
2.5
NaN
GG
AA
AA
AA
AA
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
19
V000080
FAM13255
2.5
NaN
NN
NN
AA
NN
AA
AA
...
NN
TT
NN
NN
NN
NN
AA
NN
AA
GG
20
V000081
FAM13252
2.9
NaN
GG
AA
AA
NN
CC
AA
...
NN
TT
AA
NN
AA
NN
AA
NN
GG
AA
21
V000082
FAM13251
3.1
NaN
NN
NN
AA
NN
AC
AA
...
NN
AA
NN
NN
NN
NN
AA
NN
AA
GG
22
V000092
FAM13250
3.3
NaN
GG
AA
AA
AA
AA
GG
...
AA
AA
AA
AA
AA
GG
AA
AA
GG
AA
23
V000096
FAM05627
4.8
NaN
GG
AA
AA
NN
CC
GG
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
24
V000098
FAM05628
3.8
NaN
GG
AA
AA
GG
NN
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
25
V000110
FAM09641
0.9
NaN
GG
AA
AA
AA
AA
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
26
V000115
FAM00499
1.3
NaN
GG
AA
AA
AA
AA
AA
...
AG
AT
AG
AA
AG
CG
AA
AG
AG
AG
27
V000122
FAM13255
2.1
NaN
NN
NN
AA
NN
AA
AA
...
NN
TT
NN
NN
NN
NN
AA
NN
AA
GG
28
V000123
FAM13249
1.9
NaN
GG
AA
AA
GG
CC
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
29
V000124
FAM13254
2.5
NaN
GG
AA
AA
AA
AA
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
GG
AA
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1497
V156772
FAM06485
3.0
2013.0
GG
AA
AA
AA
AA
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1498
V156773
FAM11197
3.1
NaN
GG
TT
GG
AA
AA
GG
...
AA
AA
GG
AA
AA
GG
CC
AA
NN
AA
1499
V156774
FAM14774
3.5
2013.0
GG
AA
AA
AA
AA
GG
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1500
V156776
FAM06480
2.6
NaN
AG
AT
NN
AG
NN
AA
...
AA
AA
GG
AA
GG
GG
CC
GG
GG
GG
1501
V156777
FAM06486
3.2
NaN
GG
AA
AA
AA
AA
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1502
V156779
FAM06485
3.1
NaN
GG
AA
AA
AA
NN
AA
...
GG
TT
AA
AA
NN
NN
AA
GG
AA
GG
1503
V156782
FAM06489
3.4
NaN
GG
AA
AA
AA
AA
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1504
V156783
FAM11183
2.4
2013.0
GG
TT
GG
AA
AA
GG
...
GG
TT
AA
AA
AA
GG
AA
GG
AA
GG
1505
V156784
FAM07969
2.4
NaN
GG
AT
NN
AG
AC
AG
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1506
V156786
FAM11169
2.1
2013.0
GG
NN
AA
AA
AA
AG
...
GG
TT
AA
AA
AG
CG
AC
GG
AA
GG
1507
V156789
FAM12500
2.8
NaN
GG
AA
AA
GG
NN
AA
...
GG
TT
AA
AA
NN
NN
CC
GG
AA
GG
1508
V156790
FAM07967
1.9
NaN
GG
AA
AA
AA
NN
AA
...
GG
TT
AA
AA
NN
NN
AA
GG
AG
AG
1509
V156791
FAM11185
2.2
NaN
GG
AA
AA
GG
CC
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1510
V156792
FAM11185
2.0
NaN
GG
AA
AA
GG
CC
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1511
V156793
FAM11185
2.8
NaN
GG
AA
AA
GG
NN
AA
...
GG
TT
AA
AA
NN
NN
AA
GG
AA
GG
1512
V156795
FAM11188
2.0
NaN
GG
AA
AA
GG
CC
GG
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1513
V156797
FAM06502
3.2
2013.0
GG
AA
AA
GG
CC
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1514
V156798
FAM11170
3.1
NaN
GG
AA
AA
GG
CC
AA
...
GG
AT
NN
AA
AG
CG
AC
GG
AA
GG
1515
V156801
FAM06483
3.1
NaN
GG
AA
AA
AA
AA
AA
...
AG
AT
AA
AA
GG
CC
AA
GG
GG
GG
1516
V156802
FAM11170
3.1
NaN
GG
TT
GG
AA
AA
GG
...
GG
TT
AA
AA
NN
NN
AC
GG
AA
GG
1517
V156803
FAM06465
3.6
NaN
GG
AA
AA
AA
AA
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1518
V156804
FAM11189
2.8
NaN
GG
AA
AA
GG
CC
GG
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1519
V156806
FAM11189
2.6
2013.0
GG
AA
AA
GG
CC
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1520
V156807
FAM11189
2.7
2013.0
GG
AA
AA
GG
CC
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1521
V156822
FAM13790
3.0
NaN
GG
AA
AA
AA
AA
GG
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1522
V156824
FAM06553
2.7
NaN
GG
AA
AA
GG
CC
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1523
V156826
FAM06553
2.9
NaN
GG
AA
AA
GG
CC
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1524
V156827
FAM06553
3.1
NaN
GG
AA
AA
AA
AA
AA
...
GG
TT
AA
AA
GG
CC
AA
GG
AA
GG
1525
V156829
FAM13825
3.1
NaN
GG
TT
GG
AA
AA
GG
...
AG
AT
AG
AA
AA
GG
CC
GG
AA
GG
1526
V156831
FAM16847
1.4
NaN
NN
NN
AA
NN
AC
NN
...
NN
AA
NN
NN
NN
NN
CC
NN
AG
GG
1527 rows × 2167 columns
In [26]:
# statistic by LOCATION
loc_yield = soy[['LOCATION', 'YIELD']]
loc_group = loc_yield.groupby('LOCATION')
loc_group['YIELD'].agg([np.mean, np.sum, np.std]).sort_values('mean', ascending=False)
Out[26]:
mean
sum
std
LOCATION
2201
78.301739
3993.388710
5.145056
4504
75.448345
4677.797360
3.731417
5220
75.292989
5571.681210
8.385741
4503
72.160118
4473.927310
9.821378
3271
72.096298
10381.866970
6.366609
3501
70.014830
40188.512300
10.616583
2212
69.998242
12669.681880
5.389066
2320
69.697813
270427.514240
9.831696
2200
67.798544
234650.761439
9.016736
2460
67.640409
150973.392250
9.611597
3410
67.570324
209400.432890
11.996355
3135
67.313502
1615.524040
3.505275
2360
66.911135
10638.870430
9.357403
3315
66.788559
14292.751686
4.206028
6530
66.506254
7182.675420
6.355477
3170
66.135319
2182.465540
4.632271
3140
66.015363
193491.027653
10.179465
3491
65.941106
13715.750090
6.056859
4370
65.772947
38016.763550
10.065248
3270
65.630892
220191.642277
9.217957
2170
65.322040
235877.888230
8.817091
3340
64.935833
333120.825640
8.803889
2450
64.353384
176843.099240
8.267672
3330
64.083623
158606.967660
7.170925
2403
63.992507
24253.160090
7.112705
2230
63.602873
241690.918941
7.632674
1301
63.596777
13736.903780
5.032112
3130
62.641049
203896.613420
8.222703
4360
62.054615
63481.870760
9.888912
3120
62.010620
472210.872451
12.323610
...
...
...
...
3503
49.270781
1182.498750
3.477881
2309
49.101961
33143.823500
13.774628
2291
48.955655
10574.421546
5.879348
5340
48.659194
17225.354630
6.974296
3490
48.566143
472888.536170
9.077163
2308
48.453637
42590.746550
6.953115
130
48.452517
5523.586950
12.503065
2111
48.009828
1728.353796
6.291308
2440
47.960011
272604.705233
10.298750
1210
47.523247
15730.194650
10.804707
3311
47.051486
10163.121057
6.025748
4310
46.650821
117979.926637
9.976808
2401
46.069286
17229.912800
5.454570
4320
46.025227
66092.225633
17.233349
1270
45.272706
40609.617150
9.770910
1290
44.733531
1610.407121
3.448338
20
44.573981
1738.385270
6.328853
1102
44.198336
3138.081870
6.106025
2290
44.088851
456143.250798
8.863546
4312
43.875213
4738.522980
6.599347
107
42.676117
2816.623740
5.296383
119
42.380966
3051.429570
7.094656
110
42.016535
6092.397590
7.861024
140
41.096731
4438.447000
5.214731
2110
39.762752
63580.640033
11.413527
10
37.837604
9989.127390
7.823764
30
36.172968
2676.799650
3.980278
4260
35.422900
56853.755080
13.789012
101
34.584116
2490.056360
4.616979
150
27.593854
6953.651096
6.310835
119 rows × 3 columns
In [27]:
# statistic by VARIETY
var_yield = soy[['VARIETY', 'YIELD']]
var_group = var_yield.groupby('VARIETY')
var_group['YIELD'].agg([np.mean, np.size, np.std]).sort_values('mean', ascending=False)
Out[27]:
mean
size
std
VARIETY
V013529
83.877361
2.0
2.447611
V155828
83.299809
2.0
12.797438
V013501
81.870800
2.0
0.814889
V013012
81.176295
2.0
7.725253
V001109
79.982753
2.0
4.506178
V030243
79.862305
2.0
0.790495
V013995
78.749804
2.0
6.603335
V013528
78.664004
2.0
4.974819
V155893
78.582516
2.0
4.522897
V000135
78.514449
4.0
3.766403
V013063
78.049798
2.0
4.655697
V013064
77.859015
2.0
3.339468
V013525
77.822509
2.0
1.949320
V000928
77.663103
2.0
1.432207
V013065
77.643623
2.0
3.168183
V155913
77.069937
2.0
12.637719
V013019
76.730603
2.0
2.769045
V155924
76.393277
2.0
3.369750
V000838
76.091564
2.0
3.902635
V013405
76.076619
2.0
2.917092
V030250
75.951934
2.0
3.158615
V025778
75.889577
3.0
5.764761
V013378
75.834684
2.0
4.162243
V013018
75.763903
2.0
4.748542
V155944
75.618994
2.0
0.944640
V013016
75.589180
2.0
0.386625
V013215
75.434904
2.0
2.209138
V030233
75.394234
2.0
5.528312
V014270
75.304642
2.0
5.977395
V001010
75.214084
2.0
0.332054
...
...
...
...
V064022
36.837006
9.0
17.839807
V054905
36.747257
3.0
10.687844
V055313
36.719525
4.0
9.705256
V030565
36.629226
4.0
2.832648
V083010
36.614779
10.0
14.456018
V055314
36.363835
4.0
6.092611
V045448
36.315656
10.0
10.974832
V000964
36.301602
2.0
2.932970
V030535
36.189482
2.0
1.579307
V062293
35.733125
2.0
3.332106
V062295
35.707320
2.0
4.689872
V062304
35.554455
2.0
13.319070
V069310
35.551013
3.0
20.760367
V062268
35.284715
2.0
8.545605
V068794
35.067658
10.0
6.093481
V062286
34.985010
2.0
18.830777
V083330
34.676037
6.0
8.489889
V069282
34.548412
10.0
16.209401
V062297
34.545750
2.0
1.626204
V054959
34.439923
3.0
16.850776
V069318
34.155809
9.0
11.366702
V064000
34.004927
9.0
13.311731
V061199
33.843841
7.0
10.553614
V062305
32.688335
2.0
0.260138
V020399
31.833382
2.0
5.528312
V045678
31.775185
8.0
7.445084
V055380
31.708212
4.0
12.221417
V062274
31.423265
2.0
14.741076
V026494
29.611060
2.0
8.433521
V055335
28.593227
3.0
23.054366
13962 rows × 3 columns
In [28]:
# statistic by FAMILY
fam_yield = soy[['FAMILY', 'YIELD']]
fam_group = fam_yield.groupby('FAMILY')
fam_group['YIELD'].agg([np.mean, np.size, np.std]).sort_values('mean', ascending=False)
Out[28]:
mean
size
std
FAMILY
FAM13433
76.549003
4.0
1.991346
FAM13511
76.393277
2.0
3.369750
FAM13542
75.304642
2.0
5.977395
FAM06633
75.211288
5.0
10.771322
FAM13533
75.140068
8.0
5.599808
FAM13363
74.214966
2.0
12.169710
FAM13447
73.350795
4.0
1.607920
FAM13260
73.272318
8.0
7.129478
FAM16817
72.562832
2.0
7.735137
FAM13604
72.138541
6.0
7.706760
FAM13565
71.415981
2.0
3.891910
FAM06564
70.763013
6.0
11.983282
FAM13378
70.227494
2.0
2.351191
FAM00604
69.924784
52.0
11.486684
FAM15620
69.788977
2.0
0.253720
FAM06591
69.651164
21.0
8.818803
FAM13519
69.594127
14.0
10.797108
FAM13531
69.404067
4.0
7.245544
FAM13383
69.261537
2.0
7.517062
FAM15662
69.250385
2.0
1.579202
FAM06587
69.148943
13.0
5.800448
FAM13516
69.011748
22.0
4.438088
FAM15408
68.843690
2.0
3.759109
FAM01224
68.698299
8.0
14.634540
FAM15579
68.626127
52.0
12.381804
FAM13517
68.508296
16.0
5.847764
FAM15625
68.257848
4.0
2.957638
FAM05491
68.179520
52.0
10.289500
FAM13518
67.937324
8.0
2.982530
FAM01258
67.935132
10.0
13.403011
...
...
...
...
FAM14541
42.466886
7.0
8.680450
FAM14682
42.329873
4.0
1.860457
FAM11160
42.310940
45.0
9.254989
FAM03574
42.297385
4.0
9.420989
FAM01424
42.108676
23.0
14.814461
FAM07915
42.081901
20.0
9.525611
FAM05298
42.025284
10.0
14.590236
FAM14724
41.997286
9.0
11.520502
FAM15671
41.886338
2.0
0.790495
FAM12520
41.720351
8.0
2.676230
FAM09640
41.685828
38.0
9.383129
FAM06448
41.663294
21.0
13.839497
FAM14735
41.568481
16.0
8.449816
FAM09955
41.432761
32.0
9.751457
FAM12774
41.366368
9.0
5.580799
FAM14740
41.077079
14.0
14.250639
FAM07971
40.844196
15.0
8.667220
FAM10405
40.654121
7.0
9.790113
FAM03584
40.370693
21.0
10.519378
FAM06631
40.242110
9.0
21.813354
FAM10402
39.989984
7.0
9.075769
FAM14730
39.936545
4.0
6.864171
FAM13861
39.728362
8.0
12.150156
FAM03772
39.191015
8.0
8.476899
FAM14746
38.958430
2.0
4.614268
FAM08150
38.310615
6.0
10.873464
FAM03587
37.306220
4.0
18.989511
FAM14395
34.004927
9.0
13.311731
FAM01421
33.843841
7.0
10.553614
FAM12785
28.593227
3.0
23.054366
1691 rows × 3 columns
In [29]:
# statistic by both FAMILY and VARIETY
fam_var_yield = soy[['FAMILY', 'VARIETY', 'YIELD']]
fam_var_group = fam_var_yield.groupby(['FAMILY', 'VARIETY'])
fam_var_group.describe().unstack().to_csv('fam_var.csv')
In [31]:
# general geo statistic helper methods
def geo_statis(keys, year=2014):
geo_key = geo[['LOCATION', 'AREA'] + keys]
year_yield = soy[soy['YEAR'] == year][['LOCATION', 'YIELD']]
year_yield = year_yield.groupby('LOCATION', as_index=False).agg(np.sum)
key_yield = pd.merge(geo_key, year_yield, on='LOCATION')
# get YIELD per AREA
key_yield['AVE_YIELD'] = key_yield.apply(lambda row: (row['YIELD'] / row['AREA']), axis=1)
return key_yield
In [39]:
# plot corresponding feature with annual average yield in each geo location
geo_yield = geo_statis(['PH', 'CEC', 'ORGANIC_MATTER', 'TEMP_14', 'PREC_14', 'RAD_14'])
geo_yield.plot(x='PH', y='AVE_YIELD', kind='scatter')
geo_yield.plot(x='CEC', y='AVE_YIELD', kind='scatter')
geo_yield.plot(x='ORGANIC_MATTER', y='AVE_YIELD', kind='scatter')
geo_yield.plot(x='TEMP_14', y='AVE_YIELD', kind='scatter')
geo_yield.plot(x='PREC_14', y='AVE_YIELD', kind='scatter')
geo_yield.plot(x='RAD_14', y='AVE_YIELD', kind='scatter')
plt.show()
In [31]:
# Lower bound for Training MSE
feature_group = ['LOCATION', 'YEAR', 'VARIETY']
group_mean = soy.groupby(feature_group)['YIELD'].mean()
test_feature = feature_group + ['YIELD']
test_x = soy[test_feature].as_matrix()
def square_error(item):
actual = item[-1]
pred = group_mean.get(tuple(item[:-1]))
return (actual - pred) * (actual - pred)
se = [square_error(row) for row in test_x]
se
np.mean(se)
Out[31]:
16.073080825619961
In [32]:
group_mean
Out[32]:
LOCATION YEAR VARIETY
10 2011 V000070 28.990160
V000071 20.546365
V000110 26.259500
V055643 41.916520
V055644 41.333920
V055645 40.463280
V055646 39.108020
V055647 36.687930
V055648 36.301600
V055649 41.238010
V055650 41.432210
V055651 42.883370
V055652 42.302850
V055653 39.398290
V055655 33.493100
V055657 39.496430
V055659 40.851680
V055660 43.077580
V055661 33.397040
V055662 47.627510
V055663 42.012580
V055664 39.108020
V055665 39.688550
V055666 38.237380
V055667 44.626880
V055668 37.366590
V055669 50.047600
V055671 30.590770
V055672 45.013210
V055674 39.398290
...
6550 2011 V103161 50.469220
V103313 54.008087
V103318 58.705130
V112159 63.751423
V112167 59.405957
V120047 61.403300
V120205 62.280283
V120304 53.272467
V120395 65.854640
V120396 63.997127
V136828 59.125460
V136833 51.239583
V136874 57.162913
V137061 63.296303
V137068 66.906247
V137079 55.761213
V137097 57.092527
V137105 61.543920
V137136 61.368503
V137154 53.308057
V137222 58.810213
V137226 66.730783
V151692 64.171750
V151854 54.463950
V151855 63.822503
V151858 56.987497
V151867 58.950043
V155250 61.719287
V155251 57.933973
V155918 60.317637
Name: YIELD, dtype: float64
In [ ]:
Content source: danielwpz/soybean
Similar notebooks: