In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# read files
soy = pd.read_csv('dataset/yield.csv')
geo = pd.read_csv('dataset/geo.csv')
gene = pd.read_csv('dataset/gene.csv')

In [23]:
soy


Out[23]:
EXPERIMENT YEAR LOCATION VARIETY FAMILY CHECK RM REPNO YIELD CLASS_OF
0 09YT000052 2009 3310 V000016 FAM05619 True 3.9 1 62.548835 0
1 09YT000052 2009 3310 V000016 FAM05619 True 3.9 2 60.315357 0
2 09YT000052 2009 3310 V000018 FAM05619 True 3.7 1 65.900242 0
3 09YT000052 2009 3310 V000018 FAM05619 True 3.7 2 61.433286 0
4 09YT000052 2009 3310 V000039 FAM05625 True 3.4 1 68.133720 0
5 09YT000052 2009 3310 V000039 FAM05625 True 3.4 2 60.315357 0
6 09YT000052 2009 3310 V000082 FAM13251 True 3.1 1 60.315357 0
7 09YT000052 2009 3310 V000082 FAM13251 True 3.1 2 65.900242 0
8 09YT000052 2009 3310 V030090 FAM13986 False 3.2 1 54.730473 0
9 09YT000052 2009 3310 V030090 FAM13986 False 3.2 2 61.433286 0
10 09YT000052 2009 3310 V030093 FAM13986 False 3.5 1 53.612544 0
11 09YT000052 2009 3310 V030093 FAM13986 False 3.5 2 53.612544 0
12 09YT000052 2009 3310 V030101 FAM13986 False 3.5 1 58.081879 0
13 09YT000052 2009 3310 V030101 FAM13986 False 3.5 2 62.548835 0
14 09YT000052 2009 3310 V030102 FAM13986 False 3.4 1 62.548835 0
15 09YT000052 2009 3310 V030102 FAM13986 False 3.4 2 61.433286 0
16 09YT000052 2009 3310 V030105 FAM13986 False 3.3 1 68.133720 0
17 09YT000052 2009 3310 V030105 FAM13986 False 3.3 2 61.433286 0
18 09YT000052 2009 3310 V030107 FAM13991 False 3.5 1 59.197429 0
19 09YT000052 2009 3310 V030107 FAM13991 False 3.5 2 59.197429 0
20 09YT000052 2009 3310 V030110 FAM13991 False 3.4 1 67.018171 0
21 09YT000052 2009 3310 V030110 FAM13991 False 3.4 2 63.666764 0
22 09YT000052 2009 3310 V030116 FAM13991 False 3.7 1 59.197429 0
23 09YT000052 2009 3310 V030116 FAM13991 False 3.7 2 62.548835 0
24 09YT000052 2009 3310 V030134 FAM13991 False 3.3 1 63.666764 0
25 09YT000052 2009 3310 V030134 FAM13991 False 3.3 2 63.666764 0
26 09YT000052 2009 3310 V030137 FAM13991 False 3.5 1 70.367050 0
27 09YT000052 2009 3310 V030137 FAM13991 False 3.5 2 63.666764 0
28 09YT000052 2009 3310 V030139 FAM13991 False 3.5 1 69.251649 0
29 09YT000052 2009 3310 V030139 FAM13991 False 3.5 2 62.548835 0
... ... ... ... ... ... ... ... ... ... ...
172027 14YT005707 2014 3340 V152053 FAM14333 True 3.0 1 59.016460 2013
172028 14YT005707 2014 3340 V152053 FAM14333 True 3.0 2 61.033130 2013
172029 14YT005707 2014 3340 V152079 FAM11179 True 2.7 1 52.145320 2013
172030 14YT005707 2014 3340 V152079 FAM11179 True 2.7 2 62.416490 2013
172031 14YT005707 2014 3340 V155853 FAM13521 True 2.7 1 60.408890 2011
172032 14YT005707 2014 3340 V155853 FAM13521 True 2.7 2 71.876060 2011
172033 14YT005707 2014 3340 V156314 FAM12531 True 2.8 1 47.151370 2012
172034 14YT005707 2014 3340 V156314 FAM12531 True 2.8 2 51.038540 2012
172035 14YT005707 2014 3340 V156553 FAM14238 True 2.9 1 51.976100 2013
172036 14YT005707 2014 3340 V156553 FAM14238 True 2.9 2 65.448490 2013
172037 14YT005707 2014 3370 V152053 FAM14333 True 3.0 1 54.665040 2013
172038 14YT005707 2014 3370 V152053 FAM14333 True 3.0 2 53.002720 2013
172039 14YT005707 2014 3370 V152079 FAM11179 True 2.7 1 57.845590 2013
172040 14YT005707 2014 3370 V152079 FAM11179 True 2.7 2 58.819730 2013
172041 14YT005707 2014 3370 V155853 FAM13521 True 2.7 1 52.049260 2011
172042 14YT005707 2014 3370 V155853 FAM13521 True 2.7 2 54.424890 2011
172043 14YT005707 2014 3370 V156314 FAM12531 True 2.8 1 51.607920 2012
172044 14YT005707 2014 3370 V156314 FAM12531 True 2.8 2 57.927970 2012
172045 14YT005707 2014 3370 V156553 FAM14238 True 2.9 1 65.409680 2013
172046 14YT005707 2014 3370 V156553 FAM14238 True 2.9 2 55.609440 2013
172047 14YT005707 2014 3490 V152053 FAM14333 True 3.0 1 62.554490 2013
172048 14YT005707 2014 3490 V152053 FAM14333 True 3.0 2 59.494380 2013
172049 14YT005707 2014 3490 V152079 FAM11179 True 2.7 1 50.660990 2013
172050 14YT005707 2014 3490 V152079 FAM11179 True 2.7 2 64.556290 2013
172051 14YT005707 2014 3490 V155853 FAM13521 True 2.7 1 57.359940 2011
172052 14YT005707 2014 3490 V155853 FAM13521 True 2.7 2 57.215110 2011
172053 14YT005707 2014 3490 V156314 FAM12531 True 2.8 1 69.472020 2012
172054 14YT005707 2014 3490 V156314 FAM12531 True 2.8 2 64.089670 2012
172055 14YT005707 2014 3490 V156553 FAM14238 True 2.9 1 72.767820 2013
172056 14YT005707 2014 3490 V156553 FAM14238 True 2.9 2 60.186730 2013

172057 rows × 10 columns


In [24]:
geo


Out[24]:
LOCATION LATITUDE LONGITUDE FIPS AREA IRRIGATION TEMP_01 TEMP_02 TEMP_03 TEMP_04 ... RAD_14 RAD_13.1 RAD_14.1 CEC PH ORGANIC_MATTER CLAY SILT_TOP SAND_TOP AWC_100CM
0 10 46.501212 -96.338590 27167 7992.5 831.652312 2929.7 2822.0 2832.7 2612.9 ... 1100823 1015998 992691 26.655599 7.222128 6.803356 22.303241 35.982133 41.781395 21.478885
1 20 48.481819 -97.354542 38099 2102.8 56.178384 2757.8 2567.5 2739.9 2332.2 ... 1056317 1023342 974648 29.820095 7.589446 7.175127 31.485822 46.015191 22.490958 25.397483
2 30 47.383756 -96.362880 27107 6420.4 20.234374 2919.2 2775.3 2862.9 2528.7 ... 1080511 1011937 974301 26.386140 7.328356 6.924282 19.525246 25.714048 54.747396 18.263845
3 101 47.908480 -97.626370 38035 5189.8 454.833566 2829.6 2652.6 2784.9 2422.6 ... 1072692 1029851 977706 27.709274 7.314308 6.861001 23.775101 33.212891 42.986473 21.693859
4 103 46.977000 -96.884000 38017 12774.4 201.405003 2977.4 2852.1 2901.7 2625.5 ... 1099641 1018317 988927 32.209780 7.344828 7.484837 41.345197 42.814670 15.911892 27.676606
5 107 46.908000 -98.709000 38093 4048.6 115.384619 2914.4 2789.7 2860.3 2579.4 ... 1103974 1046380 1003437 25.642289 7.104803 5.968004 23.659071 34.745153 41.614005 21.378516
6 110 47.114430 -97.131311 38017 13618.6 256.113597 2977.4 2852.1 2901.7 2625.5 ... 1099641 1018317 988927 28.207248 7.323727 7.122696 30.040148 35.409505 34.561415 24.018750
7 119 45.543370 -96.435570 27011 9093.1 188.308826 3073.8 3007.8 2983.2 2826.0 ... 1108340 1031682 1015075 29.832679 7.188441 7.167354 30.999855 39.053037 30.019108 24.644398
8 120 46.300390 -96.481110 27167 9245.7 190.189566 3034.4 2939.4 2935.2 2736.5 ... 1106461 1014054 1002638 29.574653 7.383941 6.698668 32.118056 34.002604 33.870587 24.463845
9 130 48.137077 -97.573581 38035 4860.3 427.861246 2829.6 2652.6 2784.9 2422.6 ... 1072692 1029851 977706 28.903212 7.419958 6.853100 25.350694 33.128906 41.495226 22.431814
10 140 45.314360 -98.298900 46013 8651.3 234.629694 3178.2 3153.7 3126.2 2958.0 ... 1142812 1077983 1048878 27.307198 6.951561 5.989278 25.472148 49.133363 25.414725 24.106608
11 150 46.202694 -98.239100 38021 7690.1 596.776817 3027.1 2954.3 2981.4 2746.5 ... 1121718 1048643 1024703 26.957535 7.003484 6.298501 24.167102 36.957535 38.861135 22.186890
12 160 47.351845 -99.475466 38103 9482.9 33.397585 2856.8 2697.5 2810.7 2494.9 ... 1081757 1050316 1011466 25.457031 7.031474 6.220953 25.044560 36.189453 38.683521 22.354102
13 1102 46.499750 -97.578490 38073 9268.8 1275.500954 3038.7 2943.5 2965.2 2724.5 ... 1112523 1029232 1012789 25.136548 7.078729 6.359579 23.057703 34.114755 42.801622 20.991182
14 1110 44.309980 -96.705308 46011 5092.6 789.362212 3208.2 3180.3 3126.5 3026.4 ... 1149779 1062451 1036223 27.102865 6.796730 6.349816 24.457972 41.334997 34.166522 22.973416
15 1130 44.935970 -98.393310 46115 7413.3 664.385747 3267.1 3262.5 3218.2 3069.3 ... 1149209 1085817 1058598 27.291794 6.901997 5.611397 25.629412 49.651634 24.664706 24.378083
16 1140 44.348412 -96.634137 46011 6319.8 948.247934 3157.6 3137.9 3082.1 2987.2 ... 1148611 1058003 1034445 27.426505 6.764019 6.074994 24.948061 44.301143 30.841869 23.508854
17 1210 44.717000 -94.086000 27143 7201.4 33.303463 3124.1 3096.0 3089.4 2980.3 ... 1119820 1020404 1000506 28.921152 6.617180 7.626366 26.607350 37.865741 35.574797 23.595226
18 1220 44.483000 -93.035000 27049 4406.2 142.675785 3172.3 3138.2 3126.1 3026.2 ... 1115279 1000423 995052 26.025365 6.178823 6.803279 20.850073 38.459942 40.704167 21.853048
19 1250 44.089000 -93.303000 27147 5480.1 62.634715 3120.3 3126.4 3087.1 3022.5 ... 1127106 1022768 998548 30.351997 6.215126 7.539843 25.572700 43.866753 30.560547 24.642687
20 1260 43.688810 -96.286160 27133 7653.3 108.258660 3189.0 3201.1 3116.2 3047.3 ... 1160481 1054856 1031275 27.297671 6.476338 5.662898 26.234158 48.901910 24.833333 24.465885
21 1270 44.643000 -95.156000 27127 6997.8 27.144595 3140.0 3125.3 3099.3 2979.9 ... 1126041 1035417 1002471 26.269748 6.827640 6.528933 25.591073 36.268012 38.143953 22.656272
22 1280 44.923630 -95.983470 27073 8155.1 267.142745 3120.0 3071.6 3042.7 2905.6 ... 1115487 1031477 1010586 28.927517 7.071832 6.562415 28.440611 38.973235 32.669126 24.088759
23 1290 43.757000 -92.872000 27099 6472.2 210.585953 3128.7 3139.9 3083.9 3034.6 ... 1130376 1022627 999616 28.694517 6.090791 7.233593 23.257957 47.151982 29.602865 23.748958
24 1301 43.307700 -89.323430 55021 2856.1 209.094754 3203.5 3200.3 3023.0 3036.3 ... 1128769 1012620 1031580 23.702722 6.126796 8.143973 21.061396 52.044237 26.935491 23.242420
25 1320 42.979716 -89.449691 55025 2156.2 225.682054 3279.5 3275.4 3106.3 3124.7 ... 1132342 1014960 1032072 23.763310 6.200955 7.813844 20.289352 51.361834 28.372830 23.030469
26 1402 43.507340 -83.523320 26157 7189.9 250.272799 2972.0 2990.2 2714.7 2799.4 ... 1110759 1017373 1041977 23.011212 6.628805 7.814006 22.881583 37.460142 39.638817 21.558485
27 1410 43.173979 -84.787589 26057 5669.8 467.131812 3088.4 3133.2 2920.8 3000.6 ... 1108274 1020250 1027014 22.447917 6.367093 9.740458 21.438006 35.346427 43.180990 20.807444
28 2110 43.319600 -96.568820 46083 6760.4 120.261918 3319.5 3340.1 3220.8 3171.9 ... 1179091 1054586 1040005 27.936053 6.615104 4.423613 26.624349 44.908709 28.442419 24.199002
29 2111 43.319600 -96.568820 46083 6760.4 120.261918 3319.5 3340.1 3220.8 3171.9 ... 1179091 1054586 1040005 27.936053 6.615104 4.423613 26.624349 44.908709 28.442419 24.199002
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
89 3420 39.323490 -85.233553 18137 3497.9 17.129785 3676.2 3819.9 3497.3 3644.8 ... 1132126 1063793 1077948 18.956670 5.843229 4.390081 19.402995 61.038050 19.528935 23.085026
90 3490 40.899287 -86.814469 18181 6426.6 192.953296 3453.0 3482.7 3268.5 3348.1 ... 1149816 1060328 1056727 19.566045 6.176917 5.671409 15.652633 27.064525 57.282335 17.002778
91 3491 40.899287 -86.814469 18181 6426.6 192.953296 3453.0 3482.7 3268.5 3348.1 ... 1149816 1060328 1056727 19.566045 6.176917 5.671409 15.652633 27.064525 57.282335 17.002778
92 3501 40.125880 -76.534230 42071 1073.8 596.548113 3723.3 3861.6 3558.5 3757.9 ... 1058000 1044000 1072968 19.599710 5.963137 5.472535 20.728395 54.385403 24.856064 22.873704
93 3503 40.125870 -76.534340 42071 1073.8 596.548113 3723.3 3861.6 3558.5 3757.9 ... 1058000 1044000 1072968 19.599710 5.963137 5.472535 20.728395 54.385403 24.856064 22.873704
94 4104 39.010983 -95.210466 20045 4722.5 190.911028 4049.4 4029.1 3891.9 3815.0 ... 1179171 1080636 1116892 22.894242 6.499313 3.997221 23.591942 54.151910 22.282190 23.752799
95 4108 39.805992 -95.261603 20043 9546.9 177.615088 3854.1 3871.6 3728.6 3665.6 ... 1184089 1079015 1112949 26.337167 6.267969 3.531338 26.081742 61.863498 12.046224 25.126845
96 4110 39.784210 -97.866910 20157 5759.3 4111.376335 3876.7 3978.3 3751.6 3770.7 ... 1217500 1117055 1147378 23.202257 6.453711 3.052826 21.578704 50.681279 27.755136 23.025694
97 4120 39.747108 -95.470291 20013 8446.3 110.484664 3848.3 3869.8 3725.1 3662.6 ... 1187499 1082148 1116161 27.320964 6.285648 3.535033 28.220631 58.249060 13.506149 25.616840
98 4215 40.569134 -95.694995 19071 7635.0 681.312339 3779.8 3827.9 3665.8 3632.0 ... 1196469 1074075 1104333 27.944251 6.697676 3.853998 26.140023 55.525919 18.320374 24.817159
99 4220 38.815030 -90.592700 29183 2505.9 123.239327 3988.3 4078.8 3837.3 3800.5 ... 1164382 1070697 1093418 21.866319 6.501440 4.424011 22.712095 58.599248 18.633825 23.809570
100 4260 40.160670 -92.057200 29103 5382.6 31.564411 3760.1 3759.3 3637.1 3603.5 ... 1157662 1073014 1056834 25.205078 5.976924 4.041788 26.602937 58.694806 14.610026 25.771376
101 4270 39.185239 -93.324883 29195 9507.9 182.632845 3948.1 3944.1 3808.3 3756.7 ... 1166912 1081778 1085627 24.522642 6.443902 3.781648 23.424117 62.754630 13.753255 24.496202
102 4310 38.736200 -89.596980 17119 4763.7 104.707088 3999.2 4130.6 3851.8 3840.5 ... 1161294 1062226 1108868 20.155888 6.258608 4.017265 19.458984 72.323061 7.991175 23.981228
103 4312 38.830010 -90.037900 17119 2803.3 70.842181 3965.0 4075.0 3819.1 3788.5 ... 1164547 1064169 1104253 19.728082 6.281380 3.789960 20.059968 65.455584 14.367549 23.461914
104 4320 39.140200 -87.860000 17033 7761.1 623.312683 3885.1 4038.2 3720.4 3774.2 ... 1159041 1059175 1086454 20.061343 6.059295 3.831641 19.669777 56.416088 23.963614 23.069878
105 4330 38.495000 -87.727400 17185 6350.3 1469.001391 4009.7 4174.2 3845.0 3899.2 ... 1160654 1058236 1098336 19.990656 6.098254 4.293061 20.680936 57.564537 21.725699 23.091504
106 4360 39.202040 -89.827130 17117 5678.6 35.699188 3852.5 3960.1 3704.3 3694.0 ... 1165577 1068931 1091194 22.157769 6.085475 3.762400 20.287977 69.740885 9.805266 24.536806
107 4370 38.484790 -89.742030 17027 7208.1 130.756720 3999.5 4125.1 3851.6 3835.9 ... 1160997 1062053 1111071 20.444083 6.312001 3.991933 20.109230 69.956814 9.771340 24.010981
108 4501 39.610970 -75.391980 34033 4802.5 2352.601418 3813.4 3936.3 3642.8 3827.1 ... 1070288 1071220 1091680 16.448016 5.244490 7.353585 13.377498 47.019476 39.525702 20.002976
109 4503 38.992264 -76.143491 24035 2329.9 872.796547 3785.1 3961.6 3653.2 3842.6 ... 1106452 1083053 1117816 14.713976 5.174819 7.175737 11.376374 45.319878 43.055411 18.938498
110 4504 39.362128 -75.836019 24015 2336.7 900.479957 3839.5 3961.8 3665.6 3847.9 ... 1069374 1067801 1094285 17.459655 5.267529 7.178402 13.020716 48.955889 38.091192 20.103433
111 5210 37.889510 -89.830430 17157 3739.8 73.342896 4038.2 4131.5 3892.6 3864.8 ... 1155549 1061590 1115782 19.181517 5.803161 4.650474 20.106455 62.624761 17.167328 23.519938
112 5220 37.147997 -89.780363 29207 9736.4 10090.705750 4211.9 4191.0 4057.5 4049.1 ... 1161432 1070630 1102711 18.224392 5.964569 4.880537 20.614945 55.379413 24.001085 22.750217
113 5240 35.789506 -90.535658 5031 7138.2 15068.233260 4342.0 4288.1 4182.8 4161.2 ... 1163163 1088825 1112485 15.023582 6.103371 4.162950 16.251374 42.369792 41.500289 20.284266
114 5340 37.741470 -88.219900 17059 3597.7 1769.231583 4096.0 4191.5 3930.5 3972.0 ... 1157357 1056117 1109931 16.931450 5.923573 4.914770 19.293912 61.731056 18.932691 22.722580
115 5410 37.732700 -87.184860 21059 6110.4 341.723997 4057.8 4162.6 3891.8 3949.1 ... 1149309 1048094 1102341 16.712167 5.837854 4.958798 17.089193 57.426577 25.463903 21.843880
116 6510 35.652100 -76.553500 37095 4878.8 603.696746 4066.9 4214.5 4058.5 4191.6 ... 1048570 1032294 1053288 27.879051 5.277074 12.225963 19.396447 40.531891 40.059025 23.972948
117 6530 35.262400 -77.817400 37107 2868.7 489.568647 4147.1 4288.3 4144.9 4277.3 ... 1040903 1034525 1077398 14.394748 5.245334 5.587760 15.159722 40.641710 44.129557 20.141168
118 6550 36.264200 -76.251700 37139 6018.4 300.925987 3951.9 4102.8 3875.7 4053.2 ... 1069212 1055486 1066473 20.756655 5.254557 8.446850 16.555845 41.410590 41.942057 21.564280

119 rows × 57 columns


In [25]:
gene


Out[25]:
VARIETY FAMILY RM CLASS_OF X2 X7 X9 X10 X11 X23 ... X6045 X6046 X6049 X6054 X6056 X6058 X6059 X6060 X6061 X6062
0 V000016 FAM05619 3.9 NaN GG AA AA AA AA GG ... GG TT AA AA GG CC AA GG AA GG
1 V000017 FAM05619 3.7 NaN GG AA AA NN AA GG ... GG TT AA AA NN CC AA GG AG GG
2 V000018 FAM05619 3.7 NaN GG AA AA NN AA GG ... GG TT AA AA GG CC AA GG AA GG
3 V000023 FAM13245 2.8 NaN GG AA AA GG AC AG ... GG TT AA AA NN NN AA GG AA GG
4 V000024 FAM13246 2.4 NaN GG AA AA GG AC GG ... GG TT AA AA NN NN AA GG AA GG
5 V000025 FAM13244 2.7 NaN GG TT GG AA AA AA ... AA AA AA AA AA GG AA AA GG AA
6 V000032 FAM02899 0.6 NaN GG AA AA NN AA AA ... NN AA GG AA AA GG AA AA AA GG
7 V000034 FAM05620 4.4 NaN GG AA AA GG NN AA ... GG TT AA AA GG CC AA GG AA GG
8 V000036 FAM05621 4.1 NaN NN NN AG NN AA GG ... NN TT NN NN NN NN AA NN AA GG
9 V000039 FAM05625 3.4 NaN NN NN AG NN AA AA ... NN TT NN NN NN NN AA NN AA GG
10 V000051 FAM13243 3.0 NaN NN NN AA NN AC AG ... NN TT NN NN NN NN AA NN AA GG
11 V000062 FAM05624 3.5 NaN GG TT NN AA AA AA ... GG TT AA AA NN NN AA GG AA GG
12 V000067 FAM09634 1.7 NaN GG AA AA AA AA AA ... GG TT AA AA NN NN AA GG AA GG
13 V000070 FAM09629 1.2 NaN GG AA GG NN AA AA ... GG TT AA AT GG CG AA GG AA GG
14 V000071 FAM09640 0.8 NaN GG AA AA GG NN AA ... GG TT AA AA AA GG CC GG AA GG
15 V000075 FAM09630 2.6 NaN GG TT GG AA AA GG ... AA AA GG AA GG GG CC GG AA GG
16 V000077 FAM09631 2.2 NaN GG TT GG NN AA GG ... GG TT AA AA AG GG CC GG AA GG
17 V000078 FAM00498 1.3 NaN GG NN AA NN AC NN ... GG TT AA AA AA GG CC GG AA GG
18 V000079 FAM13255 2.5 NaN GG AA AA AA AA AA ... GG TT AA AA GG CC AA GG AA GG
19 V000080 FAM13255 2.5 NaN NN NN AA NN AA AA ... NN TT NN NN NN NN AA NN AA GG
20 V000081 FAM13252 2.9 NaN GG AA AA NN CC AA ... NN TT AA NN AA NN AA NN GG AA
21 V000082 FAM13251 3.1 NaN NN NN AA NN AC AA ... NN AA NN NN NN NN AA NN AA GG
22 V000092 FAM13250 3.3 NaN GG AA AA AA AA GG ... AA AA AA AA AA GG AA AA GG AA
23 V000096 FAM05627 4.8 NaN GG AA AA NN CC GG ... GG TT AA AA GG CC AA GG AA GG
24 V000098 FAM05628 3.8 NaN GG AA AA GG NN AA ... GG TT AA AA GG CC AA GG AA GG
25 V000110 FAM09641 0.9 NaN GG AA AA AA AA AA ... GG TT AA AA GG CC AA GG AA GG
26 V000115 FAM00499 1.3 NaN GG AA AA AA AA AA ... AG AT AG AA AG CG AA AG AG AG
27 V000122 FAM13255 2.1 NaN NN NN AA NN AA AA ... NN TT NN NN NN NN AA NN AA GG
28 V000123 FAM13249 1.9 NaN GG AA AA GG CC AA ... GG TT AA AA GG CC AA GG AA GG
29 V000124 FAM13254 2.5 NaN GG AA AA AA AA AA ... GG TT AA AA GG CC AA GG GG AA
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1497 V156772 FAM06485 3.0 2013.0 GG AA AA AA AA AA ... GG TT AA AA GG CC AA GG AA GG
1498 V156773 FAM11197 3.1 NaN GG TT GG AA AA GG ... AA AA GG AA AA GG CC AA NN AA
1499 V156774 FAM14774 3.5 2013.0 GG AA AA AA AA GG ... GG TT AA AA GG CC AA GG AA GG
1500 V156776 FAM06480 2.6 NaN AG AT NN AG NN AA ... AA AA GG AA GG GG CC GG GG GG
1501 V156777 FAM06486 3.2 NaN GG AA AA AA AA AA ... GG TT AA AA GG CC AA GG AA GG
1502 V156779 FAM06485 3.1 NaN GG AA AA AA NN AA ... GG TT AA AA NN NN AA GG AA GG
1503 V156782 FAM06489 3.4 NaN GG AA AA AA AA AA ... GG TT AA AA GG CC AA GG AA GG
1504 V156783 FAM11183 2.4 2013.0 GG TT GG AA AA GG ... GG TT AA AA AA GG AA GG AA GG
1505 V156784 FAM07969 2.4 NaN GG AT NN AG AC AG ... GG TT AA AA GG CC AA GG AA GG
1506 V156786 FAM11169 2.1 2013.0 GG NN AA AA AA AG ... GG TT AA AA AG CG AC GG AA GG
1507 V156789 FAM12500 2.8 NaN GG AA AA GG NN AA ... GG TT AA AA NN NN CC GG AA GG
1508 V156790 FAM07967 1.9 NaN GG AA AA AA NN AA ... GG TT AA AA NN NN AA GG AG AG
1509 V156791 FAM11185 2.2 NaN GG AA AA GG CC AA ... GG TT AA AA GG CC AA GG AA GG
1510 V156792 FAM11185 2.0 NaN GG AA AA GG CC AA ... GG TT AA AA GG CC AA GG AA GG
1511 V156793 FAM11185 2.8 NaN GG AA AA GG NN AA ... GG TT AA AA NN NN AA GG AA GG
1512 V156795 FAM11188 2.0 NaN GG AA AA GG CC GG ... GG TT AA AA GG CC AA GG AA GG
1513 V156797 FAM06502 3.2 2013.0 GG AA AA GG CC AA ... GG TT AA AA GG CC AA GG AA GG
1514 V156798 FAM11170 3.1 NaN GG AA AA GG CC AA ... GG AT NN AA AG CG AC GG AA GG
1515 V156801 FAM06483 3.1 NaN GG AA AA AA AA AA ... AG AT AA AA GG CC AA GG GG GG
1516 V156802 FAM11170 3.1 NaN GG TT GG AA AA GG ... GG TT AA AA NN NN AC GG AA GG
1517 V156803 FAM06465 3.6 NaN GG AA AA AA AA AA ... GG TT AA AA GG CC AA GG AA GG
1518 V156804 FAM11189 2.8 NaN GG AA AA GG CC GG ... GG TT AA AA GG CC AA GG AA GG
1519 V156806 FAM11189 2.6 2013.0 GG AA AA GG CC AA ... GG TT AA AA GG CC AA GG AA GG
1520 V156807 FAM11189 2.7 2013.0 GG AA AA GG CC AA ... GG TT AA AA GG CC AA GG AA GG
1521 V156822 FAM13790 3.0 NaN GG AA AA AA AA GG ... GG TT AA AA GG CC AA GG AA GG
1522 V156824 FAM06553 2.7 NaN GG AA AA GG CC AA ... GG TT AA AA GG CC AA GG AA GG
1523 V156826 FAM06553 2.9 NaN GG AA AA GG CC AA ... GG TT AA AA GG CC AA GG AA GG
1524 V156827 FAM06553 3.1 NaN GG AA AA AA AA AA ... GG TT AA AA GG CC AA GG AA GG
1525 V156829 FAM13825 3.1 NaN GG TT GG AA AA GG ... AG AT AG AA AA GG CC GG AA GG
1526 V156831 FAM16847 1.4 NaN NN NN AA NN AC NN ... NN AA NN NN NN NN CC NN AG GG

1527 rows × 2167 columns


In [26]:
# statistic by LOCATION
loc_yield = soy[['LOCATION', 'YIELD']]
loc_group = loc_yield.groupby('LOCATION')
loc_group['YIELD'].agg([np.mean, np.sum, np.std]).sort_values('mean', ascending=False)


Out[26]:
mean sum std
LOCATION
2201 78.301739 3993.388710 5.145056
4504 75.448345 4677.797360 3.731417
5220 75.292989 5571.681210 8.385741
4503 72.160118 4473.927310 9.821378
3271 72.096298 10381.866970 6.366609
3501 70.014830 40188.512300 10.616583
2212 69.998242 12669.681880 5.389066
2320 69.697813 270427.514240 9.831696
2200 67.798544 234650.761439 9.016736
2460 67.640409 150973.392250 9.611597
3410 67.570324 209400.432890 11.996355
3135 67.313502 1615.524040 3.505275
2360 66.911135 10638.870430 9.357403
3315 66.788559 14292.751686 4.206028
6530 66.506254 7182.675420 6.355477
3170 66.135319 2182.465540 4.632271
3140 66.015363 193491.027653 10.179465
3491 65.941106 13715.750090 6.056859
4370 65.772947 38016.763550 10.065248
3270 65.630892 220191.642277 9.217957
2170 65.322040 235877.888230 8.817091
3340 64.935833 333120.825640 8.803889
2450 64.353384 176843.099240 8.267672
3330 64.083623 158606.967660 7.170925
2403 63.992507 24253.160090 7.112705
2230 63.602873 241690.918941 7.632674
1301 63.596777 13736.903780 5.032112
3130 62.641049 203896.613420 8.222703
4360 62.054615 63481.870760 9.888912
3120 62.010620 472210.872451 12.323610
... ... ... ...
3503 49.270781 1182.498750 3.477881
2309 49.101961 33143.823500 13.774628
2291 48.955655 10574.421546 5.879348
5340 48.659194 17225.354630 6.974296
3490 48.566143 472888.536170 9.077163
2308 48.453637 42590.746550 6.953115
130 48.452517 5523.586950 12.503065
2111 48.009828 1728.353796 6.291308
2440 47.960011 272604.705233 10.298750
1210 47.523247 15730.194650 10.804707
3311 47.051486 10163.121057 6.025748
4310 46.650821 117979.926637 9.976808
2401 46.069286 17229.912800 5.454570
4320 46.025227 66092.225633 17.233349
1270 45.272706 40609.617150 9.770910
1290 44.733531 1610.407121 3.448338
20 44.573981 1738.385270 6.328853
1102 44.198336 3138.081870 6.106025
2290 44.088851 456143.250798 8.863546
4312 43.875213 4738.522980 6.599347
107 42.676117 2816.623740 5.296383
119 42.380966 3051.429570 7.094656
110 42.016535 6092.397590 7.861024
140 41.096731 4438.447000 5.214731
2110 39.762752 63580.640033 11.413527
10 37.837604 9989.127390 7.823764
30 36.172968 2676.799650 3.980278
4260 35.422900 56853.755080 13.789012
101 34.584116 2490.056360 4.616979
150 27.593854 6953.651096 6.310835

119 rows × 3 columns


In [27]:
# statistic by VARIETY
var_yield = soy[['VARIETY', 'YIELD']]
var_group = var_yield.groupby('VARIETY')
var_group['YIELD'].agg([np.mean, np.size, np.std]).sort_values('mean', ascending=False)


Out[27]:
mean size std
VARIETY
V013529 83.877361 2.0 2.447611
V155828 83.299809 2.0 12.797438
V013501 81.870800 2.0 0.814889
V013012 81.176295 2.0 7.725253
V001109 79.982753 2.0 4.506178
V030243 79.862305 2.0 0.790495
V013995 78.749804 2.0 6.603335
V013528 78.664004 2.0 4.974819
V155893 78.582516 2.0 4.522897
V000135 78.514449 4.0 3.766403
V013063 78.049798 2.0 4.655697
V013064 77.859015 2.0 3.339468
V013525 77.822509 2.0 1.949320
V000928 77.663103 2.0 1.432207
V013065 77.643623 2.0 3.168183
V155913 77.069937 2.0 12.637719
V013019 76.730603 2.0 2.769045
V155924 76.393277 2.0 3.369750
V000838 76.091564 2.0 3.902635
V013405 76.076619 2.0 2.917092
V030250 75.951934 2.0 3.158615
V025778 75.889577 3.0 5.764761
V013378 75.834684 2.0 4.162243
V013018 75.763903 2.0 4.748542
V155944 75.618994 2.0 0.944640
V013016 75.589180 2.0 0.386625
V013215 75.434904 2.0 2.209138
V030233 75.394234 2.0 5.528312
V014270 75.304642 2.0 5.977395
V001010 75.214084 2.0 0.332054
... ... ... ...
V064022 36.837006 9.0 17.839807
V054905 36.747257 3.0 10.687844
V055313 36.719525 4.0 9.705256
V030565 36.629226 4.0 2.832648
V083010 36.614779 10.0 14.456018
V055314 36.363835 4.0 6.092611
V045448 36.315656 10.0 10.974832
V000964 36.301602 2.0 2.932970
V030535 36.189482 2.0 1.579307
V062293 35.733125 2.0 3.332106
V062295 35.707320 2.0 4.689872
V062304 35.554455 2.0 13.319070
V069310 35.551013 3.0 20.760367
V062268 35.284715 2.0 8.545605
V068794 35.067658 10.0 6.093481
V062286 34.985010 2.0 18.830777
V083330 34.676037 6.0 8.489889
V069282 34.548412 10.0 16.209401
V062297 34.545750 2.0 1.626204
V054959 34.439923 3.0 16.850776
V069318 34.155809 9.0 11.366702
V064000 34.004927 9.0 13.311731
V061199 33.843841 7.0 10.553614
V062305 32.688335 2.0 0.260138
V020399 31.833382 2.0 5.528312
V045678 31.775185 8.0 7.445084
V055380 31.708212 4.0 12.221417
V062274 31.423265 2.0 14.741076
V026494 29.611060 2.0 8.433521
V055335 28.593227 3.0 23.054366

13962 rows × 3 columns


In [28]:
# statistic by FAMILY
fam_yield = soy[['FAMILY', 'YIELD']]
fam_group = fam_yield.groupby('FAMILY')
fam_group['YIELD'].agg([np.mean, np.size, np.std]).sort_values('mean', ascending=False)


Out[28]:
mean size std
FAMILY
FAM13433 76.549003 4.0 1.991346
FAM13511 76.393277 2.0 3.369750
FAM13542 75.304642 2.0 5.977395
FAM06633 75.211288 5.0 10.771322
FAM13533 75.140068 8.0 5.599808
FAM13363 74.214966 2.0 12.169710
FAM13447 73.350795 4.0 1.607920
FAM13260 73.272318 8.0 7.129478
FAM16817 72.562832 2.0 7.735137
FAM13604 72.138541 6.0 7.706760
FAM13565 71.415981 2.0 3.891910
FAM06564 70.763013 6.0 11.983282
FAM13378 70.227494 2.0 2.351191
FAM00604 69.924784 52.0 11.486684
FAM15620 69.788977 2.0 0.253720
FAM06591 69.651164 21.0 8.818803
FAM13519 69.594127 14.0 10.797108
FAM13531 69.404067 4.0 7.245544
FAM13383 69.261537 2.0 7.517062
FAM15662 69.250385 2.0 1.579202
FAM06587 69.148943 13.0 5.800448
FAM13516 69.011748 22.0 4.438088
FAM15408 68.843690 2.0 3.759109
FAM01224 68.698299 8.0 14.634540
FAM15579 68.626127 52.0 12.381804
FAM13517 68.508296 16.0 5.847764
FAM15625 68.257848 4.0 2.957638
FAM05491 68.179520 52.0 10.289500
FAM13518 67.937324 8.0 2.982530
FAM01258 67.935132 10.0 13.403011
... ... ... ...
FAM14541 42.466886 7.0 8.680450
FAM14682 42.329873 4.0 1.860457
FAM11160 42.310940 45.0 9.254989
FAM03574 42.297385 4.0 9.420989
FAM01424 42.108676 23.0 14.814461
FAM07915 42.081901 20.0 9.525611
FAM05298 42.025284 10.0 14.590236
FAM14724 41.997286 9.0 11.520502
FAM15671 41.886338 2.0 0.790495
FAM12520 41.720351 8.0 2.676230
FAM09640 41.685828 38.0 9.383129
FAM06448 41.663294 21.0 13.839497
FAM14735 41.568481 16.0 8.449816
FAM09955 41.432761 32.0 9.751457
FAM12774 41.366368 9.0 5.580799
FAM14740 41.077079 14.0 14.250639
FAM07971 40.844196 15.0 8.667220
FAM10405 40.654121 7.0 9.790113
FAM03584 40.370693 21.0 10.519378
FAM06631 40.242110 9.0 21.813354
FAM10402 39.989984 7.0 9.075769
FAM14730 39.936545 4.0 6.864171
FAM13861 39.728362 8.0 12.150156
FAM03772 39.191015 8.0 8.476899
FAM14746 38.958430 2.0 4.614268
FAM08150 38.310615 6.0 10.873464
FAM03587 37.306220 4.0 18.989511
FAM14395 34.004927 9.0 13.311731
FAM01421 33.843841 7.0 10.553614
FAM12785 28.593227 3.0 23.054366

1691 rows × 3 columns


In [29]:
# statistic by both FAMILY and VARIETY
fam_var_yield = soy[['FAMILY', 'VARIETY', 'YIELD']]
fam_var_group = fam_var_yield.groupby(['FAMILY', 'VARIETY'])
fam_var_group.describe().unstack().to_csv('fam_var.csv')

In [31]:
# general geo statistic helper methods
def geo_statis(keys, year=2014):
    geo_key = geo[['LOCATION', 'AREA'] + keys]
    year_yield = soy[soy['YEAR'] == year][['LOCATION', 'YIELD']]
    year_yield = year_yield.groupby('LOCATION', as_index=False).agg(np.sum)
    key_yield = pd.merge(geo_key, year_yield, on='LOCATION')
    # get YIELD per AREA
    key_yield['AVE_YIELD'] = key_yield.apply(lambda row: (row['YIELD'] / row['AREA']), axis=1)
    return key_yield

In [39]:
# plot corresponding feature with annual average yield in each geo location
geo_yield = geo_statis(['PH', 'CEC', 'ORGANIC_MATTER', 'TEMP_14', 'PREC_14', 'RAD_14'])
geo_yield.plot(x='PH', y='AVE_YIELD', kind='scatter')
geo_yield.plot(x='CEC', y='AVE_YIELD', kind='scatter')
geo_yield.plot(x='ORGANIC_MATTER', y='AVE_YIELD', kind='scatter')
geo_yield.plot(x='TEMP_14', y='AVE_YIELD', kind='scatter')
geo_yield.plot(x='PREC_14', y='AVE_YIELD', kind='scatter')
geo_yield.plot(x='RAD_14', y='AVE_YIELD', kind='scatter')
plt.show()



In [31]:
# Lower bound for Training MSE
feature_group = ['LOCATION', 'YEAR', 'VARIETY']
group_mean = soy.groupby(feature_group)['YIELD'].mean()

test_feature = feature_group + ['YIELD']
test_x = soy[test_feature].as_matrix()

def square_error(item):
    actual = item[-1]
    pred = group_mean.get(tuple(item[:-1]))
    return (actual - pred) * (actual - pred)

se = [square_error(row) for row in test_x]
se
np.mean(se)


Out[31]:
16.073080825619961

In [32]:
group_mean


Out[32]:
LOCATION  YEAR  VARIETY
10        2011  V000070    28.990160
                V000071    20.546365
                V000110    26.259500
                V055643    41.916520
                V055644    41.333920
                V055645    40.463280
                V055646    39.108020
                V055647    36.687930
                V055648    36.301600
                V055649    41.238010
                V055650    41.432210
                V055651    42.883370
                V055652    42.302850
                V055653    39.398290
                V055655    33.493100
                V055657    39.496430
                V055659    40.851680
                V055660    43.077580
                V055661    33.397040
                V055662    47.627510
                V055663    42.012580
                V055664    39.108020
                V055665    39.688550
                V055666    38.237380
                V055667    44.626880
                V055668    37.366590
                V055669    50.047600
                V055671    30.590770
                V055672    45.013210
                V055674    39.398290
                             ...    
6550      2011  V103161    50.469220
                V103313    54.008087
                V103318    58.705130
                V112159    63.751423
                V112167    59.405957
                V120047    61.403300
                V120205    62.280283
                V120304    53.272467
                V120395    65.854640
                V120396    63.997127
                V136828    59.125460
                V136833    51.239583
                V136874    57.162913
                V137061    63.296303
                V137068    66.906247
                V137079    55.761213
                V137097    57.092527
                V137105    61.543920
                V137136    61.368503
                V137154    53.308057
                V137222    58.810213
                V137226    66.730783
                V151692    64.171750
                V151854    54.463950
                V151855    63.822503
                V151858    56.987497
                V151867    58.950043
                V155250    61.719287
                V155251    57.933973
                V155918    60.317637
Name: YIELD, dtype: float64

In [ ]: