notebook.community

Edit and run



In [7]:

    
from pyspark.sql.functions import year, udf
import matplotlib.pyplot as plt
%matplotlib inline



In [1]:

    
dir()









    Out[1]:





['In',
 'Out',
 'SQLContext',
 'SparkContext',
 'SparkSession',
 'StorageLevel',
 '_',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_i',
 '_i1',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 '_sh',
 'atexit',
 'exit',
 'get_ipython',
 'os',
 'platform',
 'py4j',
 'pyspark',
 'quit']

http://www.gbif.org/analytics/global



In [1]:

    
df = sqlContext.read.load("/guoda/data/idigbio-20190612T171757.parquet")



In [3]:

    
df.count()









    Out[3]:





104661524



In [4]:

    
year_summary = df.groupBy(year("datecollected").cast("integer").alias("yearcollected")).count().orderBy("yearcollected").persist()



In [5]:

    
year_summary.count()









    Out[5]:





1390



In [6]:

    
year_summary.printSchema()









    



root
 |-- yearcollected: integer (nullable = true)
 |-- count: long (nullable = false)



In [7]:

    
year_summary.describe().show()









    



+-------+------------------+-----------------+
|summary|     yearcollected|            count|
+-------+------------------+-----------------+
|  count|              1389|             1390|
|   mean|  2316.51403887689|75296.06043165468|
| stddev|2122.4294635611996| 752125.436430329|
|    min|               100|                1|
|    max|              9999|         27146522|
+-------+------------------+-----------------+



In [8]:

    
year_summary.head(10)









    Out[8]:





[Row(yearcollected=None, count=27146522),
 Row(yearcollected=100, count=3),
 Row(yearcollected=102, count=1),
 Row(yearcollected=103, count=2),
 Row(yearcollected=104, count=2),
 Row(yearcollected=105, count=5),
 Row(yearcollected=106, count=9),
 Row(yearcollected=107, count=3),
 Row(yearcollected=108, count=3),
 Row(yearcollected=109, count=4)]



In [9]:

    
year_summary.orderBy("yearcollected", ascending=False).head(10)









    Out[9]:





[Row(yearcollected=9999, count=9187),
 Row(yearcollected=9998, count=1),
 Row(yearcollected=9983, count=1),
 Row(yearcollected=9972, count=4),
 Row(yearcollected=9960, count=1),
 Row(yearcollected=9948, count=2),
 Row(yearcollected=9863, count=1),
 Row(yearcollected=9855, count=1),
 Row(yearcollected=9840, count=2),
 Row(yearcollected=9832, count=3)]



In [10]:

    
pandas_year_summary = year_summary.filter(year_summary.yearcollected >= 1817).filter(year_summary.yearcollected <= 2017).orderBy("yearcollected").toPandas()



In [11]:

    
pandas_year_summary.head()









    Out[11]:






  
    
      
      yearcollected
      count
    
  
  
    
      0
      1817
      3447
    
    
      1
      1818
      11556
    
    
      2
      1819
      3748
    
    
      3
      1820
      4009
    
    
      4
      1821
      4754



In [12]:

    
plt.bar(pandas_year_summary["yearcollected"], pandas_year_summary["count"])









    Out[12]:





<Container object of 201 artists>



In [13]:

    
yc_sum = (df
          .groupBy(year("datecollected").cast("integer").alias("yearcollected"),
                   "continent")
          .count()
          .orderBy("yearcollected")
          .persist()
          )



In [14]:

    
yc_sum.head(10)









    Out[14]:





[Row(yearcollected=None, continent='américa do sul', count=96917),
 Row(yearcollected=None, continent='leptocarpa', count=2),
 Row(yearcollected=None, continent='sowev', count=1),
 Row(yearcollected=None, continent='denticulata', count=1),
 Row(yearcollected=None, continent='mediterranean sea', count=110),
 Row(yearcollected=None, continent='australasia?', count=29),
 Row(yearcollected=None, continent='north atlantic, atlantic ocean', count=6),
 Row(yearcollected=None, continent='north america, central america', count=1),
 Row(yearcollected=None, continent='copy of ds-soolsource_workbook_2012-2-7_15:52', count=3),
 Row(yearcollected=None, continent='antarctic region', count=1)]

We're going to have to fix some data!



In [11]:

    
continents = set(["africa", "asia", "australia", "europe", "north america", "south america"])
def fix_continent(c):
    if c in continents:
        return c
    else:
        return "other"
print(fix_continent("europe"))    
print(fix_continent("oceana"))









    



europe
other



In [13]:

    
fix_continent_udf = udf(fix_continent)



In [10]:

    
yc_sum = (df
          .withColumn("fixed", fix_continent_udf(df.continent))
          .groupBy(year("datecollected").cast("integer").alias("yearcollected"),
                   "fixed")
          .count()
          .orderBy("yearcollected")
          .persist()
          )









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-5c765bf16c2c> in <module>()
----> 1 fix_continent_udf = udf(fix_continent)
      2 yc_sum = (df
      3           .withColumn("fixed", fix_continent_udf(df.continent))
      4           .groupBy(year("datecollected").cast("integer").alias("yearcollected"),
      5                    "fixed")

NameError: name 'fix_continent' is not defined



In [17]:

    
yc_sum.head(10)









    Out[17]:





[Row(yearcollected=None, fixed='north america', count=6661147),
 Row(yearcollected=None, fixed='europe', count=2312443),
 Row(yearcollected=None, fixed='asia', count=1451924),
 Row(yearcollected=None, fixed='australia', count=1386899),
 Row(yearcollected=None, fixed='south america', count=1401163),
 Row(yearcollected=None, fixed='other', count=13130500),
 Row(yearcollected=None, fixed='africa', count=802446),
 Row(yearcollected=100, fixed='other', count=1),
 Row(yearcollected=100, fixed='south america', count=1),
 Row(yearcollected=100, fixed='north america', count=1)]



In [18]:

    
yc_sum.groupBy(yc_sum.fixed).count().show()









    



+-------------+-----+
|        fixed|count|
+-------------+-----+
|    australia|  522|
|north america|  994|
|       europe|  546|
|south america|  513|
|        other|  779|
|       africa|  412|
|         asia|  413|
+-------------+-----+



In [19]:

    
yc_cross = (df
            .select(year("datecollected").cast("integer").alias("yearcollected"),
                    df.continent)
            .withColumn("fixed", fix_continent_udf(df.continent))
            .crosstab("yearcollected", "fixed")
            )



In [20]:

    
yc_cross.head(5)









    Out[20]:





[Row(yearcollected_fixed='892', africa=0, asia=0, australia=0, europe=0, north america=2, other=0, south america=0),
 Row(yearcollected_fixed='2199', africa=0, asia=0, australia=0, europe=0, north america=1, other=1, south america=0),
 Row(yearcollected_fixed='1665', africa=2, asia=0, australia=0, europe=1, north america=2, other=2, south america=0),
 Row(yearcollected_fixed='1036', africa=0, asia=2, australia=0, europe=0, north america=15, other=0, south america=0),
 Row(yearcollected_fixed='9131', africa=0, asia=0, australia=0, europe=0, north america=1, other=0, south america=0)]



In [21]:

    
pandas_yc_cross = (yc_cross
                   .filter(yc_cross.yearcollected_fixed >= 1817)
                   .filter(yc_cross.yearcollected_fixed <= 2017)
                   .orderBy(yc_cross.yearcollected_fixed)
                   .toPandas()
                   )



In [22]:

    
pandas_yc_cross.head(200)









    Out[22]:






  
    
      
      yearcollected_fixed
      africa
      asia
      australia
      europe
      north america
      other
      south america
    
  
  
    
      0
      1817
      76
      139
      385
      534
      471
      390
      1452
    
    
      1
      1818
      43
      246
      477
      805
      5365
      4259
      361
    
    
      2
      1819
      76
      332
      487
      1066
      560
      914
      313
    
    
      3
      1820
      50
      155
      217
      1741
      641
      997
      208
    
    
      4
      1821
      79
      715
      115
      1853
      464
      876
      652
    
    
      5
      1822
      343
      451
      547
      1241
      3362
      1196
      325
    
    
      6
      1823
      66
      82
      1441
      866
      371
      1032
      247
    
    
      7
      1824
      319
      141
      124
      1553
      516
      1181
      818
    
    
      8
      1825
      338
      178
      311
      1246
      952
      1628
      300
    
    
      9
      1826
      339
      120
      87
      1584
      889
      2143
      1250
    
    
      10
      1827
      293
      187
      166
      2196
      752
      3063
      1249
    
    
      11
      1828
      258
      236
      173
      1950
      2040
      3001
      954
    
    
      12
      1829
      555
      238
      227
      1807
      1044
      3407
      718
    
    
      13
      1830
      220
      225
      76
      1712
      523
      3964
      763
    
    
      14
      1831
      49
      218
      102
      1615
      945
      2642
      1317
    
    
      15
      1832
      209
      224
      196
      1723
      1500
      4771
      886
    
    
      16
      1833
      139
      261
      225
      2579
      2045
      4977
      954
    
    
      17
      1834
      374
      294
      241
      1824
      1694
      2821
      881
    
    
      18
      1835
      677
      570
      267
      1744
      1405
      3377
      973
    
    
      19
      1836
      178
      901
      497
      3318
      881
      3752
      2488
    
    
      20
      1837
      2048
      576
      595
      3238
      1464
      5717
      1138
    
    
      21
      1838
      1943
      671
      2013
      3125
      2981
      9108
      4119
    
    
      22
      1839
      1828
      238
      3187
      3149
      2278
      6150
      1841
    
    
      23
      1840
      1760
      388
      2724
      5650
      2979
      7144
      1894
    
    
      24
      1841
      1315
      1655
      906
      4406
      3521
      5255
      1931
    
    
      25
      1842
      538
      2882
      1174
      4458
      3814
      7077
      2995
    
    
      26
      1843
      288
      1361
      2793
      3816
      2640
      7017
      1386
    
    
      27
      1844
      320
      876
      1373
      4322
      2419
      6085
      1315
    
    
      28
      1845
      237
      1037
      789
      4818
      3275
      4734
      2008
    
    
      29
      1846
      417
      1005
      882
      5464
      4362
      3782
      1781
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      170
      1987
      34874
      37605
      130764
      46302
      397502
      118522
      129859
    
    
      171
      1988
      44456
      31930
      141211
      48189
      383202
      104349
      123776
    
    
      172
      1989
      38428
      36151
      134280
      45063
      381650
      99985
      127818
    
    
      173
      1990
      42041
      36005
      136161
      56755
      391754
      99632
      109858
    
    
      174
      1991
      29538
      39925
      131091
      62951
      405979
      109148
      113523
    
    
      175
      1992
      30309
      37354
      151296
      67376
      408740
      113462
      114611
    
    
      176
      1993
      37388
      40643
      159553
      77994
      421136
      132286
      106249
    
    
      177
      1994
      39959
      32979
      145041
      69078
      451914
      128029
      115625
    
    
      178
      1995
      52436
      49346
      172370
      71207
      511523
      138048
      112657
    
    
      179
      1996
      38526
      41148
      137164
      68625
      444017
      127164
      113196
    
    
      180
      1997
      28611
      40305
      138453
      76381
      433645
      149017
      124524
    
    
      181
      1998
      33057
      46807
      133546
      71646
      439901
      123515
      104031
    
    
      182
      1999
      38141
      42899
      140347
      63971
      407015
      135641
      107499
    
    
      183
      2000
      51413
      40252
      114264
      72520
      426556
      164377
      118871
    
    
      184
      2001
      132746
      31912
      109268
      60116
      410070
      145436
      107078
    
    
      185
      2002
      85929
      30626
      96699
      65991
      440842
      152949
      74342
    
    
      186
      2003
      68127
      30017
      101329
      92905
      440627
      173595
      108202
    
    
      187
      2004
      45539
      53708
      119930
      100434
      452017
      148997
      81527
    
    
      188
      2005
      54908
      39664
      107732
      101349
      422926
      199265
      71840
    
    
      189
      2006
      37666
      45890
      83399
      63043
      411666
      210184
      57729
    
    
      190
      2007
      51341
      35545
      83050
      59972
      378394
      188704
      68636
    
    
      191
      2008
      41523
      35243
      84874
      59476
      410031
      184670
      48399
    
    
      192
      2009
      41027
      21076
      73359
      48657
      404088
      189340
      60834
    
    
      193
      2010
      33274
      21877
      69704
      48589
      454417
      181268
      54929
    
    
      194
      2011
      33872
      21994
      52976
      43900
      368527
      157945
      48408
    
    
      195
      2012
      52683
      22216
      43513
      34557
      537893
      183608
      46927
    
    
      196
      2013
      26231
      22368
      39975
      39156
      512247
      119463
      33712
    
    
      197
      2014
      16198
      19373
      28999
      24021
      403086
      106155
      28562
    
    
      198
      2015
      12080
      15112
      21054
      14454
      135191
      69078
      7593
    
    
      199
      2016
      3571
      10894
      6803
      9361
      88688
      36387
      5159
    
  

200 rows × 8 columns



In [23]:

    
p1 = plt.bar(pandas_yc_cross['yearcollected_fixed'], 
                 pandas_yc_cross['asia'], color='#d62728', edgecolor='none')
#p2 = plt.bar(pandas_yc_cross['yearcollected_fixed'], 
#                 pandas_yc_cross['africa'], color='#05ff05', edgecolor='none')



In [27]:

    
import pandas as pd
import numpy as np
continents_list = sorted(continents)
continents_list.insert(0, "other")
# the 100k subset has no australia stuff so graph loop fails
#continents_list.remove("australia")

# blue -> red
#colors = ["#6d6263", "#00e5c8", "#1bc4ae", "#36a395", "#51827c", "#88414a", "#a32031", "#bf0018"]

# blue -> orange
colors = ["#a75902", "#00e5c8", "#1bcda7", "#37b686", "#539f65", "#51827c", "#bf6603"]

plots = []
bottoms = pd.DataFrame(np.zeros((len(pandas_yc_cross['yearcollected_fixed']), 1)))
for c in continents_list:
    plots.append(
      plt.bar(pandas_yc_cross['yearcollected_fixed'], pandas_yc_cross[c], 
              color=colors[len(plots)], edgecolor='none',
              width=1.0, bottom=bottoms[0])
    )
    bottoms[0] += pandas_yc_cross[c]
    #print(pandas_yc_cross[c])

#print(bottoms)

# Start of WWI
plt.axvline(x=1914)

# Start of WWII
plt.axvline(x=1939)

# "1988 - October 31: President Reagan signs the NSF Authorization Act of 1988, thereby authorizing the doubling of the NSF budget over the next five years."
plt.axvline(x=1988)

plt.legend(plots, continents_list, loc=2)
plt.title("Specimens in iDigBio by Collection Year and Continent")
plt.ylabel("Number of Specimen Records")
plt.xlabel("Year")
axes = plt.gca()
axes.set_xlim([1815, 2020])
axes.set_ylim([0, 1200000])
fig = plt.gcf()
fig.set_size_inches(12, 4)

GBIF!



In [1]:

    
df = sqlContext.read.parquet("/guoda/data/gbif-idigbio.parquet/source=gbif/date=20160825")



In [17]:

    
df.createOrReplaceTempView("df")
renamed_cols = sqlContext.sql("""
SELECT `http://rs.tdwg.org/dwc/terms/eventDate` as eventDate,
       `http://rs.tdwg.org/dwc/terms/continent` as continent
FROM df
WHERE `http://rs.tdwg.org/dwc/terms/basisOfRecord` LIKE "%SPECIMEN%"
""").persist()



In [18]:

    
renamed_cols.count()









    Out[18]:





63184115



In [37]:

    
renamed_cols.groupBy("continent").count().orderBy("count", ascending=False).head(100)









    Out[37]:





[Row(continent='', count=39879474),
 Row(continent='NORTH_AMERICA', count=10329814),
 Row(continent='SOUTH_AMERICA', count=4873126),
 Row(continent='EUROPE', count=4107357),
 Row(continent='AFRICA', count=2147077),
 Row(continent='ASIA', count=1088336),
 Row(continent='OCEANIA', count=578096),
 Row(continent=None, count=129861),
 Row(continent='ANTARCTICA', count=26681),
 Row(continent='EASTERN NORTH PACIFIC | USA | ALASKA |  |  |  |  |', count=1475),
 Row(continent='NULL', count=1410),
 Row(continent='Eastern Pacific | USA | California | - |  |  |  |', count=719),
 Row(continent='North America, United States, Alaska', count=676),
 Row(continent='North Atlantic Ocean, United States', count=540),
 Row(continent='North Atlantic Ocean, United States, Massachusetts', count=495),
 Row(continent='North Atlantic Ocean, Gulf of Mexico, United States, Florida', count=449),
 Row(continent='|  |  |  |  | Gulf of Mexico |  |', count=272),
 Row(continent='Antarctic Ocean, Antarctica', count=195),
 Row(continent='North Atlantic Ocean', count=190),
 Row(continent='North Atlantic Ocean, Gulf of Mexico, United States, Louisiana', count=183),
 Row(continent='North Atlantic Ocean, Gulf of Mexico, United States, Texas', count=183),
 Row(continent='Atlantic, Bermuda, Bermuda', count=177),
 Row(continent='North America | Canada | Alberta', count=131),
 Row(continent='North Atlantic Ocean, Gulf of Mexico, United States', count=129),
 Row(continent='North Atlantic Ocean, Caribbean Sea, Panama, Colon', count=127),
 Row(continent='Eastern Pacific | Mexico | Baja California | - |  |  |  |', count=120),
 Row(continent='Oceania, Palau, ,', count=104),
 Row(continent='South Pacific Ocean', count=99),
 Row(continent='North Atlantic Ocean, Caribbean Sea, Belize', count=93),
 Row(continent='EASTERN NORTH PACIFIC | USA | OREGON |  |  |  |  |', count=93),
 Row(continent='Северный Ледовитый океан', count=87),
 Row(continent='North Atlantic Ocean, United States, South Carolina', count=84),
 Row(continent='Eastern Pacific | USA | California | Los Angeles |  |  |  |', count=84),
 Row(continent='| Venezuela | Apure |  |  | Cinaruco River |  |', count=81),
 Row(continent='North Atlantic Ocean, United States, Florida', count=77),
 Row(continent='South Atlantic Ocean, Scotia Sea', count=76),
 Row(continent='Eastern Pacific | Mexico | - | - |  |  |  |', count=75),
 Row(continent='North Pacific Ocean, United States, Hawaii', count=74),
 Row(continent='North Pacific Ocean, Bering Sea, United States, Alaska', count=74),
 Row(continent='North America; United States; California', count=66),
 Row(continent='Canada; Ontario; Kenora District', count=66),
 Row(continent='North Atlantic Ocean, United States, North Carolina', count=66),
 Row(continent='South Atlantic Ocean', count=62),
 Row(continent='NW Atlantic; Bermuda', count=59),
 Row(continent='North America, Bermuda, ,', count=59),
 Row(continent='New Caledonia', count=58),
 Row(continent='North Pacific Ocean, United States, California', count=57),
 Row(continent='Antarctic | - | - | - |  |  |  |', count=56),
 Row(continent='Oceania, Micronesia, , Pohnpei', count=56),
 Row(continent='Indo-West Pacific; Micronesia; Pohnpei; Kapingamarangi Atoll', count=56),
 Row(continent='Antarctic Ocean, Ross Sea, Antarctica', count=56),
 Row(continent='EASTERN NORTH PACIFIC | USA | WASHINGTON |  |  |  |  |', count=52),
 Row(continent='Indo-West Pacific; Palau', count=50),
 Row(continent='Fiji; Great Astrolabe Reef', count=50),
 Row(continent='EASTERN NORTH PACIFIC | USA | CALIFORNIA |  |  |  |  |', count=49),
 Row(continent='North America, USA, , California State', count=48),
 Row(continent='Canada; Ontario; Thunder Bay District', count=48),
 Row(continent='North Atlantic Ocean, United States, Georgia', count=48),
 Row(continent='CALIFORNIA: San Francisco Estuary: San Francisco Bay: San Mateo County', count=46),
 Row(continent='South Atlantic Ocean, Argentina, Tierra del Fuego', count=45),
 Row(continent='Canada; Manitoba', count=45),
 Row(continent='Canada; Ontario; Nipissing District', count=45),
 Row(continent='Oceania, Micronesia, ,', count=44),
 Row(continent='NE Pacific; United States; California', count=44),
 Row(continent='Eastern Pacific | Costa Rica | - | - |  |  |  |', count=43),
 Row(continent='Chagos Archipelago;British Indian Ocean Territory; Peros Banhos Atoll', count=43),
 Row(continent='North America; Canada; Quebec; Anticosti Island', count=43),
 Row(continent='South Pacific; Fiji; Viti Levu Group; Viti Levu I.', count=42),
 Row(continent='North Atlantic Ocean, Bahamas', count=41),
 Row(continent='Antarctic Ocean', count=41),
 Row(continent='Canada; Newfoundland', count=40),
 Row(continent='Palau; Hatohobei State', count=38),
 Row(continent='Indo-West Pacific; Thailand', count=38),
 Row(continent='Palau; Koror', count=38),
 Row(continent='Indo-West Pacific; Micronesia; Ifalik Atoll', count=38),
 Row(continent='| USA |  |  |  | Gulf of Mexico |  |', count=37),
 Row(continent='North America, Mexico, ,', count=37),
 Row(continent='NE Pacific', count=36),
 Row(continent='Тихий океан', count=35),
 Row(continent='Eastern Pacific | USA | California | San Luis Obispo |  |  |  |', count=35),
 Row(continent='Central Pacific | USA | Hawaii | - |  |  |  |', count=35),
 Row(continent='Oceania, Fr Polynesia, ,', count=34),
 Row(continent='Guyana; Essequibo', count=34),
 Row(continent='South Pacific Ocean, New Zealand', count=34),
 Row(continent='South Pacific; French Polynesia; Tuamotu Archipelago; Raroia Atoll', count=33),
 Row(continent='Atlantic', count=33),
 Row(continent='Canada; British Columbia', count=33),
 Row(continent='Indo-West Pacific; Micronesia; Yap; Yap Is.; Yap I.', count=33),
 Row(continent='North Pacific Ocean', count=33),
 Row(continent='North America, United States, Pennsylvania', count=32),
 Row(continent='Vietnam; Khanh Hoa', count=32),
 Row(continent='Guyana; Region 6 (Kurupukari)', count=32),
 Row(continent='Canada; Northwest Territories', count=31),
 Row(continent='NE Pacific; Mexico; Sonora', count=31),
 Row(continent='North America; USA; Connecticut; Tolland County; Union', count=30),
 Row(continent='Pacific Ocean;North Subtropical Oceania;Hawaiian Islands;', count=30),
 Row(continent='Eastern Pacific | Costa Rica | Puntarenas | - |  |  |  |', count=30),
 Row(continent='CALIFORNIA: San Francisco Estuary: San Francisco Bay: Napa County: Napa River: Mare Island Strait', count=30),
 Row(continent='Canada; Ontario; Algoma District', count=30),
 Row(continent='Cuenca del Magdalena', count=30)]



In [52]:

    
date_group = renamed_cols.groupBy(year("eventDate").cast("integer").alias("year")).count().orderBy("year", ascending=False)



In [53]:

    
date_group.head(10)









    Out[53]:





[Row(year=9960, count=1),
 Row(year=9959, count=1),
 Row(year=9953, count=2),
 Row(year=9906, count=1),
 Row(year=9891, count=1),
 Row(year=9889, count=2),
 Row(year=9870, count=1),
 Row(year=9866, count=1),
 Row(year=9865, count=1),
 Row(year=9860, count=1)]



In [55]:

    
date_group.describe().show()









    



+-------+-----------------+-----------------+
|summary|             year|            count|
+-------+-----------------+-----------------+
|  count|              854|              855|
|   mean|2498.307962529274|73899.54970760235|
| stddev|2254.309979612469| 478474.536895311|
|    min|                0|                1|
|    max|             9960|         12972411|
+-------+-----------------+-----------------+



In [43]:

    
# need to do some harder-core data cleaning
continents = set(["africa", "asia", "oceania", "europe", "north america", "south america"])
def fix_continent_gbif(c):
    for continent in continents:
        if (c) and (continent in c.lower().replace("_", " ")):
            return continent
    return "other"
print(fix_continent_gbif(None)) 
print(fix_continent_gbif("europe"))    
print(fix_continent_gbif("oceania"))
print(fix_continent_gbif("NORTH_AMERICA"))
print(fix_continent_gbif("North America, Canada, Manitoba, Churchill"))
print(fix_continent_gbif("East Indies, Indonesia: Pulo Pandjang, off Sumatra"))
print(fix_continent_gbif("Asia; Thailand; Pathum Thani"))









    



other
europe
oceania
north america
north america
other
asia



In [44]:

    
from pyspark.sql.functions import udf
fix_continent_gbif_udf = udf(fix_continent_gbif)



In [45]:

    
yc_cross_gbif = (renamed_cols
            .select(year("eventDate").cast("integer").alias("yearcollected"),
                    renamed_cols.continent)
            .withColumn("fixed", fix_continent_gbif_udf(renamed_cols.continent))
            .crosstab("yearcollected", "fixed")
            .persist()
            )



In [46]:

    
pandas_yc_cross_gbif = (yc_cross_gbif
                   .filter(yc_cross_gbif.yearcollected_fixed >= 1817)
                   .filter(yc_cross_gbif.yearcollected_fixed <= 2017)
                   .orderBy(yc_cross_gbif.yearcollected_fixed)
                   .toPandas()
                   )



In [47]:

    
import pandas as pd
import numpy as np
continents_list = sorted(continents)
continents_list.insert(0, "other")
# the 100k subset has no australia stuff so graph loop fails
#continents_list.remove("australia")

# blue -> red
#colors = ["#6d6263", "#00e5c8", "#1bc4ae", "#36a395", "#51827c", "#88414a", "#a32031", "#bf0018"]

# blue -> orange
colors = ["#a75902", "#00e5c8", "#1bcda7", "#37b686", "#539f65", "#51827c", "#bf6603"]


plots = []
bottoms = pd.DataFrame(np.zeros((len(pandas_yc_cross_gbif['yearcollected_fixed']), 1)))
for c in continents_list:
    plots.append(
      plt.bar(pandas_yc_cross_gbif['yearcollected_fixed'], pandas_yc_cross_gbif[c], 
              color=colors[len(plots)], edgecolor='none',
              width=1.0, bottom=bottoms[0])
    )
    bottoms[0] += pandas_yc_cross_gbif[c]
    #print(pandas_yc_cross[c])

#print(bottoms)

# Start of WWI
plt.axvline(x=1914)

# Start of WWII
plt.axvline(x=1939)

# "1988 - October 31: President Reagan signs the NSF Authorization Act of 1988, thereby authorizing the doubling of the NSF budget over the next five years."
plt.axvline(x=1988)

plt.legend(plots, continents_list, loc=2)
plt.title("Specimens in GBIF by Collection Year and Continent")
plt.ylabel("Number of Specimen Records")
plt.xlabel("Year")
axes = plt.gca()
axes.set_xlim([1815, 2020])
axes.set_ylim([0, 1200000])
fig = plt.gcf()
fig.set_size_inches(12, 4)

	yearcollected_fixed	africa	asia	australia	europe	north america	other	south america
0	1817	76	139	385	534	471	390	1452
1	1818	43	246	477	805	5365	4259	361
2	1819	76	332	487	1066	560	914	313
3	1820	50	155	217	1741	641	997	208
4	1821	79	715	115	1853	464	876	652
5	1822	343	451	547	1241	3362	1196	325
6	1823	66	82	1441	866	371	1032	247
7	1824	319	141	124	1553	516	1181	818
8	1825	338	178	311	1246	952	1628	300
9	1826	339	120	87	1584	889	2143	1250
10	1827	293	187	166	2196	752	3063	1249
11	1828	258	236	173	1950	2040	3001	954
12	1829	555	238	227	1807	1044	3407	718
13	1830	220	225	76	1712	523	3964	763
14	1831	49	218	102	1615	945	2642	1317
15	1832	209	224	196	1723	1500	4771	886
16	1833	139	261	225	2579	2045	4977	954
17	1834	374	294	241	1824	1694	2821	881
18	1835	677	570	267	1744	1405	3377	973
19	1836	178	901	497	3318	881	3752	2488
20	1837	2048	576	595	3238	1464	5717	1138
21	1838	1943	671	2013	3125	2981	9108	4119
22	1839	1828	238	3187	3149	2278	6150	1841
23	1840	1760	388	2724	5650	2979	7144	1894
24	1841	1315	1655	906	4406	3521	5255	1931
25	1842	538	2882	1174	4458	3814	7077	2995
26	1843	288	1361	2793	3816	2640	7017	1386
27	1844	320	876	1373	4322	2419	6085	1315
28	1845	237	1037	789	4818	3275	4734	2008
29	1846	417	1005	882	5464	4362	3782	1781
...	...	...	...	...	...	...	...	...
170	1987	34874	37605	130764	46302	397502	118522	129859
171	1988	44456	31930	141211	48189	383202	104349	123776
172	1989	38428	36151	134280	45063	381650	99985	127818
173	1990	42041	36005	136161	56755	391754	99632	109858
174	1991	29538	39925	131091	62951	405979	109148	113523
175	1992	30309	37354	151296	67376	408740	113462	114611
176	1993	37388	40643	159553	77994	421136	132286	106249
177	1994	39959	32979	145041	69078	451914	128029	115625
178	1995	52436	49346	172370	71207	511523	138048	112657
179	1996	38526	41148	137164	68625	444017	127164	113196
180	1997	28611	40305	138453	76381	433645	149017	124524
181	1998	33057	46807	133546	71646	439901	123515	104031
182	1999	38141	42899	140347	63971	407015	135641	107499
183	2000	51413	40252	114264	72520	426556	164377	118871
184	2001	132746	31912	109268	60116	410070	145436	107078
185	2002	85929	30626	96699	65991	440842	152949	74342
186	2003	68127	30017	101329	92905	440627	173595	108202
187	2004	45539	53708	119930	100434	452017	148997	81527
188	2005	54908	39664	107732	101349	422926	199265	71840
189	2006	37666	45890	83399	63043	411666	210184	57729
190	2007	51341	35545	83050	59972	378394	188704	68636
191	2008	41523	35243	84874	59476	410031	184670	48399
192	2009	41027	21076	73359	48657	404088	189340	60834
193	2010	33274	21877	69704	48589	454417	181268	54929
194	2011	33872	21994	52976	43900	368527	157945	48408
195	2012	52683	22216	43513	34557	537893	183608	46927
196	2013	26231	22368	39975	39156	512247	119463	33712
197	2014	16198	19373	28999	24021	403086	106155	28562
198	2015	12080	15112	21054	14454	135191	69078	7593
199	2016	3571	10894	6803	9361	88688	36387	5159