In [7]:
from pyspark.sql.functions import year, udf
import matplotlib.pyplot as plt
%matplotlib inline

In [1]:
dir()


Out[1]:
['In',
 'Out',
 'SQLContext',
 'SparkContext',
 'SparkSession',
 'StorageLevel',
 '_',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_i',
 '_i1',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 '_sh',
 'atexit',
 'exit',
 'get_ipython',
 'os',
 'platform',
 'py4j',
 'pyspark',
 'quit']

In [1]:
df = sqlContext.read.load("/guoda/data/idigbio-20190612T171757.parquet")

In [3]:
df.count()


Out[3]:
104661524

In [4]:
year_summary = df.groupBy(year("datecollected").cast("integer").alias("yearcollected")).count().orderBy("yearcollected").persist()

In [5]:
year_summary.count()


Out[5]:
1390

In [6]:
year_summary.printSchema()


root
 |-- yearcollected: integer (nullable = true)
 |-- count: long (nullable = false)


In [7]:
year_summary.describe().show()


+-------+------------------+-----------------+
|summary|     yearcollected|            count|
+-------+------------------+-----------------+
|  count|              1389|             1390|
|   mean|  2316.51403887689|75296.06043165468|
| stddev|2122.4294635611996| 752125.436430329|
|    min|               100|                1|
|    max|              9999|         27146522|
+-------+------------------+-----------------+


In [8]:
year_summary.head(10)


Out[8]:
[Row(yearcollected=None, count=27146522),
 Row(yearcollected=100, count=3),
 Row(yearcollected=102, count=1),
 Row(yearcollected=103, count=2),
 Row(yearcollected=104, count=2),
 Row(yearcollected=105, count=5),
 Row(yearcollected=106, count=9),
 Row(yearcollected=107, count=3),
 Row(yearcollected=108, count=3),
 Row(yearcollected=109, count=4)]

In [9]:
year_summary.orderBy("yearcollected", ascending=False).head(10)


Out[9]:
[Row(yearcollected=9999, count=9187),
 Row(yearcollected=9998, count=1),
 Row(yearcollected=9983, count=1),
 Row(yearcollected=9972, count=4),
 Row(yearcollected=9960, count=1),
 Row(yearcollected=9948, count=2),
 Row(yearcollected=9863, count=1),
 Row(yearcollected=9855, count=1),
 Row(yearcollected=9840, count=2),
 Row(yearcollected=9832, count=3)]

In [10]:
pandas_year_summary = year_summary.filter(year_summary.yearcollected >= 1817).filter(year_summary.yearcollected <= 2017).orderBy("yearcollected").toPandas()

In [11]:
pandas_year_summary.head()


Out[11]:
yearcollected count
0 1817 3447
1 1818 11556
2 1819 3748
3 1820 4009
4 1821 4754

In [12]:
plt.bar(pandas_year_summary["yearcollected"], pandas_year_summary["count"])


Out[12]:
<Container object of 201 artists>

In [13]:
yc_sum = (df
          .groupBy(year("datecollected").cast("integer").alias("yearcollected"),
                   "continent")
          .count()
          .orderBy("yearcollected")
          .persist()
          )

In [14]:
yc_sum.head(10)


Out[14]:
[Row(yearcollected=None, continent='américa do sul', count=96917),
 Row(yearcollected=None, continent='leptocarpa', count=2),
 Row(yearcollected=None, continent='sowev', count=1),
 Row(yearcollected=None, continent='denticulata', count=1),
 Row(yearcollected=None, continent='mediterranean sea', count=110),
 Row(yearcollected=None, continent='australasia?', count=29),
 Row(yearcollected=None, continent='north atlantic, atlantic ocean', count=6),
 Row(yearcollected=None, continent='north america, central america', count=1),
 Row(yearcollected=None, continent='copy of ds-soolsource_workbook_2012-2-7_15:52', count=3),
 Row(yearcollected=None, continent='antarctic region', count=1)]

We're going to have to fix some data!


In [11]:
continents = set(["africa", "asia", "australia", "europe", "north america", "south america"])
def fix_continent(c):
    if c in continents:
        return c
    else:
        return "other"
print(fix_continent("europe"))    
print(fix_continent("oceana"))


europe
other

In [13]:
fix_continent_udf = udf(fix_continent)

In [10]:
yc_sum = (df
          .withColumn("fixed", fix_continent_udf(df.continent))
          .groupBy(year("datecollected").cast("integer").alias("yearcollected"),
                   "fixed")
          .count()
          .orderBy("yearcollected")
          .persist()
          )


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-5c765bf16c2c> in <module>()
----> 1 fix_continent_udf = udf(fix_continent)
      2 yc_sum = (df
      3           .withColumn("fixed", fix_continent_udf(df.continent))
      4           .groupBy(year("datecollected").cast("integer").alias("yearcollected"),
      5                    "fixed")

NameError: name 'fix_continent' is not defined

In [17]:
yc_sum.head(10)


Out[17]:
[Row(yearcollected=None, fixed='north america', count=6661147),
 Row(yearcollected=None, fixed='europe', count=2312443),
 Row(yearcollected=None, fixed='asia', count=1451924),
 Row(yearcollected=None, fixed='australia', count=1386899),
 Row(yearcollected=None, fixed='south america', count=1401163),
 Row(yearcollected=None, fixed='other', count=13130500),
 Row(yearcollected=None, fixed='africa', count=802446),
 Row(yearcollected=100, fixed='other', count=1),
 Row(yearcollected=100, fixed='south america', count=1),
 Row(yearcollected=100, fixed='north america', count=1)]

In [18]:
yc_sum.groupBy(yc_sum.fixed).count().show()


+-------------+-----+
|        fixed|count|
+-------------+-----+
|    australia|  522|
|north america|  994|
|       europe|  546|
|south america|  513|
|        other|  779|
|       africa|  412|
|         asia|  413|
+-------------+-----+


In [19]:
yc_cross = (df
            .select(year("datecollected").cast("integer").alias("yearcollected"),
                    df.continent)
            .withColumn("fixed", fix_continent_udf(df.continent))
            .crosstab("yearcollected", "fixed")
            )

In [20]:
yc_cross.head(5)


Out[20]:
[Row(yearcollected_fixed='892', africa=0, asia=0, australia=0, europe=0, north america=2, other=0, south america=0),
 Row(yearcollected_fixed='2199', africa=0, asia=0, australia=0, europe=0, north america=1, other=1, south america=0),
 Row(yearcollected_fixed='1665', africa=2, asia=0, australia=0, europe=1, north america=2, other=2, south america=0),
 Row(yearcollected_fixed='1036', africa=0, asia=2, australia=0, europe=0, north america=15, other=0, south america=0),
 Row(yearcollected_fixed='9131', africa=0, asia=0, australia=0, europe=0, north america=1, other=0, south america=0)]

In [21]:
pandas_yc_cross = (yc_cross
                   .filter(yc_cross.yearcollected_fixed >= 1817)
                   .filter(yc_cross.yearcollected_fixed <= 2017)
                   .orderBy(yc_cross.yearcollected_fixed)
                   .toPandas()
                   )

In [22]:
pandas_yc_cross.head(200)


Out[22]:
yearcollected_fixed africa asia australia europe north america other south america
0 1817 76 139 385 534 471 390 1452
1 1818 43 246 477 805 5365 4259 361
2 1819 76 332 487 1066 560 914 313
3 1820 50 155 217 1741 641 997 208
4 1821 79 715 115 1853 464 876 652
5 1822 343 451 547 1241 3362 1196 325
6 1823 66 82 1441 866 371 1032 247
7 1824 319 141 124 1553 516 1181 818
8 1825 338 178 311 1246 952 1628 300
9 1826 339 120 87 1584 889 2143 1250
10 1827 293 187 166 2196 752 3063 1249
11 1828 258 236 173 1950 2040 3001 954
12 1829 555 238 227 1807 1044 3407 718
13 1830 220 225 76 1712 523 3964 763
14 1831 49 218 102 1615 945 2642 1317
15 1832 209 224 196 1723 1500 4771 886
16 1833 139 261 225 2579 2045 4977 954
17 1834 374 294 241 1824 1694 2821 881
18 1835 677 570 267 1744 1405 3377 973
19 1836 178 901 497 3318 881 3752 2488
20 1837 2048 576 595 3238 1464 5717 1138
21 1838 1943 671 2013 3125 2981 9108 4119
22 1839 1828 238 3187 3149 2278 6150 1841
23 1840 1760 388 2724 5650 2979 7144 1894
24 1841 1315 1655 906 4406 3521 5255 1931
25 1842 538 2882 1174 4458 3814 7077 2995
26 1843 288 1361 2793 3816 2640 7017 1386
27 1844 320 876 1373 4322 2419 6085 1315
28 1845 237 1037 789 4818 3275 4734 2008
29 1846 417 1005 882 5464 4362 3782 1781
... ... ... ... ... ... ... ... ...
170 1987 34874 37605 130764 46302 397502 118522 129859
171 1988 44456 31930 141211 48189 383202 104349 123776
172 1989 38428 36151 134280 45063 381650 99985 127818
173 1990 42041 36005 136161 56755 391754 99632 109858
174 1991 29538 39925 131091 62951 405979 109148 113523
175 1992 30309 37354 151296 67376 408740 113462 114611
176 1993 37388 40643 159553 77994 421136 132286 106249
177 1994 39959 32979 145041 69078 451914 128029 115625
178 1995 52436 49346 172370 71207 511523 138048 112657
179 1996 38526 41148 137164 68625 444017 127164 113196
180 1997 28611 40305 138453 76381 433645 149017 124524
181 1998 33057 46807 133546 71646 439901 123515 104031
182 1999 38141 42899 140347 63971 407015 135641 107499
183 2000 51413 40252 114264 72520 426556 164377 118871
184 2001 132746 31912 109268 60116 410070 145436 107078
185 2002 85929 30626 96699 65991 440842 152949 74342
186 2003 68127 30017 101329 92905 440627 173595 108202
187 2004 45539 53708 119930 100434 452017 148997 81527
188 2005 54908 39664 107732 101349 422926 199265 71840
189 2006 37666 45890 83399 63043 411666 210184 57729
190 2007 51341 35545 83050 59972 378394 188704 68636
191 2008 41523 35243 84874 59476 410031 184670 48399
192 2009 41027 21076 73359 48657 404088 189340 60834
193 2010 33274 21877 69704 48589 454417 181268 54929
194 2011 33872 21994 52976 43900 368527 157945 48408
195 2012 52683 22216 43513 34557 537893 183608 46927
196 2013 26231 22368 39975 39156 512247 119463 33712
197 2014 16198 19373 28999 24021 403086 106155 28562
198 2015 12080 15112 21054 14454 135191 69078 7593
199 2016 3571 10894 6803 9361 88688 36387 5159

200 rows × 8 columns


In [23]:
p1 = plt.bar(pandas_yc_cross['yearcollected_fixed'], 
                 pandas_yc_cross['asia'], color='#d62728', edgecolor='none')
#p2 = plt.bar(pandas_yc_cross['yearcollected_fixed'], 
#                 pandas_yc_cross['africa'], color='#05ff05', edgecolor='none')



In [27]:
import pandas as pd
import numpy as np
continents_list = sorted(continents)
continents_list.insert(0, "other")
# the 100k subset has no australia stuff so graph loop fails
#continents_list.remove("australia")

# blue -> red
#colors = ["#6d6263", "#00e5c8", "#1bc4ae", "#36a395", "#51827c", "#88414a", "#a32031", "#bf0018"]

# blue -> orange
colors = ["#a75902", "#00e5c8", "#1bcda7", "#37b686", "#539f65", "#51827c", "#bf6603"]

plots = []
bottoms = pd.DataFrame(np.zeros((len(pandas_yc_cross['yearcollected_fixed']), 1)))
for c in continents_list:
    plots.append(
      plt.bar(pandas_yc_cross['yearcollected_fixed'], pandas_yc_cross[c], 
              color=colors[len(plots)], edgecolor='none',
              width=1.0, bottom=bottoms[0])
    )
    bottoms[0] += pandas_yc_cross[c]
    #print(pandas_yc_cross[c])

#print(bottoms)

# Start of WWI
plt.axvline(x=1914)

# Start of WWII
plt.axvline(x=1939)

# "1988 - October 31: President Reagan signs the NSF Authorization Act of 1988, thereby authorizing the doubling of the NSF budget over the next five years."
plt.axvline(x=1988)

plt.legend(plots, continents_list, loc=2)
plt.title("Specimens in iDigBio by Collection Year and Continent")
plt.ylabel("Number of Specimen Records")
plt.xlabel("Year")
axes = plt.gca()
axes.set_xlim([1815, 2020])
axes.set_ylim([0, 1200000])
fig = plt.gcf()
fig.set_size_inches(12, 4)


GBIF!


In [1]:
df = sqlContext.read.parquet("/guoda/data/gbif-idigbio.parquet/source=gbif/date=20160825")

In [17]:
df.createOrReplaceTempView("df")
renamed_cols = sqlContext.sql("""
SELECT `http://rs.tdwg.org/dwc/terms/eventDate` as eventDate,
       `http://rs.tdwg.org/dwc/terms/continent` as continent
FROM df
WHERE `http://rs.tdwg.org/dwc/terms/basisOfRecord` LIKE "%SPECIMEN%"
""").persist()

In [18]:
renamed_cols.count()


Out[18]:
63184115

In [37]:
renamed_cols.groupBy("continent").count().orderBy("count", ascending=False).head(100)


Out[37]:
[Row(continent='', count=39879474),
 Row(continent='NORTH_AMERICA', count=10329814),
 Row(continent='SOUTH_AMERICA', count=4873126),
 Row(continent='EUROPE', count=4107357),
 Row(continent='AFRICA', count=2147077),
 Row(continent='ASIA', count=1088336),
 Row(continent='OCEANIA', count=578096),
 Row(continent=None, count=129861),
 Row(continent='ANTARCTICA', count=26681),
 Row(continent='EASTERN NORTH PACIFIC | USA | ALASKA |  |  |  |  |', count=1475),
 Row(continent='NULL', count=1410),
 Row(continent='Eastern Pacific | USA | California | - |  |  |  |', count=719),
 Row(continent='North America, United States, Alaska', count=676),
 Row(continent='North Atlantic Ocean, United States', count=540),
 Row(continent='North Atlantic Ocean, United States, Massachusetts', count=495),
 Row(continent='North Atlantic Ocean, Gulf of Mexico, United States, Florida', count=449),
 Row(continent='|  |  |  |  | Gulf of Mexico |  |', count=272),
 Row(continent='Antarctic Ocean, Antarctica', count=195),
 Row(continent='North Atlantic Ocean', count=190),
 Row(continent='North Atlantic Ocean, Gulf of Mexico, United States, Louisiana', count=183),
 Row(continent='North Atlantic Ocean, Gulf of Mexico, United States, Texas', count=183),
 Row(continent='Atlantic, Bermuda, Bermuda', count=177),
 Row(continent='North America | Canada | Alberta', count=131),
 Row(continent='North Atlantic Ocean, Gulf of Mexico, United States', count=129),
 Row(continent='North Atlantic Ocean, Caribbean Sea, Panama, Colon', count=127),
 Row(continent='Eastern Pacific | Mexico | Baja California | - |  |  |  |', count=120),
 Row(continent='Oceania, Palau, ,', count=104),
 Row(continent='South Pacific Ocean', count=99),
 Row(continent='North Atlantic Ocean, Caribbean Sea, Belize', count=93),
 Row(continent='EASTERN NORTH PACIFIC | USA | OREGON |  |  |  |  |', count=93),
 Row(continent='Северный Ледовитый океан', count=87),
 Row(continent='North Atlantic Ocean, United States, South Carolina', count=84),
 Row(continent='Eastern Pacific | USA | California | Los Angeles |  |  |  |', count=84),
 Row(continent='| Venezuela | Apure |  |  | Cinaruco River |  |', count=81),
 Row(continent='North Atlantic Ocean, United States, Florida', count=77),
 Row(continent='South Atlantic Ocean, Scotia Sea', count=76),
 Row(continent='Eastern Pacific | Mexico | - | - |  |  |  |', count=75),
 Row(continent='North Pacific Ocean, United States, Hawaii', count=74),
 Row(continent='North Pacific Ocean, Bering Sea, United States, Alaska', count=74),
 Row(continent='North America; United States; California', count=66),
 Row(continent='Canada; Ontario; Kenora District', count=66),
 Row(continent='North Atlantic Ocean, United States, North Carolina', count=66),
 Row(continent='South Atlantic Ocean', count=62),
 Row(continent='NW Atlantic; Bermuda', count=59),
 Row(continent='North America, Bermuda, ,', count=59),
 Row(continent='New Caledonia', count=58),
 Row(continent='North Pacific Ocean, United States, California', count=57),
 Row(continent='Antarctic | - | - | - |  |  |  |', count=56),
 Row(continent='Oceania, Micronesia, , Pohnpei', count=56),
 Row(continent='Indo-West Pacific; Micronesia; Pohnpei; Kapingamarangi Atoll', count=56),
 Row(continent='Antarctic Ocean, Ross Sea, Antarctica', count=56),
 Row(continent='EASTERN NORTH PACIFIC | USA | WASHINGTON |  |  |  |  |', count=52),
 Row(continent='Indo-West Pacific; Palau', count=50),
 Row(continent='Fiji; Great Astrolabe Reef', count=50),
 Row(continent='EASTERN NORTH PACIFIC | USA | CALIFORNIA |  |  |  |  |', count=49),
 Row(continent='North America, USA, , California State', count=48),
 Row(continent='Canada; Ontario; Thunder Bay District', count=48),
 Row(continent='North Atlantic Ocean, United States, Georgia', count=48),
 Row(continent='CALIFORNIA: San Francisco Estuary: San Francisco Bay: San Mateo County', count=46),
 Row(continent='South Atlantic Ocean, Argentina, Tierra del Fuego', count=45),
 Row(continent='Canada; Manitoba', count=45),
 Row(continent='Canada; Ontario; Nipissing District', count=45),
 Row(continent='Oceania, Micronesia, ,', count=44),
 Row(continent='NE Pacific; United States; California', count=44),
 Row(continent='Eastern Pacific | Costa Rica | - | - |  |  |  |', count=43),
 Row(continent='Chagos Archipelago;British Indian Ocean Territory; Peros Banhos Atoll', count=43),
 Row(continent='North America; Canada; Quebec; Anticosti Island', count=43),
 Row(continent='South Pacific; Fiji; Viti Levu Group; Viti Levu I.', count=42),
 Row(continent='North Atlantic Ocean, Bahamas', count=41),
 Row(continent='Antarctic Ocean', count=41),
 Row(continent='Canada; Newfoundland', count=40),
 Row(continent='Palau; Hatohobei State', count=38),
 Row(continent='Indo-West Pacific; Thailand', count=38),
 Row(continent='Palau; Koror', count=38),
 Row(continent='Indo-West Pacific; Micronesia; Ifalik Atoll', count=38),
 Row(continent='| USA |  |  |  | Gulf of Mexico |  |', count=37),
 Row(continent='North America, Mexico, ,', count=37),
 Row(continent='NE Pacific', count=36),
 Row(continent='Тихий океан', count=35),
 Row(continent='Eastern Pacific | USA | California | San Luis Obispo |  |  |  |', count=35),
 Row(continent='Central Pacific | USA | Hawaii | - |  |  |  |', count=35),
 Row(continent='Oceania, Fr Polynesia, ,', count=34),
 Row(continent='Guyana; Essequibo', count=34),
 Row(continent='South Pacific Ocean, New Zealand', count=34),
 Row(continent='South Pacific; French Polynesia; Tuamotu Archipelago; Raroia Atoll', count=33),
 Row(continent='Atlantic', count=33),
 Row(continent='Canada; British Columbia', count=33),
 Row(continent='Indo-West Pacific; Micronesia; Yap; Yap Is.; Yap I.', count=33),
 Row(continent='North Pacific Ocean', count=33),
 Row(continent='North America, United States, Pennsylvania', count=32),
 Row(continent='Vietnam; Khanh Hoa', count=32),
 Row(continent='Guyana; Region 6 (Kurupukari)', count=32),
 Row(continent='Canada; Northwest Territories', count=31),
 Row(continent='NE Pacific; Mexico; Sonora', count=31),
 Row(continent='North America; USA; Connecticut; Tolland County; Union', count=30),
 Row(continent='Pacific Ocean;North Subtropical Oceania;Hawaiian Islands;', count=30),
 Row(continent='Eastern Pacific | Costa Rica | Puntarenas | - |  |  |  |', count=30),
 Row(continent='CALIFORNIA: San Francisco Estuary: San Francisco Bay: Napa County: Napa River: Mare Island Strait', count=30),
 Row(continent='Canada; Ontario; Algoma District', count=30),
 Row(continent='Cuenca del Magdalena', count=30)]

In [52]:
date_group = renamed_cols.groupBy(year("eventDate").cast("integer").alias("year")).count().orderBy("year", ascending=False)

In [53]:
date_group.head(10)


Out[53]:
[Row(year=9960, count=1),
 Row(year=9959, count=1),
 Row(year=9953, count=2),
 Row(year=9906, count=1),
 Row(year=9891, count=1),
 Row(year=9889, count=2),
 Row(year=9870, count=1),
 Row(year=9866, count=1),
 Row(year=9865, count=1),
 Row(year=9860, count=1)]

In [55]:
date_group.describe().show()


+-------+-----------------+-----------------+
|summary|             year|            count|
+-------+-----------------+-----------------+
|  count|              854|              855|
|   mean|2498.307962529274|73899.54970760235|
| stddev|2254.309979612469| 478474.536895311|
|    min|                0|                1|
|    max|             9960|         12972411|
+-------+-----------------+-----------------+


In [43]:
# need to do some harder-core data cleaning
continents = set(["africa", "asia", "oceania", "europe", "north america", "south america"])
def fix_continent_gbif(c):
    for continent in continents:
        if (c) and (continent in c.lower().replace("_", " ")):
            return continent
    return "other"
print(fix_continent_gbif(None)) 
print(fix_continent_gbif("europe"))    
print(fix_continent_gbif("oceania"))
print(fix_continent_gbif("NORTH_AMERICA"))
print(fix_continent_gbif("North America, Canada, Manitoba, Churchill"))
print(fix_continent_gbif("East Indies, Indonesia: Pulo Pandjang, off Sumatra"))
print(fix_continent_gbif("Asia; Thailand; Pathum Thani"))


other
europe
oceania
north america
north america
other
asia

In [44]:
from pyspark.sql.functions import udf
fix_continent_gbif_udf = udf(fix_continent_gbif)

In [45]:
yc_cross_gbif = (renamed_cols
            .select(year("eventDate").cast("integer").alias("yearcollected"),
                    renamed_cols.continent)
            .withColumn("fixed", fix_continent_gbif_udf(renamed_cols.continent))
            .crosstab("yearcollected", "fixed")
            .persist()
            )

In [46]:
pandas_yc_cross_gbif = (yc_cross_gbif
                   .filter(yc_cross_gbif.yearcollected_fixed >= 1817)
                   .filter(yc_cross_gbif.yearcollected_fixed <= 2017)
                   .orderBy(yc_cross_gbif.yearcollected_fixed)
                   .toPandas()
                   )

In [47]:
import pandas as pd
import numpy as np
continents_list = sorted(continents)
continents_list.insert(0, "other")
# the 100k subset has no australia stuff so graph loop fails
#continents_list.remove("australia")

# blue -> red
#colors = ["#6d6263", "#00e5c8", "#1bc4ae", "#36a395", "#51827c", "#88414a", "#a32031", "#bf0018"]

# blue -> orange
colors = ["#a75902", "#00e5c8", "#1bcda7", "#37b686", "#539f65", "#51827c", "#bf6603"]


plots = []
bottoms = pd.DataFrame(np.zeros((len(pandas_yc_cross_gbif['yearcollected_fixed']), 1)))
for c in continents_list:
    plots.append(
      plt.bar(pandas_yc_cross_gbif['yearcollected_fixed'], pandas_yc_cross_gbif[c], 
              color=colors[len(plots)], edgecolor='none',
              width=1.0, bottom=bottoms[0])
    )
    bottoms[0] += pandas_yc_cross_gbif[c]
    #print(pandas_yc_cross[c])

#print(bottoms)

# Start of WWI
plt.axvline(x=1914)

# Start of WWII
plt.axvline(x=1939)

# "1988 - October 31: President Reagan signs the NSF Authorization Act of 1988, thereby authorizing the doubling of the NSF budget over the next five years."
plt.axvline(x=1988)

plt.legend(plots, continents_list, loc=2)
plt.title("Specimens in GBIF by Collection Year and Continent")
plt.ylabel("Number of Specimen Records")
plt.xlabel("Year")
axes = plt.gca()
axes.set_xlim([1815, 2020])
axes.set_ylim([0, 1200000])
fig = plt.gcf()
fig.set_size_inches(12, 4)