In [6]:
#!sudo apt-get install python-dev python-pip python-setuptools build-essential
#!pip install --upgrade setuptools
#!pip install csvkit
#!pip install --upgrade csvkit

In [3]:
!ls -lrth


total 2.2G
-rw-r--r-- 1 ross ross 1.7G May  1 19:34 occurrence.csv
-rw-r--r-- 1 ross ross 163M May  1 19:34 multimedia.csv
-rw-r--r-- 1 ross ross 6.1K May  1 19:34 meta.xml
-rw-rw-r-- 1 ross ross 409M May  1 19:51 5d27e8acaac843c4a804e881fb4c6934-99-1430505285.zip
-rw------- 1 ross ross  181 May  1 20:51 NHM-collections.ipynb

In [9]:
import sys
print (sys.version)


2.7.9 (default, Apr  2 2015, 15:33:21) 
[GCC 4.9.2]

In [10]:
%env


Out[10]:
{'CLICOLOR': '1',
 'CLUTTER_IM_MODULE': 'xim',
 'COMP_WORDBREAKS': ' \t\n"\'><;|&(:',
 'DBUS_SESSION_BUS_ADDRESS': 'unix:abstract=/tmp/dbus-ffQSThD6QG',
 'DEFAULTS_PATH': '/usr/share/gconf/Lubuntu.default.path',
 'DESKTOP_SESSION': 'Lubuntu',
 'DISPLAY': ':0',
 'GDMSESSION': 'Lubuntu',
 'GDM_LANG': 'en_GB',
 'GIT_PAGER': 'cat',
 'GNOME_KEYRING_CONTROL': '',
 'GNOME_KEYRING_PID': '',
 'GPG_AGENT_INFO': '/run/user/1000/keyring/gpg:0:1',
 'GTK_IM_MODULE': 'xim',
 'HOME': '/home/ross',
 'IM_CONFIG_PHASE': '1',
 'INSTANCE': '',
 'JOB': 'dbus',
 'LANG': 'en_GB.UTF-8',
 'LANGUAGE': 'en_GB:en',
 'LESSCLOSE': '/usr/bin/lesspipe %s %s',
 'LESSOPEN': '| /usr/bin/lesspipe %s',
 'LOGNAME': 'ross',
 'LS_COLORS': 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.axa=00;36:*.oga=00;36:*.spx=00;36:*.xspf=00;36:',
 'MANDATORY_PATH': '/usr/share/gconf/Lubuntu.mandatory.path',
 'NODE_PATH': '/usr/lib/nodejs:/usr/lib/node_modules:/usr/share/javascript',
 'PAGER': 'cat',
 'PATH': '/usr/games:/home/ross/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/home/ross/.rvm/bin:/home/ross/.rvm/bin',
 'PVM_ROOT': '/usr/lib/pvm3',
 'PWD': '/media/ross/ad06f767-5435-4c2d-ba8c-af2af3dea67d/corpuses/NHM/collections-export-2015-05-01',
 'QT4_IM_MODULE': 'ibus',
 'QT_ACCESSIBILITY': '1',
 'QT_IM_MODULE': 'ibus',
 'QT_PLATFORM_PLUGIN': 'lxqt',
 'QT_QPA_PLATFORMTHEME': 'lxqt',
 'SAL_USE_VCLPLUGIN': 'gtk',
 'SESSION': 'Lubuntu',
 'SESSIONTYPE': 'lxsession',
 'SHELL': '/bin/bash',
 'SHLVL': '1',
 'SSH_AUTH_SOCK': '/run/user/1000/keyring/ssh',
 'TERM': 'xterm-color',
 'UPSTART_EVENTS': 'started xsession',
 'UPSTART_INSTANCE': '',
 'UPSTART_JOB': 'lxsession',
 'UPSTART_SESSION': 'unix:abstract=/com/ubuntu/upstart-session/1000/1513',
 'USER': 'ross',
 'XAUTHORITY': '/home/ross/.Xauthority',
 'XDG_CONFIG_DIRS': '/etc/xdg/lubuntu:/etc/xdg/xdg-Lubuntu:/usr/share/upstart/xdg:/etc/xdg',
 'XDG_CONFIG_HOME': '/home/ross/.config',
 'XDG_CURRENT_DESKTOP': 'LXDE',
 'XDG_DATA_DIRS': '/etc/xdg/lubuntu:/usr/local/share:/usr/share:/usr/share/gdm:/var/lib/menu-xdg:/usr/share/Lubuntu:/usr/local/share/:/usr/share/',
 'XDG_GREETER_DATA_DIR': '/var/lib/lightdm-data/ross',
 'XDG_MENU_PREFIX': 'lxde-',
 'XDG_RUNTIME_DIR': '/run/user/1000',
 'XDG_SEAT': 'seat0',
 'XDG_SEAT_PATH': '/org/freedesktop/DisplayManager/Seat0',
 'XDG_SESSION_DESKTOP': 'Lubuntu',
 'XDG_SESSION_ID': 'c1',
 'XDG_SESSION_PATH': '/org/freedesktop/DisplayManager/Session0',
 'XDG_SESSION_TYPE': 'x11',
 'XDG_VTNR': '7',
 'XMODIFIERS': '@im=ibus',
 '_': '/usr/bin/ipython',
 '_LXSESSION_PID': '1980'}

In [11]:
!lsb_release -a


LSB Version:	core-2.0-amd64:core-2.0-noarch:core-3.0-amd64:core-3.0-noarch:core-3.1-amd64:core-3.1-noarch:core-3.2-amd64:core-3.2-noarch:core-4.0-amd64:core-4.0-noarch:core-4.1-amd64:core-4.1-noarch:cxx-3.0-amd64:cxx-3.0-noarch:cxx-3.1-amd64:cxx-3.1-noarch:cxx-3.2-amd64:cxx-3.2-noarch:cxx-4.0-amd64:cxx-4.0-noarch:cxx-4.1-amd64:cxx-4.1-noarch:desktop-3.1-amd64:desktop-3.1-noarch:desktop-3.2-amd64:desktop-3.2-noarch:desktop-4.0-amd64:desktop-4.0-noarch:desktop-4.1-amd64:desktop-4.1-noarch:graphics-2.0-amd64:graphics-2.0-noarch:graphics-3.0-amd64:graphics-3.0-noarch:graphics-3.1-amd64:graphics-3.1-noarch:graphics-3.2-amd64:graphics-3.2-noarch:graphics-4.0-amd64:graphics-4.0-noarch:graphics-4.1-amd64:graphics-4.1-noarch:languages-3.2-amd64:languages-3.2-noarch:languages-4.0-amd64:languages-4.0-noarch:languages-4.1-amd64:languages-4.1-noarch:multimedia-3.2-amd64:multimedia-3.2-noarch:multimedia-4.0-amd64:multimedia-4.0-noarch:multimedia-4.1-amd64:multimedia-4.1-noarch:printing-3.2-amd64:printing-3.2-noarch:printing-4.0-amd64:printing-4.0-noarch:printing-4.1-amd64:printing-4.1-noarch:qt4-3.1-amd64:qt4-3.1-noarch:security-4.0-amd64:security-4.0-noarch:security-4.1-amd64:security-4.1-noarch
Distributor ID:	Ubuntu
Description:	Ubuntu 15.04
Release:	15.04
Codename:	vivid

In [14]:
!lscpu


Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                2
On-line CPU(s) list:   0,1
Thread(s) per core:    1
Core(s) per socket:    2
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 23
Model name:            Intel(R) Core(TM)2 Duo CPU     E7500  @ 2.93GHz
Stepping:              10
CPU MHz:               2128.000
CPU max MHz:           2926.0000
CPU min MHz:           1596.0000
BogoMIPS:              5860.89
Virtualisation:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              3072K
NUMA node0 CPU(s):     0,1

In [15]:
!free -m


             total       used       free     shared    buffers     cached
Mem:          3942       3582        360        174         52        555
-/+ buffers/cache:       2974        968
Swap:         7870        386       7483

In [17]:
!head -5 occurrence.csv > testit.csv

In [21]:
!csvcut -c 5,12,4,71,1 testit.csv | csvlook


|--------------------+----------------+--------------------------+--------------------------+----------|
|  scientificName    | collectionCode | catalogNumber            | otherCatalogNumbers      | _id      |
|--------------------+----------------+--------------------------+--------------------------+----------|
|  Myotis hasseltii  | ZOO            | 1878.2.8.2               | NHMUK:ecatalogue:3838232 | 1516684  |
|                    | ZOO            | 1964.61.147              | NHMUK:ecatalogue:4194826 | 1516690  |
|  Rectuvigerina sp. | PAL            | PM BP 7597               | NHMUK:ecatalogue:2057860 | 1516693  |
|  Blechnum L.       | BOT            | HUMID CHACO BM ID - 2243 | NHMUK:ecatalogue:4582482 | 982216   |
|--------------------+----------------+--------------------------+--------------------------+----------|

In [65]:
!wc occurrence.csv


   2741852   95473453 1762863462 occurrence.csv

In [137]:
!sort -u occurrence.csv | wc


1796924 62418842 1153369550

In [66]:
!csvcut -c 4 occurrence.csv > catalognumbers.txt

In [67]:
!wc catalognumbers.txt


 2741875  3777129 35208350 catalognumbers.txt

Thanks Christian Pietsch @ChPietsch for reminding me about csvclean https://twitter.com/ChPietsch/status/594265427427106817


In [68]:
!csvclean occurrence.csv

In [69]:
!csvcut -c 4 occurrence_out.csv > catalognumbers.txt

In [70]:
!wc catalognumbers.txt


 2741875  3777129 35208350 catalognumbers.txt

In [71]:
!ls -lh catalognumbers.txt


-rw-rw-r-- 1 ross ross 34M May  1 23:46 catalognumbers.txt

In [72]:
!head catalognumbers.txt


catalogNumber
1878.2.8.2
1964.61.147
PM BP 7597
HUMID CHACO BM ID - 2243
BMNH(E)1027709
BM000756825
BM000595187
E/1941.9.4.668
1956.7.2.61-69

In [73]:
!sed -i '1d' catalognumbers.txt

In [74]:
!head catalognumbers.txt


1878.2.8.2
1964.61.147
PM BP 7597
HUMID CHACO BM ID - 2243
BMNH(E)1027709
BM000756825
BM000595187
E/1941.9.4.668
1956.7.2.61-69
1898.5.20.10865

In [82]:
!sort catalognumbers.txt | uniq -c > sortedcatalog.txt

In [83]:
!wc sortedcatalog.txt


 1658195  3963987 35110407 sortedcatalog.txt

In [84]:
!head sortedcatalog.txt


      7 -
      2 --
      3 ---
      3 ----
     10 ?
  57810 ""
      1 00000000
      1 000000000
      2 00000000a
      1 000009394

In [85]:
!tail sortedcatalog.txt


      1 Zthellung29jul1915
      2 Zthomas8106
      1 Zthompson8131
      2 Ztrinkler1927-28
      1 Ztsiang6482
      1 Zvetter3aug1881
      2 Zwehrli18nov1905
      1 ZZ.1978.3170
      1 Zzogg2193
      1 Zzollinger3577

In [86]:
!sed -i '1,9d' sortedcatalog.txt

In [81]:
!cat sortedcatalog.txt | awk '{ print length, $0 }' | sort -n | awk '{$1=""; print $0}' > sortedbylinelength.txt


000009394
000009395
000016280
000016282
000056192
000057469
000065785
000074094
000074926
000076703
000080968
000087186
000095812
000097635
000097654
000097681
000097693
000097708
000097722
01.01.02.55-57
01.01.02.58-59
01.01.02.60-61
01.01.02.62-64
01.01.03.01-02
01.01.03.03-04
01.01.03.08
01.01.03.09-10
01.01.03.11-12
01.01.03.13-15
01.01.03.16-17
01.01.03.18-20
01.01.03.21-22
01.01.03.24-26
01.01.03.27-29
01.01.03.30
01.01.03.31-33
01.01.03.34-35
01.01.03.36-38
01.01.03.39-41
0.10.4.3
0].11.19.4
01.5.11.2
0.2.5.38-41
0.2.5.38-419.1.4.49
0.2.5.42-44
0.5.7.1
0.6.29.10
0.6.29.11
0.6.29.12
0.6.29.13

In [87]:
!head -50 sortedbylinelength.txt


 10 8
 11 2
 11 4
 11 9
 12 3
 12 6
 13 1
 1 D
 1 h
 1 p
 1 q
 1 s
 20 a
 2 g
 2 j
 2 l
 2 n
 2 r
 3 c
 3 e
 6 b
 7 5
 8 7
 10 14
 10 48
 105 ZD
 10 96
 11 16
 12 15
 13373 E/
 14 12
 1 49
 1 63
 1 68
 17 13
 1 77
 1 98
 1 BM
 1 nd
 1 S/
 2 26
 2 28
 2 43
 2 53
 2 62
 2 78
 2 79
 2 85
 2 88
 2 9[

In [89]:
!grep '[0-9][0-9][0-9][0-9]\.' sortedbylinelength.txt | wc


 621947 1308084 10343759

In [90]:
!grep 'ZD' sortedbylinelength.txt | wc


  63499  189231 1067940

In [92]:
!grep 'ARC' sortedbylinelength.txt #50 matches


 1 ARC 1970.3015
 1 ARC 1970.3019
 1 ARC 1970.3063
 1 ARC 1976.5109
 1 ARC 1977.5002
 1 ARC 1979.5357
 1 ARC 1979.5360
 1 ARC 1979.5361
 1 ARC 1979.5362
 1 ARC 1979.5363
 1 ARC 1979.5364
 1 ARC 1979.5373
 1 ARC 1979.5374
 1 ARC 1979.5377
 1 ARC 1979.5381
 1 ARC 1979.5382
 1 ARC 1979.5387
 1 ARC 1979.5390
 1 ARC 1979.5393
 1 ARC 1979.5397
 1 ARC 1979.5408
 1 ARC 1984.5021
 1 ARC 1985.5134
 2 ARC 1979.5370
 2 ARC 1979.5376
 2 ARC 1979.5379
 2 ARC 1979.5407
 2 ARC 1981.5581
 2 ARC 1983.5004
 3 ARC 1979.5017
 3 ARC 1979.5367
 3 ARC 1979.5368
 3 ARC 1979.5389
 4 ARC 1979.5378
 1 ARC 1975.5075.a
 1 ARC 1969.396.1-20
 2 ARC 1975.5043.G22
 2 ARC 1975.5022.N9.1
 1 ARC 1975.5042.JpB4 46
 1 ARC 1975.5065.F105.10
 1 ARC 1975.5342.F300-17
 1 ARC 1975.5387.D405-12
 2 ARC 1975.5043.Jp E3-9
 1 ARC 1975.5045.0 201-46
 3 ARC 1975.5044.JpM 7-8B
 1 ARC 1975.5043.JpF 101-7
 1 ARC 1975.5045.F100/15AA
 2 ARC 1975.5045.F100/11CC
 2 ARC 1977.5003.F7GG74165
 2 ARC 1977.5003.F7GG74168

In [93]:
!grep 'BK' sortedbylinelength.txt #5 matches


 2 BK 465. No museum number
 1 BK 1398. No museum number
 1 BK 1400. No museum number
 3 BK 1401. No museum number
 3 BK 1496. No museum number

In [94]:
!grep 'E/' sortedbylinelength.txt | wc


  11179   22381  201356

In [95]:
!grep -v '[0-9][0-9][0-9][0-9]\.' sortedbylinelength.txt | grep -v 'ZD' | grep -v 'E/' > oddones.txt

In [97]:
!wc oddones.txt


 1036178  2655738 16474400 oddones.txt

In [98]:
!grep 'BRITFERN' oddones.txt | wc


     78     544    3623

In [99]:
!grep 'Malesiana' oddones.txt | wc


    164     842    6581

In [100]:
!grep 'Azores' oddones.txt | wc


    773    5411   33239

In [101]:
!grep 'JMC' oddones.txt | wc


   3001   13477   88038

In [104]:
!grep 'New Caledonia Brownlie' oddones.txt | wc


     57     342    1938

In [107]:
!grep 'Paradox loan' oddones.txt | wc


    644    4508   23752

In [108]:
!grep 'BM-BRIT-EURO' oddones.txt | wc


   7905   23717  168951

In [110]:
!grep 'PI D ' oddones.txt | wc


   3218   13090   45403

In [112]:
!grep 'PI BZ ' oddones.txt | wc


   4932   20959   72657

In [111]:
!grep 'SEM' oddones.txt | wc


    100     716    3066

In [117]:
!grep -i 'Gilbert' oddones.txt | wc #"Gilbert Qinghai Colls" on website gives 3136 records BOT  DUPLICATE CAT NUMBERS!!!


   2402   14412   75312

In [118]:
!grep 'Gilbert Qinghai Colls' occurrence_out.csv | wc


   3098  256271 2929973

In [119]:
!grep 'Yasuni BM specimens' oddones.txt | wc #should be 27 NOT duplicate CAT NUMBERS


     20     120     620

In [132]:
!grep 'Yasuni BM specimens' occurrence_out.csv | grep -o 48[0-9][0-9][0-9] | sort #48152 is missing


48153
48214
48214
48217
48221
48222
48237
48238
48282
48282
48283
48286
48286
48286
48288
48289
48290
48294
48297
48304
48305
48306
48310
48310
48310
48315

In [135]:
!grep 'NHMUK:ecatalogue:4614078' occurrence.csv #alternate ID not found either

In [136]:
!grep -o 'Yasuni BM specimens .......' occurrence.csv | sort


Yasuni BM specimens - 48153
Yasuni BM specimens - 48214
Yasuni BM specimens - 48214
Yasuni BM specimens - 48217
Yasuni BM specimens - 48221
Yasuni BM specimens - 48222
Yasuni BM specimens - 48237
Yasuni BM specimens - 48238
Yasuni BM specimens - 48282
Yasuni BM specimens - 48282
Yasuni BM specimens - 48283
Yasuni BM specimens - 48286
Yasuni BM specimens - 48286
Yasuni BM specimens - 48286
Yasuni BM specimens - 48288
Yasuni BM specimens - 48289
Yasuni BM specimens - 48290
Yasuni BM specimens - 48294
Yasuni BM specimens - 48297
Yasuni BM specimens - 48304
Yasuni BM specimens - 48305
Yasuni BM specimens - 48306
Yasuni BM specimens - 48310
Yasuni BM specimens - 48310
Yasuni BM specimens - 48310
Yasuni BM specimens - 48315

In [133]:
!grep -o 'Yasuni BM specimens - 48152' occurrence.csv #nothing

In [121]:
!grep 'Gilbert Hainan Colls' oddones.txt | wc #should be 631


    399    2394   11896

In [122]:
!grep 'BelizeColl2007' oddones.txt | wc #should be 212


    131     524    3799

Conclusions:

  • 2741874 records from initial export
  • perhaps only 2741852 were real records (export problem?)
  • 1658195 unique catalog numbers entries (inc. all 'dirty' entries)
  • at least 897 unique catalog numbers are 3 digits or less
  • 621947 match '[0-9][0-9][0-9][0-9].'
  • 63499 match 'ZD'

In [ ]:
!grep '[0-9][0-9][0-9][0-9]\.' sortedbylinelength.txt | wc