notebook.community

Edit and run



In [6]:

    
#!sudo apt-get install python-dev python-pip python-setuptools build-essential
#!pip install --upgrade setuptools
#!pip install csvkit
#!pip install --upgrade csvkit



In [3]:

    
!ls -lrth









    



total 2.2G
-rw-r--r-- 1 ross ross 1.7G May  1 19:34 occurrence.csv
-rw-r--r-- 1 ross ross 163M May  1 19:34 multimedia.csv
-rw-r--r-- 1 ross ross 6.1K May  1 19:34 meta.xml
-rw-rw-r-- 1 ross ross 409M May  1 19:51 5d27e8acaac843c4a804e881fb4c6934-99-1430505285.zip
-rw------- 1 ross ross  181 May  1 20:51 NHM-collections.ipynb



In [9]:

    
import sys
print (sys.version)









    



2.7.9 (default, Apr  2 2015, 15:33:21) 
[GCC 4.9.2]



In [10]:

    
%env









    Out[10]:





{'CLICOLOR': '1',
 'CLUTTER_IM_MODULE': 'xim',
 'COMP_WORDBREAKS': ' \t\n"\'><;|&(:',
 'DBUS_SESSION_BUS_ADDRESS': 'unix:abstract=/tmp/dbus-ffQSThD6QG',
 'DEFAULTS_PATH': '/usr/share/gconf/Lubuntu.default.path',
 'DESKTOP_SESSION': 'Lubuntu',
 'DISPLAY': ':0',
 'GDMSESSION': 'Lubuntu',
 'GDM_LANG': 'en_GB',
 'GIT_PAGER': 'cat',
 'GNOME_KEYRING_CONTROL': '',
 'GNOME_KEYRING_PID': '',
 'GPG_AGENT_INFO': '/run/user/1000/keyring/gpg:0:1',
 'GTK_IM_MODULE': 'xim',
 'HOME': '/home/ross',
 'IM_CONFIG_PHASE': '1',
 'INSTANCE': '',
 'JOB': 'dbus',
 'LANG': 'en_GB.UTF-8',
 'LANGUAGE': 'en_GB:en',
 'LESSCLOSE': '/usr/bin/lesspipe %s %s',
 'LESSOPEN': '| /usr/bin/lesspipe %s',
 'LOGNAME': 'ross',
 'LS_COLORS': 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.axa=00;36:*.oga=00;36:*.spx=00;36:*.xspf=00;36:',
 'MANDATORY_PATH': '/usr/share/gconf/Lubuntu.mandatory.path',
 'NODE_PATH': '/usr/lib/nodejs:/usr/lib/node_modules:/usr/share/javascript',
 'PAGER': 'cat',
 'PATH': '/usr/games:/home/ross/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/home/ross/.rvm/bin:/home/ross/.rvm/bin',
 'PVM_ROOT': '/usr/lib/pvm3',
 'PWD': '/media/ross/ad06f767-5435-4c2d-ba8c-af2af3dea67d/corpuses/NHM/collections-export-2015-05-01',
 'QT4_IM_MODULE': 'ibus',
 'QT_ACCESSIBILITY': '1',
 'QT_IM_MODULE': 'ibus',
 'QT_PLATFORM_PLUGIN': 'lxqt',
 'QT_QPA_PLATFORMTHEME': 'lxqt',
 'SAL_USE_VCLPLUGIN': 'gtk',
 'SESSION': 'Lubuntu',
 'SESSIONTYPE': 'lxsession',
 'SHELL': '/bin/bash',
 'SHLVL': '1',
 'SSH_AUTH_SOCK': '/run/user/1000/keyring/ssh',
 'TERM': 'xterm-color',
 'UPSTART_EVENTS': 'started xsession',
 'UPSTART_INSTANCE': '',
 'UPSTART_JOB': 'lxsession',
 'UPSTART_SESSION': 'unix:abstract=/com/ubuntu/upstart-session/1000/1513',
 'USER': 'ross',
 'XAUTHORITY': '/home/ross/.Xauthority',
 'XDG_CONFIG_DIRS': '/etc/xdg/lubuntu:/etc/xdg/xdg-Lubuntu:/usr/share/upstart/xdg:/etc/xdg',
 'XDG_CONFIG_HOME': '/home/ross/.config',
 'XDG_CURRENT_DESKTOP': 'LXDE',
 'XDG_DATA_DIRS': '/etc/xdg/lubuntu:/usr/local/share:/usr/share:/usr/share/gdm:/var/lib/menu-xdg:/usr/share/Lubuntu:/usr/local/share/:/usr/share/',
 'XDG_GREETER_DATA_DIR': '/var/lib/lightdm-data/ross',
 'XDG_MENU_PREFIX': 'lxde-',
 'XDG_RUNTIME_DIR': '/run/user/1000',
 'XDG_SEAT': 'seat0',
 'XDG_SEAT_PATH': '/org/freedesktop/DisplayManager/Seat0',
 'XDG_SESSION_DESKTOP': 'Lubuntu',
 'XDG_SESSION_ID': 'c1',
 'XDG_SESSION_PATH': '/org/freedesktop/DisplayManager/Session0',
 'XDG_SESSION_TYPE': 'x11',
 'XDG_VTNR': '7',
 'XMODIFIERS': '@im=ibus',
 '_': '/usr/bin/ipython',
 '_LXSESSION_PID': '1980'}



In [11]:

    
!lsb_release -a









    



LSB Version:	core-2.0-amd64:core-2.0-noarch:core-3.0-amd64:core-3.0-noarch:core-3.1-amd64:core-3.1-noarch:core-3.2-amd64:core-3.2-noarch:core-4.0-amd64:core-4.0-noarch:core-4.1-amd64:core-4.1-noarch:cxx-3.0-amd64:cxx-3.0-noarch:cxx-3.1-amd64:cxx-3.1-noarch:cxx-3.2-amd64:cxx-3.2-noarch:cxx-4.0-amd64:cxx-4.0-noarch:cxx-4.1-amd64:cxx-4.1-noarch:desktop-3.1-amd64:desktop-3.1-noarch:desktop-3.2-amd64:desktop-3.2-noarch:desktop-4.0-amd64:desktop-4.0-noarch:desktop-4.1-amd64:desktop-4.1-noarch:graphics-2.0-amd64:graphics-2.0-noarch:graphics-3.0-amd64:graphics-3.0-noarch:graphics-3.1-amd64:graphics-3.1-noarch:graphics-3.2-amd64:graphics-3.2-noarch:graphics-4.0-amd64:graphics-4.0-noarch:graphics-4.1-amd64:graphics-4.1-noarch:languages-3.2-amd64:languages-3.2-noarch:languages-4.0-amd64:languages-4.0-noarch:languages-4.1-amd64:languages-4.1-noarch:multimedia-3.2-amd64:multimedia-3.2-noarch:multimedia-4.0-amd64:multimedia-4.0-noarch:multimedia-4.1-amd64:multimedia-4.1-noarch:printing-3.2-amd64:printing-3.2-noarch:printing-4.0-amd64:printing-4.0-noarch:printing-4.1-amd64:printing-4.1-noarch:qt4-3.1-amd64:qt4-3.1-noarch:security-4.0-amd64:security-4.0-noarch:security-4.1-amd64:security-4.1-noarch
Distributor ID:	Ubuntu
Description:	Ubuntu 15.04
Release:	15.04
Codename:	vivid



In [14]:

    
!lscpu









    



Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                2
On-line CPU(s) list:   0,1
Thread(s) per core:    1
Core(s) per socket:    2
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 23
Model name:            Intel(R) Core(TM)2 Duo CPU     E7500  @ 2.93GHz
Stepping:              10
CPU MHz:               2128.000
CPU max MHz:           2926.0000
CPU min MHz:           1596.0000
BogoMIPS:              5860.89
Virtualisation:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              3072K
NUMA node0 CPU(s):     0,1



In [15]:

    
!free -m









    



             total       used       free     shared    buffers     cached
Mem:          3942       3582        360        174         52        555
-/+ buffers/cache:       2974        968
Swap:         7870        386       7483



In [17]:

    
!head -5 occurrence.csv > testit.csv



In [21]:

    
!csvcut -c 5,12,4,71,1 testit.csv | csvlook









    



|--------------------+----------------+--------------------------+--------------------------+----------|
|  scientificName    | collectionCode | catalogNumber            | otherCatalogNumbers      | _id      |
|--------------------+----------------+--------------------------+--------------------------+----------|
|  Myotis hasseltii  | ZOO            | 1878.2.8.2               | NHMUK:ecatalogue:3838232 | 1516684  |
|                    | ZOO            | 1964.61.147              | NHMUK:ecatalogue:4194826 | 1516690  |
|  Rectuvigerina sp. | PAL            | PM BP 7597               | NHMUK:ecatalogue:2057860 | 1516693  |
|  Blechnum L.       | BOT            | HUMID CHACO BM ID - 2243 | NHMUK:ecatalogue:4582482 | 982216   |
|--------------------+----------------+--------------------------+--------------------------+----------|



In [65]:

    
!wc occurrence.csv









    



   2741852   95473453 1762863462 occurrence.csv



In [137]:

    
!sort -u occurrence.csv | wc









    



1796924 62418842 1153369550



In [66]:

    
!csvcut -c 4 occurrence.csv > catalognumbers.txt



In [67]:

    
!wc catalognumbers.txt









    



 2741875  3777129 35208350 catalognumbers.txt

Thanks Christian Pietsch @ChPietsch for reminding me about csvclean https://twitter.com/ChPietsch/status/594265427427106817



In [68]:

    
!csvclean occurrence.csv



In [69]:

    
!csvcut -c 4 occurrence_out.csv > catalognumbers.txt



In [70]:

    
!wc catalognumbers.txt









    



 2741875  3777129 35208350 catalognumbers.txt



In [71]:

    
!ls -lh catalognumbers.txt









    



-rw-rw-r-- 1 ross ross 34M May  1 23:46 catalognumbers.txt



In [72]:

    
!head catalognumbers.txt









    



catalogNumber
1878.2.8.2
1964.61.147
PM BP 7597
HUMID CHACO BM ID - 2243
BMNH(E)1027709
BM000756825
BM000595187
E/1941.9.4.668
1956.7.2.61-69



In [73]:

    
!sed -i '1d' catalognumbers.txt



In [74]:

    
!head catalognumbers.txt









    



1878.2.8.2
1964.61.147
PM BP 7597
HUMID CHACO BM ID - 2243
BMNH(E)1027709
BM000756825
BM000595187
E/1941.9.4.668
1956.7.2.61-69
1898.5.20.10865



In [82]:

    
!sort catalognumbers.txt | uniq -c > sortedcatalog.txt



In [83]:

    
!wc sortedcatalog.txt









    



 1658195  3963987 35110407 sortedcatalog.txt



In [84]:

    
!head sortedcatalog.txt









    



      7 -
      2 --
      3 ---
      3 ----
     10 ?
  57810 ""
      1 00000000
      1 000000000
      2 00000000a
      1 000009394



In [85]:

    
!tail sortedcatalog.txt









    



      1 Zthellung29jul1915
      2 Zthomas8106
      1 Zthompson8131
      2 Ztrinkler1927-28
      1 Ztsiang6482
      1 Zvetter3aug1881
      2 Zwehrli18nov1905
      1 ZZ.1978.3170
      1 Zzogg2193
      1 Zzollinger3577



In [86]:

    
!sed -i '1,9d' sortedcatalog.txt



In [81]:

    
!cat sortedcatalog.txt | awk '{ print length, $0 }' | sort -n | awk '{$1=""; print $0}' > sortedbylinelength.txt









    



000009394
000009395
000016280
000016282
000056192
000057469
000065785
000074094
000074926
000076703
000080968
000087186
000095812
000097635
000097654
000097681
000097693
000097708
000097722
01.01.02.55-57
01.01.02.58-59
01.01.02.60-61
01.01.02.62-64
01.01.03.01-02
01.01.03.03-04
01.01.03.08
01.01.03.09-10
01.01.03.11-12
01.01.03.13-15
01.01.03.16-17
01.01.03.18-20
01.01.03.21-22
01.01.03.24-26
01.01.03.27-29
01.01.03.30
01.01.03.31-33
01.01.03.34-35
01.01.03.36-38
01.01.03.39-41
0.10.4.3
0].11.19.4
01.5.11.2
0.2.5.38-41
0.2.5.38-419.1.4.49
0.2.5.42-44
0.5.7.1
0.6.29.10
0.6.29.11
0.6.29.12
0.6.29.13



In [87]:

    
!head -50 sortedbylinelength.txt



In [89]:

    
!grep '[0-9][0-9][0-9][0-9]\.' sortedbylinelength.txt | wc









    



 621947 1308084 10343759



In [90]:

    
!grep 'ZD' sortedbylinelength.txt | wc









    



  63499  189231 1067940



In [92]:

    
!grep 'ARC' sortedbylinelength.txt #50 matches









    



 1 ARC 1970.3015
 1 ARC 1970.3019
 1 ARC 1970.3063
 1 ARC 1976.5109
 1 ARC 1977.5002
 1 ARC 1979.5357
 1 ARC 1979.5360
 1 ARC 1979.5361
 1 ARC 1979.5362
 1 ARC 1979.5363
 1 ARC 1979.5364
 1 ARC 1979.5373
 1 ARC 1979.5374
 1 ARC 1979.5377
 1 ARC 1979.5381
 1 ARC 1979.5382
 1 ARC 1979.5387
 1 ARC 1979.5390
 1 ARC 1979.5393
 1 ARC 1979.5397
 1 ARC 1979.5408
 1 ARC 1984.5021
 1 ARC 1985.5134
 2 ARC 1979.5370
 2 ARC 1979.5376
 2 ARC 1979.5379
 2 ARC 1979.5407
 2 ARC 1981.5581
 2 ARC 1983.5004
 3 ARC 1979.5017
 3 ARC 1979.5367
 3 ARC 1979.5368
 3 ARC 1979.5389
 4 ARC 1979.5378
 1 ARC 1975.5075.a
 1 ARC 1969.396.1-20
 2 ARC 1975.5043.G22
 2 ARC 1975.5022.N9.1
 1 ARC 1975.5042.JpB4 46
 1 ARC 1975.5065.F105.10
 1 ARC 1975.5342.F300-17
 1 ARC 1975.5387.D405-12
 2 ARC 1975.5043.Jp E3-9
 1 ARC 1975.5045.0 201-46
 3 ARC 1975.5044.JpM 7-8B
 1 ARC 1975.5043.JpF 101-7
 1 ARC 1975.5045.F100/15AA
 2 ARC 1975.5045.F100/11CC
 2 ARC 1977.5003.F7GG74165
 2 ARC 1977.5003.F7GG74168



In [93]:

    
!grep 'BK' sortedbylinelength.txt #5 matches









    



 2 BK 465. No museum number
 1 BK 1398. No museum number
 1 BK 1400. No museum number
 3 BK 1401. No museum number
 3 BK 1496. No museum number



In [94]:

    
!grep 'E/' sortedbylinelength.txt | wc









    



  11179   22381  201356



In [95]:

    
!grep -v '[0-9][0-9][0-9][0-9]\.' sortedbylinelength.txt | grep -v 'ZD' | grep -v 'E/' > oddones.txt



In [97]:

    
!wc oddones.txt









    



 1036178  2655738 16474400 oddones.txt



In [98]:

    
!grep 'BRITFERN' oddones.txt | wc









    



     78     544    3623



In [99]:

    
!grep 'Malesiana' oddones.txt | wc









    



    164     842    6581



In [100]:

    
!grep 'Azores' oddones.txt | wc









    



    773    5411   33239



In [101]:

    
!grep 'JMC' oddones.txt | wc









    



   3001   13477   88038



In [104]:

    
!grep 'New Caledonia Brownlie' oddones.txt | wc









    



     57     342    1938



In [107]:

    
!grep 'Paradox loan' oddones.txt | wc









    



    644    4508   23752



In [108]:

    
!grep 'BM-BRIT-EURO' oddones.txt | wc









    



   7905   23717  168951



In [110]:

    
!grep 'PI D ' oddones.txt | wc









    



   3218   13090   45403



In [112]:

    
!grep 'PI BZ ' oddones.txt | wc









    



   4932   20959   72657



In [111]:

    
!grep 'SEM' oddones.txt | wc









    



    100     716    3066



In [117]:

    
!grep -i 'Gilbert' oddones.txt | wc #"Gilbert Qinghai Colls" on website gives 3136 records BOT  DUPLICATE CAT NUMBERS!!!









    



   2402   14412   75312

http://data.nhm.ac.uk/dataset/collection-specimens/resource/05ff2255-c38a-40c9-b657-4ccb55ab2feb?q=Gilbert+Qinghai+Colls&view_id=6b611d29-1dcf-4c60-b6b5-4cbb69fdf4fe



In [118]:

    
!grep 'Gilbert Qinghai Colls' occurrence_out.csv | wc









    



   3098  256271 2929973



In [119]:

    
!grep 'Yasuni BM specimens' oddones.txt | wc #should be 27 NOT duplicate CAT NUMBERS









    



     20     120     620

http://data.nhm.ac.uk/dataset/collection-specimens/resource/05ff2255-c38a-40c9-b657-4ccb55ab2feb?q=%22Yasuni+BM+specimens%22&view_id=6b611d29-1dcf-4c60-b6b5-4cbb69fdf4fe



In [132]:

    
!grep 'Yasuni BM specimens' occurrence_out.csv | grep -o 48[0-9][0-9][0-9] | sort #48152 is missing

missing record details: http://data.nhm.ac.uk/dataset/collection-specimens/resource/05ff2255-c38a-40c9-b657-4ccb55ab2feb/record/873394



In [135]:

    
!grep 'NHMUK:ecatalogue:4614078' occurrence.csv #alternate ID not found either



In [136]:

    
!grep -o 'Yasuni BM specimens .......' occurrence.csv | sort









    



Yasuni BM specimens - 48153
Yasuni BM specimens - 48214
Yasuni BM specimens - 48214
Yasuni BM specimens - 48217
Yasuni BM specimens - 48221
Yasuni BM specimens - 48222
Yasuni BM specimens - 48237
Yasuni BM specimens - 48238
Yasuni BM specimens - 48282
Yasuni BM specimens - 48282
Yasuni BM specimens - 48283
Yasuni BM specimens - 48286
Yasuni BM specimens - 48286
Yasuni BM specimens - 48286
Yasuni BM specimens - 48288
Yasuni BM specimens - 48289
Yasuni BM specimens - 48290
Yasuni BM specimens - 48294
Yasuni BM specimens - 48297
Yasuni BM specimens - 48304
Yasuni BM specimens - 48305
Yasuni BM specimens - 48306
Yasuni BM specimens - 48310
Yasuni BM specimens - 48310
Yasuni BM specimens - 48310
Yasuni BM specimens - 48315



In [133]:

    
!grep -o 'Yasuni BM specimens - 48152' occurrence.csv #nothing



In [121]:

    
!grep 'Gilbert Hainan Colls' oddones.txt | wc #should be 631









    



    399    2394   11896



In [122]:

    
!grep 'BelizeColl2007' oddones.txt | wc #should be 212









    



    131     524    3799

Conclusions:

2741874 records from initial export
perhaps only 2741852 were real records (export problem?)
1658195 unique catalog numbers entries (inc. all 'dirty' entries)
at least 897 unique catalog numbers are 3 digits or less
621947 match '[0-9][0-9][0-9][0-9].'
63499 match 'ZD'



In [ ]:

    
!grep '[0-9][0-9][0-9][0-9]\.' sortedbylinelength.txt | wc