In [6]:
#!sudo apt-get install python-dev python-pip python-setuptools build-essential
#!pip install --upgrade setuptools
#!pip install csvkit
#!pip install --upgrade csvkit
In [3]:
!ls -lrth
In [9]:
import sys
print (sys.version)
In [10]:
%env
Out[10]:
In [11]:
!lsb_release -a
In [14]:
!lscpu
In [15]:
!free -m
In [17]:
!head -5 occurrence.csv > testit.csv
In [21]:
!csvcut -c 5,12,4,71,1 testit.csv | csvlook
In [65]:
!wc occurrence.csv
In [137]:
!sort -u occurrence.csv | wc
In [66]:
!csvcut -c 4 occurrence.csv > catalognumbers.txt
In [67]:
!wc catalognumbers.txt
Thanks Christian Pietsch @ChPietsch for reminding me about csvclean https://twitter.com/ChPietsch/status/594265427427106817
In [68]:
!csvclean occurrence.csv
In [69]:
!csvcut -c 4 occurrence_out.csv > catalognumbers.txt
In [70]:
!wc catalognumbers.txt
In [71]:
!ls -lh catalognumbers.txt
In [72]:
!head catalognumbers.txt
In [73]:
!sed -i '1d' catalognumbers.txt
In [74]:
!head catalognumbers.txt
In [82]:
!sort catalognumbers.txt | uniq -c > sortedcatalog.txt
In [83]:
!wc sortedcatalog.txt
In [84]:
!head sortedcatalog.txt
In [85]:
!tail sortedcatalog.txt
In [86]:
!sed -i '1,9d' sortedcatalog.txt
In [81]:
!cat sortedcatalog.txt | awk '{ print length, $0 }' | sort -n | awk '{$1=""; print $0}' > sortedbylinelength.txt
In [87]:
!head -50 sortedbylinelength.txt
In [89]:
!grep '[0-9][0-9][0-9][0-9]\.' sortedbylinelength.txt | wc
In [90]:
!grep 'ZD' sortedbylinelength.txt | wc
In [92]:
!grep 'ARC' sortedbylinelength.txt #50 matches
In [93]:
!grep 'BK' sortedbylinelength.txt #5 matches
In [94]:
!grep 'E/' sortedbylinelength.txt | wc
In [95]:
!grep -v '[0-9][0-9][0-9][0-9]\.' sortedbylinelength.txt | grep -v 'ZD' | grep -v 'E/' > oddones.txt
In [97]:
!wc oddones.txt
In [98]:
!grep 'BRITFERN' oddones.txt | wc
In [99]:
!grep 'Malesiana' oddones.txt | wc
In [100]:
!grep 'Azores' oddones.txt | wc
In [101]:
!grep 'JMC' oddones.txt | wc
In [104]:
!grep 'New Caledonia Brownlie' oddones.txt | wc
In [107]:
!grep 'Paradox loan' oddones.txt | wc
In [108]:
!grep 'BM-BRIT-EURO' oddones.txt | wc
In [110]:
!grep 'PI D ' oddones.txt | wc
In [112]:
!grep 'PI BZ ' oddones.txt | wc
In [111]:
!grep 'SEM' oddones.txt | wc
In [117]:
!grep -i 'Gilbert' oddones.txt | wc #"Gilbert Qinghai Colls" on website gives 3136 records BOT DUPLICATE CAT NUMBERS!!!
In [118]:
!grep 'Gilbert Qinghai Colls' occurrence_out.csv | wc
In [119]:
!grep 'Yasuni BM specimens' oddones.txt | wc #should be 27 NOT duplicate CAT NUMBERS
In [132]:
!grep 'Yasuni BM specimens' occurrence_out.csv | grep -o 48[0-9][0-9][0-9] | sort #48152 is missing
In [135]:
!grep 'NHMUK:ecatalogue:4614078' occurrence.csv #alternate ID not found either
In [136]:
!grep -o 'Yasuni BM specimens .......' occurrence.csv | sort
In [133]:
!grep -o 'Yasuni BM specimens - 48152' occurrence.csv #nothing
In [121]:
!grep 'Gilbert Hainan Colls' oddones.txt | wc #should be 631
In [122]:
!grep 'BelizeColl2007' oddones.txt | wc #should be 212
Conclusions:
In [ ]:
!grep '[0-9][0-9][0-9][0-9]\.' sortedbylinelength.txt | wc