notebook.community

Edit and run



In [67]:

    
import requests
from bs4 import BeautifulSoup
import nltk
from collections import Counter
import enchant
import numpy as np
import pandas as pd



In [3]:

    
r = requests.get('http://caporasolab.us')



In [11]:

    
sites = ['AND', 'ARC', 'BES', 'BNZ', 'CCE', 'CDR', 'CAP', 'CWT', 'FCE', 'GCE', 'HFR', 'HBR', 'JRN', 
         'KBS', 'KNZ', 'LNO', 'LUQ', 'MCM', 'MCR', 'NWT', 'NTL', 'PAL', 'PIE', 'SBC', 'SEV', 'VCR']



In [56]:

    
url_template = "http://lternet.edu/sites/%s"
site_bigram_counts = {}
d = enchant.Dict("en_US")
all_bgs = set()
for s in sites:
    r = requests.get(url_template % s)
    t = BeautifulSoup(r.text, 'html.parser').get_text()
    bgs = []
    for bg in nltk.bigrams(nltk.word_tokenize((t))):
        w1, w2 = bg
        if d.check(w1) and d.check(w2):
            bgs.append(bg)
            all_bgs.add(bg)
    site_bigram_counts[s] = Counter(bgs)
all_bgs = list(all_bgs)



In [51]:

    
site_bigram_counts['AND'].most_common(10)









    Out[51]:





[(('Read', 'more'), 6),
 (('of', 'the'), 6),
 (('.', 'Read'), 6),
 (('Andrews', 'Forest'), 4),
 (('Ecological', 'Research'), 3),
 (('Experimental', 'Forest'), 3),
 (('more', '.'), 3),
 (('Andrews', 'Experimental'), 3),
 (('in', 'the'), 3),
 (('with', 'elevation'), 2)]



In [57]:

    
len(site_bigram_counts['AND'])









    Out[57]:





467



In [60]:

    
data = np.zeros((len(sites), len(all_bgs)))



In [64]:

    
for i, s in enumerate(sites):
    for j, bg in enumerate(all_bgs):
        try:
            data[i,j] = site_bigram_counts[s][bg]
        except KeyError:
            pass



In [70]:

    
df = pd.DataFrame(data, columns=[' '.join(e) for e in all_bgs], index=sites)



In [83]:









    Out[83]:






  
    
      
      importance of
      more than
      the major
      Bonanza Creek
      The Long
      the USDA
      focus on
      Research Topics
      Current Ecosystem
      Dry Valleys
      ...
      . The
      Privacy Policy
      key research
      one of
      understanding of
      . These
      Station in
      to main
      located on
      landscapes .
    
  
  
    
      AND
      1
      1
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      2
      1
      1
      0
      1
      1
      0
      1
      0
      1
    
    
      ARC
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      1
      1
      1
      0
      0
      1
      0
      1
      0
      0
    
    
      BES
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      3
      1
      1
      0
      1
      0
      0
      1
      0
      0
    
    
      BNZ
      0
      0
      1
      5
      2
      0
      0
      1
      0
      0
      ...
      1
      1
      1
      0
      1
      0
      0
      1
      0
      0
    
    
      CCE
      0
      0
      0
      0
      2
      0
      0
      1
      3
      0
      ...
      2
      1
      1
      0
      0
      2
      0
      1
      0
      0
    
    
      CDR
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      0
      1
      1
      1
      0
      1
      0
      1
      0
      0
    
    
      CAP
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      2
      1
      1
      1
      0
      0
      0
      1
      0
      1
    
    
      CWT
      0
      0
      0
      0
      2
      2
      0
      1
      0
      0
      ...
      2
      1
      1
      1
      0
      0
      0
      1
      0
      0
    
    
      FCE
      0
      0
      0
      0
      2
      0
      2
      1
      0
      0
      ...
      2
      1
      1
      0
      0
      1
      0
      1
      0
      0
    
    
      GCE
      1
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      1
      1
      1
      0
      0
      1
      0
      1
      1
      0
    
    
      HFR
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      1
      1
      1
      1
      1
      1
      0
      1
      0
      0
    
    
      HBR
      0
      0
      0
      0
      2
      2
      0
      1
      0
      0
      ...
      1
      1
      1
      0
      0
      0
      1
      1
      0
      0
    
    
      JRN
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      1
      1
      1
      0
      1
      2
      0
      1
      0
      1
    
    
      KBS
      1
      1
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      0
      1
      1
      0
      0
      1
      0
      1
      0
      0
    
    
      KNZ
      0
      1
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      4
      1
      1
      1
      1
      0
      1
      1
      0
      0
    
    
      LNO
      0
      0
      0
      0
      2
      0
      0
      0
      0
      0
      ...
      1
      1
      1
      0
      0
      0
      0
      1
      0
      0
    
    
      LUQ
      0
      0
      1
      0
      2
      1
      0
      1
      0
      0
      ...
      0
      1
      1
      0
      0
      1
      0
      1
      0
      0
    
    
      MCM
      0
      0
      0
      0
      2
      0
      0
      1
      0
      5
      ...
      1
      1
      1
      0
      0
      2
      0
      1
      1
      0
    
    
      MCR
      0
      0
      0
      0
      2
      0
      1
      1
      0
      0
      ...
      0
      1
      1
      0
      1
      1
      0
      1
      1
      0
    
    
      NWT
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      1
      1
      1
      0
      0
      0
      0
      1
      0
      0
    
    
      NTL
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      1
      1
      1
      0
      1
      0
      0
      1
      0
      1
    
    
      PAL
      0
      0
      1
      0
      2
      0
      0
      1
      0
      0
      ...
      1
      1
      1
      2
      0
      0
      1
      1
      0
      0
    
    
      PIE
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      6
      1
      1
      0
      2
      0
      0
      1
      0
      0
    
    
      SBC
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      2
      1
      1
      0
      0
      0
      0
      1
      0
      0
    
    
      SEV
      0
      0
      0
      0
      2
      0
      0
      1
      0
      0
      ...
      4
      1
      1
      0
      0
      0
      0
      1
      0
      0
    
    
      VCR
      0
      0
      0
      0
      2
      0
      2
      1
      0
      0
      ...
      2
      1
      1
      0
      0
      0
      0
      1
      1
      1
    
  

26 rows × 511 columns



In [72]:

    
import seaborn as sns



In [90]:

    
%matplotlib inline
def f(c):
    return df[c].sum() > 2

df = df.select(f, axis=1)

g = sns.clustermap(df, col_cluster=False, col_linkage=False)









    



/home/gregcaporaso/.conda/envs/lter/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [91]:

    
df.to_csv('lter-bigrams.csv')



In [92]:

    
from IPython.display import FileLink
FileLink('lter-bigrams.csv')









    Out[92]:




lter-bigrams.csv



In [ ]:

	importance of	more than	the major	Bonanza Creek	The Long	the USDA	focus on	Research Topics	Current Ecosystem	Dry Valleys	...	. The	Privacy Policy	key research	one of	understanding of	. These	Station in	to main	located on	landscapes .
AND	1	1	0	0	2	0	0	1	0	0	...	2	1	1	0	1	1	0	1	0	1
ARC	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	0	1	0	1	0	0
BES	0	0	0	0	2	0	0	1	0	0	...	3	1	1	0	1	0	0	1	0	0
BNZ	0	0	1	5	2	0	0	1	0	0	...	1	1	1	0	1	0	0	1	0	0
CCE	0	0	0	0	2	0	0	1	3	0	...	2	1	1	0	0	2	0	1	0	0
CDR	0	0	0	0	2	0	0	1	0	0	...	0	1	1	1	0	1	0	1	0	0
CAP	0	0	0	0	2	0	0	1	0	0	...	2	1	1	1	0	0	0	1	0	1
CWT	0	0	0	0	2	2	0	1	0	0	...	2	1	1	1	0	0	0	1	0	0
FCE	0	0	0	0	2	0	2	1	0	0	...	2	1	1	0	0	1	0	1	0	0
GCE	1	0	0	0	2	0	0	1	0	0	...	1	1	1	0	0	1	0	1	1	0
HFR	0	0	0	0	2	0	0	1	0	0	...	1	1	1	1	1	1	0	1	0	0
HBR	0	0	0	0	2	2	0	1	0	0	...	1	1	1	0	0	0	1	1	0	0
JRN	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	1	2	0	1	0	1
KBS	1	1	0	0	2	0	0	1	0	0	...	0	1	1	0	0	1	0	1	0	0
KNZ	0	1	0	0	2	0	0	1	0	0	...	4	1	1	1	1	0	1	1	0	0
LNO	0	0	0	0	2	0	0	0	0	0	...	1	1	1	0	0	0	0	1	0	0
LUQ	0	0	1	0	2	1	0	1	0	0	...	0	1	1	0	0	1	0	1	0	0
MCM	0	0	0	0	2	0	0	1	0	5	...	1	1	1	0	0	2	0	1	1	0
MCR	0	0	0	0	2	0	1	1	0	0	...	0	1	1	0	1	1	0	1	1	0
NWT	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	0	0	0	1	0	0
NTL	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	1	0	0	1	0	1
PAL	0	0	1	0	2	0	0	1	0	0	...	1	1	1	2	0	0	1	1	0	0
PIE	0	0	0	0	2	0	0	1	0	0	...	6	1	1	0	2	0	0	1	0	0
SBC	0	0	0	0	2	0	0	1	0	0	...	2	1	1	0	0	0	0	1	0	0
SEV	0	0	0	0	2	0	0	1	0	0	...	4	1	1	0	0	0	0	1	0	0
VCR	0	0	0	0	2	0	2	1	0	0	...	2	1	1	0	0	0	0	1	1	1

	importance of	more than	the major	Bonanza Creek	The Long	the USDA	focus on	Research Topics	Current Ecosystem	Dry Valleys	...	. The	Privacy Policy	key research	one of	understanding of	. These	Station in	to main	located on	landscapes .
AND	1	1	0	0	2	0	0	1	0	0	...	2	1	1	0	1	1	0	1	0	1
ARC	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	0	1	0	1	0	0
BES	0	0	0	0	2	0	0	1	0	0	...	3	1	1	0	1	0	0	1	0	0
BNZ	0	0	1	5	2	0	0	1	0	0	...	1	1	1	0	1	0	0	1	0	0
CCE	0	0	0	0	2	0	0	1	3	0	...	2	1	1	0	0	2	0	1	0	0
CDR	0	0	0	0	2	0	0	1	0	0	...	0	1	1	1	0	1	0	1	0	0
CAP	0	0	0	0	2	0	0	1	0	0	...	2	1	1	1	0	0	0	1	0	1
CWT	0	0	0	0	2	2	0	1	0	0	...	2	1	1	1	0	0	0	1	0	0
FCE	0	0	0	0	2	0	2	1	0	0	...	2	1	1	0	0	1	0	1	0	0
GCE	1	0	0	0	2	0	0	1	0	0	...	1	1	1	0	0	1	0	1	1	0
HFR	0	0	0	0	2	0	0	1	0	0	...	1	1	1	1	1	1	0	1	0	0
HBR	0	0	0	0	2	2	0	1	0	0	...	1	1	1	0	0	0	1	1	0	0
JRN	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	1	2	0	1	0	1
KBS	1	1	0	0	2	0	0	1	0	0	...	0	1	1	0	0	1	0	1	0	0
KNZ	0	1	0	0	2	0	0	1	0	0	...	4	1	1	1	1	0	1	1	0	0
LNO	0	0	0	0	2	0	0	0	0	0	...	1	1	1	0	0	0	0	1	0	0
LUQ	0	0	1	0	2	1	0	1	0	0	...	0	1	1	0	0	1	0	1	0	0
MCM	0	0	0	0	2	0	0	1	0	5	...	1	1	1	0	0	2	0	1	1	0
MCR	0	0	0	0	2	0	1	1	0	0	...	0	1	1	0	1	1	0	1	1	0
NWT	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	0	0	0	1	0	0
NTL	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	1	0	0	1	0	1
PAL	0	0	1	0	2	0	0	1	0	0	...	1	1	1	2	0	0	1	1	0	0
PIE	0	0	0	0	2	0	0	1	0	0	...	6	1	1	0	2	0	0	1	0	0
SBC	0	0	0	0	2	0	0	1	0	0	...	2	1	1	0	0	0	0	1	0	0
SEV	0	0	0	0	2	0	0	1	0	0	...	4	1	1	0	0	0	0	1	0	0
VCR	0	0	0	0	2	0	2	1	0	0	...	2	1	1	0	0	0	0	1	1	1

	importance of	more than	the major	Bonanza Creek	The Long	the USDA	focus on	Research Topics	Current Ecosystem	Dry Valleys	...	. The	Privacy Policy	key research	one of	understanding of	. These	Station in	to main	located on	landscapes .
AND	1	1	0	0	2	0	0	1	0	0	...	2	1	1	0	1	1	0	1	0	1
ARC	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	0	1	0	1	0	0
BES	0	0	0	0	2	0	0	1	0	0	...	3	1	1	0	1	0	0	1	0	0
BNZ	0	0	1	5	2	0	0	1	0	0	...	1	1	1	0	1	0	0	1	0	0
CCE	0	0	0	0	2	0	0	1	3	0	...	2	1	1	0	0	2	0	1	0	0
CDR	0	0	0	0	2	0	0	1	0	0	...	0	1	1	1	0	1	0	1	0	0
CAP	0	0	0	0	2	0	0	1	0	0	...	2	1	1	1	0	0	0	1	0	1
CWT	0	0	0	0	2	2	0	1	0	0	...	2	1	1	1	0	0	0	1	0	0
FCE	0	0	0	0	2	0	2	1	0	0	...	2	1	1	0	0	1	0	1	0	0
GCE	1	0	0	0	2	0	0	1	0	0	...	1	1	1	0	0	1	0	1	1	0
HFR	0	0	0	0	2	0	0	1	0	0	...	1	1	1	1	1	1	0	1	0	0
HBR	0	0	0	0	2	2	0	1	0	0	...	1	1	1	0	0	0	1	1	0	0
JRN	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	1	2	0	1	0	1
KBS	1	1	0	0	2	0	0	1	0	0	...	0	1	1	0	0	1	0	1	0	0
KNZ	0	1	0	0	2	0	0	1	0	0	...	4	1	1	1	1	0	1	1	0	0
LNO	0	0	0	0	2	0	0	0	0	0	...	1	1	1	0	0	0	0	1	0	0
LUQ	0	0	1	0	2	1	0	1	0	0	...	0	1	1	0	0	1	0	1	0	0
MCM	0	0	0	0	2	0	0	1	0	5	...	1	1	1	0	0	2	0	1	1	0
MCR	0	0	0	0	2	0	1	1	0	0	...	0	1	1	0	1	1	0	1	1	0
NWT	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	0	0	0	1	0	0
NTL	0	0	0	0	2	0	0	1	0	0	...	1	1	1	0	1	0	0	1	0	1
PAL	0	0	1	0	2	0	0	1	0	0	...	1	1	1	2	0	0	1	1	0	0
PIE	0	0	0	0	2	0	0	1	0	0	...	6	1	1	0	2	0	0	1	0	0
SBC	0	0	0	0	2	0	0	1	0	0	...	2	1	1	0	0	0	0	1	0	0
SEV	0	0	0	0	2	0	0	1	0	0	...	4	1	1	0	0	0	0	1	0	0
VCR	0	0	0	0	2	0	2	1	0	0	...	2	1	1	0	0	0	0	1	1	1