In [67]:
import requests
from bs4 import BeautifulSoup
import nltk
from collections import Counter
import enchant
import numpy as np
import pandas as pd
In [3]:
r = requests.get('http://caporasolab.us')
In [11]:
sites = ['AND', 'ARC', 'BES', 'BNZ', 'CCE', 'CDR', 'CAP', 'CWT', 'FCE', 'GCE', 'HFR', 'HBR', 'JRN',
'KBS', 'KNZ', 'LNO', 'LUQ', 'MCM', 'MCR', 'NWT', 'NTL', 'PAL', 'PIE', 'SBC', 'SEV', 'VCR']
In [56]:
url_template = "http://lternet.edu/sites/%s"
site_bigram_counts = {}
d = enchant.Dict("en_US")
all_bgs = set()
for s in sites:
r = requests.get(url_template % s)
t = BeautifulSoup(r.text, 'html.parser').get_text()
bgs = []
for bg in nltk.bigrams(nltk.word_tokenize((t))):
w1, w2 = bg
if d.check(w1) and d.check(w2):
bgs.append(bg)
all_bgs.add(bg)
site_bigram_counts[s] = Counter(bgs)
all_bgs = list(all_bgs)
In [51]:
site_bigram_counts['AND'].most_common(10)
Out[51]:
In [57]:
len(site_bigram_counts['AND'])
Out[57]:
In [60]:
data = np.zeros((len(sites), len(all_bgs)))
In [64]:
for i, s in enumerate(sites):
for j, bg in enumerate(all_bgs):
try:
data[i,j] = site_bigram_counts[s][bg]
except KeyError:
pass
In [70]:
df = pd.DataFrame(data, columns=[' '.join(e) for e in all_bgs], index=sites)
In [83]:
Out[83]:
In [72]:
import seaborn as sns
In [90]:
%matplotlib inline
def f(c):
return df[c].sum() > 2
df = df.select(f, axis=1)
g = sns.clustermap(df, col_cluster=False, col_linkage=False)
In [91]:
df.to_csv('lter-bigrams.csv')
In [92]:
from IPython.display import FileLink
FileLink('lter-bigrams.csv')
Out[92]:
In [ ]: