notebook.community

Edit and run



In [1]:

    
%load_ext autoreload



In [2]:

    
%autoreload 2



In [196]:

    
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim
from utils.common.db_utils import read_all_results

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = stopwords.words('english')

# Create p_stemmer of class PorterStemmer
p_stemmer = SnowballStemmer('english',ignore_stopwords=True)



In [4]:

    
config = dict(parameters={},DEFAULT={})
config["parameters"]["input_db"] = "tier-1"
config["DEFAULT"]["tier-1"] = "/Users/jklinger/Nesta/nesta_dataflow/db_config/tier-1.cnf"
config["parameters"]["input_table"] = "course_descriptions"
results = read_all_results(config,"input_db","input_table")
doc_set = [r[1] for r in results]









    



/Users/jklinger/anaconda3/envs/py36/lib/python3.6/site-packages/pymysql/cursors.py:166: Warning: (1287, "'@@tx_isolation' is deprecated and will be removed in a future release. Please use '@@transaction_isolation' instead")
  result = self._query(query)



In [379]:

    
def generate_corpus(doc_set,extra_stops=[]):
    # list for tokenized documents in loop
    texts = []

    # loop through document list
    for doc in doc_set:
        if doc is None:
            continue
        # clean and tokenize document string
        raw = doc.lower()
        tokens = tokenizer.tokenize(raw)
        
        # remove stop words from tokens
        tokens = [t for t in tokens if t not in en_stop]

        # stem tokens
        tokens = [p_stemmer.stem(s) for s in tokens]
        before = len(tokens)
        
        tokens = [t for t in tokens if t not in extra_stops]
        after = len(tokens)
        
        diff = before - after
        if diff > 0:
            print(diff)
        
        # add tokens to list
        texts.append(tokens)

    print("Got",len(texts))

    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]
    return texts,corpus,dictionary



In [211]:

    
# generate LDA model
texts,corpus,dictionary = generate_corpus(doc_set)
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=1000)



In [219]:

    
ldamodel.print_topics(num_topics=30)









    Out[219]:





[(0,
  '0.030*"water" + 0.024*"english" + 0.022*"environment" + 0.009*"anim" + 0.009*"univers" + 0.009*"state" + 0.008*"world" + 0.008*"market" + 0.008*"teach" + 0.007*"mani"'),
 (1,
  '0.021*"polit" + 0.016*"special" + 0.013*"safeti" + 0.013*"control" + 0.010*"need" + 0.010*"occup" + 0.009*"teach" + 0.009*"intern" + 0.009*"student" + 0.009*"petroleum"'),
 (2,
  '0.042*"tourism" + 0.032*"geographi" + 0.030*"logist" + 0.030*"place" + 0.020*"suppli" + 0.014*"chain" + 0.012*"point" + 0.010*"origin" + 0.010*"tourist" + 0.010*"concern"'),
 (3,
  '0.023*"hr" + 0.022*"employe" + 0.020*"technolog" + 0.018*"learn" + 0.014*"train" + 0.012*"resourc" + 0.012*"field" + 0.012*"profession" + 0.012*"benefit" + 0.010*"teach"'),
 (4,
  '0.026*"process" + 0.016*"translat" + 0.012*"specif" + 0.011*"activ" + 0.011*"creat" + 0.011*"peopl" + 0.011*"capit" + 0.010*"collect" + 0.010*"refer" + 0.010*"associ"'),
 (5,
  '0.020*"multimedia" + 0.018*"network" + 0.013*"devic" + 0.011*"content" + 0.011*"ict" + 0.010*"media" + 0.010*"entrepreneurship" + 0.009*"electron" + 0.008*"school" + 0.008*"audio"'),
 (6,
  '0.036*"process" + 0.029*"project" + 0.024*"interpret" + 0.018*"manufactur" + 0.016*"industri" + 0.015*"time" + 0.013*"disciplin" + 0.013*"deal" + 0.012*"activ" + 0.012*"complex"'),
 (7,
  '0.016*"optic" + 0.012*"tourism" + 0.011*"data" + 0.010*"term" + 0.010*"light" + 0.009*"sensor" + 0.009*"integr" + 0.009*"gis" + 0.008*"signal" + 0.007*"geograph"'),
 (8,
  '0.051*"english" + 0.020*"advertis" + 0.012*"judg" + 0.012*"judici" + 0.009*"art" + 0.009*"countri" + 0.008*"visual" + 0.008*"linguist" + 0.008*"train" + 0.007*"product"'),
 (9,
  '0.020*"may" + 0.017*"intern" + 0.014*"agribusi" + 0.012*"administr" + 0.011*"social" + 0.011*"technolog" + 0.011*"academ" + 0.011*"provid" + 0.010*"sector" + 0.010*"individu"'),
 (10,
  '0.048*"psycholog" + 0.020*"clinic" + 0.019*"western" + 0.015*"centuri" + 0.014*"europ" + 0.012*"psychologist" + 0.010*"empir" + 0.009*"west" + 0.009*"appli" + 0.008*"first"'),
 (11,
  '0.037*"electr" + 0.036*"comput" + 0.021*"electron" + 0.020*"field" + 0.014*"telecommun" + 0.014*"mechan" + 0.011*"technolog" + 0.011*"disciplin" + 0.009*"power" + 0.009*"softwar"'),
 (12,
  '0.032*"water" + 0.018*"softwar" + 0.015*"clinic" + 0.009*"known" + 0.009*"resourc" + 0.009*"physic" + 0.009*"pharmacist" + 0.009*"care" + 0.009*"pharmaci" + 0.009*"health"'),
 (13,
  '0.041*"materi" + 0.017*"understand" + 0.016*"field" + 0.014*"physic" + 0.011*"term" + 0.011*"disciplin" + 0.010*"research" + 0.010*"structur" + 0.009*"chemistri" + 0.009*"new"'),
 (14,
  '0.045*"arab" + 0.023*"chemistri" + 0.013*"modern" + 0.010*"compound" + 0.010*"mani" + 0.009*"histori" + 0.008*"word" + 0.008*"result" + 0.008*"bond" + 0.008*"influenc"'),
 (15,
  '0.055*"mathemat" + 0.017*"research" + 0.013*"doctor" + 0.012*"appli" + 0.009*"administr" + 0.009*"practic" + 0.009*"interior" + 0.008*"plan" + 0.008*"b" + 0.008*"pure"'),
 (16,
  '0.052*"health" + 0.033*"environment" + 0.019*"public" + 0.015*"nurs" + 0.013*"care" + 0.013*"biolog" + 0.010*"chemic" + 0.009*"process" + 0.009*"life" + 0.009*"environ"'),
 (17,
  '0.022*"modular" + 0.020*"messag" + 0.015*"signal" + 0.015*"encod" + 0.011*"modul" + 0.010*"general" + 0.010*"specif" + 0.009*"librari" + 0.008*"mean" + 0.008*"refer"'),
 (18,
  '0.075*"account" + 0.054*"financi" + 0.018*"report" + 0.017*"term" + 0.013*"corpor" + 0.012*"firm" + 0.012*"standard" + 0.011*"public" + 0.011*"financ" + 0.011*"food"'),
 (19,
  '0.017*"program" + 0.016*"master" + 0.014*"degre" + 0.014*"unit" + 0.013*"mba" + 0.013*"histori" + 0.011*"certif" + 0.011*"state" + 0.010*"secur" + 0.009*"cissp"'),
 (20,
  '0.025*"technolog" + 0.012*"secondari" + 0.011*"linguist" + 0.011*"disciplin" + 0.009*"primari" + 0.008*"civil" + 0.008*"militari" + 0.007*"school" + 0.007*"take" + 0.006*"place"'),
 (21,
  '0.042*"univers" + 0.014*"war" + 0.010*"strateg" + 0.009*"food" + 0.009*"diseas" + 0.009*"law" + 0.008*"construct" + 0.008*"nation" + 0.008*"colleg" + 0.008*"intern"'),
 (22,
  '0.020*"technolog" + 0.016*"data" + 0.013*"statist" + 0.012*"comput" + 0.008*"web" + 0.007*"major" + 0.007*"branch" + 0.006*"electron" + 0.006*"civil" + 0.006*"sun"'),
 (23,
  '0.029*"social" + 0.016*"media" + 0.014*"urban" + 0.011*"mass" + 0.010*"public" + 0.010*"chang" + 0.010*"econom" + 0.010*"process" + 0.009*"individu" + 0.009*"research"'),
 (24,
  '0.047*"sustain" + 0.034*"law" + 0.019*"nuclear" + 0.015*"econom" + 0.014*"environment" + 0.013*"ecolog" + 0.008*"process" + 0.008*"ecosystem" + 0.007*"social" + 0.007*"individu"'),
 (25,
  '0.033*"structur" + 0.017*"compani" + 0.014*"intern" + 0.013*"econom" + 0.012*"financ" + 0.012*"good" + 0.011*"servic" + 0.011*"market" + 0.010*"earth" + 0.010*"countri"'),
 (26,
  '0.024*"market" + 0.018*"public" + 0.016*"game" + 0.016*"leadership" + 0.011*"social" + 0.010*"hydrolog" + 0.009*"digit" + 0.009*"group" + 0.009*"e" + 0.008*"pr"'),
 (27,
  '0.013*"islam" + 0.011*"persian" + 0.011*"saudi" + 0.011*"arabia" + 0.010*"stern" + 0.009*"bank" + 0.008*"radio" + 0.007*"world" + 0.007*"sens" + 0.007*"satellit"'),
 (28,
  '0.028*"dfas" + 0.019*"dod" + 0.017*"account" + 0.014*"oper" + 0.012*"servic" + 0.012*"custom" + 0.012*"depart" + 0.012*"militari" + 0.012*"financ" + 0.012*"million"'),
 (29,
  '0.027*"translat" + 0.025*"english" + 0.022*"televis" + 0.022*"advertis" + 0.010*"time" + 0.010*"literatur" + 0.010*"centuri" + 0.010*"unit" + 0.008*"commerci" + 0.008*"state"')]

Try word2vec to identify words which aren't useful



In [1]:

    
model = gensim.models.Word2Vec(texts, size=500, window=5, min_count=5, workers=4, sample=1e-3)
model.wv.init_sims()
wv = model.wv.syn0
word_mapping = {model.wv.index2word[i]:v for i,v in enumerate(wv)}









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-d89929693daf> in <module>()
----> 1 model = gensim.models.Word2Vec(texts, size=500, window=5, min_count=5, workers=4, sample=1e-3)
      2 model.wv.init_sims()
      3 wv = model.wv.syn0
      4 word_mapping = {model.wv.index2word[i]:v for i,v in enumerate(wv)}

NameError: name 'gensim' is not defined



In [294]:

    
import numpy as np
from collections import Counter

class WordPotential:
    def __init__(self):
        self.topics = []
        self.weights = []
        self.distances = []
    def append(self,distance,weight,topic):
        self.topics.append(topic)
        self.distances.append(distance)
        self.weights.append(weight)
    def forces(self):
        for d,w in self.items():
            yield w*d
    def energies(self):
        for d,w in self.items():
            yield 0.5*w*d*d
    def items(self):
        return zip(self.distances,self.weights)
    
topic_positions = []
all_forces = []
word_potentials = {}
x = []
for itopic,topic in enumerate(ldamodel.get_topics()):
    weighted_positions = []
    for idx,weight in enumerate(topic):
        word = dictionary.get(idx)
        if word in word_mapping:
            pos = word_mapping[word]
            weighted_positions.append(weight * pos)
        elif weight > 0.01:
            print(word,weight)
    topic_pos = np.average(weighted_positions,axis=0)
    #print("-------------")
    #print(topic_pos)
    topic_positions.append(topic_pos)
    
    # Plot get collection of weight*distance for this topic
    forces = []
    for idx,weight in enumerate(topic):
        word = dictionary.get(idx)
        if not word in word_mapping:
            continue
        pos = word_mapping[word]
        distance = np.linalg.norm(topic_pos - pos)
        x.append(distance)
        #print(distance)
        #print(pos)        
        #if np.abs(distance - 1.) > 1e-5:
        #    print(type(distance),distance,distance-1.0)
        #if weight > 1e-4:
        force = distance*weight
        forces.append(force)
        if word not in word_potentials:
            word_potentials[word] = WordPotential()
        word_potentials[word].append(distance,weight,itopic)
    
    #break 
    all_forces.append(forces)



In [295]:

    
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import mpld3


word_mean_potential = [max(wp.energies()) for word,wp in word_potentials.items()]
word_min_distance = [min(wp.distances) for word,wp in word_potentials.items()]
n = [counter[word] for word,wp in word_potentials.items()]
#n = [c if c < 100 else 100 for c in n]

fig,ax = plt.subplots(figsize=(12,6))
scatter = ax.scatter(word_mean_potential,word_min_distance,c=n,cmap="gist_rainbow",vmax=250)
ax.set_xlabel("Word's maximum energy",fontsize=14)
ax.set_ylabel("Closest distance to a topic",fontsize=14)

#ax.set_ylim(0.,0.0004)
#ax.set_xlim(0.0,0.05)

fig.colorbar(scatter)

labels = [word for word,wp in word_potentials.items()]
#print(labels[0:10])
tooltip = mpld3.plugins.PointLabelTooltip(scatter,labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display(fig)

#word_energy_total,np.arange(0,0.12,0.001))









    Out[295]:



In [347]:

    
word_mean_potential = [np.std(list(wp.energies())) for word,wp in word_potentials.items()]
word_max_potential = [np.sum(list(wp.energies())) for word,wp in word_potentials.items()]
n = [counter[word] for word,wp in word_potentials.items()]

fig,ax = plt.subplots(figsize=(12,6))
scatter = ax.scatter(word_mean_potential,word_max_potential,c=n,cmap="gist_rainbow",vmax=250)
ax.set_xlabel("Word's median energy (Genericity)",fontsize=14)
ax.set_ylabel("Word's maximum force (Ambiguity) ",fontsize=14)
#ax.set_ylim(0.,1.5)
#ax.set_xlim(0.0,0.018)

fig.colorbar(scatter)

labels = [word for word,wp in word_potentials.items()]
#print(labels[0:10])
tooltip = mpld3.plugins.PointLabelTooltip(scatter,labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display(fig)









    Out[347]:



In [360]:

    
sum_e = [np.std(list(wp.energies())/np.mean(list(wp.energies()))) for word,wp in word_potentials.items()]
med_e = [np.sqrt(np.sum(np.square(wp.weights),axis=0)) for word,wp in word_potentials.items()]

#sum_e = [x for _,x in sorted(zip(n,sum_e))]
#med_e = [x for _,x in sorted(zip(n,med_e))]

fig,ax = plt.subplots(figsize=(12,6))
scatter = ax.scatter(sum_e,med_e,c=n,cmap="gist_rainbow",vmax=250)
ax.set_xlabel("Std/Mean word energy (Specificity)",fontsize=14)
ax.set_ylabel("Sqrt sum of word weight^2 (Topic prevalance) ",fontsize=14)
#ax.set_ylim(0.,0.002)
#ax.set_xlim(0.0,0.018)

fig.colorbar(scatter)

labels = [word for word,wp in word_potentials.items()]
#print(labels[0:10])
tooltip = mpld3.plugins.PointLabelTooltip(scatter,labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display(fig)









    Out[360]:



In [380]:

    
extra_stops = []
for word,wp in word_potentials.items():
    specificity = np.std(list(wp.energies())/np.mean(list(wp.energies())))
    if specificity < 1.5: # and max_force < 0.35:
        extra_stops.append(word)
print(extra_stops)        
text,corpus,dictionary = generate_corpus(doc_set,extra_stops=extra_stops)









    



['establish', 'call', 'activ', 'includ', 'known', 'often', 'use', 'sever', 'relat', 'analysi', 'may', 'present', 'common', 'profession', 'general', 'various', 'unit', 'major', 'plan', 'direct', 'within', 'provid', 'also', 'requir', 'one', 'well', 'human', 'work', 'world', 'peopl', 'concern', 'develop', 'two', 'deal', 'larg', 'refer', 'scienc', 'part', 'industri', 'associ', 'describ', 'howev', 'studi', 'area', 'number', 'base', 'involv', 'produc', 'level', 'exampl', 'make', 'natur', 'consid', 'non', 'applic', 'differ', 'find', 'practic', 'theori', 'first', 'purpos', 'certain', 'increas', 'creat', 'techniqu', 'whole', 'defin', 'specif', 'person', 'case', 'generat', 'import']
25
25
21
11
22
20
36
20
20
23
16
8
30
6
9
11
8
4
3
26
19
15
17
42
3
18
6
7
7
7
18
25
26
5
46
3
13
10
10
36
8
8
6
8
10
6
1
6
6
28
22
22
30
9
17
3
54
5
10
11
22
4
9
22
21
21
6
6
26
12
47
8
10
1
16
20
16
5
5
13
7
13
11
28
17
9
32
23
9
23
23
17
12
18
12
15
4
1
7
5
15
15
54
28
3
6
13
3
14
27
19
19
5
37
17
37
11
10
23
1
13
16
31
11
1
31
31
27
17
13
1
23
8
34
25
20
35
47
16
1
69
69
27
25
11
3
20
44
11
47
5
7
19
26
8
6
18
5
18
6
11
12
11
26
17
26
25
40
6
6
11
7
32
36
17
10
9
10
20
3
21
5
6
30
6
28
75
15
21
34
19
47
5
19
10
16
27
2
34
34
34
41
26
26
26
17
26
18
14
10
10
10
26
12
25
52
11
11
1
21
11
3
8
22
18
10
10
27
3
3
5
28
41
41
28
16
16
7
4
14
7
31
30
2
4
35
48
36
10
15
34
5
12
2
3
15
5
5
12
12
45
25
6
6
7
9
37
12
27
32
9
38
27
11
11
Got 278



In [381]:

    
# generate LDA model
text,corpus,dictionary = generate_corpus(doc_set,extra_stops=extra_stops)
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=1000)



In [382]:

    
ldamodel.print_topics(num_topics=30)









    Out[382]:





[(0,
  '0.028*"optic" + 0.022*"radio" + 0.021*"stern" + 0.016*"light" + 0.013*"sensor" + 0.011*"signal" + 0.009*"year" + 0.009*"million" + 0.009*"show" + 0.007*"new"'),
 (1,
  '0.022*"busi" + 0.015*"technolog" + 0.015*"econom" + 0.014*"process" + 0.013*"environment" + 0.010*"game" + 0.009*"new" + 0.009*"field" + 0.009*"entrepreneurship" + 0.008*"product"'),
 (2,
  '0.026*"manag" + 0.023*"employe" + 0.022*"hr" + 0.015*"financ" + 0.015*"train" + 0.014*"financi" + 0.013*"compani" + 0.012*"educ" + 0.012*"benefit" + 0.012*"focus"'),
 (3,
  '0.043*"engin" + 0.031*"telecommun" + 0.030*"english" + 0.016*"system" + 0.014*"telecom" + 0.013*"judg" + 0.013*"judici" + 0.012*"design" + 0.012*"communic" + 0.012*"servic"'),
 (4,
  '0.042*"technolog" + 0.024*"educ" + 0.024*"manag" + 0.022*"school" + 0.018*"innov" + 0.016*"market" + 0.013*"organ" + 0.013*"ministri" + 0.011*"indonesia" + 0.011*"societi"'),
 (5,
  '0.111*"engin" + 0.035*"design" + 0.027*"electr" + 0.018*"field" + 0.017*"electron" + 0.017*"system" + 0.015*"mechan" + 0.011*"power" + 0.010*"softwar" + 0.010*"comput"'),
 (6,
  '0.028*"languag" + 0.027*"persian" + 0.023*"interpret" + 0.012*"group" + 0.012*"arab" + 0.010*"exposur" + 0.007*"write" + 0.007*"time" + 0.007*"sourc" + 0.007*"offici"'),
 (7,
  '0.067*"design" + 0.024*"visual" + 0.017*"manag" + 0.015*"communic" + 0.015*"cultur" + 0.014*"graphic" + 0.014*"art" + 0.014*"market" + 0.012*"public" + 0.012*"account"'),
 (8,
  '0.030*"univers" + 0.022*"engin" + 0.020*"disciplin" + 0.018*"intern" + 0.015*"academ" + 0.014*"war" + 0.012*"physic" + 0.012*"nation" + 0.012*"militari" + 0.011*"field"'),
 (9,
  '0.031*"multimedia" + 0.030*"public" + 0.022*"media" + 0.021*"content" + 0.018*"devic" + 0.015*"interact" + 0.015*"languag" + 0.013*"form" + 0.010*"pr" + 0.009*"mis"'),
 (10,
  '0.038*"web" + 0.027*"health" + 0.015*"classif" + 0.012*"intern" + 0.012*"definit" + 0.012*"system" + 0.012*"allow" + 0.012*"content" + 0.012*"templat" + 0.008*"physic"'),
 (11,
  '0.037*"account" + 0.029*"mathemat" + 0.024*"financi" + 0.019*"chemistri" + 0.016*"inform" + 0.011*"report" + 0.010*"cultur" + 0.009*"standard" + 0.009*"compound" + 0.009*"measur"'),
 (12,
  '0.048*"english" + 0.031*"languag" + 0.017*"market" + 0.013*"advertis" + 0.012*"water" + 0.012*"televis" + 0.010*"countri" + 0.009*"digit" + 0.007*"form" + 0.006*"instruct"'),
 (13,
  '0.077*"communic" + 0.046*"messag" + 0.023*"encod" + 0.020*"signal" + 0.019*"logist" + 0.017*"inform" + 0.013*"transmiss" + 0.012*"mean" + 0.011*"manag" + 0.011*"receiv"'),
 (14,
  '0.133*"health" + 0.037*"care" + 0.033*"public" + 0.023*"nurs" + 0.023*"environment" + 0.020*"medic" + 0.018*"protect" + 0.016*"patient" + 0.016*"diseas" + 0.014*"hospit"'),
 (15,
  '0.031*"manag" + 0.026*"sustain" + 0.024*"polit" + 0.021*"intern" + 0.018*"organ" + 0.016*"media" + 0.013*"effect" + 0.011*"system" + 0.010*"resourc" + 0.010*"busi"'),
 (16,
  '0.022*"learn" + 0.020*"educ" + 0.014*"arabia" + 0.014*"saudi" + 0.014*"teach" + 0.012*"inform" + 0.010*"state" + 0.010*"data" + 0.010*"secur" + 0.009*"student"'),
 (17,
  '0.065*"psycholog" + 0.025*"leadership" + 0.022*"water" + 0.018*"histori" + 0.015*"psychologist" + 0.015*"appli" + 0.014*"behavior" + 0.012*"cognit" + 0.009*"entrepreneuri" + 0.009*"organiz"'),
 (18,
  '0.176*"inform" + 0.108*"system" + 0.097*"comput" + 0.059*"technolog" + 0.041*"network" + 0.034*"communic" + 0.024*"ict" + 0.017*"electron" + 0.016*"softwar" + 0.016*"storag"'),
 (19,
  '0.017*"univers" + 0.017*"sun" + 0.016*"dietitian" + 0.015*"nutrit" + 0.012*"technolog" + 0.011*"version" + 0.009*"sust" + 0.009*"bangladesh" + 0.008*"special" + 0.006*"facil"'),
 (20,
  '0.032*"materi" + 0.030*"educ" + 0.016*"univers" + 0.014*"state" + 0.012*"field" + 0.011*"engin" + 0.011*"age" + 0.011*"secondari" + 0.011*"understand" + 0.009*"emerg"'),
 (21,
  '0.048*"arab" + 0.031*"languag" + 0.017*"data" + 0.014*"statist" + 0.012*"word" + 0.010*"mass" + 0.010*"mani" + 0.009*"written" + 0.009*"standard" + 0.009*"spoken"'),
 (22,
  '0.093*"engin" + 0.052*"environment" + 0.039*"system" + 0.029*"manag" + 0.027*"process" + 0.016*"project" + 0.015*"control" + 0.014*"problem" + 0.014*"design" + 0.014*"technolog"'),
 (23,
  '0.027*"languag" + 0.016*"structur" + 0.015*"food" + 0.011*"linguist" + 0.010*"educ" + 0.010*"degre" + 0.010*"nurs" + 0.008*"physic" + 0.007*"cultur" + 0.006*"system"'),
 (24,
  '0.059*"manag" + 0.030*"busi" + 0.030*"organ" + 0.016*"administr" + 0.015*"process" + 0.014*"master" + 0.013*"law" + 0.011*"degre" + 0.011*"system" + 0.011*"sociolog"'),
 (25,
  '0.030*"tourism" + 0.020*"modular" + 0.014*"advertis" + 0.012*"place" + 0.012*"busi" + 0.009*"modul" + 0.008*"travel" + 0.007*"tourist" + 0.007*"geographi" + 0.007*"b"'),
 (26,
  '0.025*"electr" + 0.020*"earth" + 0.017*"water" + 0.015*"clinic" + 0.015*"fish" + 0.013*"hydrolog" + 0.013*"satellit" + 0.011*"sens" + 0.011*"remot" + 0.011*"geolog"'),
 (27,
  '0.028*"educ" + 0.025*"islam" + 0.017*"special" + 0.014*"agribusi" + 0.012*"bank" + 0.010*"student" + 0.010*"need" + 0.009*"product" + 0.007*"teach" + 0.007*"term"'),
 (28,
  '0.020*"social" + 0.014*"servic" + 0.013*"process" + 0.012*"oper" + 0.011*"design" + 0.011*"chemic" + 0.010*"organ" + 0.010*"dfas" + 0.010*"life" + 0.009*"manag"'),
 (29,
  '0.029*"languag" + 0.028*"translat" + 0.027*"busi" + 0.026*"social" + 0.019*"english" + 0.017*"compani" + 0.015*"communic" + 0.013*"countri" + 0.012*"market" + 0.011*"servic"')]



In [ ]: