In [1]:
import glob
from io import open
import pandas as pd
from pandas import DataFrame as df
from os import path
import re
import gensim
import numpy as np
from collections import Counter
In [3]:
import nltk
In [4]:
from nltk import word_tokenize
In [5]:
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
In [6]:
from tools import get_psycinfo_database
In [7]:
words_df = get_psycinfo_database()
In [8]:
words_df.head()
Out[8]:
In [9]:
words_to_replace = {"/":"-", "bi-ethnic": "biethnic", "bi-racial": "biracial",
"mono-ethnic": "monoethnic", "poly-ethnic": "polyethnic",
"mono-racial": "monoracial", "multi-racial": "multiracial",
"inter-racial": "interracial", "mono-cultural": "monocultural",
"bi-cultural": "bicultural", "multi-cultural": "multicultural",
"other ethnicity":"other-ethnicity", "other race": "other-race",
"mixed race": "mixed-race", "mixed ethnicity": "mixed-ethnicity"}
In [10]:
def FixText(text):
for old, new in words_to_replace.items():
text = text.replace(old, new)
return text
In [11]:
abstracts = [FixText(x.lower()) for x in words_df.Abstract.fillna("").values]
In [12]:
abstract_text = [word_tokenize(x) for x in abstracts]
In [13]:
abstract_text[0][:10]
Out[13]:
In [14]:
print("Number of Abstracts", len(abstract_text))
print("Number of words", sum(len(x) for x in abstract_text))
In [15]:
model = gensim.models.Word2Vec(sentences=abstract_text, size=16, window=5, min_count=2, sample=0,
workers=10, hs=1, sg=1, cbow_mean=0)
In [104]:
model.save("data/PsycInfo/processed/abstract_model.pkl")
In [105]:
ls -lh data/PsycInfo/processed/abstract_model.pkl
In [112]:
model = model.load("data/PsycInfo/processed/abstract_model.pkl")
In [16]:
len(model.vocab.keys())
Out[16]:
In [18]:
model.most_similar_cosmul("multicultural")
Out[18]:
In [19]:
model.most_similar_cosmul("biracial")
Out[19]:
In [20]:
model.most_similar_cosmul("monoracial")
Out[20]:
In [135]:
model.most_similar("bicultural", topn=10)
Out[135]:
In [49]:
from scipy.cluster.hierarchy import dendrogram, linkage, set_link_color_palette
In [23]:
words = ['monocultural', 'monoracial', 'bicultural', 'biracial', 'biethnic', 'interracial',
'multicultural', 'multiracial', 'multiethnic', 'polycultural', 'polyracial', 'polyethnic',
'mixed race', 'mixed ethnicity', 'other race', 'other ethnicity', 'multiculturalism', 'polyculturalism',
'cultural pluralism']
In [24]:
expansion = []
for w in words:
if w in model:
expansion.extend([x for x,y in model.most_similar(w, topn=50)])
counter = Counter(expansion)
expansion = [x for x, y in counter.items() if y > 2]
In [137]:
y = []
X = []
normalized = True
for w in set(words + expansion):
if w in model:
v = model.syn0norm[model.vocab[w].index, :] if normalized else model[w]
X.append(v)
y.append(w)
X = np.asarray(X)
X = (X.T / np.linalg.norm(X, axis=1)).T
In [138]:
X.shape
Out[138]:
In [139]:
print(plt.style.available)
In [92]:
set_link_color_palette([str(x) for x in [0.3, 0.5, 0.75]])
In [180]:
plt.style.use('seaborn-ticks')
In [181]:
Z = linkage(X, 'ward')
In [186]:
plt.figure(figsize=(25, 10))
plt.ylabel('Words', family = "Times New Roman", fontsize=20)
plt.xlabel('Distance', family = "Times New Roman", fontsize=20)
plt.grid(False)
_ = dendrogram(Z, orientation="left", above_threshold_color='k', color_threshold=1.4, labels=y,
leaf_rotation=0., leaf_font_size=16.)
In [ ]: