In [5]:
from os.path import join
from itertools import chain

In [6]:
file_names = ['stsa.fine.test','stsa.fine.train','stsa.fine.dev']
file_path = '/home/bruce/data/sentiment/'
def read_file(fname=''):
    with open(join(file_path,fname)) as fr:
        lines = fr.readlines()
    lines = [line.lower().strip() for line in lines]
    lables = [int(line[0:1]) for line in lines]
    words = [line[2:].split() for line in lines]
    return words,lables       
train_X,train_y = read_file(fname='stsa.fine.train')
test_X,test_y = read_file(fname='stsa.fine.test')
dev_X,dev_y = read_file(fname='stsa.fine.dev')
print('train_X 的数量:',len(train_X))
print('test_X 的数量:',len(test_X))
print('dev_X 的数量:',len(dev_X))
print(train_X[0:10])
print(train_y[0:2])


train_X 的数量: 8544
test_X 的数量: 2210
dev_X 的数量: 1101
[['a', 'stirring', ',', 'funny', 'and', 'finally', 'transporting', 're-imagining', 'of', 'beauty', 'and', 'the', 'beast', 'and', '1930s', 'horror', 'films'], ['apparently', 'reassembled', 'from', 'the', 'cutting-room', 'floor', 'of', 'any', 'given', 'daytime', 'soap', '.'], ['they', 'presume', 'their', 'audience', 'wo', "n't", 'sit', 'still', 'for', 'a', 'sociology', 'lesson', ',', 'however', 'entertainingly', 'presented', ',', 'so', 'they', 'trot', 'out', 'the', 'conventional', 'science-fiction', 'elements', 'of', 'bug-eyed', 'monsters', 'and', 'futuristic', 'women', 'in', 'skimpy', 'clothes', '.'], ['the', 'entire', 'movie', 'is', 'filled', 'with', 'deja', 'vu', 'moments', '.'], ['this', 'is', 'a', 'visually', 'stunning', 'rumination', 'on', 'love', ',', 'memory', ',', 'history', 'and', 'the', 'war', 'between', 'art', 'and', 'commerce', '.'], ['um', ',', 'no.', '.'], ['jonathan', 'parker', "'s", 'bartleby', 'should', 'have', 'been', 'the', 'be-all-end-all', 'of', 'the', 'modern-office', 'anomie', 'films', '.'], ['campanella', 'gets', 'the', 'tone', 'just', 'right', '--', 'funny', 'in', 'the', 'middle', 'of', 'sad', 'in', 'the', 'middle', 'of', 'hopeful', '.'], ['a', 'fan', 'film', 'that', 'for', 'the', 'uninitiated', 'plays', 'better', 'on', 'video', 'with', 'the', 'sound', 'turned', 'down', '.'], ['béart', 'and', 'berling', 'are', 'both', 'superb', ',', 'while', 'huppert', '...', 'is', 'magnificent', '.']]
[4, 1]

In [8]:
word_set = set()
all_data = chain(train_X,dev_X,test_X)
for line in all_data:
    word_set.update(line)
# 总共有19536个词,词态:16005
# train 中的单词有:16579  词态:13704

In [11]:
print(len(word_set))


19536

In [9]:
from gensim.models import word2vec
word2vec_model = 'google'
embedding_dim = 300
#we_file = '/home/bruce/data/glove/twitter/glove.twitter.27B.{0}d.txt'.format(embedding_dim)
we_file = '/home/bruce/data/glove/twitter/glove.42B.300d.txt'
google_w2v = '/home/bruce/data/google_word2vec/GoogleNews-vectors-negative300.bin'

print('using google trained word embedding')
google_model = word2vec.Word2Vec.load_word2vec_format(google_w2v, binary=True)
print('loading finished')


using google trained word embedding
loading finished

In [21]:
we_file = '/home/bruce/data/glove/CommonCrawl/glove.42B.300d.txt'
sst5_word_embedding_glove_dict={}
for line in open(we_file):
    elements = line.strip().split()
    if elements[0] in  word_set:
        wordembedding = [float(i) for i in elements[1:]]
        sst5_word_embedding_glove_dict[elements[0]] = wordembedding
        if elements[0] == 'like':
            print(wordembedding)


[-0.014949, 0.1791, -0.28883, -0.33999, -0.030165, 0.00057625, -4.0142, -0.56189, -0.24536, -0.90196, 0.42605, 0.30041, 0.32461, -0.094574, -0.13366, -0.049502, -0.24984, 0.023725, -0.39718, 0.17291, 0.062534, -0.34646, 0.29672, -0.11939, 0.13938, -0.50584, -0.21518, -0.73068, -0.051667, -0.069903, -0.071621, 0.38934, 0.17007, 0.19084, 0.040775, 0.0013094, 0.32719, -0.31138, 0.34084, -0.18078, -0.022643, 0.32605, 0.13847, -0.069246, 0.077059, 0.18622, -0.15879, 0.018862, 0.19157, -0.11791, 0.071837, -0.10129, 0.069355, 0.033117, 0.16184, -0.38982, -0.16356, -0.12119, 0.34246, 0.056193, -0.037565, -0.16854, 0.18214, 0.26026, -0.063287, -0.093168, -0.18505, 0.13058, 0.21269, -0.41993, -0.082688, 0.063783, 0.15828, -0.18407, -0.14867, -0.04469, 0.77545, 0.18728, 0.050465, -0.013003, -0.0708, -0.51099, -0.40859, -0.035602, -0.10428, -0.2234, 0.33998, -0.076035, -0.03264, -0.037299, -0.18082, 0.063581, 0.0051452, -0.45987, -0.21094, -0.56206, -2.3929, 0.034614, 0.40902, 0.081487, -0.0020817, 0.39564, -0.031031, -0.20894, -0.18744, -0.2481, 0.25515, -0.46924, -0.13959, 0.097956, -0.37738, 0.12116, -0.021726, -0.0059438, 0.20783, 0.20737, 0.81562, -0.67751, 0.29477, 0.18229, -0.042271, -0.26642, 0.10736, 0.22445, -0.20397, -0.49131, 0.10115, 0.12019, -0.1194, -0.016802, -0.36415, -0.080153, 0.051018, -0.44842, 0.2265, -0.021619, 0.37785, -0.24994, -0.0073165, 0.77456, 0.39436, -0.18304, 0.23201, -0.13569, -0.26692, -0.16569, -0.24404, 0.13501, 0.26217, -0.16322, 0.18089, -0.15639, 0.12229, -0.38965, 0.31833, -0.13878, 0.23247, 0.031254, -0.38539, 0.3011, -0.12461, 0.12069, -0.27977, 0.044042, -0.12104, -0.02281, -0.058538, -0.35483, 0.23713, -0.097158, -0.44894, 0.082827, 0.085183, 0.035283, -0.21025, 0.040391, -0.33818, 0.09738, 0.11225, 0.19501, -0.1562, 0.13022, -0.08153, -0.046094, 0.10796, -0.26636, 0.24266, -0.31957, 0.53819, 0.16366, 0.13988, -0.2067, 0.1314, 0.049284, 0.35513, -0.11817, 0.34368, 0.33374, -0.12097, -0.050766, 0.18658, 0.001428, -0.28846, -0.14927, 0.0037826, 0.092253, -0.079158, -0.44176, 0.45045, -0.21924, 0.17285, -0.24194, -0.18027, -0.35402, 0.018975, 0.12723, -0.25158, -0.01173, -0.21646, 0.0035812, 0.066, -0.24177, -0.23994, 0.027884, -2.7965, 0.10667, 0.13727, 0.22352, 0.17254, -0.27831, 0.080013, -0.051054, 0.63112, 0.031682, -0.42326, 0.18815, 0.38747, -0.012553, -0.21191, 0.057839, -0.057082, -0.14497, -0.38388, 0.20052, 0.055822, -0.040005, -0.051256, 0.070942, 0.027775, 0.23481, -0.26545, -0.20806, -0.093754, 0.38609, 0.040559, -0.23549, 0.30061, 0.15563, 0.046156, 0.0075508, -0.27485, -0.23408, 0.33414, -0.33647, -0.099405, -0.11208, -0.272, 0.14051, -0.18809, 0.025763, 0.038478, 0.0083357, -0.42713, -0.21649, 0.030883, -0.23209, 0.18609, 0.34209, 0.097109, -0.16616, 1.1353, 0.18662, 0.22876, 0.08164, 0.089013, -0.14447, 0.064785, 0.010417, 0.25046, -0.06917, 0.21302, -0.040679, -0.34699, -0.31536, 0.4771, -0.23487, -0.33792, 0.059178, -0.19221, 0.4562]

In [13]:
sst5_word_embedding_dict={}
count_in = 0
count_out = 0
for word in word_set:
    if word in google_model:
        count_in = count_in + 1 
        sst5_word_embedding_dict[word] = google_model[word]
    else:
        count_out = count_out + 1
print('count_in = ',count_in)
print('count_out = ',count_out)


count_in =  15783
count_out =  3753

In [22]:
import pickle
sst5_word_embedding_glove_dict['like']


Out[22]:
[-0.014949,
 0.1791,
 -0.28883,
 -0.33999,
 -0.030165,
 0.00057625,
 -4.0142,
 -0.56189,
 -0.24536,
 -0.90196,
 0.42605,
 0.30041,
 0.32461,
 -0.094574,
 -0.13366,
 -0.049502,
 -0.24984,
 0.023725,
 -0.39718,
 0.17291,
 0.062534,
 -0.34646,
 0.29672,
 -0.11939,
 0.13938,
 -0.50584,
 -0.21518,
 -0.73068,
 -0.051667,
 -0.069903,
 -0.071621,
 0.38934,
 0.17007,
 0.19084,
 0.040775,
 0.0013094,
 0.32719,
 -0.31138,
 0.34084,
 -0.18078,
 -0.022643,
 0.32605,
 0.13847,
 -0.069246,
 0.077059,
 0.18622,
 -0.15879,
 0.018862,
 0.19157,
 -0.11791,
 0.071837,
 -0.10129,
 0.069355,
 0.033117,
 0.16184,
 -0.38982,
 -0.16356,
 -0.12119,
 0.34246,
 0.056193,
 -0.037565,
 -0.16854,
 0.18214,
 0.26026,
 -0.063287,
 -0.093168,
 -0.18505,
 0.13058,
 0.21269,
 -0.41993,
 -0.082688,
 0.063783,
 0.15828,
 -0.18407,
 -0.14867,
 -0.04469,
 0.77545,
 0.18728,
 0.050465,
 -0.013003,
 -0.0708,
 -0.51099,
 -0.40859,
 -0.035602,
 -0.10428,
 -0.2234,
 0.33998,
 -0.076035,
 -0.03264,
 -0.037299,
 -0.18082,
 0.063581,
 0.0051452,
 -0.45987,
 -0.21094,
 -0.56206,
 -2.3929,
 0.034614,
 0.40902,
 0.081487,
 -0.0020817,
 0.39564,
 -0.031031,
 -0.20894,
 -0.18744,
 -0.2481,
 0.25515,
 -0.46924,
 -0.13959,
 0.097956,
 -0.37738,
 0.12116,
 -0.021726,
 -0.0059438,
 0.20783,
 0.20737,
 0.81562,
 -0.67751,
 0.29477,
 0.18229,
 -0.042271,
 -0.26642,
 0.10736,
 0.22445,
 -0.20397,
 -0.49131,
 0.10115,
 0.12019,
 -0.1194,
 -0.016802,
 -0.36415,
 -0.080153,
 0.051018,
 -0.44842,
 0.2265,
 -0.021619,
 0.37785,
 -0.24994,
 -0.0073165,
 0.77456,
 0.39436,
 -0.18304,
 0.23201,
 -0.13569,
 -0.26692,
 -0.16569,
 -0.24404,
 0.13501,
 0.26217,
 -0.16322,
 0.18089,
 -0.15639,
 0.12229,
 -0.38965,
 0.31833,
 -0.13878,
 0.23247,
 0.031254,
 -0.38539,
 0.3011,
 -0.12461,
 0.12069,
 -0.27977,
 0.044042,
 -0.12104,
 -0.02281,
 -0.058538,
 -0.35483,
 0.23713,
 -0.097158,
 -0.44894,
 0.082827,
 0.085183,
 0.035283,
 -0.21025,
 0.040391,
 -0.33818,
 0.09738,
 0.11225,
 0.19501,
 -0.1562,
 0.13022,
 -0.08153,
 -0.046094,
 0.10796,
 -0.26636,
 0.24266,
 -0.31957,
 0.53819,
 0.16366,
 0.13988,
 -0.2067,
 0.1314,
 0.049284,
 0.35513,
 -0.11817,
 0.34368,
 0.33374,
 -0.12097,
 -0.050766,
 0.18658,
 0.001428,
 -0.28846,
 -0.14927,
 0.0037826,
 0.092253,
 -0.079158,
 -0.44176,
 0.45045,
 -0.21924,
 0.17285,
 -0.24194,
 -0.18027,
 -0.35402,
 0.018975,
 0.12723,
 -0.25158,
 -0.01173,
 -0.21646,
 0.0035812,
 0.066,
 -0.24177,
 -0.23994,
 0.027884,
 -2.7965,
 0.10667,
 0.13727,
 0.22352,
 0.17254,
 -0.27831,
 0.080013,
 -0.051054,
 0.63112,
 0.031682,
 -0.42326,
 0.18815,
 0.38747,
 -0.012553,
 -0.21191,
 0.057839,
 -0.057082,
 -0.14497,
 -0.38388,
 0.20052,
 0.055822,
 -0.040005,
 -0.051256,
 0.070942,
 0.027775,
 0.23481,
 -0.26545,
 -0.20806,
 -0.093754,
 0.38609,
 0.040559,
 -0.23549,
 0.30061,
 0.15563,
 0.046156,
 0.0075508,
 -0.27485,
 -0.23408,
 0.33414,
 -0.33647,
 -0.099405,
 -0.11208,
 -0.272,
 0.14051,
 -0.18809,
 0.025763,
 0.038478,
 0.0083357,
 -0.42713,
 -0.21649,
 0.030883,
 -0.23209,
 0.18609,
 0.34209,
 0.097109,
 -0.16616,
 1.1353,
 0.18662,
 0.22876,
 0.08164,
 0.089013,
 -0.14447,
 0.064785,
 0.010417,
 0.25046,
 -0.06917,
 0.21302,
 -0.040679,
 -0.34699,
 -0.31536,
 0.4771,
 -0.23487,
 -0.33792,
 0.059178,
 -0.19221,
 0.4562]

In [23]:
with open('/home/bruce/data/glove/CommonCrawl/sst5_word_embedding_glove_dict.pkl','wb') as fw:
    pickle.dump(sst5_word_embedding_glove_dict,fw)

In [ ]: