Инициализация

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import os
import artm
import numpy as np
from sklearn.datasets import dump_svmlight_file
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import stopwords
output_file = "./output/data"
data_folder='./batches_new'
gexf_file='hcl_phi.gexf'
num_top_tokens = 20 # число топ слов
num_topics = 30 # число тем

Удаляем старые данные


In [2]:
import shutil
shutil.rmtree(data_folder)
os.mkdir(data_folder)

Далее вызываем count vectorizer, чтобы отфильтровать мусор, накладываем idf ограничения + меняем регулярку.


In [3]:
data_directory = '.'
emails = pd.read_csv(os.path.join(data_directory, 'Emails.csv'))
vectorizer = CountVectorizer(decode_error='replace', min_df=0.01, max_df=0.3, token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z][a-zA-Z][a-zA-Z]+\b")
X = vectorizer.fit_transform(emails.RawText).todense()

table = vectorizer.get_feature_names()
fo = open("words.txt", "w")
print >>fo, table

print X.shape


(7945L, 1993L)

In [4]:
table = vectorizer.get_feature_names()
y = [i for i in xrange(X.shape[0])]
dump_svmlight_file( X, y, f = output_file, zero_based = True)

Создаем словарь и батчи


In [5]:
batch_vectorizer = artm.BatchVectorizer(data_path=output_file, data_format='vowpal_wabbit', target_folder=data_folder)

In [6]:
dictionary = artm.Dictionary()
dictionary.gather(data_path=data_folder)

In [7]:
# os.remove('./batches/dictionary.dict')
# os.remove('./batches/dictionary_text')
dictionary.save(dictionary_path=data_folder + '/dictionary')
dictionary.save_text(dictionary_path=data_folder + 'dictionary_text')

Создаем модель


In [8]:
topic_names = ['topic_{}'.format(i) for i in xrange(num_topics)]
model_artm = artm.ARTM(topic_names=topic_names,
                       scores=[artm.PerplexityScore(name='PerplexityScore',
                                                    use_unigram_document_model=False,
                                                    dictionary=dictionary)], cache_theta=True)
# не используем. ибо попытка выделить самые основные темы и еще фон не пошла.
part1 = topic_names[:len(topic_names / 2)]
part2 = topic_names[len(topic_names / 2):]

model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
model_artm.scores.add(artm.TopicKernelScore(name='TopicKernelScore'))
model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=num_top_tokens))

model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))
model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5))
model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15))

Далее в 2а этапа учим модель. В 2а проще, после 1го можжно было что-то поменять и улучшить. Короче, я перелопапил кучу параметров в попытках получить какие-то осмысленные ядра. Но самая (на глаз) удачная модель получается с параметрами из туториала. Если же задаваться целью минимизировать перплексию, а не удачную на глаз модель, то порядок параметров должен быть ~0.5, кроме того я еще делал разбиение на 10 топ тем и еще 20 побочных в надежде, что 10 нужно "разрядить", а 20 сгладить (topic_names=...). Но это полный бред, или я не умею это готовить!!! Ничего не помогло.


In [9]:
model_artm.num_document_passes = 1
model_artm.initialize(dictionary=dictionary)
model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=30)

def print_measures(model_artm):
    print 'Sparsity Phi: {0:.3f} (ARTM)'.format(model_artm.score_tracker['SparsityPhiScore'].last_value)
    print 'Sparsity Theta: {0:.3f} (ARTM)'.format(model_artm.score_tracker['SparsityThetaScore'].last_value)
    print 'Kernel contrast: {0:.3f} (ARTM)'.format(model_artm.score_tracker['TopicKernelScore'].last_average_contrast)
    print 'Kernel purity: {0:.3f} (ARTM)'.format(model_artm.score_tracker['TopicKernelScore'].last_average_purity)
    print 'Perplexity: {0:.3f} (ARTM)'.format(model_artm.score_tracker['PerplexityScore'].last_value)

    plt.plot(xrange(model_artm.num_phi_updates), model_artm.score_tracker['PerplexityScore'].value, 'r--', linewidth=2)
    plt.xlabel('Iterations count')
    plt.ylabel('ARTM perp. (red)')
    plt.grid(True)
    plt.show()
    
print_measures(model_artm)


Sparsity Phi: 0.639 (ARTM)
Sparsity Theta: 0.274 (ARTM)
Kernel contrast: 0.373 (ARTM)
Kernel purity: 0.848 (ARTM)
Perplexity: 709.900 (ARTM)

In [10]:
model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.2))
model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=3.5e+5))
model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.2))
model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=20)

print_measures(model_artm)
plt.plot(xrange(model_artm.num_phi_updates), model_artm.score_tracker['SparsityPhiScore'].value, 'r--', linewidth=2)
plt.xlabel('Iterations count')
plt.ylabel('ARTM Phi sp. (red)')
plt.grid(True)
plt.show()
plt.plot(xrange(model_artm.num_phi_updates), model_artm.score_tracker['SparsityThetaScore'].value, 'r--', linewidth=2)
plt.xlabel('Iterations count')
plt.ylabel('ARTM Theta sp. (red)')
plt.grid(True)
plt.show()
plt.plot(xrange(model_artm.num_phi_updates), model_artm.score_tracker['TopicKernelScore'].average_contrast, 'r--', linewidth=2)
plt.xlabel('Iterations count')
plt.ylabel('ARTM KernelScore:AvgContrast sp. (red)')
plt.grid(True)
plt.show()
plt.plot(xrange(model_artm.num_phi_updates), model_artm.score_tracker['TopicKernelScore'].average_purity, 'r--', linewidth=2)
plt.xlabel('Iterations count')
plt.ylabel('ARTM KernelScore:AvgPurity sp. (red)')
plt.grid(True)
plt.show()


Sparsity Phi: 0.822 (ARTM)
Sparsity Theta: 0.328 (ARTM)
Kernel contrast: 0.461 (ARTM)
Kernel purity: 0.936 (ARTM)
Perplexity: 682.705 (ARTM)

Выводим слова. Чтобы избежать проблем с кодировкой мы писали vowpal с id слов, теперь нужно сделать замену.


In [11]:
def idToWord(id):
    table = vectorizer.get_feature_names()
    return table[int(id)]

for topic_name in model_artm.topic_names:
    print topic_name + ': ',
    print [idToWord(token) for token in model_artm.score_tracker['TopTokensScore'].last_tokens[topic_name]]


topic_0:  [u'american', u'group', u'under', u'being', u'only', u'general', u'members', u'ambassador', u'force', u'strong', u'important', u'european', u'leadership', u'europe', u'current', u'clear', u'cannot', u'threat', u'south', u'nato']
topic_1:  [u'very', u'last', u'much', u'many', u'help', u'where', u'women', u'hillary', u'working', u'best', u'great', u'because', u'family', u'question', u'doing', u'well', u'together', u'really', u'year', u'friends']
topic_2:  [u'there', u'what', u'your', u'like', u'week', u'going', u'want', u'come', u'lona', u'valmoro', u'next', u'valmorou', u'schedule', u'meet', u'dinner', u'coming', u'sure', u'night', u'tonight', u'special']
topic_3:  [u'time', u'good', u'speech', u'july', u'team', u'press', u'give', u'draft', u'send', u'questions', u'getting', u'later', u'better', u'idea', u'still', u'wrote', u'book', u'lissa', u'ready', u'minutes']
topic_4:  [u'united', u'states', u'political', u'such', u'years', u'nations', u'economic', u'between', u'term', u'relationship', u'both', u'region', u'interests', u'problems', u'commitment', u'interest', u'progress', u'different', u'leaders', u'several']
topic_5:  [u'secretary', u'office', u'meeting', u'lauren', u'conference', u'jiloty', u'update', u'staff', u'room', u'jilotylc', u'private', u'list', u'arrive', u'route', u'depart', u'assistant', u'daily', u'phone', u'residence', u'deputy']
topic_6:  [u'just', u'know', u'june', u'email', u'please', u'embassy', u'hope', u'mail', u'note', u'received', u'blackberry', u'judith', u'attachments', u'contact', u'mchale', u'thank', u'love', u'dear', u'happy', u'confidential']
topic_7:  [u'call', u'abedinh', u'sunday', u'today', u'saturday', u'tomorrow', u'august', u'talk', u'morning', u'september', u'turkey', u'text', u'thursday', u'points', u'october', u'confirmed', u'asking', u'secure', u'afternoon', u'speak']
topic_8:  [u'about', u'when', u'david', u'march', u'asked', u'wanted', u'home', u'things', u'might', u'story', u'thomas', u'heard', u'anything', u'tell', u'mark', u'something', u'saying', u'hear', u'told', u'interview']
topic_9:  [u'should', u'think', u'make', u'here', u'forward', u'below', u'anne', u'marie', u'done', u'slaughter', u'philippe', u'article', u'both', u'look', u'read', u'february', u'reines', u'internet', u'latest', u'conversation']
topic_10:  [u'government', u'country', u'military', u'while', u'support', u'effort', u'iraq', u'defense', u'must', u'local', u'conflict', u'civil', u'action', u'particularly', u'diplomatic', u'leaders', u'effective', u'diplomats', u'civilian', u'lead']
topic_11:  [u'policy', u'washington', u'pakistan', u'media', u'strategy', u'plan', u'africa', u'post', u'administration', u'program', u'chief', u'budget', u'needed', u'jones', u'financial', u'strategic', u'gates', u'including', u'dialogue', u'fund']
topic_12:  [u'more', u'clinton', u'than', u'world', u'health', u'countries', u'america', u'care', u'children', u'every', u'nation', u'education', u'half', u'families', u'child', u'small', u'less', u'need', u'often', u'cost']
topic_13:  [u'their', u'president', u'obama', u'white', u'former', u'percent', u'administration', u'campaign', u'americans', u'bush', u'republican', u'republicans', u'congress', u'democrats', u'among', u'voters', u'enough', u'economy', u'barack', u'presidential']
topic_14:  [u'would', u'vote', u'sbwhoeop', u'memo', u'likely', u'either', u'majority', u'next', u'cameron', u'short', u'perhaps', u'labour', u'whether', u'tories', u'london', u'coalition', u'legislation', u'letter', u'votes', u'until']
topic_15:  [u'people', u'into', u'those', u'rights', u'human', u'future', u'freedom', u'violence', u'history', u'anti', u'democracy', u'long', u'saudi', u'found', u'right', u'around', u'especially', u'religious', u'life', u'themselves']
topic_16:  [u'security', u'these', u'national', u'according', u'source', u'forces', u'situation', u'council', u'western', u'opinion', u'same', u'individual', u'qaddafi', u'against', u'army', u'intelligence', u'believes', u'comment', u'sources', u'groups']
topic_17:  [u'work', u'http', u'china', u'center', u'india', u'business', u'chinese', u'university', u'melanne', u'school', u'company', u'project', u'climate', u'asia', u'energy', u'board', u'foundation', u'video', u'html', u'year']
topic_18:  [u'also', u'which', u'other', u'some', u'visit', u'number', u'million', u'russia', u'food', u'problem', u'however', u'year', u'others', u'place', u'cooperation', u'legal', u'areas', u'well', u'money', u'companies']
topic_19:  [u'haiti', u'january', u'report', u'decision', u'order', u'assistance', u'honduras', u'high', u'travel', u'haitian', u'mission', u'needs', u'immediate', u'response', u'mexico', u'operations', u'central', u'zelaya', u'resolution', u'relief']
topic_20:  [u'were', u'over', u'first', u'even', u'senate', u'another', u'john', u'during', u'senator', u'never', u'little', u'committee', u'george', u'hold', u'away', u'took', u'side', u'seems', u'having', u'show']
topic_21:  [u'said', u'minister', u'news', u'told', u'says', u'officials', u'prime', u'afghanistan', u'official', u'afghan', u'police', u'reuters', u'reported', u'taliban', u'reports', u'mahogany', u'kabul', u'killed', u'october', u'month']
topic_22:  [u'sullivan', u'jacob', u'monday', u'friday', u'sullivanjj', u'richard', u'jake', u'april', u'statement', u'verma', u'burns', u'james', u'jeffrey', u'steinberg', u'william', u'preines', u'feltman', u'harold', u'jeff', u'daniel']
topic_23:  [u'party', u'deal', u'power', u'election', u'leader', u'politics', u'justice', u'against', u'gordon', u'held', u'issue', u'brown', u'parties', u'conservative', u'guardian', u'northern', u'democratic', u'ireland', u'community', u'peter']
topic_24:  [u'cheryl', u'mills', u'millscd', u'tuesday', u'wednesday', u'mailto', u'thanks', u'thursday', u'letter', u'kennedy', u'michael', u'nora', u'october', u'toiv', u'crowley', u'kelly', u'patrick', u'philip', u'daniel', u'discuss']
topic_25:  [u'been', u'most', u'take', u'issues', u'made', u'times', u'since', u'position', u'without', u'major', u'fact', u'already', u'worked', u'trade', u'always', u'germany', u'course', u'three', u'turn', u'positions']
topic_26:  [u'house', u'information', u'agreement', u'benghazi', u'sensitive', u'dept', u'produced', u'select', u'waiver', u'comm', u'foia', u'redactions', u'libya', u'libyan', u'attack', u'september', u'egypt', u'rice', u'consulate', u'stevens']
topic_27:  [u'could', u'them', u'after', u'back', u'then', u'december', u'bill', u'called', u'before', u'again', u'calls', u'trying', u'november', u'blair', u'north', u'holbrooke', u'start', u'soon', u'left', u'right']
topic_28:  [u'foreign', u'public', u'development', u'global', u'international', u'diplomacy', u'affairs', u'service', u'approach', u'agency', u'usaid', u'programs', u'planning', u'discussion', u'director', u'initiative', u'resources', u'capacity', u'goals', u'building']
topic_29:  [u'israel', u'iran', u'peace', u'israeli', u'east', u'talks', u'nuclear', u'middle', u'palestinian', u'netanyahu', u'arab', u'bank', u'west', u'reason', u'jewish', u'palestinians', u'negotiations', u'jerusalem', u'class', u'process']

Примерные темы, часть конечно такие себе, но я решил, что лучше взять 30 и иметь лишние темы, чем 25 и что-то потерять.


In [27]:
real_topics_names = ["европа-нато",
"женщины",
"что-то про поход куда-то",
"про речи 1",
"политика в штатах",
"lauren??? а так кажется, что это какие-то секритарские штучки",
"бред какой-то. общие вещи 1",
"про выступления",
"бред какой-то. общие вещи 2",
"бред какой-то. общие вещи 3. что-то про Филлипины",
"про ирак",
"международная политика 1",
"здоровье клинтон/семья/дети",
"про обаму и правительство 2",
"обсуждение выборов",
"права людей",
"полика на востоке",
"бред какой-то. общие вещи 4",
"бред какой-то. общие вещи 5 (есть что-то про Россию, правда)",
"международная политика 2",
"про правительство и заседания",
"афганистан",
"куча имен. видимо какие-то переписки с людьми",
"политика и выборы",
"европа-германия",
"бред какой-то. общие вещи 6",
"про ливию",
"европа-англия",
"международная политика 3",
"ядерная программа ирана"]

Вывод матриц. Тут приведен код, который я использовал. Но обычно просто смотрел разные кусочки матрицы. Просто нужно отобразить ее в картинку


In [12]:
phi = model_artm.get_phi().values
def plot_matrix(matrix, figsize=(25,15), xlabel='', ylabel='', title='', save=None):
    plt.figure(figsize=figsize)
    plt.title(title)
    plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.binary)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.colorbar()
# можно смотреть по частям
plot_matrix(phi[:200, :], figsize=(10,10), xlabel='topic', ylabel='word', title='Theta matrix')
#print phi



In [13]:
theta_matrix = model_artm.get_theta().values
plot_matrix(theta_matrix[:, 100:200], figsize=(10,10), xlabel='document', ylabel='topic',
            title='theta matrix')
#print theta_matrix



In [16]:
model = model_artm
unique, counter = {}, 1
for topic_index, topic_name in enumerate(model.topic_names):
    score_tracker = model.score_tracker['TopTokensScore']
    print topic_index
    var = score_tracker.last_tokens[topic_name]
    for i in xrange(len(var)):
        unique[idToWord(var[i])] = counter
        counter += 1


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

Если внимательно прочитать документацию, то узнаем, что переменные last_weights и им подобные выгружаются каждый раз при вызове, что замедляет СИЛЬНО код. Лучше сначала выгрузить, потом обращаться.


In [28]:
with open(gexf_file, 'wb') as f:
    f.write('<gexf xmlns:viz="http:///www.gexf.net/1.1draft/viz" xmlns="http://www.gexf.net/1.1draft" version="1.1">\n')
    f.write('<meta lastmodifieddate="2010-03-03+23:44">\n')
    f.write('<creator>Gephi 0.7</creator>\n')
    f.write('</meta>\n')
    f.write('<graph defaultedgetype="undirected" idtype="string" type="static">\n')
    f.write('<nodes count="{}">\n'.format(len(unique) + num_topics))
    for token, value in unique.iteritems():
        f.write('<node id="{0}" label="{1}"/>\n'.format(value, token))
        for id in xrange(num_topics):
            f.write('<node id="{0}" label="TOPIC_{1}|{2}"/>\n'.format(counter + id, id, real_topics_names[id]))
    f.write('</nodes>\n')
    edge_id = 0
    strs_to_write = []
    p = 0
    topic_tokens = []
    topic_weights = []
    for topic_index, topic_name in enumerate(model.topic_names):
        var = score_tracker.last_tokens[topic_name]
        topic_tokens.append([idToWord(item) for item in var])
        topic_weights.append(score_tracker.last_weights[topic_name])
    print topic_tokens
    for token, value in unique.iteritems():
        p += 1
        print "Process %s, %d, %d of %d" % (token, value, p, counter)
        for topic_index, topic_name in enumerate(model.topic_names):
            score_tracker = model.score_tracker['TopTokensScore']
            for i in xrange(len(topic_weights[topic_index])):
                if topic_tokens[topic_index][i] == token:
                    strs_to_write.append('<edge id="{0}" source="{1}" target="{2}" weight="{3}"/>\n'.format(
                        edge_id, counter + topic_index, value, topic_weights[topic_index][i]))
                    edge_id += 1
    f.write('<edges count="{}">\n'.format(len(strs_to_write)))
    for elem in strs_to_write:
        f.write(elem)
    f.write('</edges>\n')
    f.write('</graph>\n')
    f.write('</gexf>\n')


[[u'american', u'group', u'under', u'being', u'only', u'general', u'members', u'ambassador', u'force', u'strong', u'important', u'european', u'leadership', u'europe', u'current', u'clear', u'cannot', u'threat', u'south', u'nato'], [u'very', u'last', u'much', u'many', u'help', u'where', u'women', u'hillary', u'working', u'best', u'great', u'because', u'family', u'question', u'doing', u'well', u'together', u'really', u'year', u'friends'], [u'there', u'what', u'your', u'like', u'week', u'going', u'want', u'come', u'lona', u'valmoro', u'next', u'valmorou', u'schedule', u'meet', u'dinner', u'coming', u'sure', u'night', u'tonight', u'special'], [u'time', u'good', u'speech', u'july', u'team', u'press', u'give', u'draft', u'send', u'questions', u'getting', u'later', u'better', u'idea', u'still', u'wrote', u'book', u'lissa', u'ready', u'minutes'], [u'united', u'states', u'political', u'such', u'years', u'nations', u'economic', u'between', u'term', u'relationship', u'both', u'region', u'interests', u'problems', u'commitment', u'interest', u'progress', u'different', u'leaders', u'several'], [u'secretary', u'office', u'meeting', u'lauren', u'conference', u'jiloty', u'update', u'staff', u'room', u'jilotylc', u'private', u'list', u'arrive', u'route', u'depart', u'assistant', u'daily', u'phone', u'residence', u'deputy'], [u'just', u'know', u'june', u'email', u'please', u'embassy', u'hope', u'mail', u'note', u'received', u'blackberry', u'judith', u'attachments', u'contact', u'mchale', u'thank', u'love', u'dear', u'happy', u'confidential'], [u'call', u'abedinh', u'sunday', u'today', u'saturday', u'tomorrow', u'august', u'talk', u'morning', u'september', u'turkey', u'text', u'thursday', u'points', u'october', u'confirmed', u'asking', u'secure', u'afternoon', u'speak'], [u'about', u'when', u'david', u'march', u'asked', u'wanted', u'home', u'things', u'might', u'story', u'thomas', u'heard', u'anything', u'tell', u'mark', u'something', u'saying', u'hear', u'told', u'interview'], [u'should', u'think', u'make', u'here', u'forward', u'below', u'anne', u'marie', u'done', u'slaughter', u'philippe', u'article', u'both', u'look', u'read', u'february', u'reines', u'internet', u'latest', u'conversation'], [u'government', u'country', u'military', u'while', u'support', u'effort', u'iraq', u'defense', u'must', u'local', u'conflict', u'civil', u'action', u'particularly', u'diplomatic', u'leaders', u'effective', u'diplomats', u'civilian', u'lead'], [u'policy', u'washington', u'pakistan', u'media', u'strategy', u'plan', u'africa', u'post', u'administration', u'program', u'chief', u'budget', u'needed', u'jones', u'financial', u'strategic', u'gates', u'including', u'dialogue', u'fund'], [u'more', u'clinton', u'than', u'world', u'health', u'countries', u'america', u'care', u'children', u'every', u'nation', u'education', u'half', u'families', u'child', u'small', u'less', u'need', u'often', u'cost'], [u'their', u'president', u'obama', u'white', u'former', u'percent', u'administration', u'campaign', u'americans', u'bush', u'republican', u'republicans', u'congress', u'democrats', u'among', u'voters', u'enough', u'economy', u'barack', u'presidential'], [u'would', u'vote', u'sbwhoeop', u'memo', u'likely', u'either', u'majority', u'next', u'cameron', u'short', u'perhaps', u'labour', u'whether', u'tories', u'london', u'coalition', u'legislation', u'letter', u'votes', u'until'], [u'people', u'into', u'those', u'rights', u'human', u'future', u'freedom', u'violence', u'history', u'anti', u'democracy', u'long', u'saudi', u'found', u'right', u'around', u'especially', u'religious', u'life', u'themselves'], [u'security', u'these', u'national', u'according', u'source', u'forces', u'situation', u'council', u'western', u'opinion', u'same', u'individual', u'qaddafi', u'against', u'army', u'intelligence', u'believes', u'comment', u'sources', u'groups'], [u'work', u'http', u'china', u'center', u'india', u'business', u'chinese', u'university', u'melanne', u'school', u'company', u'project', u'climate', u'asia', u'energy', u'board', u'foundation', u'video', u'html', u'year'], [u'also', u'which', u'other', u'some', u'visit', u'number', u'million', u'russia', u'food', u'problem', u'however', u'year', u'others', u'place', u'cooperation', u'legal', u'areas', u'well', u'money', u'companies'], [u'haiti', u'january', u'report', u'decision', u'order', u'assistance', u'honduras', u'high', u'travel', u'haitian', u'mission', u'needs', u'immediate', u'response', u'mexico', u'operations', u'central', u'zelaya', u'resolution', u'relief'], [u'were', u'over', u'first', u'even', u'senate', u'another', u'john', u'during', u'senator', u'never', u'little', u'committee', u'george', u'hold', u'away', u'took', u'side', u'seems', u'having', u'show'], [u'said', u'minister', u'news', u'told', u'says', u'officials', u'prime', u'afghanistan', u'official', u'afghan', u'police', u'reuters', u'reported', u'taliban', u'reports', u'mahogany', u'kabul', u'killed', u'october', u'month'], [u'sullivan', u'jacob', u'monday', u'friday', u'sullivanjj', u'richard', u'jake', u'april', u'statement', u'verma', u'burns', u'james', u'jeffrey', u'steinberg', u'william', u'preines', u'feltman', u'harold', u'jeff', u'daniel'], [u'party', u'deal', u'power', u'election', u'leader', u'politics', u'justice', u'against', u'gordon', u'held', u'issue', u'brown', u'parties', u'conservative', u'guardian', u'northern', u'democratic', u'ireland', u'community', u'peter'], [u'cheryl', u'mills', u'millscd', u'tuesday', u'wednesday', u'mailto', u'thanks', u'thursday', u'letter', u'kennedy', u'michael', u'nora', u'october', u'toiv', u'crowley', u'kelly', u'patrick', u'philip', u'daniel', u'discuss'], [u'been', u'most', u'take', u'issues', u'made', u'times', u'since', u'position', u'without', u'major', u'fact', u'already', u'worked', u'trade', u'always', u'germany', u'course', u'three', u'turn', u'positions'], [u'house', u'information', u'agreement', u'benghazi', u'sensitive', u'dept', u'produced', u'select', u'waiver', u'comm', u'foia', u'redactions', u'libya', u'libyan', u'attack', u'september', u'egypt', u'rice', u'consulate', u'stevens'], [u'could', u'them', u'after', u'back', u'then', u'december', u'bill', u'called', u'before', u'again', u'calls', u'trying', u'november', u'blair', u'north', u'holbrooke', u'start', u'soon', u'left', u'right'], [u'foreign', u'public', u'development', u'global', u'international', u'diplomacy', u'affairs', u'service', u'approach', u'agency', u'usaid', u'programs', u'planning', u'discussion', u'director', u'initiative', u'resources', u'capacity', u'goals', u'building'], [u'israel', u'iran', u'peace', u'israeli', u'east', u'talks', u'nuclear', u'middle', u'palestinian', u'netanyahu', u'arab', u'bank', u'west', u'reason', u'jewish', u'palestinians', u'negotiations', u'jerusalem', u'class', u'process']]
Process chinese, 347, 1 of 601
Process diplomacy, 566, 2 of 601
Process global, 564, 3 of 601
Process month, 440, 4 of 601
Process cameron, 289, 5 of 601
Process religious, 318, 6 of 601
Process children, 249, 7 of 601
Process issues, 504, 8 of 601
Process votes, 299, 9 of 601
Process show, 420, 10 of 601
Process send, 69, 11 of 601
Process consulate, 539, 12 of 601
Process nora, 492, 13 of 601
Process program, 230, 14 of 601
Process under, 3, 15 of 601
Process women, 27, 16 of 601
Process resources, 577, 17 of 601
Process brown, 472, 18 of 601
Process confidential, 140, 19 of 601
Process very, 21, 20 of 601
Process every, 250, 21 of 601
Process jacob, 442, 22 of 601
Process school, 350, 23 of 601
Process issue, 471, 24 of 601
Process jiloty, 106, 25 of 601
Process list, 112, 26 of 601
Process wednesday, 485, 27 of 601
Process saudi, 313, 28 of 601
Process team, 65, 29 of 601
Process small, 256, 30 of 601
Process force, 9, 31 of 601
Process leaders, 216, 32 of 601
Process talks, 586, 33 of 601
Process likely, 285, 34 of 601
Process cost, 260, 35 of 601
Process video, 358, 36 of 601
Process even, 404, 37 of 601
Process afghanistan, 428, 38 of 601
Process what, 42, 39 of 601
Process power, 463, 40 of 601
Process richard, 446, 41 of 601
Process qaddafi, 333, 42 of 601
Process assistance, 386, 43 of 601
Process crowley, 495, 44 of 601
Process asia, 354, 45 of 601
Process current, 15, 46 of 601
Process international, 565, 47 of 601
Process course, 517, 48 of 601
Process public, 562, 49 of 601
Process told, 424, 50 of 601
Process philippe, 191, 51 of 601
Process friday, 444, 52 of 601
Process iran, 582, 53 of 601
Process guardian, 475, 54 of 601
Process never, 410, 55 of 601
Process here, 184, 56 of 601
Process reported, 433, 57 of 601
Process china, 343, 58 of 601
Process groups, 340, 59 of 601
Process others, 373, 60 of 601
Process november, 553, 61 of 601
Process iraq, 207, 62 of 601
Process great, 31, 63 of 601
Process reports, 435, 64 of 601
Process action, 213, 65 of 601
Process military, 203, 66 of 601
Process love, 137, 67 of 601
Process secure, 158, 68 of 601
Process campaign, 268, 69 of 601
Process deputy, 120, 70 of 601
Process egypt, 537, 71 of 601
Process africa, 227, 72 of 601
Process private, 111, 73 of 601
Process mailto, 486, 74 of 601
Process thursday, 488, 75 of 601
Process select, 528, 76 of 601
Process would, 281, 77 of 601
Process army, 335, 78 of 601
Process mexico, 395, 79 of 601
Process visit, 365, 80 of 601
Process bush, 270, 81 of 601
Process next, 288, 82 of 601
Process call, 141, 83 of 601
Process themselves, 320, 84 of 601
Process until, 300, 85 of 601
Process today, 144, 86 of 601
Process more, 241, 87 of 601
Process israel, 581, 88 of 601
Process company, 351, 89 of 601
Process phone, 118, 90 of 601
Process hold, 414, 91 of 601
Process effort, 206, 92 of 601
Process must, 209, 93 of 601
Process high, 388, 94 of 601
Process room, 109, 95 of 601
Process rights, 304, 96 of 601
Process work, 341, 97 of 601
Process coalition, 296, 98 of 601
Process believes, 337, 99 of 601
Process meet, 54, 100 of 601
Process arrive, 113, 101 of 601
Process could, 541, 102 of 601
Process history, 309, 103 of 601
Process want, 47, 104 of 601
Process give, 67, 105 of 601
Process process, 600, 106 of 601
Process december, 546, 107 of 601
Process india, 345, 108 of 601
Process states, 82, 109 of 601
Process heard, 172, 110 of 601
Process something, 176, 111 of 601
Process council, 328, 112 of 601
Process made, 505, 113 of 601
Process information, 522, 114 of 601
Process needs, 392, 115 of 601
Process united, 81, 116 of 601
Process democracy, 311, 117 of 601
Process turn, 519, 118 of 601
Process travel, 389, 119 of 601
Process kabul, 437, 120 of 601
Process sunday, 143, 121 of 601
Process usaid, 571, 122 of 601
Process interview, 180, 123 of 601
Process economy, 278, 124 of 601
Process after, 543, 125 of 601
Process different, 98, 126 of 601
Process coming, 56, 127 of 601
Process president, 262, 128 of 601
Process response, 394, 129 of 601
Process times, 506, 130 of 601
Process tories, 294, 131 of 601
Process short, 290, 132 of 601
Process effective, 217, 133 of 601
Process sensitive, 525, 134 of 601
Process chief, 231, 135 of 601
Process things, 168, 136 of 601
Process democratic, 477, 137 of 601
Process order, 385, 138 of 601
Process talk, 148, 139 of 601
Process operations, 396, 140 of 601
Process help, 25, 141 of 601
Process office, 102, 142 of 601
Process september, 536, 143 of 601
Process over, 402, 144 of 601
Process mission, 391, 145 of 601
Process years, 85, 146 of 601
Process produced, 527, 147 of 601
Process held, 470, 148 of 601
Process embassy, 126, 149 of 601
Process including, 238, 150 of 601
Process committee, 412, 151 of 601
Process preines, 456, 152 of 601
Process still, 75, 153 of 601
Process before, 549, 154 of 601
Process group, 2, 155 of 601
Process thank, 136, 156 of 601
Process better, 73, 157 of 601
Process reuters, 432, 158 of 601
Process policy, 221, 159 of 601
Process mail, 128, 160 of 601
Process might, 169, 161 of 601
Process then, 545, 162 of 601
Process them, 542, 163 of 601
Process good, 62, 164 of 601
Process civilian, 219, 165 of 601
Process food, 369, 166 of 601
Process nation, 251, 167 of 601
Process taliban, 434, 168 of 601
Process half, 253, 169 of 601
Process republican, 271, 170 of 601
Process killed, 438, 171 of 601
Process presidential, 280, 172 of 601
Process february, 196, 173 of 601
Process january, 382, 174 of 601
Process university, 348, 175 of 601
Process redactions, 532, 176 of 601
Process positions, 520, 177 of 601
Process done, 189, 178 of 601
Process side, 417, 179 of 601
Process development, 563, 180 of 601
Process financial, 235, 181 of 601
Process attachments, 133, 182 of 601
Process doing, 35, 183 of 601
Process companies, 380, 184 of 601
Process energy, 355, 185 of 601
Process idea, 74, 186 of 601
Process police, 431, 187 of 601
Process year, 372, 188 of 601
Process saturday, 145, 189 of 601
Process special, 60, 190 of 601
Process really, 38, 191 of 601
Process jerusalem, 598, 192 of 601
Process since, 507, 193 of 601
Process george, 413, 194 of 601
Process daniel, 499, 195 of 601
Process health, 245, 196 of 601
Process internet, 198, 197 of 601
Process judith, 132, 198 of 601
Process foundation, 357, 199 of 601
Process harold, 458, 200 of 601
Process million, 367, 201 of 601
Process little, 411, 202 of 601
Process reason, 594, 203 of 601
Process members, 7, 204 of 601
Process philip, 498, 205 of 601
Process wanted, 166, 206 of 601
Process care, 248, 207 of 601
Process route, 114, 208 of 601
Process david, 163, 209 of 601
Process conversation, 200, 210 of 601
Process american, 1, 211 of 601
Process place, 374, 212 of 601
Process gordon, 469, 213 of 601
Process threat, 18, 214 of 601
Process think, 182, 215 of 601
Process south, 19, 216 of 601
Process first, 403, 217 of 601
Process major, 510, 218 of 601
Process already, 512, 219 of 601
Process stevens, 540, 220 of 601
Process question, 34, 221 of 601
Process number, 366, 222 of 601
Process strategic, 236, 223 of 601
Process americans, 269, 224 of 601
Process another, 406, 225 of 601
Process such, 84, 226 of 601
Process vote, 282, 227 of 601
Process says, 425, 228 of 601
Process tomorrow, 146, 229 of 601
Process story, 170, 230 of 601
Process officials, 426, 231 of 601
Process service, 568, 232 of 601
Process their, 261, 233 of 601
Process needed, 233, 234 of 601
Process attack, 535, 235 of 601
Process draft, 68, 236 of 601
Process statement, 449, 237 of 601
Process which, 362, 238 of 601
Process white, 264, 239 of 601
Process john, 407, 240 of 601
Process discussion, 574, 241 of 601
Process interests, 93, 242 of 601
Process relationship, 90, 243 of 601
Process memo, 284, 244 of 601
Process took, 416, 245 of 601
Process immediate, 393, 246 of 601
Process hillary, 28, 247 of 601
Process western, 329, 248 of 601
Process july, 64, 249 of 601
Process than, 243, 250 of 601
Process obama, 263, 251 of 601
Process lissa, 78, 252 of 601
Process diplomatic, 215, 253 of 601
Process nations, 86, 254 of 601
Process project, 352, 255 of 601
Process future, 306, 256 of 601
Process were, 401, 257 of 601
Process russia, 368, 258 of 601
Process benghazi, 524, 259 of 601
Process jilotylc, 110, 260 of 601
Process tonight, 59, 261 of 601
Process need, 258, 262 of 601
Process affairs, 567, 263 of 601
Process agency, 570, 264 of 601
Process mills, 482, 265 of 601
Process note, 129, 266 of 601
Process majority, 287, 267 of 601
Process labour, 292, 268 of 601
Process take, 503, 269 of 601
Process forces, 326, 270 of 601
Process sure, 57, 271 of 601
Process most, 502, 272 of 601
Process germany, 516, 273 of 601
Process plan, 226, 274 of 601
Process letter, 489, 275 of 601
Process america, 247, 276 of 601
Process class, 599, 277 of 601
Process especially, 317, 278 of 601
Process clear, 16, 279 of 601
Process later, 72, 280 of 601
Process points, 154, 281 of 601
Process left, 559, 282 of 601
Process fact, 511, 283 of 601
Process saying, 177, 284 of 601
Process thomas, 171, 285 of 601
Process particularly, 214, 286 of 601
Process gates, 237, 287 of 601
Process text, 152, 288 of 601
Process planning, 573, 289 of 601
Process democrats, 274, 290 of 601
Process staff, 108, 291 of 601
Process northern, 476, 292 of 601
Process justice, 467, 293 of 601
Process should, 181, 294 of 601
Process only, 5, 295 of 601
Process going, 46, 296 of 601
Process money, 379, 297 of 601
Process rice, 538, 298 of 601
Process local, 210, 299 of 601
Process hope, 127, 300 of 601
Process assistant, 116, 301 of 601
Process soon, 558, 302 of 601
Process cannot, 17, 303 of 601
Process report, 383, 304 of 601
Process during, 408, 305 of 601
Process haiti, 381, 306 of 601
Process trade, 514, 307 of 601
Process areas, 377, 308 of 601
Process prime, 427, 309 of 601
Process countries, 246, 310 of 601
Process millscd, 483, 311 of 601
Process morning, 149, 312 of 601
Process london, 295, 313 of 601
Process where, 26, 314 of 601
Process wrote, 76, 315 of 601
Process secretary, 101, 316 of 601
Process human, 305, 317 of 601
Process intelligence, 336, 318 of 601
Process national, 323, 319 of 601
Process defense, 208, 320 of 601
Process best, 30, 321 of 601
Process israeli, 584, 322 of 601
Process said, 421, 323 of 601
Process capacity, 578, 324 of 601
Process nuclear, 587, 325 of 601
Process away, 415, 326 of 601
Process please, 125, 327 of 601
Process enough, 277, 328 of 601
Process cooperation, 375, 329 of 601
Process between, 88, 330 of 601
Process progress, 97, 331 of 601
Process approach, 569, 332 of 601
Process email, 124, 333 of 601
Process europe, 14, 334 of 601
Process august, 147, 335 of 601
Process however, 371, 336 of 601
Process comm, 530, 337 of 601
Process article, 192, 338 of 601
Process come, 48, 339 of 601
Process sullivanjj, 445, 340 of 601
Process both, 193, 341 of 601
Process climate, 353, 342 of 601
Process last, 22, 343 of 601
Process libya, 533, 344 of 601
Process always, 515, 345 of 601
Process country, 202, 346 of 601
Process region, 92, 347 of 601
Process according, 324, 348 of 601
Process against, 468, 349 of 601
Process foreign, 561, 350 of 601
Process april, 448, 351 of 601
Process senator, 409, 352 of 601
Process asked, 165, 353 of 601
Process comment, 338, 354 of 601
Process among, 275, 355 of 601
Process diplomats, 218, 356 of 601
Process jeff, 459, 357 of 601
Process sullivan, 441, 358 of 601
Process tuesday, 484, 359 of 601
Process community, 479, 360 of 601
Process anti, 310, 361 of 601
Process news, 423, 362 of 601
Process speak, 160, 363 of 601
Process turkey, 151, 364 of 601
Process valmorou, 52, 365 of 601
Process strong, 10, 366 of 601
Process west, 593, 367 of 601
Process political, 83, 368 of 601
Process three, 518, 369 of 601
Process been, 501, 370 of 601
Process mark, 175, 371 of 601
Process much, 23, 372 of 601
Process interest, 96, 373 of 601
Process meeting, 103, 374 of 601
Process dialogue, 239, 375 of 601
Process families, 254, 376 of 601
Process received, 130, 377 of 601
Process feltman, 457, 378 of 601
Process republicans, 272, 379 of 601
Process palestinian, 589, 380 of 601
Process fund, 240, 381 of 601
Process minister, 422, 382 of 601
Process child, 255, 383 of 601
Process worked, 513, 384 of 601
Process former, 265, 385 of 601
Process those, 303, 386 of 601
Process look, 194, 387 of 601
Process these, 322, 388 of 601
Process bill, 547, 389 of 601
Process budget, 232, 390 of 601
Process dept, 526, 391 of 601
Process while, 204, 392 of 601
Process valmoro, 50, 393 of 601
Process steinberg, 454, 394 of 601
Process many, 24, 395 of 601
Process situation, 327, 396 of 601
Process voters, 276, 397 of 601
Process pakistan, 223, 398 of 601
Process middle, 588, 399 of 601
Process ready, 79, 400 of 601
Process confirmed, 156, 401 of 601
Process james, 452, 402 of 601
Process perhaps, 291, 403 of 601
Process media, 224, 404 of 601
Process make, 183, 405 of 601
Process administration, 267, 406 of 601
Process same, 331, 407 of 601
Process html, 359, 408 of 601
Process speech, 63, 409 of 601
Process party, 461, 410 of 601
Process several, 100, 411 of 601
Process conflict, 211, 412 of 601
Process european, 12, 413 of 601
Process week, 45, 414 of 601
Process jeffrey, 453, 415 of 601
Process http, 342, 416 of 601
Process blair, 554, 417 of 601
Process director, 575, 418 of 601
Process opinion, 330, 419 of 601
Process afghan, 430, 420 of 601
Process toiv, 494, 421 of 601
Process center, 344, 422 of 601
Process zelaya, 398, 423 of 601
Process programs, 572, 424 of 601
Process holbrooke, 556, 425 of 601
Process well, 378, 426 of 601
Process without, 509, 427 of 601
Process contact, 134, 428 of 601
Process relief, 400, 429 of 601
Process kelly, 496, 430 of 601
Process position, 508, 431 of 601
Process latest, 199, 432 of 601
Process ambassador, 8, 433 of 601
Process just, 121, 434 of 601
Process less, 257, 435 of 601
Process being, 4, 436 of 601
Process when, 162, 437 of 601
Process sources, 339, 438 of 601
Process thanks, 487, 439 of 601
Process questions, 70, 440 of 601
Process depart, 115, 441 of 601
Process lauren, 104, 442 of 601
Process mchale, 135, 443 of 601
Process family, 33, 444 of 601
Process also, 361, 445 of 601
Process seems, 418, 446 of 601
Process discuss, 500, 447 of 601
Process barack, 279, 448 of 601
Process source, 325, 449 of 601
Process other, 363, 450 of 601
Process nato, 20, 451 of 601
Process board, 356, 452 of 601
Process east, 585, 453 of 601
Process bank, 592, 454 of 601
Process cheryl, 481, 455 of 601
Process jake, 447, 456 of 601
Process march, 164, 457 of 601
Process around, 316, 458 of 601
Process government, 201, 459 of 601
Process read, 195, 460 of 601
Process libyan, 534, 461 of 601
Process term, 89, 462 of 601
Process arab, 591, 463 of 601
Process know, 122, 464 of 601
Process press, 66, 465 of 601
Process world, 244, 466 of 601
Process burns, 451, 467 of 601
Process like, 44, 468 of 601
Process kennedy, 490, 469 of 601
Process patrick, 497, 470 of 601
Process michael, 491, 471 of 601
Process blackberry, 131, 472 of 601
Process night, 58, 473 of 601
Process initiative, 576, 474 of 601
Process security, 321, 475 of 601
Process because, 32, 476 of 601
Process often, 259, 477 of 601
Process deal, 462, 478 of 601
Process people, 301, 479 of 601
Process senate, 405, 480 of 601
Process some, 364, 481 of 601
Process back, 544, 482 of 601
Process economic, 87, 483 of 601
Process election, 464, 484 of 601
Process dear, 138, 485 of 601
Process home, 167, 486 of 601
Process peter, 480, 487 of 601
Process monday, 443, 488 of 601
Process lead, 220, 489 of 601
Process decision, 384, 490 of 601
Process legal, 376, 491 of 601
Process conservative, 474, 492 of 601
Process asking, 157, 493 of 601
Process anne, 187, 494 of 601
Process leader, 465, 495 of 601
Process either, 286, 496 of 601
Process conference, 105, 497 of 601
Process business, 346, 498 of 601
Process schedule, 53, 499 of 601
Process problems, 94, 500 of 601
Process agreement, 523, 501 of 601
Process slaughter, 190, 502 of 601
Process leadership, 13, 503 of 601
Process found, 314, 504 of 601
Process post, 228, 505 of 601
Process about, 161, 506 of 601
Process central, 397, 507 of 601
Process working, 29, 508 of 601
Process anything, 173, 509 of 601
Process getting, 71, 510 of 601
Process freedom, 307, 511 of 601
Process violence, 308, 512 of 601
Process dinner, 55, 513 of 601
Process afternoon, 159, 514 of 601
Process palestinians, 596, 515 of 601
Process clinton, 242, 516 of 601
Process civil, 212, 517 of 601
Process into, 302, 518 of 601
Process washington, 222, 519 of 601
Process commitment, 95, 520 of 601
Process right, 560, 521 of 601
Process parties, 473, 522 of 601
Process honduras, 387, 523 of 601
Process strategy, 225, 524 of 601
Process your, 43, 525 of 601
Process marie, 188, 526 of 601
Process lona, 49, 527 of 601
Process support, 205, 528 of 601
Process there, 41, 529 of 601
Process legislation, 297, 530 of 601
Process long, 312, 531 of 601
Process sbwhoeop, 283, 532 of 601
Process start, 557, 533 of 601
Process house, 521, 534 of 601
Process forward, 185, 535 of 601
Process mahogany, 436, 536 of 601
Process happy, 139, 537 of 601
Process building, 580, 538 of 601
Process north, 555, 539 of 601
Process ireland, 478, 540 of 601
Process jewish, 595, 541 of 601
Process hear, 178, 542 of 601
Process goals, 579, 543 of 601
Process trying, 552, 544 of 601
Process haitian, 390, 545 of 601
Process october, 493, 546 of 601
Process melanne, 349, 547 of 601
Process whether, 293, 548 of 601
Process official, 429, 549 of 601
Process verma, 450, 550 of 601
Process tell, 174, 551 of 601
Process below, 186, 552 of 601
Process problem, 370, 553 of 601
Process minutes, 80, 554 of 601
Process called, 548, 555 of 601
Process life, 319, 556 of 601
Process general, 6, 557 of 601
Process together, 37, 558 of 601
Process politics, 466, 559 of 601
Process education, 252, 560 of 601
Process negotiations, 597, 561 of 601
Process again, 550, 562 of 601
Process abedinh, 142, 563 of 601
Process peace, 583, 564 of 601
Process percent, 266, 565 of 601
Process book, 77, 566 of 601
Process residence, 119, 567 of 601
Process congress, 273, 568 of 601
Process june, 123, 569 of 601
Process update, 107, 570 of 601
Process william, 455, 571 of 601
Process important, 11, 572 of 601
Process netanyahu, 590, 573 of 601
Process friends, 40, 574 of 601
Process foia, 531, 575 of 601
Process jones, 234, 576 of 601
Process individual, 332, 577 of 601
Process calls, 551, 578 of 601
Process reines, 197, 579 of 601
Process daily, 117, 580 of 601
Process time, 61, 581 of 601
Process resolution, 399, 582 of 601
Process having, 419, 583 of 601
Process waiver, 529, 584 of 601

In [ ]:


In [ ]: