In [1]:
import os
import numpy as np
import pandas as pd
import ast
import re
import csv

import nltk
#from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
ps = PorterStemmer()
lemma = WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer


import gensim
from gensim import corpora, models
os.chdir('~/Codes/DL - Topic Modelling')

In [2]:
dir_src = os.path.join(os.getcwd(), 'data/raw_20news/20news-18828')

dir_src_classes = list( map(lambda x: os.path.join(dir_src, x ), os.listdir(dir_src)) )

In [3]:
dat = []
dat_y = []
dat_y_cat = []

for i in range(0,len(dir_src_classes)):
    
    print('Currently loading the following topic (iteration ' + str(i) + '):\n \t' + dir_src_classes[i])
    dir_src_classes_file = list( map(lambda x: os.path.join(dir_src_classes[i], x), os.listdir(dir_src_classes[i])) )
    
    for ii in range(0, len(dir_src_classes_file)):
        
        dat_y.append(i)
        
        with open(dir_src_classes_file[ii], encoding='ISO-8859-1') as file:
            dat.append(file.read().replace('\n', ' '))


Currently loading the following topic (iteration 0):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/alt.atheism
Currently loading the following topic (iteration 1):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/comp.graphics
Currently loading the following topic (iteration 2):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/comp.os.ms-windows.misc
Currently loading the following topic (iteration 3):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/comp.sys.ibm.pc.hardware
Currently loading the following topic (iteration 4):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/comp.sys.mac.hardware
Currently loading the following topic (iteration 5):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/comp.windows.x
Currently loading the following topic (iteration 6):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/misc.forsale
Currently loading the following topic (iteration 7):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/rec.autos
Currently loading the following topic (iteration 8):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/rec.motorcycles
Currently loading the following topic (iteration 9):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/rec.sport.baseball
Currently loading the following topic (iteration 10):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/rec.sport.hockey
Currently loading the following topic (iteration 11):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/sci.crypt
Currently loading the following topic (iteration 12):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/sci.electronics
Currently loading the following topic (iteration 13):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/sci.med
Currently loading the following topic (iteration 14):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/sci.space
Currently loading the following topic (iteration 15):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/soc.religion.christian
Currently loading the following topic (iteration 16):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/talk.politics.guns
Currently loading the following topic (iteration 17):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/talk.politics.mideast
Currently loading the following topic (iteration 18):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/talk.politics.misc
Currently loading the following topic (iteration 19):
 	/home/ekhongl/Codes/DL - Topic Modelling/data/raw_20news/20news-18828/talk.religion.misc

In [4]:
# mapping sub topics 0-19 to broader topics 0-5
y_map = [3,0,0,0,0,0,5,1,1,1,1,2,2,2,2,3,4,4,4,3]
dat_y2 = [y_map[idx] for idx in dat_y]

In [5]:
#export data
pd.DataFrame( { '_label_granular' : dat_y, 
                '_label_overview' : dat_y2,
                'document' : [' '.join(re.sub('[^a-zA-Z]+', ' ', doc).strip().split()) for doc in dat]}). \
                to_csv('data/raw_20news/20news.csv',
                    index=False, sep=',', encoding='ISO-8859-1')

In [6]:
print('------- Data cleaning -------')                
stopwords_en = stopwords.words('english')
dat_clean = []
for i in range(len(dat)):

    ''' tokenization and punctuation removal '''
    # uses nltk tokenization - e.g. shouldn't = [should, n't] instead of [shouldn, 't]
    tmp_doc = nltk.tokenize.word_tokenize(dat[i].lower())
    
    # split words sperated by fullstops
    tmp_doc_split = [w.split('.') for w in tmp_doc if len(w.split('.')) > 1]
    # flatten list
    tmp_doc_split = [i_sublist for i_list in tmp_doc_split for i_sublist in i_list]
    # clean split words
    tmp_doc_split = [w for w in tmp_doc_split if re.search('^[a-z]+$',w)]
    
    # drop punctuations
    tmp_doc_clean = [w for w in tmp_doc if re.search('^[a-z]+$',w)]
    tmp_doc_clean.extend(tmp_doc_split)

    ''' stop word removal'''
    tmp_doc_clean_stop = [w for w in tmp_doc_clean if w not in stopwords_en]
    #retain only words with 2 characters or more
    tmp_doc_clean_stop = [w for w in  tmp_doc_clean_stop if len(w) >2]
    
    ''' stemming (using the Porter's algorithm)'''
    tmp_doc_clean_stop_stemmed = [ps.stem(w) for w in tmp_doc_clean_stop]
    dat_clean.append(tmp_doc_clean_stop_stemmed)
    
    #print progress
    if i % 100 == 0: print( 'Current progress: ' + str(i) + '/' + str(len(dat)) )


------- Data cleaning -------
Current progress: 0/18828
Current progress: 100/18828
Current progress: 200/18828
Current progress: 300/18828
Current progress: 400/18828
Current progress: 500/18828
Current progress: 600/18828
Current progress: 700/18828
Current progress: 800/18828
Current progress: 900/18828
Current progress: 1000/18828
Current progress: 1100/18828
Current progress: 1200/18828
Current progress: 1300/18828
Current progress: 1400/18828
Current progress: 1500/18828
Current progress: 1600/18828
Current progress: 1700/18828
Current progress: 1800/18828
Current progress: 1900/18828
Current progress: 2000/18828
Current progress: 2100/18828
Current progress: 2200/18828
Current progress: 2300/18828
Current progress: 2400/18828
Current progress: 2500/18828
Current progress: 2600/18828
Current progress: 2700/18828
Current progress: 2800/18828
Current progress: 2900/18828
Current progress: 3000/18828
Current progress: 3100/18828
Current progress: 3200/18828
Current progress: 3300/18828
Current progress: 3400/18828
Current progress: 3500/18828
Current progress: 3600/18828
Current progress: 3700/18828
Current progress: 3800/18828
Current progress: 3900/18828
Current progress: 4000/18828
Current progress: 4100/18828
Current progress: 4200/18828
Current progress: 4300/18828
Current progress: 4400/18828
Current progress: 4500/18828
Current progress: 4600/18828
Current progress: 4700/18828
Current progress: 4800/18828
Current progress: 4900/18828
Current progress: 5000/18828
Current progress: 5100/18828
Current progress: 5200/18828
Current progress: 5300/18828
Current progress: 5400/18828
Current progress: 5500/18828
Current progress: 5600/18828
Current progress: 5700/18828
Current progress: 5800/18828
Current progress: 5900/18828
Current progress: 6000/18828
Current progress: 6100/18828
Current progress: 6200/18828
Current progress: 6300/18828
Current progress: 6400/18828
Current progress: 6500/18828
Current progress: 6600/18828
Current progress: 6700/18828
Current progress: 6800/18828
Current progress: 6900/18828
Current progress: 7000/18828
Current progress: 7100/18828
Current progress: 7200/18828
Current progress: 7300/18828
Current progress: 7400/18828
Current progress: 7500/18828
Current progress: 7600/18828
Current progress: 7700/18828
Current progress: 7800/18828
Current progress: 7900/18828
Current progress: 8000/18828
Current progress: 8100/18828
Current progress: 8200/18828
Current progress: 8300/18828
Current progress: 8400/18828
Current progress: 8500/18828
Current progress: 8600/18828
Current progress: 8700/18828
Current progress: 8800/18828
Current progress: 8900/18828
Current progress: 9000/18828
Current progress: 9100/18828
Current progress: 9200/18828
Current progress: 9300/18828
Current progress: 9400/18828
Current progress: 9500/18828
Current progress: 9600/18828
Current progress: 9700/18828
Current progress: 9800/18828
Current progress: 9900/18828
Current progress: 10000/18828
Current progress: 10100/18828
Current progress: 10200/18828
Current progress: 10300/18828
Current progress: 10400/18828
Current progress: 10500/18828
Current progress: 10600/18828
Current progress: 10700/18828
Current progress: 10800/18828
Current progress: 10900/18828
Current progress: 11000/18828
Current progress: 11100/18828
Current progress: 11200/18828
Current progress: 11300/18828
Current progress: 11400/18828
Current progress: 11500/18828
Current progress: 11600/18828
Current progress: 11700/18828
Current progress: 11800/18828
Current progress: 11900/18828
Current progress: 12000/18828
Current progress: 12100/18828
Current progress: 12200/18828
Current progress: 12300/18828
Current progress: 12400/18828
Current progress: 12500/18828
Current progress: 12600/18828
Current progress: 12700/18828
Current progress: 12800/18828
Current progress: 12900/18828
Current progress: 13000/18828
Current progress: 13100/18828
Current progress: 13200/18828
Current progress: 13300/18828
Current progress: 13400/18828
Current progress: 13500/18828
Current progress: 13600/18828
Current progress: 13700/18828
Current progress: 13800/18828
Current progress: 13900/18828
Current progress: 14000/18828
Current progress: 14100/18828
Current progress: 14200/18828
Current progress: 14300/18828
Current progress: 14400/18828
Current progress: 14500/18828
Current progress: 14600/18828
Current progress: 14700/18828
Current progress: 14800/18828
Current progress: 14900/18828
Current progress: 15000/18828
Current progress: 15100/18828
Current progress: 15200/18828
Current progress: 15300/18828
Current progress: 15400/18828
Current progress: 15500/18828
Current progress: 15600/18828
Current progress: 15700/18828
Current progress: 15800/18828
Current progress: 15900/18828
Current progress: 16000/18828
Current progress: 16100/18828
Current progress: 16200/18828
Current progress: 16300/18828
Current progress: 16400/18828
Current progress: 16500/18828
Current progress: 16600/18828
Current progress: 16700/18828
Current progress: 16800/18828
Current progress: 16900/18828
Current progress: 17000/18828
Current progress: 17100/18828
Current progress: 17200/18828
Current progress: 17300/18828
Current progress: 17400/18828
Current progress: 17500/18828
Current progress: 17600/18828
Current progress: 17700/18828
Current progress: 17800/18828
Current progress: 17900/18828
Current progress: 18000/18828
Current progress: 18100/18828
Current progress: 18200/18828
Current progress: 18300/18828
Current progress: 18400/18828
Current progress: 18500/18828
Current progress: 18600/18828
Current progress: 18700/18828
Current progress: 18800/18828

In [7]:
dat = pd.read_csv('data/clean_20news.csv', sep=",")


docs = [ast.literal_eval(doc) for doc in dat['document'].tolist()]

all_words = [word for doc in docs for word in doc]
pd_all_words = pd.DataFrame({'words' : all_words})
pd_unq_word_counts = pd.DataFrame({'count' : pd_all_words.groupby('words').size()}).reset_index().sort('count', ascending = False)

# follow's research paper's top 2000 vocabulary (previously only took data with counts of words more than 150)
pd_unq_word_counts_filtered = pd_unq_word_counts.head(2000)
list_unq_word_filtered = list( pd_unq_word_counts_filtered.ix[:,0] )
len(list_unq_word_filtered)


/home/ekhongl/.conda/envs/py3/lib/python3.5/site-packages/ipykernel/__main__.py:8: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
Out[7]:
2000

In [8]:
vec = CountVectorizer(input = 'content', lowercase = False, vocabulary = list_unq_word_filtered)

iters = list(range(0,len(docs),500))
iters.append(len(docs))
dtm = np.array([] ).reshape(0,len(list_unq_word_filtered))
for i in range(len(iters)-1):
    dtm = np.concatenate( (dtm, list(map(lambda x: vec.fit_transform(x).toarray().sum(axis=0), docs[iters[i]:iters[i+1]] )) ), axis = 0)
    print( 'Percentage completion: ' + str( (i+1) / (len(iters)-1) ) )

colnames = list_unq_word_filtered
colnames.insert(0,'_label_')


Percentage completion: 0.02631578947368421
Percentage completion: 0.05263157894736842
Percentage completion: 0.07894736842105263
Percentage completion: 0.10526315789473684
Percentage completion: 0.13157894736842105
Percentage completion: 0.15789473684210525
Percentage completion: 0.18421052631578946
Percentage completion: 0.21052631578947367
Percentage completion: 0.23684210526315788
Percentage completion: 0.2631578947368421
Percentage completion: 0.2894736842105263
Percentage completion: 0.3157894736842105
Percentage completion: 0.34210526315789475
Percentage completion: 0.3684210526315789
Percentage completion: 0.39473684210526316
Percentage completion: 0.42105263157894735
Percentage completion: 0.4473684210526316
Percentage completion: 0.47368421052631576
Percentage completion: 0.5
Percentage completion: 0.5263157894736842
Percentage completion: 0.5526315789473685
Percentage completion: 0.5789473684210527
Percentage completion: 0.6052631578947368
Percentage completion: 0.631578947368421
Percentage completion: 0.6578947368421053
Percentage completion: 0.6842105263157895
Percentage completion: 0.7105263157894737
Percentage completion: 0.7368421052631579
Percentage completion: 0.7631578947368421
Percentage completion: 0.7894736842105263
Percentage completion: 0.8157894736842105
Percentage completion: 0.8421052631578947
Percentage completion: 0.868421052631579
Percentage completion: 0.8947368421052632
Percentage completion: 0.9210526315789473
Percentage completion: 0.9473684210526315
Percentage completion: 0.9736842105263158
Percentage completion: 1.0

In [11]:
pd.DataFrame(data = np.c_[dat['label'].values, dtm], 
             columns = colnames). \
             to_csv( 'data/dtm_2000_20news.csv', index = False)

In [10]:
pd.DataFrame(data = np.c_[dat_y2, dtm], 
             columns = colnames). \
             to_csv( 'data/dtm_2000_20news_6class.csv', index = False)

In [30]:
list_unq_word_filtered


Out[30]:
['edu',
 'subject',
 'com',
 'would',
 'one',
 'use',
 'write',
 'articl',
 'like',
 'get',
 'peopl',
 'know',
 'think',
 'time',
 'say',
 'also',
 'make',
 'work',
 'could',
 'want',
 'good',
 'new',
 'system',
 'year',
 'right',
 'see',
 'need',
 'way',
 'even',
 'may',
 'well',
 'look',
 'thing',
 'problem',
 'god',
 'file',
 'tri',
 'much',
 'mani',
 'first',
 'max',
 'two',
 'question',
 'window',
 'take',
 'call',
 'believ',
 'post',
 'come',
 'anyon',
 'point',
 'program',
 'run',
 'said',
 'seem',
 'mean',
 'help',
 'state',
 'read',
 'pleas',
 'differ',
 'drive',
 'number',
 'thank',
 'someth',
 'find',
 'back',
 'realli',
 'game',
 'sinc',
 'includ',
 'day',
 'still',
 'inform',
 'give',
 'reason',
 'person',
 'univers',
 'christian',
 'gener',
 'go',
 'govern',
 'start',
 'part',
 'last',
 'support',
 'might',
 'sure',
 'ask',
 'let',
 'case',
 'follow',
 'law',
 'never',
 'set',
 'comput',
 'better',
 'imag',
 'interest',
 'must',
 'car',
 'power',
 'key',
 'group',
 'fact',
 'anoth',
 'without',
 'world',
 'possibl',
 'etc',
 'david',
 'name',
 'someon',
 'got',
 'tell',
 'chang',
 'avail',
 'made',
 'control',
 'put',
 'line',
 'list',
 'lot',
 'live',
 'data',
 'word',
 'space',
 'actual',
 'place',
 'book',
 'probabl',
 'exist',
 'card',
 'around',
 'happen',
 'long',
 'littl',
 'softwar',
 'howev',
 'anyth',
 'show',
 'talk',
 'play',
 'gun',
 'team',
 'bit',
 'keep',
 'opinion',
 'everi',
 'best',
 'kill',
 'claim',
 'john',
 'consid',
 'true',
 'least',
 'cours',
 'idea',
 'enough',
 'base',
 'chip',
 'second',
 'news',
 'order',
 'end',
 'version',
 'sourc',
 'great',
 'org',
 'armenian',
 'mail',
 'nation',
 'provid',
 'answer',
 'issu',
 'note',
 'public',
 'though',
 'either',
 'far',
 'human',
 'send',
 'thought',
 'exampl',
 'jesu',
 'caus',
 'requir',
 'mark',
 'life',
 'current',
 'els',
 'found',
 'refer',
 'noth',
 'hard',
 'wrong',
 'rather',
 'real',
 'old',
 'respons',
 'report',
 'origin',
 'effect',
 'discuss',
 'man',
 'engin',
 'machin',
 'allow',
 'quit',
 'price',
 'american',
 'sever',
 'gov',
 'net',
 'phone',
 'disk',
 'ye',
 'bill',
 'standard',
 'done',
 'next',
 'email',
 'free',
 'kind',
 'hope',
 'seen',
 'care',
 'other',
 'feel',
 'suggest',
 'mayb',
 'address',
 'research',
 'yet',
 'abl',
 'turn',
 'understand',
 'object',
 'win',
 'applic',
 'alway',
 'buy',
 'driver',
 'access',
 'type',
 'sun',
 'sound',
 'ever',
 'high',
 'heard',
 'mac',
 'player',
 'bad',
 'develop',
 'nasa',
 'wrote',
 'hand',
 'result',
 'user',
 'messag',
 'author',
 'less',
 'accept',
 'evid',
 'code',
 'graphic',
 'view',
 'internet',
 'manag',
 'israel',
 'home',
 'whether',
 'color',
 'three',
 'given',
 'love',
 'children',
 'design',
 'rememb',
 'open',
 'copi',
 'servic',
 'today',
 'test',
 'left',
 'michael',
 'jew',
 'away',
 'encrypt',
 'moral',
 'big',
 'info',
 'move',
 'area',
 'agre',
 'cost',
 'fire',
 'appear',
 'import',
 'display',
 'unit',
 'posit',
 'build',
 'forc',
 'check',
 'presid',
 'studi',
 'sale',
 'netcom',
 'format',
 'secur',
 'creat',
 'larg',
 'commun',
 'appl',
 'religion',
 'war',
 'local',
 'week',
 'paul',
 'ftp',
 'oper',
 'do',
 'countri',
 'major',
 'matter',
 'speed',
 'compani',
 'box',
 'side',
 'guy',
 'wonder',
 'complet',
 'offer',
 'mind',
 'technolog',
 'scienc',
 'assum',
 'stuff',
 'protect',
 'alreadi',
 'uucp',
 'church',
 'becom',
 'argument',
 'center',
 'perhap',
 'experi',
 'uiuc',
 'close',
 'rule',
 'ibm',
 'stop',
 'steve',
 'product',
 'fax',
 'save',
 'specif',
 'small',
 'death',
 'money',
 'bibl',
 'valu',
 'came',
 'memori',
 'expect',
 'mention',
 'die',
 'process',
 'ago',
 'offic',
 'organ',
 'hous',
 'recent',
 'mike',
 'told',
 'sell',
 'month',
 'packag',
 'final',
 'correct',
 'monitor',
 'pretti',
 'form',
 'rate',
 'act',
 'homosexu',
 'perform',
 'model',
 'robert',
 'andrew',
 'hit',
 'present',
 'natur',
 'receiv',
 'whole',
 'attack',
 'statement',
 'guess',
 'instal',
 'light',
 'advanc',
 'action',
 'polit',
 'bodi',
 'often',
 'deal',
 'muslim',
 'except',
 'pass',
 'went',
 'friend',
 'head',
 'member',
 'usual',
 'board',
 'comment',
 'server',
 'intern',
 'sort',
 'limit',
 'april',
 'simpli',
 'continu',
 'involv',
 'histori',
 'appreci',
 'hear',
 'command',
 'everyth',
 'plan',
 'clipper',
 'error',
 'full',
 'pay',
 'carri',
 'return',
 'network',
 'everyon',
 'jim',
 'level',
 'regard',
 'drug',
 'anyway',
 'speak',
 'job',
 'lead',
 'contact',
 'almost',
 'bike',
 'isra',
 'term',
 'convert',
 'total',
 'later',
 'certainli',
 'instead',
 'sens',
 'men',
 'hold',
 'faith',
 'situat',
 'devic',
 'fan',
 'suppos',
 'christ',
 'press',
 'arm',
 'mit',
 'white',
 'earth',
 'activ',
 'video',
 'connect',
 'although',
 'basic',
 'contain',
 'text',
 'cover',
 'leav',
 'washington',
 'coupl',
 'defin',
 'function',
 'citi',
 'nice',
 'sin',
 'quot',
 'explain',
 'scsi',
 'singl',
 'period',
 'truth',
 'turkish',
 'unless',
 'anybodi',
 'belief',
 'site',
 'decid',
 'similar',
 'clear',
 'jewish',
 'black',
 'hell',
 'size',
 'definit',
 'learn',
 'school',
 'night',
 'appli',
 'arab',
 'document',
 'page',
 'repli',
 'polic',
 'within',
 'individu',
 'watch',
 'atheist',
 'relat',
 'compar',
 'privat',
 'ground',
 'stand',
 'concern',
 'thu',
 'record',
 'attempt',
 'newsgroup',
 'detail',
 'hockey',
 'releas',
 'direct',
 'road',
 'addit',
 'face',
 'mode',
 'figur',
 'practic',
 'weapon',
 'legal',
 'jame',
 'fbi',
 'accord',
 'certain',
 'mine',
 'known',
 'event',
 'busi',
 'past',
 'stori',
 'especi',
 'clinton',
 'top',
 'took',
 'screen',
 'dave',
 'wait',
 'physic',
 'hardwar',
 'delet',
 'purpos',
 'field',
 'dead',
 'normal',
 'brian',
 'land',
 'fine',
 'saw',
 'via',
 'option',
 'among',
 'sorri',
 'common',
 'season',
 'exactli',
 'depart',
 'faq',
 'women',
 'entir',
 'notic',
 'ignor',
 'date',
 'increas',
 'goe',
 'per',
 'fast',
 'peac',
 'begin',
 'rest',
 'project',
 'shot',
 'low',
 'particular',
 'add',
 'toronto',
 'special',
 'replac',
 'describ',
 'red',
 'condit',
 'simpl',
 'port',
 'request',
 'print',
 'taken',
 'crime',
 'output',
 'polici',
 'att',
 'apr',
 'sometim',
 'million',
 'islam',
 'propos',
 'tape',
 'societi',
 'cmu',
 'medic',
 'health',
 'religi',
 'blue',
 'orbit',
 'electron',
 'due',
 'defens',
 'usa',
 'paper',
 'handl',
 'observ',
 'goal',
 'pick',
 'fail',
 'bob',
 'radio',
 'easi',
 'cut',
 'burn',
 'therefor',
 'dod',
 'front',
 'peter',
 'depend',
 'seri',
 'whatev',
 'canada',
 'algorithm',
 'scott',
 'murder',
 'inc',
 'lost',
 'produc',
 'associ',
 'unix',
 'section',
 'manual',
 'third',
 'publish',
 'font',
 'prevent',
 'market',
 'flame',
 'upon',
 'cso',
 'prefer',
 'offici',
 'absolut',
 'futur',
 'remov',
 'score',
 'frank',
 'written',
 'basebal',
 'mous',
 'air',
 'main',
 'ride',
 'doubt',
 'chanc',
 'launch',
 'method',
 'ship',
 'switch',
 'miss',
 'bu',
 'longer',
 'short',
 'four',
 'lie',
 'theori',
 'hour',
 'jpeg',
 'indic',
 'tool',
 'tax',
 'water',
 'prove',
 'account',
 'distribut',
 'king',
 'modem',
 'insur',
 'york',
 'institut',
 'select',
 'earli',
 'variou',
 'break',
 'bring',
 'leagu',
 'koresh',
 'administr',
 'resourc',
 'librari',
 'qualiti',
 'wish',
 'greek',
 'pictur',
 'serv',
 'lord',
 'san',
 'occur',
 'directori',
 'announc',
 'digit',
 'widget',
 'thoma',
 'court',
 'tom',
 'behind',
 'letter',
 'featur',
 'young',
 'solut',
 'minor',
 'rutger',
 'charg',
 'parti',
 'shall',
 'strong',
 'gif',
 'amount',
 'choic',
 'togeth',
 'sign',
 'smith',
 'keith',
 'logic',
 'comp',
 'draw',
 'knowledg',
 'fall',
 'ram',
 'agenc',
 'ad',
 'trade',
 'anonym',
 'share',
 'virginia',
 'popul',
 'meet',
 'plu',
 'interpret',
 'voic',
 'entri',
 'famili',
 'food',
 'load',
 'bitnet',
 'respect',
 'decis',
 'express',
 'will',
 'constitut',
 'father',
 'along',
 'colorado',
 'pat',
 'commit',
 'previou',
 'minut',
 'worth',
 'russian',
 'outsid',
 'remain',
 'fix',
 'citizen',
 'militari',
 'near',
 'measur',
 'averag',
 'soon',
 'appar',
 'chri',
 'wire',
 'student',
 'feder',
 'ga',
 'necessari',
 'eye',
 'printer',
 'doctor',
 'usenet',
 'fit',
 'vote',
 'technic',
 'suppli',
 'divis',
 'judg',
 'educ',
 'richard',
 'sent',
 'languag',
 'higher',
 'secret',
 'stanford',
 'class',
 'anim',
 'respond',
 'adam',
 'interfac',
 'crimin',
 'mission',
 'son',
 'child',
 'initi',
 'cabl',
 'eric',
 'mile',
 'recommend',
 'clearli',
 'station',
 'toward',
 'avoid',
 'age',
 'none',
 'archiv',
 'search',
 'ide',
 'owner',
 'diseas',
 'determin',
 'disclaim',
 'approach',
 'fight',
 'media',
 'freedom',
 'door',
 'stay',
 'georg',
 'east',
 'heart',
 'seriou',
 'purchas',
 'implement',
 'capabl',
 'convers',
 'stupid',
 'bank',
 'enforc',
 'block',
 'repres',
 'realiz',
 'otherwis',
 'drop',
 'success',
 'unfortun',
 'uunet',
 'imagin',
 'turk',
 'commerci',
 'cup',
 'joe',
 'fund',
 'materi',
 'danger',
 'patient',
 'separ',
 'pitt',
 'armenia',
 'rang',
 'serial',
 'effort',
 'hate',
 'teach',
 'late',
 'street',
 'gave',
 'henri',
 'improv',
 'independ',
 'reach',
 'manufactur',
 'insid',
 'inde',
 'berkeley',
 'ac',
 'store',
 'folk',
 'spend',
 'roger',
 'sex',
 'choos',
 'lack',
 'confer',
 'thousand',
 'kid',
 'grant',
 'mass',
 'equip',
 'happi',
 'bought',
 'locat',
 'directli',
 'pitch',
 'transfer',
 'mil',
 'matthew',
 'surpris',
 'built',
 'microsoft',
 'faster',
 'altern',
 'bear',
 'dan',
 'argu',
 'concept',
 'floppi',
 'waco',
 'upgrad',
 'basi',
 'dealer',
 'cheap',
 'sgi',
 'jon',
 'consist',
 'william',
 'troubl',
 'intend',
 'input',
 'cpu',
 'firearm',
 'green',
 'obvious',
 'suffer',
 'aid',
 'tim',
 'america',
 'civil',
 'util',
 'warn',
 'titl',
 'half',
 'turkey',
 'obtain',
 'angel',
 'room',
 'edit',
 'translat',
 'collect',
 'ray',
 'leader',
 'genocid',
 'rais',
 'equal',
 'step',
 'armi',
 'rel',
 'compress',
 'pain',
 'predict',
 'count',
 'tradit',
 'licens',
 'whose',
 'excel',
 'batteri',
 'extra',
 'difficult',
 'investig',
 'environ',
 'easili',
 'motif',
 'boston',
 'agent',
 'columbia',
 'star',
 'volum',
 'train',
 'risk',
 'compil',
 'establish',
 'batf',
 'nhl',
 'signal',
 'updat',
 'punish',
 'villag',
 'enter',
 'scriptur',
 ...]

reading the exported raw text and labels


In [76]:
df = pd.read_csv('data/raw_20news/20news.csv', sep=",")
df


Out[76]:
_label document
0 0 From mathew mathew mantis co uk Subject Alt At...
1 0 From mathew mathew mantis co uk Subject Alt At...
2 0 From I dbstu rz tu bs de Benedikt Rosenau Subj...
3 0 From mathew mathew mantis co uk Subject Re uni...
4 0 From strom Watson Ibm Com Rob Strom Subject Re...
5 0 From I dbstu rz tu bs de Benedikt Rosenau Subj...
6 0 From keith cco caltech edu Keith Allan Schneid...
7 0 From I dbstu rz tu bs de Benedikt Rosenau Subj...
8 0 From keith cco caltech edu Keith Allan Schneid...
9 0 From keith cco caltech edu Keith Allan Schneid...
10 0 From keith cco caltech edu Keith Allan Schneid...
11 0 From keith cco caltech edu Keith Allan Schneid...
12 0 From keith cco caltech edu Keith Allan Schneid...
13 0 From keith cco caltech edu Keith Allan Schneid...
14 0 From keith cco caltech edu Keith Allan Schneid...
15 0 From keith cco caltech edu Keith Allan Schneid...
16 0 Subject Re Don t more innocents die without th...
17 0 Subject Re Ancient islamic rituals From bobbe ...
18 0 Subject Re Political Atheists From bobbe vice ...
19 0 Subject Re There must be a creator Maybe From ...
20 0 Subject Re Americans and Evolution From halat ...
21 0 Subject Re Speculations From dgraham bmers bnr...
22 0 From keith cco caltech edu Keith Allan Schneid...
23 0 From keith cco caltech edu Keith Allan Schneid...
24 0 From rm ic ac uk Mr R Mellish Subject Re unive...
25 0 From kilman y fiu edu Yevgeny Gene Kilman Subj...
26 0 Subject Re islamic authority over women From l...
27 0 Subject Re Ancient islamic rituals From livese...
28 0 From anthropo carina unm edu Dominick V Zurlo ...
29 0 From keith cco caltech edu Keith Allan Schneid...
... ... ...
18798 19 From sandvik newton apple com Kent Sandvik Sub...
18799 19 From sandvik newton apple com Kent Sandvik Sub...
18800 19 From daveb pogo wv tek com Dave Butler Subject...
18801 19 From eeb quads uchicago edu E Elizabeth Bartle...
18802 19 From mcelwre cnsvax uwec edu Subject LARSONIAN...
18803 19 From lwb cs utexas edu Lance W Bledsoe Subject...
18804 19 From jmeritt mental MITRE ORG Jim Meritt Syste...
18805 19 From lwb cs utexas edu Lance W Bledsoe Subject...
18806 19 From livesey solntze wpd sgi com Jon Livesey S...
18807 19 From cutter gloster via mind org cutter Subjec...
18808 19 From arromdee jyusenkyou cs jhu edu Ken Arromd...
18809 19 From pboxrud magnus acs ohio state edu Paul D ...
18810 19 Subject Re rw Is Robert Weiss the only orthodo...
18811 19 From tbrent bank ecn purdue edu Timothy J Bren...
18812 19 From prl csis dit csiro au Peter Lamb Subject ...
18813 19 From Lynn Anderson dba lynn cs cmu edu Subject...
18814 19 From kltensme infonode ingr com Kermit Tensmey...
18815 19 From ktikkane phoenix oulu fi Kari Tikkanen Su...
18816 19 From kltensme infonode ingr com Kermit Tensmey...
18817 19 From frank D S uucp Frank O Dwyer Subject Re T...
18818 19 Subject Re Albert Sabin From rfox charlie usd ...
18819 19 From David R Sacco dsav andrew cmu edu Subject...
18820 19 From neese cerritos edu Subject Hell In the Ki...
18821 19 From sbuckley fraser sfu ca Stephen Buckley Su...
18822 19 From sbuckley fraser sfu ca Stephen Buckley Su...
18823 19 From sbuckley fraser sfu ca Stephen Buckley Su...
18824 19 From bakerj gtephx UUCP Jon Baker Subject Re A...
18825 19 From pharvey quack kfu com Paul Harvey Subject...
18826 19 From KEVXU CUNYVM BITNET Subject Re Info about...
18827 19 From pharvey quack kfu com Paul Harvey Subject...

18828 rows × 2 columns