In [49]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
def getData(dn):
corpus = []
with open(dir + "corpus.csv") as f:
i = 0
for line in f:
corpus.append(map(lambda s: s.rstrip(), line.split(',')))
docs = []
with open(dir + 'docs.txt') as f:
docs = f.readline().split(',')
vocab = []
with open(dir + 'vocab.txt') as f:
vocab = f.readline().split(',')
return docs, corpus, vocab
dir = '/Users/chanjinpark/GitHub/NRFAnalysis/data/temp/'
docs, corpus, vocab = getData(dir)
print corpus[0][0]
print vocab[2]
print len(vocab)
vocabid = dict(zip(vocab, list([i for i in range(len(vocab))])))
def getDocTermArray(doc):
dtarr = [0 for x in range(len(vocab))]
for term in doc:
dtarr[vocabid[term]] = dtarr[vocabid[term]] + 1
return dtarr
print vocabid[vocab[2]]
corpusarr = map(lambda x: getDocTermArray(x), corpus)
In [54]:
from sklearn.feature_extraction.text import CountVectorizer
len(corpusarr)
def countUniqueTerms(arr):
cnt = 0
for e in arr:
if ( e != 0 ):
cnt = cnt + 1
return cnt
def countTerms(arr):
cnt = 0
for e in arr:
if ( e > 0): cnt = cnt + e
return cnt
uniqueTerms = map(lambda arr: (countUniqueTerms(arr), countTerms(arr)), corpusarr)
print uniqueTerms[0]
In [56]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(corpusarr)
In [ ]: