In [49]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

def getData(dn): 
    corpus = []
    with open(dir + "corpus.csv") as f:
        i = 0
        for line in f:
            corpus.append(map(lambda s: s.rstrip(), line.split(',')))
            
    docs = []
    with open(dir + 'docs.txt') as f:
        docs = f.readline().split(',')
    
    vocab = []
    with open(dir + 'vocab.txt') as f:
        vocab = f.readline().split(',')
        
    return docs, corpus, vocab
    
dir = '/Users/chanjinpark/GitHub/NRFAnalysis/data/temp/'
docs, corpus, vocab = getData(dir)
print corpus[0][0]
print vocab[2]
print len(vocab)

vocabid = dict(zip(vocab, list([i for i in range(len(vocab))])))

def getDocTermArray(doc):
    dtarr = [0 for x in range(len(vocab))]
    for term in doc:
        dtarr[vocabid[term]] = dtarr[vocabid[term]] + 1
    return dtarr

print vocabid[vocab[2]]
    
corpusarr = map(lambda x: getDocTermArray(x), corpus)


우리
제의
104055
2

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

len(corpusarr)

def countUniqueTerms(arr):
    cnt = 0
    for e in arr:
        if ( e != 0 ):
            cnt = cnt + 1
    return cnt

def countTerms(arr):
    cnt = 0
    for e in arr:
        if ( e > 0): cnt = cnt + e
    return cnt

uniqueTerms = map(lambda arr: (countUniqueTerms(arr), countTerms(arr)), corpusarr)
print uniqueTerms[0]


(170, 262)

In [56]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(corpusarr)


[(170, 262), (112, 251), (105, 234), (144, 346), (233, 450)]

In [ ]: