In [2]:
%matplotlib inline
import pandas as pd
import numpy as np

In [6]:
data = pd.read_csv('trainning_raw.txt', sep='|', skiprows=1, names=['text','category'])

In [7]:
data.head()


Out[7]:
text category
0 COLES GLENSIDE SA AU Grocery Stores, Supermarkets
1 WOOLWORTHS RANDWICK RANDWICK AU Grocery Stores, Supermarkets
2 WOOLWORTHS 4390 JOONDA AU Grocery Stores, Supermarkets
3 COLES SUPERMARKETS KAL KALAMUNDA AU Grocery Stores, Supermarkets
4 COLES ERINA 0885 ERINA AUS Grocery Stores, Supermarkets

In [29]:
corpus = data.text.values

In [30]:
corpus


Out[30]:
array(['COLES  GLENSIDE SA AU', 'WOOLWORTHS RANDWICK RANDWICK AU',
       'WOOLWORTHS      4390 JOONDA AU', ..., 'COLES 0332 BOORAGOON AU',
       'COLES 0901 FORSTER AU', 'WOOLWORTHS 1316 CAHRLE CHARLESTOWN AU'], dtype=object)

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 0, stop_words = 'english', token_pattern='[a-zA-Z]\w+')
tfidf_matrix =  v.fit_transform(corpus)

In [52]:
feature_names = v.get_feature_names()

In [53]:
len(feature_names)


Out[53]:
4068

In [54]:
feature_names[50:70]


Out[54]:
[u'alexandr',
 u'alexandra',
 u'alexandria',
 u'alfa',
 u'alfred',
 u'alfredton',
 u'algester',
 u'alh',
 u'ali',
 u'alice',
 u'alipay',
 u'allambie',
 u'allenby',
 u'allens',
 u'alliance',
 u'almond',
 u'alpha',
 u'alpharetta',
 u'alphin',
 u'alsterhaus']

In [55]:
import csv
with open("tfidf_scikit.csv", "w") as file:
    writer = csv.writer(file, delimiter=",")
    writer.writerow(["Tran_Id", "Phrase", "Score"])
 
    doc_id = 0
    for doc in tfidf_matrix.todense():
        # print "Tran_Id %d" %(doc_id)
        word_id = 0
        for score in doc.tolist()[0]:
            if score > 0:
                word = feature_names[word_id]
                writer.writerow([doc_id+1, word.encode("utf-8"), score])
            word_id +=1
        doc_id +=1

In [ ]: