notebook.community

Edit and run



In [2]:

    
%matplotlib inline
import pandas as pd
import numpy as np



In [6]:

    
data = pd.read_csv('trainning_raw.txt', sep='|', skiprows=1, names=['text','category'])



In [7]:

    
data.head()









    Out[7]:






  
    
      
      text
      category
    
  
  
    
      0
      COLES  GLENSIDE SA AU
      Grocery Stores, Supermarkets
    
    
      1
      WOOLWORTHS RANDWICK RANDWICK AU
      Grocery Stores, Supermarkets
    
    
      2
      WOOLWORTHS      4390 JOONDA AU
      Grocery Stores, Supermarkets
    
    
      3
      COLES SUPERMARKETS KAL KALAMUNDA AU
      Grocery Stores, Supermarkets
    
    
      4
      COLES ERINA 0885 ERINA AUS
      Grocery Stores, Supermarkets



In [29]:

    
corpus = data.text.values



In [30]:

    
corpus









    Out[30]:





array(['COLES  GLENSIDE SA AU', 'WOOLWORTHS RANDWICK RANDWICK AU',
       'WOOLWORTHS      4390 JOONDA AU', ..., 'COLES 0332 BOORAGOON AU',
       'COLES 0901 FORSTER AU', 'WOOLWORTHS 1316 CAHRLE CHARLESTOWN AU'], dtype=object)



In [51]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 0, stop_words = 'english', token_pattern='[a-zA-Z]\w+')
tfidf_matrix =  v.fit_transform(corpus)



In [52]:

    
feature_names = v.get_feature_names()



In [53]:

    
len(feature_names)









    Out[53]:





4068



In [54]:

    
feature_names[50:70]









    Out[54]:





[u'alexandr',
 u'alexandra',
 u'alexandria',
 u'alfa',
 u'alfred',
 u'alfredton',
 u'algester',
 u'alh',
 u'ali',
 u'alice',
 u'alipay',
 u'allambie',
 u'allenby',
 u'allens',
 u'alliance',
 u'almond',
 u'alpha',
 u'alpharetta',
 u'alphin',
 u'alsterhaus']



In [55]:

    
import csv
with open("tfidf_scikit.csv", "w") as file:
    writer = csv.writer(file, delimiter=",")
    writer.writerow(["Tran_Id", "Phrase", "Score"])
 
    doc_id = 0
    for doc in tfidf_matrix.todense():
        # print "Tran_Id %d" %(doc_id)
        word_id = 0
        for score in doc.tolist()[0]:
            if score > 0:
                word = feature_names[word_id]
                writer.writerow([doc_id+1, word.encode("utf-8"), score])
            word_id +=1
        doc_id +=1



In [ ]:

	text	category
0	COLES GLENSIDE SA AU	Grocery Stores, Supermarkets
1	WOOLWORTHS RANDWICK RANDWICK AU	Grocery Stores, Supermarkets
2	WOOLWORTHS 4390 JOONDA AU	Grocery Stores, Supermarkets
3	COLES SUPERMARKETS KAL KALAMUNDA AU	Grocery Stores, Supermarkets
4	COLES ERINA 0885 ERINA AUS	Grocery Stores, Supermarkets