In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
In [6]:
data = pd.read_csv('trainning_raw.txt', sep='|', skiprows=1, names=['text','category'])
In [7]:
data.head()
Out[7]:
In [29]:
corpus = data.text.values
In [30]:
corpus
Out[30]:
In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 0, stop_words = 'english', token_pattern='[a-zA-Z]\w+')
tfidf_matrix = v.fit_transform(corpus)
In [52]:
feature_names = v.get_feature_names()
In [53]:
len(feature_names)
Out[53]:
In [54]:
feature_names[50:70]
Out[54]:
In [55]:
import csv
with open("tfidf_scikit.csv", "w") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["Tran_Id", "Phrase", "Score"])
doc_id = 0
for doc in tfidf_matrix.todense():
# print "Tran_Id %d" %(doc_id)
word_id = 0
for score in doc.tolist()[0]:
if score > 0:
word = feature_names[word_id]
writer.writerow([doc_id+1, word.encode("utf-8"), score])
word_id +=1
doc_id +=1
In [ ]: