notebook.community

Edit and run



In [8]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# taken from http://stats.stackexchange.com/questions/73908/search-in-tf-idf



In [5]:

    
vectorizer = TfidfVectorizer(min_df=1)

my_phrases = ["boring answer phrase",
              "exciting phrase",
              "phrase on stackoverflow",
              "answer on stackoverflow"]

my_features = vectorizer.fit_transform(my_phrases)

feature_words = vectorizer.get_feature_names()



In [6]:

    
D = {}
for doc_index, weights in enumerate(my_features.A):
    D[doc_index] = pd.Series(weights, index=feature_words)
df = pd.DataFrame(D)

df_t = df.transpose()



In [7]:

    
df_t









    Out[7]:






  
    
      
      answer
      boring
      exciting
      on
      phrase
      stackoverflow
    
  
  
    
      0
      0.553492
      0.702035
      0.000000
      0.000000
      0.448100
      0.000000
    
    
      1
      0.000000
      0.000000
      0.842926
      0.000000
      0.538029
      0.000000
    
    
      2
      0.000000
      0.000000
      0.000000
      0.613667
      0.496816
      0.613667
    
    
      3
      0.577350
      0.000000
      0.000000
      0.577350
      0.000000
      0.577350

	answer	boring	exciting	on	phrase	stackoverflow
0	0.553492	0.702035	0.000000	0.000000	0.448100	0.000000
1	0.000000	0.000000	0.842926	0.000000	0.538029	0.000000
2	0.000000	0.000000	0.000000	0.613667	0.496816	0.613667
3	0.577350	0.000000	0.000000	0.577350	0.000000	0.577350