In [1]:
from __future__ import division
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
In [2]:
%matplotlib inline
#%qtconsole
In [3]:
corpus_dir = 'corpus_haiku' # input directory
corpus_statistics_filename = 'haiku_multi-grams.txt' # output file name
# output-file header
text_msg = "This file contains TF-IDF-frequent 3-6-gram words in Dr. Fisher's haiku\n\n"
In [ ]:
f = open(corpus_statistics_filename,'w')
f.write(text_msg)
f.close()
corpus = []
titles = []
import os
for num_docs,item in enumerate(os.listdir(corpus_dir)):
if item.endswith(".txt"):
file_name = item
titles.append(file_name)
f = open(corpus_dir + '/' + file_name,'r')
data = f.read()
corpus.append(data)
print("{:,} documents read".format(num_docs+1))
In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
set_of_words = set()
for max_g in [3,4,5,6]:
for max_df in np.arange(0.2,1.0,0.1):
tf = TfidfVectorizer(analyzer=u'word',
stop_words='english',
ngram_range=(1, max_g), # n-grams
max_df=max_df,
min_df=1,
vocabulary=None,
norm='l2',
use_idf=True,
smooth_idf=True,
sublinear_tf=False)
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names()
dense = tfidf_matrix.todense()
for title_idx,title in enumerate(titles):
doc_10best_features = [i[0] for i in sorted(enumerate(dense.tolist()[title_idx]),
key=lambda x:x[1],
reverse=True)][:10]
for idx in doc_10best_features:
set_of_words.add(feature_names[idx])
In [5]:
f = open(corpus_statistics_filename,'a')
for i,word in enumerate(sorted(list(set_of_words))):
f.write(word)
f.write("\n")
f.close()