In [1]:

    
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as snb
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import re



In [2]:

    
train_bio = pd.read_csv("input_light/biology.csv",encoding='utf-8')
train_cooking = pd.read_csv("input_light/cooking.csv",encoding='utf-8')
train_crypto = pd.read_csv("input_light/crypto.csv",encoding='utf-8')
train_dyi = pd.read_csv("input_light/diy.csv",encoding='utf-8')
train_robotic = pd.read_csv("input_light/robotics.csv",encoding='utf-8')
train_travel = pd.read_csv("input_light/travel.csv",encoding='utf-8')
# test_df = pd.read_csv("input_light/test.csv",encoding='utf-8')



In [12]:

    
# df_list = []
df_list = pd.concat([train_bio,train_cooking,train_crypto,train_dyi,train_robotic,train_travel])

Length of dataset



In [13]:

    
df_list.head()









    Out[13]:






  
    
      
      id
      title
      content
      tags
    
  
  
    
      0
      1
      criticality ribosome binding site relative sta...
      prokaryotic translation critical efficient tra...
      ribosome binding-sites translation synthetic-b...
    
    
      1
      2
      rnase contamination rna based experiments prev...
      anyone suggestions prevent rnase contamination...
      rna biochemistry
    
    
      2
      3
      lymphocyte sizes clustered two groups
      tortora writes principles anatomy physiology l...
      immunology cell-biology hematology
    
    
      3
      4
      long antibiotic dosed lb maintain good selection
      various people lab prepare liter lb add kanamy...
      cell-culture
    
    
      4
      5
      exon order always preserved splicing
      cases splicing machinery constructs mrna exons...
      splicing mrna spliceosome introns exons



In [18]:

    
df_list.dropna(axis=0,how='any',inplace=True)



In [24]:

    
df_concat = df_list['title'] + ' ' + df_list['content']



In [26]:

    
df_list['doc'] = df_concat.map(lambda d: d.replace('-','_'))



In [27]:

    
print "Total length = %s"%df_list.shape[0]









    



Total length = 86998



In [28]:

    
print "Length of null data = %s"%df_list[df_list['doc'].isnull()].shape[0]









    



Length of null data = 0



In [29]:

    
print "Length of null data = %s"%df_list[df_list['tags'].isnull()].shape[0]









    



Length of null data = 0

Number of unique tags in training datasets



In [10]:

    
df_list['tags'][0:10]









    Out[10]:





0    ribosome binding-sites translation synthetic-b...
1                                     rna biochemistry
2                   immunology cell-biology hematology
3                                         cell-culture
4              splicing mrna spliceosome introns exons
5                   dna biochemistry molecular-biology
6                                neuroscience synapses
7                                             plasmids
8    molecular-genetics gene-expression experimenta...
9                  evolution mitochondria chloroplasts
Name: tags, dtype: object



In [33]:

    
# Y = [item.replace("-","_").split(" ") for item in df_list['tags']]
all_tags = df_list['tags'].map(lambda d: d.replace('-','_'))



In [12]:

    
length_of_tags = pd.DataFrame(zip(*np.unique([len(tag) for tag in Y],return_counts=True)),columns=['length','count'])



In [13]:

    
%matplotlib inline
# ax = plt.subplot()
# ax.plot(length_of_tags['length'],label='')
ax = length_of_tags.set_index("length").plot(kind='bar',rot=0,title="Length of tags")



In [ ]:

Analyze type of data



In [30]:

    
type_of_tags = []

def get_post_tag(tags):
    return np.array(nltk.pos_tag(tags.split(' ')))[:,1]

for tag in df_list.tags:
    type_of_tags.extend(get_post_tag(tag))

count_tag = pd.DataFrame(zip(*np.unique(type_of_tags,return_counts=True)),columns=['tag','count'])



In [31]:

    
%matplotlib inline
count_tag.sort_values(['count'],ascending=False).iloc[0:10].set_index(['tag']).plot(kind='bar',title="Type of tag",rot=0)









    Out[31]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd1a9940f90>

Number of tags in title



In [ ]:

    
def remove_number(demo):
    a = []
    for f in demo.split(' '):
        if not(f.isdigit()):
            a.append(f)
    return a



In [94]:

    
# token_pattern='^[0-9]*'
word_title = [remove_number(item) for item in df_list['title']]
title_dict = list(set(sum(word_title,[])))



In [95]:

    
title_dict = set(title_dict)



In [96]:

    
number_of_tags_in_title = title_dict.intersection(tags)



In [97]:

    
print "Ratio of tags in title = %s %%"%(len(number_of_tags_in_title)*100/len(all_tags))









    



Ratio of tags in title = 1 %

TF-IDF + LDA



In [35]:

    
tfidfVec = TfidfVectorizer(max_df=0.95)
countVec = CountVectorizer(max_df=0.95,min_df=2,ngram_range=(1,2),token_pattern='[a-zA-Z]{2,}')



In [36]:

    
df_tf = countVec.fit_transform(df_list['doc'])



In [37]:

    
n_topics = 6



In [44]:

    
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=4111,n_jobs=2)
lda.fit(df_tf)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-44-b4ebe801813f> in <module>()
      3                                 learning_offset=50.,
      4                                 random_state=4111,n_jobs=2)
----> 5 lda.fit(df_tf)

/usr/local/lib/python2.7/dist-packages/sklearn/decomposition/online_lda.pyc in fit(self, X, y)
    521                     for idx_slice in gen_batches(n_samples, batch_size):
    522                         self._em_step(X[idx_slice, :], total_samples=n_samples,
--> 523                                       batch_update=False, parallel=parallel)
    524                 else:
    525                     # batch update

/usr/local/lib/python2.7/dist-packages/sklearn/decomposition/online_lda.pyc in _em_step(self, X, total_samples, batch_update, parallel)
    408         # E-step
    409         _, suff_stats = self._e_step(X, cal_sstats=True, random_init=True,
--> 410                                      parallel=parallel)
    411 
    412         # M-step

/usr/local/lib/python2.7/dist-packages/sklearn/decomposition/online_lda.pyc in _e_step(self, X, cal_sstats, random_init, parallel)
    361                                               self.mean_change_tol, cal_sstats,
    362                                               random_state)
--> 363             for idx_slice in gen_even_slices(X.shape[0], n_jobs))
    364 
    365         # merge result

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.pyc in retrieve(self)
    716                     # scheduling.
    717                     ensure_ready = self._managed_backend
--> 718                     backend.abort_everything(ensure_ready=ensure_ready)
    719                 raise exception
    720 

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/_parallel_backends.pyc in abort_everything(self, ensure_ready)
    144         if ensure_ready:
    145             self.configure(n_jobs=self.parallel.n_jobs, parallel=self.parallel,
--> 146                            **self.parallel._backend_args)
    147 
    148 

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/_parallel_backends.pyc in configure(self, n_jobs, parallel, **backend_args)
    308 
    309         # Make sure to free as much memory as possible before forking
--> 310         gc.collect()
    311         self._pool = MemmapingPool(n_jobs, **backend_args)
    312         self.parallel = parallel

KeyboardInterrupt:



In [ ]:

    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()



In [ ]:

    
print("\nTopics in LDA model:")
n_top_words = 30
tf_feature_names = countVec.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)



In [ ]:

    
idx = lda.transform(df_tf[0]).argmax()

# words_topic = zip(*(lda.components_[3],countVec.get_feature_names()))
words_topic = lda.components_[idx]
feature_names = countVec.get_feature_names()
n_words = 100

print [feature_names[i] for i in words_topic.argsort()[:-n_words - 1:-1]]



In [42]:

    
tokenizer = countVec.build_analyzer()
a = tokenizer(df_list['title'].iloc[0])
b = [(c,words_topic[countVec.vocabulary_[c]]) for c in a if countVec.vocabulary_.has_key(c)]



In [43]:

    
df_list['tags'].iloc[0]









    Out[43]:





u'ribosome binding-sites translation synthetic-biology'



In [47]:

	id	title	content	tags
0	1	criticality ribosome binding site relative sta...	prokaryotic translation critical efficient tra...	ribosome binding-sites translation synthetic-b...
1	2	rnase contamination rna based experiments prev...	anyone suggestions prevent rnase contamination...	rna biochemistry
2	3	lymphocyte sizes clustered two groups	tortora writes principles anatomy physiology l...	immunology cell-biology hematology
3	4	long antibiotic dosed lb maintain good selection	various people lab prepare liter lb add kanamy...	cell-culture
4	5	exon order always preserved splicing	cases splicing machinery constructs mrna exons...	splicing mrna spliceosome introns exons