In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as snb
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import re

In [2]:
train_bio = pd.read_csv("input_light/biology.csv",encoding='utf-8')
train_cooking = pd.read_csv("input_light/cooking.csv",encoding='utf-8')
train_crypto = pd.read_csv("input_light/crypto.csv",encoding='utf-8')
train_dyi = pd.read_csv("input_light/diy.csv",encoding='utf-8')
train_robotic = pd.read_csv("input_light/robotics.csv",encoding='utf-8')
train_travel = pd.read_csv("input_light/travel.csv",encoding='utf-8')
# test_df = pd.read_csv("input_light/test.csv",encoding='utf-8')

In [12]:
# df_list = []
df_list = pd.concat([train_bio,train_cooking,train_crypto,train_dyi,train_robotic,train_travel])

Length of dataset


In [13]:
df_list.head()


Out[13]:
id title content tags
0 1 criticality ribosome binding site relative sta... prokaryotic translation critical efficient tra... ribosome binding-sites translation synthetic-b...
1 2 rnase contamination rna based experiments prev... anyone suggestions prevent rnase contamination... rna biochemistry
2 3 lymphocyte sizes clustered two groups tortora writes principles anatomy physiology l... immunology cell-biology hematology
3 4 long antibiotic dosed lb maintain good selection various people lab prepare liter lb add kanamy... cell-culture
4 5 exon order always preserved splicing cases splicing machinery constructs mrna exons... splicing mrna spliceosome introns exons

In [18]:
df_list.dropna(axis=0,how='any',inplace=True)

In [24]:
df_concat = df_list['title'] + ' ' + df_list['content']

In [26]:
df_list['doc'] = df_concat.map(lambda d: d.replace('-','_'))

In [27]:
print "Total length = %s"%df_list.shape[0]


Total length = 86998

In [28]:
print "Length of null data = %s"%df_list[df_list['doc'].isnull()].shape[0]


Length of null data = 0

In [29]:
print "Length of null data = %s"%df_list[df_list['tags'].isnull()].shape[0]


Length of null data = 0

Number of unique tags in training datasets


In [10]:
df_list['tags'][0:10]


Out[10]:
0    ribosome binding-sites translation synthetic-b...
1                                     rna biochemistry
2                   immunology cell-biology hematology
3                                         cell-culture
4              splicing mrna spliceosome introns exons
5                   dna biochemistry molecular-biology
6                                neuroscience synapses
7                                             plasmids
8    molecular-genetics gene-expression experimenta...
9                  evolution mitochondria chloroplasts
Name: tags, dtype: object

In [33]:
# Y = [item.replace("-","_").split(" ") for item in df_list['tags']]
all_tags = df_list['tags'].map(lambda d: d.replace('-','_'))

In [12]:
length_of_tags = pd.DataFrame(zip(*np.unique([len(tag) for tag in Y],return_counts=True)),columns=['length','count'])

In [13]:
%matplotlib inline
# ax = plt.subplot()
# ax.plot(length_of_tags['length'],label='')
ax = length_of_tags.set_index("length").plot(kind='bar',rot=0,title="Length of tags")



In [ ]:

Analyze type of data


In [30]:
type_of_tags = []

def get_post_tag(tags):
    return np.array(nltk.pos_tag(tags.split(' ')))[:,1]

for tag in df_list.tags:
    type_of_tags.extend(get_post_tag(tag))

count_tag = pd.DataFrame(zip(*np.unique(type_of_tags,return_counts=True)),columns=['tag','count'])

In [31]:
%matplotlib inline
count_tag.sort_values(['count'],ascending=False).iloc[0:10].set_index(['tag']).plot(kind='bar',title="Type of tag",rot=0)


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd1a9940f90>

Number of tags in title


In [ ]:
def remove_number(demo):
    a = []
    for f in demo.split(' '):
        if not(f.isdigit()):
            a.append(f)
    return a

In [94]:
# token_pattern='^[0-9]*'
word_title = [remove_number(item) for item in df_list['title']]
title_dict = list(set(sum(word_title,[])))

In [95]:
title_dict = set(title_dict)

In [96]:
number_of_tags_in_title = title_dict.intersection(tags)

In [97]:
print "Ratio of tags in title = %s %%"%(len(number_of_tags_in_title)*100/len(all_tags))


Ratio of tags in title = 1 %

TF-IDF + LDA


In [35]:
tfidfVec = TfidfVectorizer(max_df=0.95)
countVec = CountVectorizer(max_df=0.95,min_df=2,ngram_range=(1,2),token_pattern='[a-zA-Z]{2,}')

In [36]:
df_tf = countVec.fit_transform(df_list['doc'])

In [37]:
n_topics = 6

In [44]:
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=4111,n_jobs=2)
lda.fit(df_tf)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-44-b4ebe801813f> in <module>()
      3                                 learning_offset=50.,
      4                                 random_state=4111,n_jobs=2)
----> 5 lda.fit(df_tf)

/usr/local/lib/python2.7/dist-packages/sklearn/decomposition/online_lda.pyc in fit(self, X, y)
    521                     for idx_slice in gen_batches(n_samples, batch_size):
    522                         self._em_step(X[idx_slice, :], total_samples=n_samples,
--> 523                                       batch_update=False, parallel=parallel)
    524                 else:
    525                     # batch update

/usr/local/lib/python2.7/dist-packages/sklearn/decomposition/online_lda.pyc in _em_step(self, X, total_samples, batch_update, parallel)
    408         # E-step
    409         _, suff_stats = self._e_step(X, cal_sstats=True, random_init=True,
--> 410                                      parallel=parallel)
    411 
    412         # M-step

/usr/local/lib/python2.7/dist-packages/sklearn/decomposition/online_lda.pyc in _e_step(self, X, cal_sstats, random_init, parallel)
    361                                               self.mean_change_tol, cal_sstats,
    362                                               random_state)
--> 363             for idx_slice in gen_even_slices(X.shape[0], n_jobs))
    364 
    365         # merge result

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.pyc in retrieve(self)
    716                     # scheduling.
    717                     ensure_ready = self._managed_backend
--> 718                     backend.abort_everything(ensure_ready=ensure_ready)
    719                 raise exception
    720 

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/_parallel_backends.pyc in abort_everything(self, ensure_ready)
    144         if ensure_ready:
    145             self.configure(n_jobs=self.parallel.n_jobs, parallel=self.parallel,
--> 146                            **self.parallel._backend_args)
    147 
    148 

/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/_parallel_backends.pyc in configure(self, n_jobs, parallel, **backend_args)
    308 
    309         # Make sure to free as much memory as possible before forking
--> 310         gc.collect()
    311         self._pool = MemmapingPool(n_jobs, **backend_args)
    312         self.parallel = parallel

KeyboardInterrupt: 

In [ ]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [ ]:
print("\nTopics in LDA model:")
n_top_words = 30
tf_feature_names = countVec.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

In [ ]:
idx = lda.transform(df_tf[0]).argmax()

# words_topic = zip(*(lda.components_[3],countVec.get_feature_names()))
words_topic = lda.components_[idx]
feature_names = countVec.get_feature_names()
n_words = 100

print [feature_names[i] for i in words_topic.argsort()[:-n_words - 1:-1]]

In [42]:
tokenizer = countVec.build_analyzer()
a = tokenizer(df_list['title'].iloc[0])
b = [(c,words_topic[countVec.vocabulary_[c]]) for c in a if countVec.vocabulary_.has_key(c)]

In [43]:
df_list['tags'].iloc[0]


Out[43]:
u'ribosome binding-sites translation synthetic-biology'

In [47]: