In [61]:
import numpy as np
import pandas as pd
import IPython.display
from six.moves import cPickle as pickle
from tqdm import tqdm
tqdm.pandas()
from IPython.display import display


def maybe_pickle(file_name, load_dataset, force=False):
    pickle_file_name = "pickle/2_" + file_name + ".pickle"
    import os
    if not os.path.exists("pickle"):
        os.makedirs("pickle")
        
    if os.path.exists(pickle_file_name) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping pickling.' % pickle_file_name)
    else:
        print('Pickling %s.' % pickle_file_name)
        dataset = load_dataset(None)
        try:
            with open(pickle_file_name, 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', file_name, ':', e)
    
    return pickle_file_name

def load_data(file_name, force=False):
    original_file_path = "../input/" + file_name + ".csv"
    pickle_file_name = maybe_pickle(file_name, lambda x: pd.read_csv(original_file_path), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

In [62]:
biology = load_data("biology")
cooking = load_data("cooking")
crypto = load_data("crypto")
diy = load_data("diy")
robotics = load_data("robotics")
travel = load_data("travel")


pickle/2_biology.pickle already present - Skipping pickling.
pickle/2_cooking.pickle already present - Skipping pickling.
pickle/2_crypto.pickle already present - Skipping pickling.
pickle/2_diy.pickle already present - Skipping pickling.
pickle/2_robotics.pickle already present - Skipping pickling.
pickle/2_travel.pickle already present - Skipping pickling.

In [103]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()
punctuation_trans_table = str.maketrans({key: None for key in string.punctuation})
html_tag_regex = re.compile('<.*?>')
code_tag_regex = re.compile('<code>([^<]+)</code>', re.S)
a_tag_regex = re.compile('<a href([^<]+)</a>', re.S)

def cleaning_text(text):
    original_text_length = len(text)
    number_of_html_tag = len(re.findall(html_tag_regex, text))
    number_of_code_fragments = len(re.findall(code_tag_regex, text))
    number_of_a_href = len(re.findall(a_tag_regex, text))
    
    # convert to lowercase
    text = text.lower()
    # remove code fragment
    text = re.sub(code_tag_regex, 'code_tag', text)
    # remove html tags
    text = re.sub(html_tag_regex, '', text)
    # remove \r, \n
    text = text.replace('\n', ' ').replace('\r', '')
    # remove Punctuations
    text = text.translate(punctuation_trans_table)
    # split
    words = word_tokenize(text)
    # remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    # lemmatizing, stemming
    #words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    #words = [stemmer.stem(word) for word in words]
    # join
    text = ' '.join(words)
    
    number_of_cleaned_text_tokens = len(words)
    cleaned_text_length = len(text)
    return text, [original_text_length, number_of_html_tag, number_of_code_fragments, number_of_a_href, number_of_cleaned_text_tokens, cleaned_text_length]


def cleaning(row):
    row['title'], title_meta_list = cleaning_text(row['title'])
    row['title_original_text_length'] = title_meta_list[0]
    row['title_number_of_cleaned_text_tokens'] = title_meta_list[4]
    row['title_cleaned_text_length'] = title_meta_list[5]
    
    row['content'], content_meta_list = cleaning_text(row['content'])
    row['content_original_text_length'] = content_meta_list[0]
    row['content_number_of_html_tag'] = content_meta_list[1]
    row['content_number_of_code_fragments'] = content_meta_list[2]
    row['content_number_of_a_href'] = content_meta_list[3]
    row['content_number_of_cleaned_text_tokens'] = content_meta_list[4]
    row['content_cleaned_text_length'] = content_meta_list[5]
    
    #row['cleaned_tags'], content_meta_list = cleaning_text(row['tags'])
    return row

In [107]:
def load_cleaned_df(file_name, force=False):
    original_file_path = "../input/" + file_name + ".csv"
    df = pd.read_csv(original_file_path)
    print("total len : %d" % len(df))
    result_df = df.progress_apply(cleaning, axis=1)
    
    # feature scaling for meta columns
    from sklearn import preprocessing
    min_max_scaler = preprocessing.MinMaxScaler()
    result_df['title_original_text_length'] = min_max_scaler.fit_transform(result_df['title_original_text_length'])
    result_df['title_number_of_cleaned_text_tokens'] = min_max_scaler.fit_transform(result_df['title_number_of_cleaned_text_tokens'])
    result_df['title_cleaned_text_length'] = min_max_scaler.fit_transform(result_df['title_cleaned_text_length'])
    result_df['content_original_text_length'] = min_max_scaler.fit_transform(result_df['content_original_text_length'])
    result_df['content_number_of_html_tag'] = min_max_scaler.fit_transform(result_df['content_number_of_html_tag'])
    result_df['content_number_of_code_fragments'] = min_max_scaler.fit_transform(result_df['content_number_of_code_fragments'])
    result_df['content_number_of_a_href'] = min_max_scaler.fit_transform(result_df['content_number_of_a_href'])
    result_df['content_number_of_cleaned_text_tokens'] = min_max_scaler.fit_transform(result_df['content_number_of_cleaned_text_tokens'])
    result_df['content_cleaned_text_length'] = min_max_scaler.fit_transform(result_df['content_cleaned_text_length'])
    
    return result_df
    
def maybe_pickle_cleaned_df(file_name, force=False):
    pickle_file_name = maybe_pickle(file_name + "_cleaned", lambda x: load_cleaned_df(file_name), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

In [65]:
biology_cleaned_df = maybe_pickle_cleaned_df('biology')
cooking_cleaned_df = maybe_pickle_cleaned_df('cooking')
crypto_cleaned_df = maybe_pickle_cleaned_df('crypto')
diy_cleaned_df = maybe_pickle_cleaned_df('diy')
robotics_cleaned_df = maybe_pickle_cleaned_df('robotics')
travel_cleaned_df = maybe_pickle_cleaned_df('travel')


pickle/2_biology_cleaned.pickle already present - Skipping pickling.
pickle/2_cooking_cleaned.pickle already present - Skipping pickling.
pickle/2_crypto_cleaned.pickle already present - Skipping pickling.
pickle/2_diy_cleaned.pickle already present - Skipping pickling.
pickle/2_robotics_cleaned.pickle already present - Skipping pickling.
pickle/2_travel_cleaned.pickle already present - Skipping pickling.

In [76]:
full_df = pd.concat([biology_cleaned_df, 
                     cooking_cleaned_df, 
                     crypto_cleaned_df, 
                     diy_cleaned_df, 
                     robotics_cleaned_df, 
                     travel_cleaned_df]).reset_index().drop('index', axis=1)
print(len(full_df))


87000

In [67]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
stop_words = text.ENGLISH_STOP_WORDS

full_df_vectorizer = TfidfVectorizer(stop_words=stop_words)
full_df_vectors = full_df_vectorizer.fit_transform((full_df['title'] + " " + full_df['content']).tolist())
print(len(full_df_vectorizer.get_feature_names()))


151721

In [68]:
full_df_title_vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.95, min_df=2,max_features=4000)
full_df_title_vectors = full_df_title_vectorizer.fit_transform((full_df['title']).tolist())
print(len(full_df_title_vectorizer.get_feature_names()))


4000

In [69]:
# code from https://buhrmann.github.io/tfidf-analysis.html
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids if row[i] > 0.3]
    df = pd.DataFrame(top_feats, columns=['feature', 'tfidf'])
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def predict_tags(vectors, vectorizer, index):
    tfidf_df = top_feats_in_doc(vectors, vectorizer.get_feature_names(), index)
    return ' '.join(tfidf_df['feature'])

full_df_title_vectors2 = full_df_title_vectorizer.transform((full_df['title'] + ' ' + full_df['content']).tolist())

In [80]:
# return TP, FP, FN
def compare_two_corpus(actual, predicted):
    if len(actual) == 0:
        if len(predicted) == 0:
            return 0,0,0
        else:
            return 0,len(predicted.split()),0
    else:
        if len(predicted) == 0:
            return 0,0,len(actual.split())
        else:
            actual_words = actual.split()
            predicted_words = predicted.split()
            tp = 0
            fp = 0
            fn = 0
            for actual_word in actual_words:
                if actual_word in predicted_words:
                    tp += 1
                else:
                    fn += 1
            for predicted_word in predicted_words:
                if predicted_word in actual_words:
                    pass
                else:
                    fp += 1
            
            return tp, fp, fn

def calculate_precision(tp, fp, fn):
    return tp/float(tp + fp)


def calculate_recall(tp, fp, fn):
    return tp/float(tp + fn)
        
    
def calculate_f1_score(tp, fp, fn, print_result=False):
    if tp == 0:
        if print_result:
            print('tp : %d, fp : %d, fn : %d, precision : %f, recall : %f, f1_score : %f' % (tp, fp, fn, 0., 0., 0.))
        return 0
    precision = calculate_precision(tp, fp, fn)
    recall = calculate_recall(tp, fp, fn)
    f1_score = 2*precision*recall/(precision + recall)
    if print_result:
        print('tp : %d, fp : %d, fn : %d, precision : %f, recall : %f, f1_score : %f' % (tp, fp, fn, precision, recall, f1_score))
    return f1_score

In [81]:
print(full_df.loc[1, 'tags'])


rna biochemistry

In [94]:
def predict_and_scoring(df, vector, vectorizer, index):
    predicted_tags_by_title_content = predict_tags(vector, vectorizer, index)
    actual_tags = df.loc[index, 'tags']
    tp, fp, fn = compare_two_corpus(actual_tags, predicted_tags_by_title_content)
    df.loc[index, 'predicted_tags'] = predicted_tags_by_title_content
    df.loc[index, 'score_tp'] = tp
    df.loc[index, 'score_fp'] = fp
    df.loc[index, 'score_fn'] = fn
    if i % 1000 == 0:
        print("%drow finished..." % i)

for i in range(0,len(full_df)):
    predict_and_scoring(full_df, full_df_title_vectors2, full_df_title_vectorizer, i)
    #calculate_f1_score(tp, fp, fn)


0row finished...
1000row finished...
2000row finished...
3000row finished...
4000row finished...
5000row finished...
6000row finished...
7000row finished...
8000row finished...
9000row finished...
10000row finished...
11000row finished...
12000row finished...
13000row finished...
14000row finished...
15000row finished...
16000row finished...
17000row finished...
18000row finished...
19000row finished...
20000row finished...
21000row finished...
22000row finished...
23000row finished...
24000row finished...
25000row finished...
26000row finished...
27000row finished...
28000row finished...
29000row finished...
30000row finished...
31000row finished...
32000row finished...
33000row finished...
34000row finished...
35000row finished...
36000row finished...
37000row finished...
38000row finished...
39000row finished...
40000row finished...
41000row finished...
42000row finished...
43000row finished...
44000row finished...
45000row finished...
46000row finished...
47000row finished...
48000row finished...
49000row finished...
50000row finished...
51000row finished...
52000row finished...
53000row finished...
54000row finished...
55000row finished...
56000row finished...
57000row finished...
58000row finished...
59000row finished...
60000row finished...
61000row finished...
62000row finished...
63000row finished...
64000row finished...
65000row finished...
66000row finished...
67000row finished...
68000row finished...
69000row finished...
70000row finished...
71000row finished...
72000row finished...
73000row finished...
74000row finished...
75000row finished...
76000row finished...
77000row finished...
78000row finished...
79000row finished...
80000row finished...
81000row finished...
82000row finished...
83000row finished...
84000row finished...
85000row finished...
86000row finished...

In [95]:
print(calculate_f1_score(sum(full_df['score_tp']), sum(full_df['score_fp']), sum(full_df['score_fn'])))


0.152494858297

In [109]:
# for submit
test = load_data("test")
test_cleaned_df = maybe_pickle_cleaned_df('test')


pickle/2_test.pickle already present - Skipping pickling.
pickle/2_test_cleaned.pickle already present - Skipping pickling.

In [ ]: