In [61]:
import numpy as np
import pandas as pd
import IPython.display
from six.moves import cPickle as pickle
from tqdm import tqdm
tqdm.pandas()
from IPython.display import display
def maybe_pickle(file_name, load_dataset, force=False):
pickle_file_name = "pickle/2_" + file_name + ".pickle"
import os
if not os.path.exists("pickle"):
os.makedirs("pickle")
if os.path.exists(pickle_file_name) and not force:
# You may override by setting force=True.
print('%s already present - Skipping pickling.' % pickle_file_name)
else:
print('Pickling %s.' % pickle_file_name)
dataset = load_dataset(None)
try:
with open(pickle_file_name, 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
print('Unable to save data to', file_name, ':', e)
return pickle_file_name
def load_data(file_name, force=False):
original_file_path = "../input/" + file_name + ".csv"
pickle_file_name = maybe_pickle(file_name, lambda x: pd.read_csv(original_file_path), force)
with open(pickle_file_name, 'rb') as f:
return pickle.load(f)
In [62]:
biology = load_data("biology")
cooking = load_data("cooking")
crypto = load_data("crypto")
diy = load_data("diy")
robotics = load_data("robotics")
travel = load_data("travel")
In [103]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()
punctuation_trans_table = str.maketrans({key: None for key in string.punctuation})
html_tag_regex = re.compile('<.*?>')
code_tag_regex = re.compile('<code>([^<]+)</code>', re.S)
a_tag_regex = re.compile('<a href([^<]+)</a>', re.S)
def cleaning_text(text):
original_text_length = len(text)
number_of_html_tag = len(re.findall(html_tag_regex, text))
number_of_code_fragments = len(re.findall(code_tag_regex, text))
number_of_a_href = len(re.findall(a_tag_regex, text))
# convert to lowercase
text = text.lower()
# remove code fragment
text = re.sub(code_tag_regex, 'code_tag', text)
# remove html tags
text = re.sub(html_tag_regex, '', text)
# remove \r, \n
text = text.replace('\n', ' ').replace('\r', '')
# remove Punctuations
text = text.translate(punctuation_trans_table)
# split
words = word_tokenize(text)
# remove stop words
words = [word for word in words if word not in stopwords.words('english')]
# lemmatizing, stemming
#words = [wordnet_lemmatizer.lemmatize(word) for word in words]
#words = [stemmer.stem(word) for word in words]
# join
text = ' '.join(words)
number_of_cleaned_text_tokens = len(words)
cleaned_text_length = len(text)
return text, [original_text_length, number_of_html_tag, number_of_code_fragments, number_of_a_href, number_of_cleaned_text_tokens, cleaned_text_length]
def cleaning(row):
row['title'], title_meta_list = cleaning_text(row['title'])
row['title_original_text_length'] = title_meta_list[0]
row['title_number_of_cleaned_text_tokens'] = title_meta_list[4]
row['title_cleaned_text_length'] = title_meta_list[5]
row['content'], content_meta_list = cleaning_text(row['content'])
row['content_original_text_length'] = content_meta_list[0]
row['content_number_of_html_tag'] = content_meta_list[1]
row['content_number_of_code_fragments'] = content_meta_list[2]
row['content_number_of_a_href'] = content_meta_list[3]
row['content_number_of_cleaned_text_tokens'] = content_meta_list[4]
row['content_cleaned_text_length'] = content_meta_list[5]
#row['cleaned_tags'], content_meta_list = cleaning_text(row['tags'])
return row
In [107]:
def load_cleaned_df(file_name, force=False):
original_file_path = "../input/" + file_name + ".csv"
df = pd.read_csv(original_file_path)
print("total len : %d" % len(df))
result_df = df.progress_apply(cleaning, axis=1)
# feature scaling for meta columns
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
result_df['title_original_text_length'] = min_max_scaler.fit_transform(result_df['title_original_text_length'])
result_df['title_number_of_cleaned_text_tokens'] = min_max_scaler.fit_transform(result_df['title_number_of_cleaned_text_tokens'])
result_df['title_cleaned_text_length'] = min_max_scaler.fit_transform(result_df['title_cleaned_text_length'])
result_df['content_original_text_length'] = min_max_scaler.fit_transform(result_df['content_original_text_length'])
result_df['content_number_of_html_tag'] = min_max_scaler.fit_transform(result_df['content_number_of_html_tag'])
result_df['content_number_of_code_fragments'] = min_max_scaler.fit_transform(result_df['content_number_of_code_fragments'])
result_df['content_number_of_a_href'] = min_max_scaler.fit_transform(result_df['content_number_of_a_href'])
result_df['content_number_of_cleaned_text_tokens'] = min_max_scaler.fit_transform(result_df['content_number_of_cleaned_text_tokens'])
result_df['content_cleaned_text_length'] = min_max_scaler.fit_transform(result_df['content_cleaned_text_length'])
return result_df
def maybe_pickle_cleaned_df(file_name, force=False):
pickle_file_name = maybe_pickle(file_name + "_cleaned", lambda x: load_cleaned_df(file_name), force)
with open(pickle_file_name, 'rb') as f:
return pickle.load(f)
In [65]:
biology_cleaned_df = maybe_pickle_cleaned_df('biology')
cooking_cleaned_df = maybe_pickle_cleaned_df('cooking')
crypto_cleaned_df = maybe_pickle_cleaned_df('crypto')
diy_cleaned_df = maybe_pickle_cleaned_df('diy')
robotics_cleaned_df = maybe_pickle_cleaned_df('robotics')
travel_cleaned_df = maybe_pickle_cleaned_df('travel')
In [76]:
full_df = pd.concat([biology_cleaned_df,
cooking_cleaned_df,
crypto_cleaned_df,
diy_cleaned_df,
robotics_cleaned_df,
travel_cleaned_df]).reset_index().drop('index', axis=1)
print(len(full_df))
In [67]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
stop_words = text.ENGLISH_STOP_WORDS
full_df_vectorizer = TfidfVectorizer(stop_words=stop_words)
full_df_vectors = full_df_vectorizer.fit_transform((full_df['title'] + " " + full_df['content']).tolist())
print(len(full_df_vectorizer.get_feature_names()))
In [68]:
full_df_title_vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=0.95, min_df=2,max_features=4000)
full_df_title_vectors = full_df_title_vectorizer.fit_transform((full_df['title']).tolist())
print(len(full_df_title_vectorizer.get_feature_names()))
In [69]:
# code from https://buhrmann.github.io/tfidf-analysis.html
def top_tfidf_feats(row, features, top_n=25):
''' Get top n tfidf values in row and return them with their corresponding feature names.'''
topn_ids = np.argsort(row)[::-1][:top_n]
top_feats = [(features[i], row[i]) for i in topn_ids if row[i] > 0.3]
df = pd.DataFrame(top_feats, columns=['feature', 'tfidf'])
return df
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
''' Top tfidf features in specific document (matrix row) '''
row = np.squeeze(Xtr[row_id].toarray())
return top_tfidf_feats(row, features, top_n)
def predict_tags(vectors, vectorizer, index):
tfidf_df = top_feats_in_doc(vectors, vectorizer.get_feature_names(), index)
return ' '.join(tfidf_df['feature'])
full_df_title_vectors2 = full_df_title_vectorizer.transform((full_df['title'] + ' ' + full_df['content']).tolist())
In [80]:
# return TP, FP, FN
def compare_two_corpus(actual, predicted):
if len(actual) == 0:
if len(predicted) == 0:
return 0,0,0
else:
return 0,len(predicted.split()),0
else:
if len(predicted) == 0:
return 0,0,len(actual.split())
else:
actual_words = actual.split()
predicted_words = predicted.split()
tp = 0
fp = 0
fn = 0
for actual_word in actual_words:
if actual_word in predicted_words:
tp += 1
else:
fn += 1
for predicted_word in predicted_words:
if predicted_word in actual_words:
pass
else:
fp += 1
return tp, fp, fn
def calculate_precision(tp, fp, fn):
return tp/float(tp + fp)
def calculate_recall(tp, fp, fn):
return tp/float(tp + fn)
def calculate_f1_score(tp, fp, fn, print_result=False):
if tp == 0:
if print_result:
print('tp : %d, fp : %d, fn : %d, precision : %f, recall : %f, f1_score : %f' % (tp, fp, fn, 0., 0., 0.))
return 0
precision = calculate_precision(tp, fp, fn)
recall = calculate_recall(tp, fp, fn)
f1_score = 2*precision*recall/(precision + recall)
if print_result:
print('tp : %d, fp : %d, fn : %d, precision : %f, recall : %f, f1_score : %f' % (tp, fp, fn, precision, recall, f1_score))
return f1_score
In [81]:
print(full_df.loc[1, 'tags'])
In [94]:
def predict_and_scoring(df, vector, vectorizer, index):
predicted_tags_by_title_content = predict_tags(vector, vectorizer, index)
actual_tags = df.loc[index, 'tags']
tp, fp, fn = compare_two_corpus(actual_tags, predicted_tags_by_title_content)
df.loc[index, 'predicted_tags'] = predicted_tags_by_title_content
df.loc[index, 'score_tp'] = tp
df.loc[index, 'score_fp'] = fp
df.loc[index, 'score_fn'] = fn
if i % 1000 == 0:
print("%drow finished..." % i)
for i in range(0,len(full_df)):
predict_and_scoring(full_df, full_df_title_vectors2, full_df_title_vectorizer, i)
#calculate_f1_score(tp, fp, fn)
In [95]:
print(calculate_f1_score(sum(full_df['score_tp']), sum(full_df['score_fp']), sum(full_df['score_fn'])))
In [109]:
# for submit
test = load_data("test")
test_cleaned_df = maybe_pickle_cleaned_df('test')
In [ ]: