Tested on real and large data (over 10 million rows) with over 95% accuracy. Removing the dataset presently as it is under non-disclaimer. Uploading a sample data presently but the algorithm is tested on real data and performs well. This notebook shows 100% accuracy but that is because the dataset used is small and self-created.


In [1]:
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.metrics import classification_report
import os.path
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier


/usr/local/lib/python2.7/dist-packages/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.
  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))

In [2]:
df = pd.read_csv(open('sample.csv'))

In [3]:
def vectorize(df):
    vectorizer = TfidfVectorizer(stop_words="english",min_df=10,max_features=50)
    vectorizer.fit(df['title']+df['message'])
    pickle.dump(file=open("vector.pkl","w"),obj=vectorizer)

In [4]:
def cluster(df,number_of_clusters,batch_size,max_iter):
    vz = pickle.load(open('vector.pkl',"r"))
    data_points = len(df)
    class_val = []
    interia_list = []
    z = MiniBatchKMeans(n_clusters=number_of_clusters,batch_size=batch_size,max_iter=max_iter)
    for i in xrange(0,data_points,batch_size):
        features = vz.transform(df[i:i+batch_size]['title']+df[i:i+batch_size]['message'])
        z.partial_fit(features)
        interia_list.append(z.inertia_)
        class_val = class_val + list(z.labels_)
    new_df = df[0:data_points]
    new_df['class'] = class_val
    new_df.to_csv('ClusteredData.csv', index=False)

In [5]:
def classify(clusteredDataFile,test_size):
    vz = pickle.load(open('vector.pkl',"r"))
    clusteredData = pd.read_csv(clusteredDataFile)
    all_features = vz.transform(clusteredData['title']+clusteredData['message'])
    X_train, X_test, y_train, y_test = train_test_split(all_features,clusteredData['class'],test_size=test_size)
    clf = SGDClassifier()
    clf.fit(X_train,y_train)
    y_true,y_pred = y_test,clf.predict(X_test)
    pickle.dump(file=open("classifier.pkl","w"),obj=clf)

In [6]:
def get_classification_report(test_size):
    clusteredData = pd.read_csv('ClusteredData.csv')
    vz = pickle.load(open('vector.pkl',"r"))
    all_features = vz.transform(clusteredData['title']+clusteredData['message'])
    X_train, X_test, y_train, y_test = train_test_split(all_features,clusteredData['class'],test_size=test_size)
    clf = pickle.load(open("classifier.pkl","r"))
    report = classification_report(y_test,clf.predict(X_test), digits=10)
    print report

In [7]:
vectorize(df)

In [8]:
cluster(df,3,10,50)

In [9]:
classify('ClusteredData.csv',0.05)

In [11]:
get_classification_report(0.5)


             precision    recall  f1-score   support

          0  1.0000000000 1.0000000000 1.0000000000         3
          1  1.0000000000 1.0000000000 1.0000000000         4
          2  1.0000000000 1.0000000000 1.0000000000         3

avg / total  1.0000000000 1.0000000000 1.0000000000        10