Tested on real and large data (over 10 million rows) with over 95% accuracy. Removing the dataset presently as it is under non-disclaimer. Uploading a sample data presently but the algorithm is tested on real data and performs well. This notebook shows 100% accuracy but that is because the dataset used is small and self-created.
In [1]:
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.metrics import classification_report
import os.path
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
In [2]:
df = pd.read_csv(open('sample.csv'))
In [3]:
def vectorize(df):
vectorizer = TfidfVectorizer(stop_words="english",min_df=10,max_features=50)
vectorizer.fit(df['title']+df['message'])
pickle.dump(file=open("vector.pkl","w"),obj=vectorizer)
In [4]:
def cluster(df,number_of_clusters,batch_size,max_iter):
vz = pickle.load(open('vector.pkl',"r"))
data_points = len(df)
class_val = []
interia_list = []
z = MiniBatchKMeans(n_clusters=number_of_clusters,batch_size=batch_size,max_iter=max_iter)
for i in xrange(0,data_points,batch_size):
features = vz.transform(df[i:i+batch_size]['title']+df[i:i+batch_size]['message'])
z.partial_fit(features)
interia_list.append(z.inertia_)
class_val = class_val + list(z.labels_)
new_df = df[0:data_points]
new_df['class'] = class_val
new_df.to_csv('ClusteredData.csv', index=False)
In [5]:
def classify(clusteredDataFile,test_size):
vz = pickle.load(open('vector.pkl',"r"))
clusteredData = pd.read_csv(clusteredDataFile)
all_features = vz.transform(clusteredData['title']+clusteredData['message'])
X_train, X_test, y_train, y_test = train_test_split(all_features,clusteredData['class'],test_size=test_size)
clf = SGDClassifier()
clf.fit(X_train,y_train)
y_true,y_pred = y_test,clf.predict(X_test)
pickle.dump(file=open("classifier.pkl","w"),obj=clf)
In [6]:
def get_classification_report(test_size):
clusteredData = pd.read_csv('ClusteredData.csv')
vz = pickle.load(open('vector.pkl',"r"))
all_features = vz.transform(clusteredData['title']+clusteredData['message'])
X_train, X_test, y_train, y_test = train_test_split(all_features,clusteredData['class'],test_size=test_size)
clf = pickle.load(open("classifier.pkl","r"))
report = classification_report(y_test,clf.predict(X_test), digits=10)
print report
In [7]:
vectorize(df)
In [8]:
cluster(df,3,10,50)
In [9]:
classify('ClusteredData.csv',0.05)
In [11]:
get_classification_report(0.5)