Available on Perforce and Github
In [1]:
def readfile(self, filepath, filetype, encod ='ISO-8859-1', header =None):
logger.info('Start reading File')
if not os.path.isfile(filepath):
logger.error("File Not Exist!")
sys.exit()
if filetype == 'csv':
df = pd.read_csv(filepath, encoding=encod, header =header)
elif filetype == 'json':
df = pd.read_json(filepath, encoding=encod, lines=True)
elif filetype == 'xlsx':
df = pd.read_excel(filepath, encoding=encod, header =header)
else:
logger.error("Extension Type not Accepted!")
sys.exit()
def processtext(self, column, removeSymbol = True, remove_stopwords=False, stemming=False):
logger.info("Start Data Cleaning...")
self.data[column] = self.data[column].str.replace(r'[\n\r\t]+', ' ')
# Remove URLs
self.data[column] = self.data[column].str.replace(self.regex_str[3],' ')
tempcol = self.data[column].values.tolist()
stops = set(stopwords.words("english"))
# This part takes a lot of times
printProgressBar(0, len(tempcol), prefix='Progress:', suffix='Complete', length=50)
for i in range(len(tempcol)):
row = BeautifulSoup(tempcol[i],'html.parser').get_text()
if removeSymbol:
row = re.sub('[^a-zA-Z0-9]', ' ', row)
words = row.split()
if remove_stopwords:
words = [w for w in words if not w in stops and not w.replace('.', '', 1).isdigit()]
row = ' '.join(words)
tempcol[i] = row.lower()
printProgressBar(i+1, len(tempcol), prefix='Progress:', suffix='Complete', length=50)
print("\n")
return tempcol
In [5]:
from SentenceParserPython3 import SentenceParser as SP
test = SP(20)
sample_text = "I @#$@have a @#$@#$@#%dog @#%@$^#$()_+%at home"
test.processline(sample_text, True, True)
Out[5]:
Term Frequency denoted by tf, is the number of occurrencesof a term t in the document D.
Inverse Document Frequency of a term t, denoted by idf is log(N/df), where N is the total number of documents in thespace. So, it reduces the weight when a term occurs manytimes in a document, or in other words a word with rareoccurrences has more weight.
TF-IDF = Term Frequency * Inverse Document Frequency
Inverse Document Frequency = log(N/df)
Vocabulary size: 10000-100000 is the range used in this project
Note: TF-IDF will brings Sparse Matrix back to reduce the memory use. Sparse Matrix is supported by K-Means. Sometimes we need to tranform it into dense when we actually use it to do the calculation
In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
def TFIDF(text, size):
print("Using TFIDF Doing data cleaning...")
vectorizer = TfidfVectorizer(stop_words='english', analyzer='word', strip_accents='unicode', max_features=size)
X = vectorizer.fit_transform(text)
return vectorizer, X
In [8]:
import pickle
from sklearn.externals import joblib
vectorizer = joblib.load('model/MSD2016NowTFIDF.pkl')
In [15]:
sample = 'GFE Share Telemetry item for OSC Hotkey Toggle'
result = vectorizer.transform([sample])
result.toarray()
Out[15]:
In [18]:
from gensim.models import word2vec
def W2V(text, size):
sentences = []
for idx in range(len(text)):
sentences.append(text[idx].split())
num_features = size
min_word_count = 20
num_workers = 4
context = 10
downsampling = 1e-3
model_name = "./model/w2vec.model"
model = word2vec.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
model.init_sims(replace=True)
return model
def Word2VecEmbed(text, model, num_features):
worddict = {}
for key in model.wv.vocab.keys():
worddict[key] = model.wv.word_vec(key)
X = []
for idx in range(len(text)):
words = text[idx].split()
counter = 0
temprow = np.zeros(num_features)
for word in words:
if word in worddict:
counter += 1
temprow += worddict[word]
if counter != 0:
X.append(temprow/counter)
else:
X.append(temprow)
X = np.array(X)
return X
Principal component analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components (or sometimes, principal modes of variation). The number of principal components is less than or equal to the smaller of the number of original variables or the number of observations. This transformation is defined in such a way that the first principal component has the largest possible variance (that is, accounts for as much of the variability in the data as possible), and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to the preceding components. The resulting vectors are an uncorrelated orthogonal basis set. PCA is sensitive to the relative scaling of the original variables.
Dimensionality reduction using truncated SVD (aka LSA).
This transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). Contrary to PCA, this estimator does not center the data before computing the singular value decomposition. This means it can work with scipy.sparse matrices efficiently.
In our model, we reduce the dimension from 100000 to 6000 and keep 77% of Variance
In [19]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
def DRN(X, DRN_size):
print("Performing dimensionality reduction using LSA")
svd = TruncatedSVD(DRN_size)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100)))
return svd, X
clustering is the task of grouping a set of objects in such a way that objects in the same group (called a cluster) are more similar (in some sense or another) to each other than to those in other groups (clusters). It is a main task of exploratory data mining, and a common technique for statistical data analysis, used in many fields, including machine learning, pattern recognition, image analysis, information retrieval, bioinformatics, data compression, and computer graphics.
The current Algorithm we are using is the General KM without Mini-Batches. Mini-Batches are not working as well as the normal K-Means in our dataset.
K-means implementation is based on "Yinyang K-Means: A Drop-In Replacement of the Classic K-Means with Consistent Speedup". While it introduces some overhead and many conditional clauses which are bad for CUDA, it still shows 1.6-2x speedup against the Lloyd algorithm. K-nearest neighbors employ the same triangle inequality idea and require precalculated centroids and cluster assignments, similar to the flattened ball tree.
| Benchmarks | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda | kmcuda 2 GPUs |
|---|---|---|---|---|---|---|
| speed | 1x | 4.5x | 8.2x | 15.5x | 17.8x | 29.8x |
| memory | 1x | 2x | 2x | 0.6x | 0.6x | 0.6x |
In [ ]:
from sklearn.cluster import KMeans
def kmtrain(X, num_clusters):
km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1, verbose=1)
print("Clustering sparse data with %s" % km)
km.fit(X)
return km
from libKMCUDA import kmeans_cuda
def cudatrain(X, num_clusters):
centroids, assignments = kmeans_cuda(X, num_clusters, verbosity=1, yinyang_t=0, seed=3)
return centroids, assignments
In [ ]:
correct = 0.0
assignment = []
printProgressBar(0, X.shape[0], prefix='Progress:', suffix='Complete', length=50)
for idx, item in enumerate(X):
center = np.squeeze(np.sum(np.square(item - centroid), axis =1)).argsort()[0]
if assign[idx] == center:
correct +=1.0
assignment.append(center)
printProgressBar(idx, X.shape[0], prefix='Progress:', suffix='Complete'+' Acc:' + str(correct/(idx+1)), length=50)
In [ ]:
count = np.bincount(assignment)
count
In [ ]:
verifier = pd.read_csv('DuplicateBugs.csv',header=None)
verifier = verifier.as_matrix()
available = []
printProgressBar(0, verifier.shape[0], prefix='Progress:', suffix='Complete', length=50)
for idx, row in enumerate(verifier):
if not np.isnan(row).any():
leftcomp = df.loc[df["BugId"]==int(row[0])]
rightcomp = df.loc[df["BugId"]==int(row[1])]
if (not leftcomp.empty) and (not rightcomp.empty):
available.append([leftcomp.index[0], rightcomp.index[0]])
printProgressBar(idx, verifier.shape[0], prefix='Progress:', suffix='Complete', length=50)
temp = np.array(available)
In [ ]:
correctrow = 0
correctdist = []
vectorizer = joblib.load(root+divname+'TFIDF.pkl')
X = vectorizer.transform(text)
printProgressBar(0, temp.shape[0], prefix='Progress:', suffix='Complete', length=50)
for idx, row in enumerate(temp):
clusterset = np.squeeze(np.sum(np.square(real_center - X[row[0]].toarray()),axis=1)).argsort()[0:3]
dist = []
for cluster in clusterset:
dataset = wholeX[np.array((df["cluster"] == cluster).tolist())]
for datarow in dataset:
dist.append(np.sum(np.square(datarow.toarray() - wholeX[row[0]].toarray())))
dist = np.array(dist)
smalldist = np.sum(np.square(wholeX[row[1]].toarray() - wholeX[row[0]].toarray()))
sorteddist = np.sort(dist)
if sorteddist.shape[0] <= 5 or smalldist <= sorteddist[5]:
correctrow += 1
correctdist.append(smalldist)
printProgressBar(idx, temp.shape[0], prefix='Progress:', suffix='Complete', length=50)
print("Accuracy: "+ str(1.0*correctrow/temp.shape[0]))
In [ ]:
def bugidgetter(df, cluster, loc):
bigset = df.loc[df['cluster'] == cluster]
return bigset.iloc[[loc],:]["BugId"].tolist()[0]
def bugindata(df, bugid):
return not df.loc[df["BugId"]==int(bugid)].empty
def predict(text, topkclusters, topktopics):
bugiddist = []
row = vectorizer.transform([text])
clusterset = np.squeeze(np.sum(np.square(real_center - row.toarray()),axis=1)).argsort()[0:topkclusters]
dist = []
print(clusterset)
for cluster in clusterset:
dataset = X[np.array((df["cluster"] == cluster).tolist())]
for idx, datarow in enumerate(dataset):
dist.append([np.sum(np.square(datarow.toarray() - row.toarray())), cluster, idx])
dist = np.array(dist)
topk = dist[dist[:,0].argsort()][0:topktopics]
# print(topk)
for idx, row in enumerate(topk):
bugiddist.append({'BugId':bugidgetter(df, row[1],row[2]), 'Distance': row[0]})
return bugiddist