First we define the model.


In [26]:
from textblob.classifiers import NaiveBayesClassifier


def correct_format_training(line):
    string = line.replace("\n", "").replace("- ", "").replace("the ", " ").replace(" and", " ").replace(" from", " ")
    label = string.split(", ")[1]
    words = string.split(", ")[0].split(" ")
    return (words, label)

def correct_format_production(line):
    string = line.replace("\n", "").replace("- ", "").replace("the ", " ").replace(" and", " ").replace(" from", " ")
    words = string.split(" ")
    return words

training_set = []
with open("training_data/training_shuffled_data.txt", "r") as ins:
    for line in ins:
        training_set.append(correct_format(line))
        

NBC = NaiveBayesClassifier(training_set)

print NBC.classify("Refactoring something or other")

validation_set = []
with open("training_data/validation_shuffled_data.txt", "r") as ins:
    for line in ins:
        validation_set.append(correct_format(line))
        
NBC.accuracy(validation_set)


Unknown
Out[26]:
0.6608478802992519

In [40]:
import pickle
f = open('my_classifier.pickle', 'wb')
pickle.dump(NBC, f)
f.close()

In [41]:
f = open('my_classifier.pickle', 'rb')
pickled_NBC = pickle.load(f)
f.close()

In [44]:
pickled_NBC.classify("Refactoring") == "Unknown"


Out[44]:
True

Then we use that model to get our analysis output, based on the user input.


In [ ]:
from github import Github
from random import randint


g = Github("username", "password")
input_string = "facebook/react" #Replaced with user given string
repo = g.get_repo(input_string, False)
root_dir = repo.get_git_tree(sha="master", recursive=True)


fileHash = {}

for file in root_dir.tree:
    fileHash[file.path] = [0,0,0,0]
    
for key in magicHash:
    commits = repo.get_commits(path=key)
    for commit in commits:
        fileHash[key][randint(0,3)]+= 1 #Replace with model results.

print g.rate_limiting