Importing modules



In [1]:

    
import numpy as np
from collections import Counter

Preprocessing the training data



In [2]:

    
import string
def preprocess(f_name):
    f=open(f_name, 'r')
    txt1=f.read().translate(str.maketrans("\t\r", "  "))
    #txt1 = txt1.lower()
    "".join(txt1.split())
    txt=txt1.split('\n')
    sentence_corpora=[]
    sentence_labels=[]
    words=[]
    for i in range(0, 32000, 4):
        txt[i]=txt[i].lstrip('0123456789')
        txt[i]=txt[i].replace('\"','')
        txt[i]=txt[i].replace('.','')
        at=str(txt[i].strip())
        for elem in at.split(" "):
            words.append(elem.replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", "").lower())
        sentence_corpora.append(str(txt[i].strip().replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", "").lower()))
        sentence_labels.append(str(txt[i+1].strip().replace("(e1,e2)", "").replace("(e2,e1)", "")))
    return sentence_corpora,sentence_labels,words



In [3]:

    
sentence_corpora,sentence_labels,words = preprocess("TRAIN_FILE.TXT")
print(type(sentence_corpora))
print(sentence_corpora[:10])
print(len(sentence_corpora))









    



<class 'list'>
['the system as described above has its greatest application in an arrayed configuration of antenna elements', 'the child was carefully wrapped and bound into the cradle by means of a cord', 'the author of a keygen uses a disassembler to look at the raw assembly code', 'a misty ridge uprises from the surge', 'the student association is the voice of the undergraduate student population of the state university of new york at buffalohello sir', "this is the sprawling complex that is peru's largest producer of silver", 'the current view is that the chronic inflammation in the distal part of the stomach caused by helicobacter pylori infection results in an increased acid production from the non-infected upper corpus region of the stomach', 'people have been moving back into downtown', 'the lawsonite was contained in a platinum crucible and the counter-weight was a plastic crucible with metal pieces', 'the solute was placed inside a beaker and 5 ml of the solvent was pipetted into a 25 ml glass flask for each trial']
8000



In [4]:

    
#Setting Label values for Softmax Classifier
label_dict={"Cause-Effect": 0, 
            "Instrument-Agency": 1, 
            "Product-Producer": 2, 
            "Content-Container": 3, 
            "Entity-Origin": 4, 
            "Entity-Destination": 5, 
            "Component-Whole": 6,
            "Member-Collection": 7,
            "Message-Topic": 8,
            "Other": 9}
final_labels=[]
for elem in sentence_labels:
    final_labels.append(label_dict[elem])
final_labels = np.array(final_labels)
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab,0)}
print(len(vocab_to_int))



In [5]:

    
print(type(final_labels))
print(final_labels[:10])









    



<class 'numpy.ndarray'>
[6 9 1 9 7 9 0 5 3 5]

Using Scikit-Learn's Text Processing functionality



In [7]:

    
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sentence_corpora)
X_train_counts.shape









    Out[7]:





(8000, 19149)



In [8]:

    
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape









    Out[8]:





(8000, 19149)



In [9]:

    
print(type(X_train_tfidf))
print(X_train_tfidf[:5])









    



<class 'scipy.sparse.csr.csr_matrix'>
  (0, 17244)	0.0515326448649
  (0, 16941)	0.227637729439
  (0, 1394)	0.150018436856
  (0, 5004)	0.287616091385
  (0, 484)	0.275086385192
  (0, 8043)	0.149812500788
  (0, 9253)	0.196410554269
  (0, 7738)	0.302849612393
  (0, 1222)	0.314925513613
  (0, 8729)	0.0900085469942
  (0, 1032)	0.142652915518
  (0, 1354)	0.390137396568
  (0, 3933)	0.344021441672
  (0, 11852)	0.0726385312515
  (0, 1131)	0.318577951128
  (0, 5855)	0.327001414834
  (1, 17244)	0.11751280552
  (1, 11852)	0.0828208761514
  (1, 3279)	0.313647523517
  (1, 18625)	0.135351638532
  (1, 2921)	0.359071232726
  (1, 19005)	0.36779727957
  (1, 1059)	0.0997817413509
  (1, 2393)	0.392245778146
  (1, 9109)	0.151018297659
  :	:
  (2, 1491)	0.176157517664
  (2, 13967)	0.345530351138
  (2, 1428)	0.331409660897
  (2, 3591)	0.323108950532
  (3, 17244)	0.0670768196775
  (3, 11006)	0.507817440142
  (3, 14601)	0.507817440142
  (3, 18156)	0.469945090977
  (3, 7182)	0.151743653705
  (3, 16779)	0.485663536066
  (4, 17244)	0.177099340864
  (4, 11852)	0.187224564735
  (4, 1491)	0.13954208758
  (4, 16552)	0.458001515461
  (4, 1460)	0.260196555389
  (4, 9212)	0.0964885507519
  (4, 18500)	0.295569782624
  (4, 17988)	0.302145138976
  (4, 13107)	0.242686904
  (4, 16314)	0.217001853541
  (4, 18060)	0.229962537735
  (4, 11556)	0.170392010154
  (4, 19088)	0.247108509275
  (4, 2604)	0.335190809435
  (4, 15695)	0.295569782624

Preprocessing for the test data



In [10]:

    
f=open('TEST_FILE_CLEAN.TXT', 'r')
txt1=f.read().translate(str.maketrans("\t\r", "  "))
txt1 = txt1.lower()
"".join(txt1.split())
txt=txt1.split('\n')
#print(txt[2716])
sentence_test=[]
words=[]
for i in range(0, 2716):
    txt[i]=txt[i].lstrip('0123456789')
    txt[i]=txt[i].replace('\"','')
    txt[i]=txt[i].replace('.','')
    at=str(txt[i].strip())
    for elem in at.split(" "):
        words.append(elem.replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", ""))
    sentence_test.append(str(txt[i].strip().replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", "")))
    #labels_test.append(str(txt[i+1].strip().replace("(e1,e2)", "").replace("(e2,e1)", "")))
#print(sentence_test[2715])



In [11]:

    
f=open('TEST_FILE_KEY.TXT', 'r')
txt1=f.read().translate(str.maketrans("\t\r", "  "))
"".join(txt1.split())
labels_test=[]
txt=txt1.split('\n')
#print(txt[0:5])
for i in range(0, 2716):
    txt[i]=txt[i].lstrip('0123456789')
    labels_test.append(str(txt[i].strip()))
                       
#print(labels_test[0:5])

final_labels_test=[]
for elem in labels_test:
    final_labels_test.append(label_dict[elem])
final_labels_test = np.array(final_labels_test)

Implementation of Multinomial Naive Bayes Classifier



In [12]:

    
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, final_labels)



In [13]:

    
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', MultinomialNB(alpha=0.01)),
])
text_clf = text_clf.fit(sentence_corpora, final_labels)



In [14]:

    
predicted = text_clf.predict(sentence_test)
np.mean(predicted == final_labels_test)









    Out[14]:





0.58652430044182624

Using GridSearchCV for obtaining optimum values



In [15]:

    
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3,1e-3),
              'vect__stop_words':('english',None)
}



In [16]:

    
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(sentence_corpora, final_labels)



In [17]:

    
gs_clf.best_score_
gs_clf.best_params_









    Out[17]:





{'clf__alpha': 0.01,
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': None}

Implementing SVM classifier



In [18]:

    
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf-svm', SGDClassifier(loss='squared_hinge', penalty='l2',
                                           alpha=2*1e-4, n_iter=800, random_state=100,learning_rate='constant',eta0=0.0009)),
 ])
_ = text_clf_svm.fit(sentence_corpora, final_labels)









    



/home/gilgamesh/anaconda2/envs/env/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.
  DeprecationWarning)

Storing predictions in output.txt



In [19]:

    
f2 = open('output.txt','w')
i=8001
pred = text_clf_svm.predict(sentence_test)
for p in pred:
    for label, val in label_dict.items():
        if val == p:
            #print(label)
            f2.write(str(i))
            f2.write("\t")
            f2.write(label)
            f2.write("\n")
            i=i+1
f2.close()

Calculating Accuracy



In [20]:

    
def preprocess_test(f_name):
    f3 = open(f_name,'r')
    data = f3.read()
    data = data.split("\n")
    return data



In [21]:

    
output = preprocess_test('output.txt')
test = preprocess_test('TEST_FILE_KEY.TXT')



In [22]:

    
count = 0
for i in range(2716):
    if output[i]==test[i]:
        count = count+1

Final Accuracy



In [23]:

    
print(count/2717)









    



0.6205373573794627



In [ ]: