Importing modules


In [1]:
import numpy as np
from collections import Counter

Preprocessing the training data


In [2]:
import string
def preprocess(f_name):
    f=open(f_name, 'r')
    txt1=f.read().translate(str.maketrans("\t\r", "  "))
    #txt1 = txt1.lower()
    "".join(txt1.split())
    txt=txt1.split('\n')
    sentence_corpora=[]
    sentence_labels=[]
    words=[]
    for i in range(0, 32000, 4):
        txt[i]=txt[i].lstrip('0123456789')
        txt[i]=txt[i].replace('\"','')
        txt[i]=txt[i].replace('.','')
        at=str(txt[i].strip())
        for elem in at.split(" "):
            words.append(elem.replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", "").lower())
        sentence_corpora.append(str(txt[i].strip().replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", "").lower()))
        sentence_labels.append(str(txt[i+1].strip().replace("(e1,e2)", "").replace("(e2,e1)", "")))
    return sentence_corpora,sentence_labels,words

In [3]:
sentence_corpora,sentence_labels,words = preprocess("TRAIN_FILE.TXT")
print(type(sentence_corpora))
print(sentence_corpora[:10])
print(len(sentence_corpora))


<class 'list'>
['the system as described above has its greatest application in an arrayed configuration of antenna elements', 'the child was carefully wrapped and bound into the cradle by means of a cord', 'the author of a keygen uses a disassembler to look at the raw assembly code', 'a misty ridge uprises from the surge', 'the student association is the voice of the undergraduate student population of the state university of new york at buffalohello sir', "this is the sprawling complex that is peru's largest producer of silver", 'the current view is that the chronic inflammation in the distal part of the stomach caused by helicobacter pylori infection results in an increased acid production from the non-infected upper corpus region of the stomach', 'people have been moving back into downtown', 'the lawsonite was contained in a platinum crucible and the counter-weight was a plastic crucible with metal pieces', 'the solute was placed inside a beaker and 5 ml of the solvent was pipetted into a 25 ml glass flask for each trial']
8000

In [4]:
#Setting Label values for Softmax Classifier
label_dict={"Cause-Effect": 0, 
            "Instrument-Agency": 1, 
            "Product-Producer": 2, 
            "Content-Container": 3, 
            "Entity-Origin": 4, 
            "Entity-Destination": 5, 
            "Component-Whole": 6,
            "Member-Collection": 7,
            "Message-Topic": 8,
            "Other": 9}
final_labels=[]
for elem in sentence_labels:
    final_labels.append(label_dict[elem])
final_labels = np.array(final_labels)
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab,0)}
print(len(vocab_to_int))


22954

In [5]:
print(type(final_labels))
print(final_labels[:10])


<class 'numpy.ndarray'>
[6 9 1 9 7 9 0 5 3 5]

Using Scikit-Learn's Text Processing functionality


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sentence_corpora)
X_train_counts.shape


Out[7]:
(8000, 19149)

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


Out[8]:
(8000, 19149)

In [9]:
print(type(X_train_tfidf))
print(X_train_tfidf[:5])


<class 'scipy.sparse.csr.csr_matrix'>
  (0, 17244)	0.0515326448649
  (0, 16941)	0.227637729439
  (0, 1394)	0.150018436856
  (0, 5004)	0.287616091385
  (0, 484)	0.275086385192
  (0, 8043)	0.149812500788
  (0, 9253)	0.196410554269
  (0, 7738)	0.302849612393
  (0, 1222)	0.314925513613
  (0, 8729)	0.0900085469942
  (0, 1032)	0.142652915518
  (0, 1354)	0.390137396568
  (0, 3933)	0.344021441672
  (0, 11852)	0.0726385312515
  (0, 1131)	0.318577951128
  (0, 5855)	0.327001414834
  (1, 17244)	0.11751280552
  (1, 11852)	0.0828208761514
  (1, 3279)	0.313647523517
  (1, 18625)	0.135351638532
  (1, 2921)	0.359071232726
  (1, 19005)	0.36779727957
  (1, 1059)	0.0997817413509
  (1, 2393)	0.392245778146
  (1, 9109)	0.151018297659
  :	:
  (2, 1491)	0.176157517664
  (2, 13967)	0.345530351138
  (2, 1428)	0.331409660897
  (2, 3591)	0.323108950532
  (3, 17244)	0.0670768196775
  (3, 11006)	0.507817440142
  (3, 14601)	0.507817440142
  (3, 18156)	0.469945090977
  (3, 7182)	0.151743653705
  (3, 16779)	0.485663536066
  (4, 17244)	0.177099340864
  (4, 11852)	0.187224564735
  (4, 1491)	0.13954208758
  (4, 16552)	0.458001515461
  (4, 1460)	0.260196555389
  (4, 9212)	0.0964885507519
  (4, 18500)	0.295569782624
  (4, 17988)	0.302145138976
  (4, 13107)	0.242686904
  (4, 16314)	0.217001853541
  (4, 18060)	0.229962537735
  (4, 11556)	0.170392010154
  (4, 19088)	0.247108509275
  (4, 2604)	0.335190809435
  (4, 15695)	0.295569782624

Preprocessing for the test data


In [10]:
f=open('TEST_FILE_CLEAN.TXT', 'r')
txt1=f.read().translate(str.maketrans("\t\r", "  "))
txt1 = txt1.lower()
"".join(txt1.split())
txt=txt1.split('\n')
#print(txt[2716])
sentence_test=[]
words=[]
for i in range(0, 2716):
    txt[i]=txt[i].lstrip('0123456789')
    txt[i]=txt[i].replace('\"','')
    txt[i]=txt[i].replace('.','')
    at=str(txt[i].strip())
    for elem in at.split(" "):
        words.append(elem.replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", ""))
    sentence_test.append(str(txt[i].strip().replace("<e1>","").replace("</e1>", "").replace("</e2>", "").replace("<e2>", "")))
    #labels_test.append(str(txt[i+1].strip().replace("(e1,e2)", "").replace("(e2,e1)", "")))
#print(sentence_test[2715])

In [11]:
f=open('TEST_FILE_KEY.TXT', 'r')
txt1=f.read().translate(str.maketrans("\t\r", "  "))
"".join(txt1.split())
labels_test=[]
txt=txt1.split('\n')
#print(txt[0:5])
for i in range(0, 2716):
    txt[i]=txt[i].lstrip('0123456789')
    labels_test.append(str(txt[i].strip()))
                       
#print(labels_test[0:5])

final_labels_test=[]
for elem in labels_test:
    final_labels_test.append(label_dict[elem])
final_labels_test = np.array(final_labels_test)

Implementation of Multinomial Naive Bayes Classifier


In [12]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, final_labels)

In [13]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', MultinomialNB(alpha=0.01)),
])
text_clf = text_clf.fit(sentence_corpora, final_labels)

In [14]:
predicted = text_clf.predict(sentence_test)
np.mean(predicted == final_labels_test)


Out[14]:
0.58652430044182624

Using GridSearchCV for obtaining optimum values


In [15]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3,1e-3),
              'vect__stop_words':('english',None)
}

In [16]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(sentence_corpora, final_labels)

In [17]:
gs_clf.best_score_
gs_clf.best_params_


Out[17]:
{'clf__alpha': 0.01,
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': None}

Implementing SVM classifier


In [18]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf-svm', SGDClassifier(loss='squared_hinge', penalty='l2',
                                           alpha=2*1e-4, n_iter=800, random_state=100,learning_rate='constant',eta0=0.0009)),
 ])
_ = text_clf_svm.fit(sentence_corpora, final_labels)


/home/gilgamesh/anaconda2/envs/env/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.
  DeprecationWarning)

Storing predictions in output.txt


In [19]:
f2 = open('output.txt','w')
i=8001
pred = text_clf_svm.predict(sentence_test)
for p in pred:
    for label, val in label_dict.items():
        if val == p:
            #print(label)
            f2.write(str(i))
            f2.write("\t")
            f2.write(label)
            f2.write("\n")
            i=i+1
f2.close()

Calculating Accuracy


In [20]:
def preprocess_test(f_name):
    f3 = open(f_name,'r')
    data = f3.read()
    data = data.split("\n")
    return data

In [21]:
output = preprocess_test('output.txt')
test = preprocess_test('TEST_FILE_KEY.TXT')

In [22]:
count = 0
for i in range(2716):
    if output[i]==test[i]:
        count = count+1

Final Accuracy


In [23]:
print(count/2717)


0.6205373573794627

In [ ]: