In [0]:
#Common imports
import pandas as pd
from IPython.display import Markdown, display, clear_output
from nltk import tokenize
from scipy import stats
from IPython.core.debugger import set_trace
from pathlib import Path
In [0]:
def printBold(string):
display('**' + string + '**')
In [0]:
import _pickle as cPickle
from pathlib import Path
def dumpPickle(fileName, content):
pickleFile = open(fileName, 'wb')
cPickle.dump(content, pickleFile, -1)
pickleFile.close()
def loadPickle(fileName):
file = open(fileName, 'rb')
content = cPickle.load(file)
file.close()
return content
def pickleExists(fileName):
file = Path(fileName)
if file.is_file():
return True
return False
In [1]:
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))
In [2]:
!unzip dev-v1.1.json.zip
In [3]:
!unzip train-v1.1.json.zip
In [0]:
!mkdir squad-v1 && mv *.json squad-v1
In [5]:
!ls #rm -rf *.zip
In [0]:
train = pd.read_json('squad-v1/train-v1.1.json', orient='column')
dev = pd.read_json('squad-v1/dev-v1.1.json', orient='column')
In [11]:
df = pd.concat([train, dev], ignore_index=True)
df.head()
Out[11]:
In [0]:
def showQuestion(titleId, paragraphId, questionId):
title = df['data'][titleId]['title']
paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
question = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['question']
answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
printBold('Title')
print(title)
printBold('Paragraph')
print(paragraph)
printBold('Question')
print(question)
printBold('Answer')
print(answerStart)
print(answer)
In [13]:
titleId = 0
paragraphId = 0
questionId = 0
showQuestion(titleId, paragraphId, questionId)
In [16]:
titlesCount = len(df['data'])
totalParagraphsCount = 0
totalQuestionsCount = 0
for titleId in range(titlesCount):
paragraphsCount = len(df['data'][titleId]['paragraphs'])
totalParagraphsCount += paragraphsCount
for paragraphId in range(paragraphsCount):
questionsCount = len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])
totalQuestionsCount += questionsCount
print('Titles', titlesCount)
print('Paragraphs', totalParagraphsCount)
print('Questions', totalQuestionsCount)
In [17]:
titles = []
for titleId in range(len(df['data'])):
titles.append(df['data'][titleId]['title'])
for i in range(20):
print(titles[i])
In [19]:
titleId = 0
paragraphId = 0
questionId = 0
showQuestion(titleId, paragraphId, questionId)
In [0]:
def extractSentence(paragraph, answerStart):
sentences = tokenize.sent_tokenize(paragraph)
sentenceStart = 0
for sentence in sentences:
if (sentenceStart + len(sentence) >= answerStart):
return sentence
sentenceStart += len(sentence) + 1
In [22]:
import nltk
nltk.download('punkt')
Out[22]:
In [23]:
paragraph = df['data'][0]['paragraphs'][0]['context']
answerStart = df['data'][0]['paragraphs'][0]['qas'][0]['answers'][0]['answer_start']
sentence = extractSentence(paragraph, answerStart)
print(sentence)
In [0]:
def containedInText(text, question):
questionWords = tokenize.word_tokenize(question.lower())
textWords = tokenize.word_tokenize(text.lower())
wordsContained = 0
for questionWord in questionWords:
for textWord in textWords:
if (questionWord == textWord):
wordsContained += 1
break
return wordsContained / len(questionWords)
In [26]:
question = df['data'][0]['paragraphs'][0]['qas'][0]['question']
contained = containedInText(sentence, question)
printBold('Question')
print(question)
printBold('Sentence')
print(sentence)
printBold("Contained")
print(contained)
In [0]:
#Printint the percentage completed
def printPercentage(currentStep, maxStep):
stepSize = maxStep / 100
if (int(currentStep / stepSize) > ((currentStep - 1) / stepSize)):
clear_output()
print('{}%'.format(int(currentStep / stepSize)))
In [0]:
!mkdir pickles
In [31]:
questionContainmentDfPickleName = 'pickles/questionContainmentDf.pkl'
#If the dataframe is already generated, load it.
if (pickleExists(questionContainmentDfPickleName)):
print("Pickle found. Saved some time.")
questionContainmentDf = loadPickle(questionContainmentDfPickleName)
else:
sentenceScore = []
paragraphScore = []
#For each title
titlesCount = len(df['data'])
for titleId in range(titlesCount):
printPercentage(titleId, titlesCount)
#For each paragraph
for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
#For each question
for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
question = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['question']
answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
sentence = extractSentence(paragraph, answerStart)
sentenceScore.append(containedInText(sentence, question))
paragraphScore.append(containedInText(paragraph, question))
#Merge dataframes into one
sentenceScoreDf = pd.DataFrame(sentenceScore, columns=['sentence'])
paragraphScoreDf = pd.DataFrame(paragraphScore, columns=['paragraph'])
questionContainmentDf = pd.concat([sentenceScoreDf, paragraphScoreDf], axis=1)
#Pickle the result
dumpPickle(questionContainmentDfPickleName, questionContainmentDf)
print("Result not pickled. Generating...")
In [32]:
questionContainmentDf.describe()
Out[32]:
In [33]:
questionContainmentDf.head(10)
Out[33]:
In [0]:
def getQuestionAt(index):
currentIndex = 0
for titleId in range(len(df['data'])):
for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
if (currentIndex == index):
return titleId, paragraphId, questionId
currentIndex += 1
In [38]:
titleId, paragraphId, questionId = getQuestionAt(81)
print(titleId, paragraphId, questionId)
showQuestion(titleId, paragraphId, questionId)
In [39]:
questionContainmentDf[questionContainmentDf['paragraph'] == 0].head()
Out[39]:
In [41]:
getQuestionAt(269)
titleId = 1
paragraphId = 0
questionId = 0
showQuestion(titleId, paragraphId, questionId)
In [42]:
getQuestionAt(505)
Out[42]:
In [43]:
titleId = 1
paragraphId = 18
questionId = 6
showQuestion(titleId, paragraphId, questionId)
In [44]:
questionContainmentDf[questionContainmentDf['sentence'] == 1]
Out[44]:
In [46]:
getQuestionAt(53226)
In [49]:
answersInText = 0
answersNotInText = 0
for titleId in range(len(df['data'])):
for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
if (answer in paragraph):
answersInText += 1
else:
answersNotInText += 1
printBold('Answers in text')
print(answersInText)
printBold('Answers not in text')
print(answersNotInText)
In [0]:
answers = []
sentences = []
for titleId in range(len(df['data'])):
for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
sentence = extractSentence(paragraph, answerStart)
answers.append(answer)
sentences.append(sentence)
In [51]:
answerTextsDf = pd.DataFrame(answers, columns=['answer'])
sentenceDf = pd.DataFrame(sentences, columns=['sentence'])
answersDf = pd.concat([answerTextsDf, sentenceDf], axis=1)
answersDf.head()
Out[51]:
In [0]:
wordCount = []
for i in range(len(answersDf)):
wordCount.append(len(tokenize.word_tokenize(answersDf.iloc[i]['answer'])))
In [53]:
answersDf = pd.concat([answersDf, pd.DataFrame(wordCount, columns=['wordCount'])], axis=1)
answersDf['wordCount'].describe()
Out[53]:
In [54]:
answersDf['wordCount'].value_counts()
Out[54]:
In [55]:
answersDf[answersDf['wordCount'] == 1].sample(10, random_state=42)
Out[55]:
In [56]:
answersDf[answersDf['wordCount'] == 2].sample(n=20, random_state=5)
Out[56]:
In [57]:
answersDf[answersDf['wordCount'] == 3].sample(n=20, random_state=5)
Out[57]:
In [58]:
answersDf[answersDf['wordCount'] == 5].sample(n=20, random_state=5)
Out[58]:
In [59]:
answersDf[answersDf['wordCount'] == 42].iloc[0]['answer']
Out[59]:
In [68]:
!python -m spacy download en_core_web_md
In [0]:
from spacy import displacy
from collections import Counter
import en_core_web_md
nlp = en_core_web_md.load()
# nlp = spacy.load('en_core_web_md')
In [0]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
In [72]:
print([(X.text, X.label_) for X in doc.ents])
In [0]:
def NerForWord(text):
doc = nlp(text)
entitiesFound = len(doc.ents)
if (entitiesFound > 0):
#TODO - Could potentially find multiple entities in the text. We're returning only the first one.
return doc.ents[0].label_
else:
return ''
In [76]:
doc = nlp('My name is Laxmikant, My GIN is 43578811. I love coding, I do career with it.')
print([(X.text, X.label_) for X in doc.ents])
In [0]:
def isSingleToken(text):
doc = nlp(text)
#The entire text is a single named entity
entitiesFound = len(doc.ents)
if(entitiesFound == 1 and doc.ents[0].text == text):
return True
#The text is not an named entity, but is a single token
tokensFound = len(doc)
if (tokensFound == 1):
return True
return False
In [81]:
isSingleToken('Laxmikant Ratnaparkhi')
Out[81]:
In [82]:
singleTokenCount = 0
sampleSize = int(len(answersDf) / 10)
for i in range(sampleSize):
printPercentage(i, sampleSize)
if (isSingleToken(answersDf.iloc[i]['answer'])):
singleTokenCount += 1
In [84]:
singleTokenCount / sampleSize
Out[84]:
In [88]:
doc = nlp('James R. Scott abc2@.com')
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop, len(doc.ents), doc.ents[0].label_)
shape = doc[0].shape_
for wordIndex in range(1, len(doc)):
shape += (' ' + doc[wordIndex].shape_)
print(shape, doc[0].shape_)
In [89]:
spacy.explain('CARDINAL')
Out[89]:
In [0]:
answersDf['isSingleToken'] = False
answersDf['NER'] = ''
answersDf['POS'] = ''
answersDf['TAG'] = ''
answersDf['DEP'] = ''
answersDf['shape'] = ''
answersDf['isAlpha'] = False
answersDf['isStop'] = False
In [91]:
answersDf.head()
Out[91]:
In [92]:
singleTokenCount = 0
sampleSize = int(len(answersDf) / 10)
for i in range(sampleSize):
printPercentage(i, sampleSize)
answer = answersDf.iloc[i]['answer']
if (isSingleToken(answer)):
answersDf.at[i, 'isSingleToken'] = True
answersDf.at[i, 'NER'] = NerForWord(answer)
#At this point I've called spacy's nlp method 3 times for the same words...
doc = nlp(answer)
answersDf.at[i, 'POS'] = doc[0].pos_
answersDf.at[i, 'TAG'] = doc[0].tag_
answersDf.at[i, 'DEP'] = doc[0].dep_
answersDf.at[i, 'isAlpha'] = doc[0].is_alpha
answersDf.at[i, 'isStop'] = doc[0].is_stop
shape = doc[0].shape_
for wordIndex in range(1, len(doc)):
shape += (' ' + doc[wordIndex].shape_)
answersDf.at[i, 'shape'] = shape
In [93]:
answersDf[answersDf['NER'] == 'ORG'].sample(n=10, random_state=5)
Out[93]:
In [94]:
answersDf['isStop'].value_counts()
Out[94]:
In [95]:
answersDf['isAlpha'].value_counts()
Out[95]:
In [96]:
answersDf[answersDf['POS'] == 'PROPN'].sample(n=5, random_state=16)
Out[96]:
In [97]:
answersDf[answersDf['POS'] == 'NOUN'].sample(n=5, random_state=16)
Out[97]:
In [98]:
answersDf[answersDf['POS'] == 'NUM'].sample(n=10, random_state=16)
Out[98]:
In [0]:
def highlightAnswers(titleId, paragraphId):
paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
answers = df['data'][titleId]['paragraphs'][paragraphId]['qas']
#Get answer starts and answer length
answerPosition = {}
for answer in answers:
answerStart = answer['answers'][0]['answer_start']
answerLength = len(answer['answers'][0]['text'])
answerPosition[answerStart] = answerLength
#Bold answers
shiftStart = 0
highlightedText = ''
currentPlaceInText = 0
#Append text between previous answer and current answer + bold sign + answer + bold sign
for answerStart in sorted(answerPosition.keys()):
highlightedText += paragraph[currentPlaceInText:answerStart]
highlightedText += '**'
highlightedText += paragraph[answerStart:answerStart + answerPosition[answerStart]]
highlightedText += '**'
currentPlaceInText = answerStart + answerPosition[answerStart]
#Append the remaining text after the last answer
highlightedText += paragraph[currentPlaceInText:len(paragraph)]
#Diplay the highlighted text
display('**'+highlightedText+'**')
In [102]:
titleId = 24
paragraphId = 0
highlightAnswers(titleId, paragraphId)
In [103]:
titleId = 4
paragraphId = 12
highlightAnswers(titleId, paragraphId)
In [104]:
text = df['data'][0]['paragraphs'][0]['context']
doc = nlp(text)
for noun_chunk in doc.noun_chunks:
print(noun_chunk)
In [105]:
titleId = 0
paragraphId = 0
highlightAnswers(titleId, paragraphId)