In [0]:
#Common imports 
import pandas as pd
from IPython.display import Markdown, display, clear_output
from nltk import tokenize
from scipy import stats
from IPython.core.debugger import set_trace
from pathlib import Path

In [0]:
def printBold(string):
    display('**' + string + '**')

In [0]:
import _pickle as cPickle
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))


Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving dev-v1.1.json.zip to dev-v1.1.json.zip
Saving train-v1.1.json.zip to train-v1.1.json.zip
User uploaded file "dev-v1.1.json.zip" with length 1051741 bytes
User uploaded file "train-v1.1.json.zip" with length 8100535 bytes

In [2]:
!unzip dev-v1.1.json.zip


Archive:  dev-v1.1.json.zip
  inflating: dev-v1.1.json           

In [3]:
!unzip train-v1.1.json.zip


Archive:  train-v1.1.json.zip
  inflating: train-v1.1.json         

In [0]:
!mkdir squad-v1 && mv *.json squad-v1

In [5]:
!ls #rm -rf *.zip


dev-v1.1.json.zip  sample_data	squad-v1  train-v1.1.json.zip

In [0]:
train = pd.read_json('squad-v1/train-v1.1.json', orient='column')
dev = pd.read_json('squad-v1/dev-v1.1.json', orient='column')

In [11]:
df = pd.concat([train, dev], ignore_index=True)
df.head()


Out[11]:
data version
0 {'title': 'University_of_Notre_Dame', 'paragra... 1.1
1 {'title': 'Beyoncé', 'paragraphs': [{'context'... 1.1
2 {'title': 'Montana', 'paragraphs': [{'context'... 1.1
3 {'title': 'Genocide', 'paragraphs': [{'context... 1.1
4 {'title': 'Antibiotics', 'paragraphs': [{'cont... 1.1

In [0]:
def showQuestion(titleId, paragraphId, questionId):

    title = df['data'][titleId]['title']
    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
    question = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['question']
    answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
    answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']

    printBold('Title')
    print(title)
    printBold('Paragraph')
    print(paragraph)
    printBold('Question')
    print(question)
    printBold('Answer')
    print(answerStart)
    print(answer)

In [13]:
titleId = 0
paragraphId = 0 
questionId = 0

showQuestion(titleId, paragraphId, questionId)


'**Title**'
University_of_Notre_Dame
'**Paragraph**'
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
'**Question**'
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
'**Answer**'
515
Saint Bernadette Soubirous

In [16]:
titlesCount = len(df['data'])
totalParagraphsCount = 0
totalQuestionsCount = 0

for titleId in range(titlesCount):
    paragraphsCount = len(df['data'][titleId]['paragraphs'])
    totalParagraphsCount += paragraphsCount
    
    for paragraphId in range(paragraphsCount):
        questionsCount = len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])
        
        totalQuestionsCount += questionsCount
        
print('Titles', titlesCount)
print('Paragraphs', totalParagraphsCount)
print('Questions', totalQuestionsCount)


Titles 490
Paragraphs 20963
Questions 98169

In [17]:
titles = []
for titleId in range(len(df['data'])):
    titles.append(df['data'][titleId]['title'])
    
for i in range(20):
    print(titles[i])


University_of_Notre_Dame
Beyoncé
Montana
Genocide
Antibiotics
Frédéric_Chopin
Sino-Tibetan_relations_during_the_Ming_dynasty
IPod
The_Legend_of_Zelda:_Twilight_Princess
Spectre_(2015_film)
2008_Sichuan_earthquake
New_York_City
To_Kill_a_Mockingbird
Solar_energy
Tajikistan
Anthropology
Portugal
Kanye_West
Buddhism
American_Idol

In [19]:
titleId = 0
paragraphId = 0 
questionId = 0

showQuestion(titleId, paragraphId, questionId)


'**Title**'
University_of_Notre_Dame
'**Paragraph**'
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
'**Question**'
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
'**Answer**'
515
Saint Bernadette Soubirous

In [0]:
def extractSentence(paragraph, answerStart):
    
    sentences = tokenize.sent_tokenize(paragraph)
    sentenceStart = 0
    
    for sentence in sentences:
        if (sentenceStart + len(sentence) >= answerStart):
            return sentence         
        
        sentenceStart += len(sentence) + 1

In [22]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Out[22]:
True

In [23]:
paragraph = df['data'][0]['paragraphs'][0]['context']
answerStart = df['data'][0]['paragraphs'][0]['qas'][0]['answers'][0]['answer_start']

sentence = extractSentence(paragraph, answerStart)
print(sentence)


It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.

In [0]:
def containedInText(text, question):
    
    questionWords = tokenize.word_tokenize(question.lower())
    textWords = tokenize.word_tokenize(text.lower())
    wordsContained = 0

    for questionWord in questionWords:
        for textWord in textWords:
            if (questionWord == textWord):
                wordsContained += 1
                break

    return wordsContained / len(questionWords)

In [26]:
question =  df['data'][0]['paragraphs'][0]['qas'][0]['question']

contained = containedInText(sentence, question)
printBold('Question')
print(question)
printBold('Sentence')
print(sentence)
printBold("Contained")
print(contained)


'**Question**'
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
'**Sentence**'
It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.
'**Contained**'
0.6428571428571429

In [0]:
#Printint the percentage completed
def printPercentage(currentStep, maxStep):
    stepSize = maxStep / 100
    
    if (int(currentStep / stepSize) > ((currentStep - 1) / stepSize)):
        clear_output()
        print('{}%'.format(int(currentStep / stepSize)))

In [0]:
!mkdir pickles

In [31]:
questionContainmentDfPickleName = 'pickles/questionContainmentDf.pkl'

#If the dataframe is already generated, load it.
if (pickleExists(questionContainmentDfPickleName)):
    print("Pickle found. Saved some time.")
    questionContainmentDf = loadPickle(questionContainmentDfPickleName)
else:
    sentenceScore = []
    paragraphScore = []

    #For each title
    titlesCount = len(df['data'])
    for titleId in range(titlesCount):
        printPercentage(titleId, titlesCount)

        #For each paragraph
        for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
            paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']

            #For each question
            for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
                question = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['question']
                answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
                sentence = extractSentence(paragraph, answerStart)

                sentenceScore.append(containedInText(sentence, question))
                paragraphScore.append(containedInText(paragraph, question))           
                
    #Merge dataframes into one                
    sentenceScoreDf = pd.DataFrame(sentenceScore, columns=['sentence'])
    paragraphScoreDf = pd.DataFrame(paragraphScore, columns=['paragraph'])

    questionContainmentDf = pd.concat([sentenceScoreDf, paragraphScoreDf], axis=1)
    
    #Pickle the result
    dumpPickle(questionContainmentDfPickleName, questionContainmentDf)
    
    print("Result not pickled. Generating...")


Pickle found. Saved some time.

In [32]:
questionContainmentDf.describe()


Out[32]:
sentence paragraph
count 98169.000000 98169.000000
mean 0.463937 0.582157
std 0.190377 0.159055
min 0.000000 0.000000
25% 0.333333 0.500000
50% 0.461538 0.600000
75% 0.600000 0.700000
max 1.000000 1.000000

In [33]:
questionContainmentDf.head(10)


Out[33]:
sentence paragraph
0 0.642857 0.571429
1 0.636364 0.636364
2 0.533333 0.600000
3 0.375000 0.500000
4 0.333333 0.416667
5 0.272727 0.636364
6 0.300000 0.800000
7 0.363636 0.727273
8 0.000000 0.545455
9 0.266667 0.733333

In [0]:
def getQuestionAt(index):
    currentIndex = 0
    
    for titleId in range(len(df['data'])):
        for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
            for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
                if (currentIndex == index):
                    return titleId, paragraphId, questionId
                currentIndex += 1

In [38]:
titleId, paragraphId, questionId = getQuestionAt(81)
print(titleId, paragraphId, questionId)
showQuestion(titleId, paragraphId, questionId)


0 16 3
'**Title**'
University_of_Notre_Dame
'**Paragraph**'
About 80% of undergraduates and 20% of graduate students live on campus. The majority of the graduate students on campus live in one of four graduate housing complexes on campus, while all on-campus undergraduates live in one of the 29 residence halls. Because of the religious affiliation of the university, all residence halls are single-sex, with 15 male dorms and 14 female dorms. The university maintains a visiting policy (known as parietal hours) for those students who live in dormitories, specifying times when members of the opposite sex are allowed to visit other students' dorm rooms; however, all residence halls have 24-hour social spaces for students regardless of gender. Many residence halls have at least one nun and/or priest as a resident. There are no traditional social fraternities or sororities at the university, but a majority of students live in the same residence hall for all four years. Some intramural sports are based on residence hall teams, where the university offers the only non-military academy program of full-contact intramural American football. At the end of the intramural season, the championship game is played on the field in Notre Dame Stadium.
'**Question**'
What amount of the graduate student body at Notre Dame live on the campus?
'**Answer**'
32
20%

In [39]:
questionContainmentDf[questionContainmentDf['paragraph'] == 0].head()


Out[39]:
sentence paragraph
269 0.0 0.0
363 0.0 0.0
505 0.0 0.0
2781 0.0 0.0
3678 0.0 0.0

In [41]:
getQuestionAt(269)
titleId = 1
paragraphId = 0 
questionId = 0

showQuestion(titleId, paragraphId, questionId)


'**Title**'
Beyoncé
'**Paragraph**'
Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
'**Question**'
When did Beyonce start becoming popular?
'**Answer**'
269
in the late 1990s

In [42]:
getQuestionAt(505)


Out[42]:
(1, 18, 6)

In [43]:
titleId = 1
paragraphId = 18 
questionId = 6

showQuestion(titleId, paragraphId, questionId)


'**Title**'
Beyoncé
'**Paragraph**'
In 2011, documents obtained by WikiLeaks revealed that Beyoncé was one of many entertainers who performed for the family of Libyan ruler Muammar Gaddafi. Rolling Stone reported that the music industry was urging them to return the money they earned for the concerts; a spokesperson for Beyoncé later confirmed to The Huffington Post that she donated the money to the Clinton Bush Haiti Fund. Later that year she became the first solo female artist to headline the main Pyramid stage at the 2011 Glastonbury Festival in over twenty years, and was named the highest-paid performer in the world per minute.
'**Question**'
When did this leak happen?
'**Answer**'
3
2011

In [44]:
questionContainmentDf[questionContainmentDf['sentence'] == 1]


Out[44]:
sentence paragraph
21911 1.0 1.0
39394 1.0 1.0
45064 1.0 1.0
48874 1.0 1.0
53226 1.0 1.0
67425 1.0 1.0

In [46]:
getQuestionAt(53226)


'**Title**'
Utrecht
'**Paragraph**'
Utrecht city has an active cultural life, and in the Netherlands is second only to Amsterdam. There are several theatres and theatre companies. The 1941 main city theatre was built by Dudok. Besides theatres there is a large number of cinemas including three arthouse cinemas. Utrecht is host to the international Early Music Festival (Festival Oude Muziek, for music before 1800) and the Netherlands Film Festival. The city has an important classical music hall Vredenburg (1979 by Herman Hertzberger). Its acoustics are considered among the best of the 20th-century original music halls.[citation needed] The original Vredenburg music hall has been redeveloped as part of the larger station area redevelopment plan and in 2014 has gained additional halls that allowed its merger with the rock club Tivoli and the SJU jazzpodium. There are several other venues for music throughout the city. Young musicians are educated in the conservatory, a department of the Utrecht School of the Arts. There is a specialised museum of automatically playing musical instruments.
'**Question**'
Cultural life in Utrecht is second to 
'**Answer**'
0
Utrecht city has an active cultural life, and in the Netherlands is second only to Amsterdam

In [49]:
answersInText = 0
answersNotInText = 0

for titleId in range(len(df['data'])):
     for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
        paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
        for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
            answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
            if (answer in paragraph):
                answersInText += 1
            else:
                answersNotInText += 1
                
printBold('Answers in text')
print(answersInText)
printBold('Answers not in text')
print(answersNotInText)


'**Answers in text**'
98169
'**Answers not in text**'
0

In [0]:
answers = []
sentences = []

for titleId in range(len(df['data'])):
    
     for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
        paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
        
        for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
            answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
            answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
            
            sentence = extractSentence(paragraph, answerStart)
            
            answers.append(answer)
            sentences.append(sentence)

In [51]:
answerTextsDf = pd.DataFrame(answers, columns=['answer'])
sentenceDf = pd.DataFrame(sentences, columns=['sentence'])

answersDf = pd.concat([answerTextsDf, sentenceDf], axis=1)
answersDf.head()


Out[51]:
answer sentence
0 Saint Bernadette Soubirous It is a replica of the grotto at Lourdes, Fran...
1 a copper statue of Christ Immediately in front of the Main Building and ...
2 the Main Building Next to the Main Building is the Basilica of t...
3 a Marian place of prayer and reflection Immediately behind the basilica is the Grotto,...
4 a golden statue of the Virgin Mary Atop the Main Building's gold dome is a golden...

In [0]:
wordCount = []

for i in range(len(answersDf)):
    wordCount.append(len(tokenize.word_tokenize(answersDf.iloc[i]['answer'])))

In [53]:
answersDf = pd.concat([answersDf, pd.DataFrame(wordCount, columns=['wordCount'])], axis=1)
answersDf['wordCount'].describe()


Out[53]:
count    98169.000000
mean         3.354511
std          3.731074
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         46.000000
Name: wordCount, dtype: float64

In [54]:
answersDf['wordCount'].value_counts()


Out[54]:
1     32161
2     25233
3     14350
4      7557
5      4654
6      3050
7      2222
8      1676
9      1206
10      974
11      755
12      653
13      565
14      462
15      406
16      313
18      275
17      269
19      243
20      191
21      182
23      138
22      132
25      120
24      101
26       78
28       58
27       57
29       29
30       18
31       12
32       11
33        6
38        2
34        2
35        2
36        2
37        2
42        1
46        1
Name: wordCount, dtype: int64

In [55]:
answersDf[answersDf['wordCount'] == 1].sample(10, random_state=42)


Out[55]:
answer sentence wordCount
52642 mul For example, the name for the hanja 水 is 물 수 (... 1
79457 11,000–16,000 The total Iranian casualties in the war were e... 1
88678 Saracens From these bases, the Normans eventually captu... 1
35390 microphone The second controller lacked the START and SEL... 1
34469 rarely Since Elizabeth rarely gives interviews, littl... 1
57333 1991 By the late 1980s, digital media, in the form ... 1
10684 ZigBee Many newer control systems are using wireless ... 1
43080 1990s Intergender singles bouts were first fought on... 1
43755 1870 In 1870, after France attacked Prussia, Prussi... 1
65525 Champs-Élysées As of 2013 the City of Paris had 1,570 hotels ... 1

In [56]:
answersDf[answersDf['wordCount'] == 2].sample(n=20, random_state=5)


Out[56]:
answer sentence wordCount
31777 six years A peace agreement was signed in which John ret... 2
4799 Notre Dame In 2006, Lee was awarded an honorary doctorate... 2
21766 gamma-aminobutyric acid The two neurotransmitters that are used most w... 2
28267 Thomas Aquinas During the Middle Ages, the Aristotelian view ... 2
7152 The Beatles The single, "A Moment Like This", went on to b... 2
26176 migratory species The state is also a host to a large population... 2
33975 over five For example, over five columns of text were de... 2
4851 Mockingbird groupies Local residents call them "Mockingbird groupie... 2
85540 Alan Rogerson Former members Heather and Gary Botting compar... 2
77579 Sheffield United The first ever Premier League goal was scored ... 2
33022 suppressive fire Many units are supplemented with a variety of ... 2
19486 political boundaries This claim also cannot be used to invalidate t... 2
92917 20 minutes It is connected to the city via the Metro Ligh... 2
56025 Ashe County Over the last decade, North Carolina has becom... 2
28701 Brian Labone The late centre half and former captain Brian ... 2
5075 learning investments Hence the additional costs of the incentives f... 2
22254 23.02% According to surveys conducted in 2007 and 200... 2
7646 Jennifer Hudson Other alumni have gone on to work in televisio... 2
54428 every continent Glaciers are present on every continent and ap... 2
29432 21 March The Church of Alexandria celebrated Easter on ... 2

In [57]:
answersDf[answersDf['wordCount'] == 3].sample(n=20, random_state=5)


Out[57]:
answer sentence wordCount
49157 Vasco da Gama Portugal had during the 15th century – particu... 3
28486 Copa del Generalísimo The 1960s saw the emergence of Josep Maria Fus... 3
91654 magnetic tape shortage During the following years, a magnetic tape sh... 3
95828 fear of betrayal In 1354, when Toghtogha led a large army to cr... 3
61090 Arab Umayyad Caliphate After conquering Persia, the Arab Umayyad Cali... 3
92998 keyed Northumbrian smallpipes John Dunn, inventor of keyed Northumbrian smal... 3
66068 The Weather Company On October 28, 2015, IBM announced its acquisi... 3
24543 10 February 1931 The city that was later dubbed "Lutyens' Delhi... 3
50145 political and moral He is without parallel in any age, excepting p... 3
84203 the Roku player Google made YouTube available on the Roku play... 3
72640 1792 and 1793 The guillotines used during the Reign of Terro... 3
78086 power to veto This made his person sacrosanct, gave him the ... 3
85670 George C. Marshall Next, he was appointed Assistant Chief of Staf... 3
10430 Pius V. The use of the title was reserved for the card... 3
49201 Frederick the Wise When he refused, he was placed under the ban o... 3
8290 Reporters Without Borders Reporters Without Borders organised several sy... 3
82432 mythical chullumpi bird The mythical chullumpi bird is said to mark th... 3
44100 George W. Bush New Haven is the birthplace of former presiden... 3
93328 president and CEO Noble subsequently acquired the rights to the ... 3
61195 East India Company This led to the Battle of Plassey on 23 June 1... 3

In [58]:
answersDf[answersDf['wordCount'] == 5].sample(n=20, random_state=5)


Out[58]:
answer sentence wordCount
85209 conduct surveys of party colleagues For instance, to keep their party colleagues "... 5
20338 Robert Bideleux and Ian Jeffries Significant legislative changes in the status ... 5
22367 in excess of £3.3 billion The total annual cost to support the defence e... 5
64526 Koninklijk Conservatorium Artesis Hogeschool A... She is now also professor mandolin at the musi... 5
97009 end of World War I At the end of World War I, the Rhineland was s... 5
54390 partly cold-based and partly warm-based Glaciers which are partly cold-based and partl... 5
90084 body and blood of Christ Luther insisted on the Real Presence of the bo... 5
39942 the eastern waterfront in Buceo The Museo Naval, is located on the eastern wat... 5
95598 School of Social Service Administration In 1955, Eero Saarinen was contracted to devel... 5
58652 protruded from the road surface However, the company ceased trading in 1875 af... 5
56345 scientific naturalism over natural theology Huxley wanted science to be secular, without r... 5
25781 Kraftwerk, Art of Noise This sound, also influenced by European electr... 5
38132 a great-great grandson of Jacob Jacob and his sons had lived in Canaan but wer... 5
41885 off Australia's northwestern coast On 20 May 2011, Royal Dutch Shell's final inve... 5
64614 Chris Thile of California is Chris Thile of California is a well-known play... 5
89455 in less than quadratic time Similarly, algorithms can solve the NP-complet... 5
20808 between 1000 to 1500 BC Celtic tribes settled in Switzerland between 1... 5
16151 executive director of football operations Foster appointed legendary Darrel "Mouse" Davi... 5
55832 tobacco, cotton and agriculture Impoverished by the Civil War, the state conti... 5
37697 large tumour on her liver When Lady Flora died in July, the post-mortem ... 5

In [59]:
answersDf[answersDf['wordCount'] == 42].iloc[0]['answer']


Out[59]:
'Hillary Clinton (2008), Howard Dean (2004), Gary Hart (1984 and 1988), Paul Tsongas (1992), Pat Robertson (1988) and Jerry Brown (1976, 1980, 1992).'

In [68]:
!python -m spacy download en_core_web_md


Requirement already satisfied: en_core_web_md==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz#egg=en_core_web_md==2.1.0 in /usr/local/lib/python3.6/dist-packages (2.1.0)
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_md')

In [0]:
from spacy import displacy
from collections import Counter
import en_core_web_md
nlp = en_core_web_md.load()
# nlp = spacy.load('en_core_web_md')

In [0]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

In [72]:
print([(X.text, X.label_) for X in doc.ents])


[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]

In [0]:
def NerForWord(text):
    doc = nlp(text)
    
    entitiesFound = len(doc.ents)
    
    if (entitiesFound > 0):
        #TODO - Could potentially find multiple entities in the text. We're returning only the first one.
        return doc.ents[0].label_
    else:
        return ''

In [76]:
doc = nlp('My name is Laxmikant, My GIN is 43578811. I love coding, I do career with it.')
print([(X.text, X.label_) for X in doc.ents])


[('Laxmikant', 'PERSON'), ('43578811', 'DATE')]

In [0]:
def isSingleToken(text):
    doc = nlp(text)
    
    #The entire text is a single named entity 
    entitiesFound = len(doc.ents)
    if(entitiesFound == 1 and doc.ents[0].text == text):
        return True
    
    #The text is not an named entity, but is a single token
    tokensFound = len(doc)
    if (tokensFound == 1):
        return True
    
    return False

In [81]:
isSingleToken('Laxmikant Ratnaparkhi')


Out[81]:
True

In [82]:
singleTokenCount = 0

sampleSize =  int(len(answersDf) / 10)
for i in range(sampleSize):
        
    printPercentage(i, sampleSize)
    
    if (isSingleToken(answersDf.iloc[i]['answer'])):
        singleTokenCount += 1


99%

In [84]:
singleTokenCount / sampleSize


Out[84]:
0.5769152404237978

In [88]:
doc = nlp('James R. Scott abc2@.com')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop, len(doc.ents), doc.ents[0].label_)
    
shape = doc[0].shape_
for wordIndex in range(1, len(doc)):
    shape += (' ' + doc[wordIndex].shape_)
        
print(shape, doc[0].shape_)


James James PROPN NNP compound Xxxxx True False 1 PERSON
R. R. PROPN NNP compound X. False False 1 PERSON
Scott Scott PROPN NNP compound Xxxxx True False 1 PERSON
abc2@.com abc2@.com X ADD ROOT xxxd@.xxx False False 1 PERSON
Xxxxx X. Xxxxx xxxd@.xxx Xxxxx

In [89]:
spacy.explain('CARDINAL')


Out[89]:
'Numerals that do not fall under another type'

In [0]:
answersDf['isSingleToken'] = False
answersDf['NER'] = ''
answersDf['POS'] = ''
answersDf['TAG'] = ''
answersDf['DEP'] = ''
answersDf['shape'] = ''
answersDf['isAlpha'] = False
answersDf['isStop'] = False

In [91]:
answersDf.head()


Out[91]:
answer sentence wordCount isSingleToken NER POS TAG DEP shape isAlpha isStop
0 Saint Bernadette Soubirous It is a replica of the grotto at Lourdes, Fran... 3 False False False
1 a copper statue of Christ Immediately in front of the Main Building and ... 5 False False False
2 the Main Building Next to the Main Building is the Basilica of t... 3 False False False
3 a Marian place of prayer and reflection Immediately behind the basilica is the Grotto,... 7 False False False
4 a golden statue of the Virgin Mary Atop the Main Building's gold dome is a golden... 7 False False False

In [92]:
singleTokenCount = 0

sampleSize = int(len(answersDf) / 10)

for i in range(sampleSize):
        
    printPercentage(i, sampleSize)
    
    answer = answersDf.iloc[i]['answer']
    if (isSingleToken(answer)):
        answersDf.at[i, 'isSingleToken'] = True
        
        answersDf.at[i, 'NER'] = NerForWord(answer)
        
        #At this point I've called spacy's nlp method 3 times for the same words...
        doc = nlp(answer)
        
        answersDf.at[i, 'POS'] = doc[0].pos_
        answersDf.at[i, 'TAG'] = doc[0].tag_
        answersDf.at[i, 'DEP'] = doc[0].dep_
        answersDf.at[i, 'isAlpha'] = doc[0].is_alpha
        answersDf.at[i, 'isStop'] = doc[0].is_stop
        
        shape = doc[0].shape_
        for wordIndex in range(1, len(doc)):
            shape += (' ' + doc[wordIndex].shape_)
            
        answersDf.at[i, 'shape'] = shape


99%

In [93]:
answersDf[answersDf['NER'] == 'ORG'].sample(n=10, random_state=5)


Out[93]:
answer sentence wordCount isSingleToken NER POS TAG DEP shape isAlpha isStop
342 Sony Music In 1996, the girls began recording their debut... 2 True ORG PROPN NNP compound Xxxx Xxxxx True False
9325 HUD In his dissent to the majority report of the F... 1 True ORG PROPN NNP ROOT XXX True False
3408 Shenzhen Stock Exchange Both the Shanghai Stock Exchange and the Shenz... 3 True ORG PROPN NNP compound Xxxxx Xxxxx Xxxxx True False
2789 Foxconn Foxconn, Apple's manufacturer, initially denie... 1 True ORG PROPN NNP ROOT Xxxxx True False
7729 The Walt Disney Company On February 14, 2009, The Walt Disney Company ... 4 True ORG DET DT det Xxx Xxxx Xxxxx Xxxxx True True
3181 The London Fire Brigade The London Fire Brigade was on set to simulate... 4 True ORG DET DT det Xxx Xxxxx Xxxx Xxxxx True True
1479 FDA Possible improvements include clarification of... 1 True ORG PROPN NNP ROOT XXX True False
9702 Cabinda It is also bounded by Gabon to the west, Camer... 1 True ORG PROPN NNP ROOT Xxxxx True False
6482 Amitābha The Japanese Pure Land teacher Genshin taught ... 1 True ORG PROPN NNP ROOT Xxxxx True False
7240 Daughtry Despite being eliminated earlier in the season... 1 True ORG PROPN NNP ROOT Xxxxx True False

In [94]:
answersDf['isStop'].value_counts()


Out[94]:
False    97704
True       465
Name: isStop, dtype: int64

In [95]:
answersDf['isAlpha'].value_counts()


Out[95]:
False    94121
True      4048
Name: isAlpha, dtype: int64

In [96]:
answersDf[answersDf['POS'] == 'PROPN'].sample(n=5, random_state=16)


Out[96]:
answer sentence wordCount isSingleToken NER POS TAG DEP shape isAlpha isStop
2878 Lanayru In return, she helps Link find Ordon Village's... 1 True ORG PROPN NNP ROOT Xxxxx True False
3788 CCTV-1 All Mainland Chinese television stations (alon... 1 True ORG PROPN NNP ROOT XXXX-d False False
9634 Bantu Bantu-speaking peoples who founded tribes duri... 1 True NORP PROPN NNP ROOT Xxxxx True False
1706 French In France he used the French versions of his g... 1 True NORP PROPN NNP ROOT Xxxxx True False
9284 Financial Crisis Inquiry Commission The Financial Crisis Inquiry Commission conclu... 4 True ORG PROPN NNP compound Xxxxx Xxxxx Xxxxx Xxxxx True False

In [97]:
answersDf[answersDf['POS'] == 'NOUN'].sample(n=5, random_state=16)


Out[97]:
answer sentence wordCount isSingleToken NER POS TAG DEP shape isAlpha isStop
4206 Christianity Christianity (59%), particularly Catholicism (... 1 True NOUN NN ROOT Xxxxx True False
9611 Labour In 1976 the future Labour prime minister James... 1 True NOUN NN ROOT Xxxxx True False
8027 nets pointers and hounds), rodent control, guarding... 1 True NOUN NNS ROOT xxxx True False
8136 humans Dogs are also vulnerable to some of the same h... 1 True NOUN NNS ROOT xxxx True False
6726 life On the other hand, the rules themselves are de... 1 True NOUN NN ROOT xxxx True False

In [98]:
answersDf[answersDf['POS'] == 'NUM'].sample(n=10, random_state=16)


Out[98]:
answer sentence wordCount isSingleToken NER POS TAG DEP shape isAlpha isStop
4168 92 Throughout its history, the city has been a ma... 1 True CARDINAL NUM CD ROOT dd False False
6996 2001 The show debuted in 2001 in Britain with Lythg... 1 True DATE NUM CD ROOT dddd False False
9763 44% Net primary enrollment rate was 44% in 2005, m... 2 True PERCENT NUM CD nummod dd % False False
7528 15 Fox announced on May 11, 2015 that the fifteen... 1 True CARDINAL NUM CD ROOT dd False False
403 four Beyoncé's first solo recording was a feature o... 1 True CARDINAL NUM CD ROOT xxxx True True
4521 290 million gallons The Croton Watershed north of the city is unde... 3 True QUANTITY NUM CD compound ddd xxxx xxxx False False
4757 40 In the years since, it has sold more than 30 m... 1 True CARDINAL NUM CD ROOT dd False False
3275 75 And while Lea Seydoux doesn’t leave a huge imp... 1 True CARDINAL NUM CD ROOT dd False False
6008 2.3 million Late Registration sold over 2.3 million units ... 2 True CARDINAL NUM CD compound d.d xxxx False False
2794 2010 In 2010, a number of workers committed suicide... 1 True DATE NUM CD ROOT dddd False False

In [0]:
def highlightAnswers(titleId, paragraphId):

    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
    
    answers = df['data'][titleId]['paragraphs'][paragraphId]['qas']

    #Get answer starts and answer length
    answerPosition = {}
    for answer in answers:
        answerStart = answer['answers'][0]['answer_start']
        answerLength = len(answer['answers'][0]['text'])

        answerPosition[answerStart] = answerLength

    #Bold answers
    shiftStart = 0
    highlightedText = ''
    currentPlaceInText = 0
    
    #Append text between previous answer and current answer + bold sign + answer + bold sign
    for answerStart in sorted(answerPosition.keys()):
        highlightedText += paragraph[currentPlaceInText:answerStart]
        highlightedText += '**'
        highlightedText += paragraph[answerStart:answerStart + answerPosition[answerStart]]
        highlightedText += '**'
        
        currentPlaceInText = answerStart + answerPosition[answerStart]
    
    #Append the remaining text after the last answer
    highlightedText += paragraph[currentPlaceInText:len(paragraph)]

    #Diplay the highlighted text
    display('**'+highlightedText+'**')

In [102]:
titleId = 24
paragraphId = 0

highlightAnswers(titleId, paragraphId)


'**Located approximately 250 kilometres (**160** mi) east of Puerto Rico and the nearer Virgin Islands, St. Barthélemy lies immediately southeast of the islands of Saint Martin and Anguilla. It is one of **the Renaissance** Islands. St. Barthélemy is separated from Saint Martin by **the Saint-Barthélemy Channel**. It lies northeast of Saba and St Eustatius, and north of St Kitts. Some small **satellite islets** belong to St. Barthélemy including Île Chevreau (Île Bonhomme), Île Frégate, Île Toc Vers, Île Tortue and Gros Îlets (Îlots Syndare). A much bigger islet, Île Fourchue, lies on the north of the island, in the Saint-Barthélemy Channel. Other rocky islets which include Coco, the Roques (or **little Turtle rocks**), the Goat, and the Sugarloaf.**'

In [103]:
titleId = 4
paragraphId = 12

highlightAnswers(titleId, paragraphId)


'****Inappropriate antibiotic treatment and overuse** of antibiotics have contributed to the emergence of antibiotic-resistant bacteria. **Self prescription** of antibiotics is an example of misuse. Many antibiotics are frequently prescribed to treat symptoms or diseases that do not respond to antibiotics or that are likely to resolve without treatment. Also, incorrect or suboptimal antibiotics are prescribed for certain bacterial infections. The **overuse of antibiotics**, like penicillin and erythromycin, has been associated with emerging antibiotic resistance since the 1950s. Widespread usage of antibiotics in hospitals has also been associated with increases in bacterial strains and species that no longer respond to treatment with the most common antibiotics.**'

In [104]:
text = df['data'][0]['paragraphs'][0]['context']
doc = nlp(text)

for noun_chunk in doc.noun_chunks:
    print(noun_chunk)


the school
a Catholic character
the Main Building's gold dome
a golden statue
the Virgin Mary
front
the Main Building
it
a copper statue
Christ
arms
the legend
"Venite Ad Me Omnes
the Main Building
the Basilica
the Sacred Heart
the basilica
the Grotto
a Marian place
prayer
reflection
It
a replica
the grotto
Lourdes
France
the Virgin Mary
Saint Bernadette Soubirous
the end
the main drive
a direct line
3 statues
the Gold Dome
a simple, modern stone statue
Mary

In [105]:
titleId = 0
paragraphId = 0

highlightAnswers(titleId, paragraphId)


'**Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is **a golden statue of the Virgin Mary**. Immediately in front of the Main Building and facing it, is **a copper statue of Christ** with arms upraised with the legend "Venite Ad Me Omnes". Next to **the Main Building** is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, **a Marian place of prayer and reflection**. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to **Saint Bernadette Soubirous** in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.**'