In [63]:
import pandas as pd

def preview(df):
    print("Dimensions: {0} rows x {1} columns".format(df.shape[0], df.shape[1]))
    return df.head()

jeopardy = pd.read_csv("jeopardy.csv")
preview(jeopardy)


Dimensions: 216930 rows x 7 columns
Out[63]:
Show Number Air Date Round Category Value Question Answer
0 4680 2004-12-31 Jeopardy! HISTORY $200 For the last 8 years of his life, Galileo was ... Copernicus
1 4680 2004-12-31 Jeopardy! ESPN's TOP 10 ALL-TIME ATHLETES $200 No. 2: 1912 Olympian; football star at Carlisl... Jim Thorpe
2 4680 2004-12-31 Jeopardy! EVERYBODY TALKS ABOUT IT... $200 The city of Yuma in this state has a record av... Arizona
3 4680 2004-12-31 Jeopardy! THE COMPANY LINE $200 In 1963, live on "The Art Linkletter Show", th... McDonald's
4 4680 2004-12-31 Jeopardy! EPITAPHS & TRIBUTES $200 Signer of the Dec. of Indep., framer of the Co... John Adams

In [64]:
print(jeopardy.columns)

# Rename columns
jeopardy.rename(columns=lambda name: name.lstrip(), inplace=True)


Index([u'Show Number', u' Air Date', u' Round', u' Category', u' Value',
       u' Question', u' Answer'],
      dtype='object')

In [65]:
print(jeopardy.columns)


Index([u'Show Number', u'Air Date', u'Round', u'Category', u'Value',
       u'Question', u'Answer'],
      dtype='object')

Normalizing text


In [66]:
import string

def norm_words(words):
    words = words.lower().translate(None, string.punctuation)
    return words

jeopardy["clean_question"] = jeopardy["Question"].apply(norm_words)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(norm_words)

jeopardy.head()


Out[66]:
Show Number Air Date Round Category Value Question Answer clean_question clean_answer
0 4680 2004-12-31 Jeopardy! HISTORY $200 For the last 8 years of his life, Galileo was ... Copernicus for the last 8 years of his life galileo was u... copernicus
1 4680 2004-12-31 Jeopardy! ESPN's TOP 10 ALL-TIME ATHLETES $200 No. 2: 1912 Olympian; football star at Carlisl... Jim Thorpe no 2 1912 olympian football star at carlisle i... jim thorpe
2 4680 2004-12-31 Jeopardy! EVERYBODY TALKS ABOUT IT... $200 The city of Yuma in this state has a record av... Arizona the city of yuma in this state has a record av... arizona
3 4680 2004-12-31 Jeopardy! THE COMPANY LINE $200 In 1963, live on "The Art Linkletter Show", th... McDonald's in 1963 live on the art linkletter show this c... mcdonalds
4 4680 2004-12-31 Jeopardy! EPITAPHS & TRIBUTES $200 Signer of the Dec. of Indep., framer of the Co... John Adams signer of the dec of indep framer of the const... john adams

Normalizing columns


In [67]:
def norm_value(value):
    try:
        value = int(value.translate(None, string.punctuation))
    except:
        value = 0
    return value

jeopardy["clean_value"] = jeopardy["Value"].apply(norm_value)
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

print(jeopardy.dtypes)
jeopardy.head()


Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object
Out[67]:
Show Number Air Date Round Category Value Question Answer clean_question clean_answer clean_value
0 4680 2004-12-31 Jeopardy! HISTORY $200 For the last 8 years of his life, Galileo was ... Copernicus for the last 8 years of his life galileo was u... copernicus 200
1 4680 2004-12-31 Jeopardy! ESPN's TOP 10 ALL-TIME ATHLETES $200 No. 2: 1912 Olympian; football star at Carlisl... Jim Thorpe no 2 1912 olympian football star at carlisle i... jim thorpe 200
2 4680 2004-12-31 Jeopardy! EVERYBODY TALKS ABOUT IT... $200 The city of Yuma in this state has a record av... Arizona the city of yuma in this state has a record av... arizona 200
3 4680 2004-12-31 Jeopardy! THE COMPANY LINE $200 In 1963, live on "The Art Linkletter Show", th... McDonald's in 1963 live on the art linkletter show this c... mcdonalds 200
4 4680 2004-12-31 Jeopardy! EPITAPHS & TRIBUTES $200 Signer of the Dec. of Indep., framer of the Co... John Adams signer of the dec of indep framer of the const... john adams 200

Answers in questions


In [73]:
def ans_in_q(row):
    match_count = 0
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    
    try:
        split_answer.remove("the")
    except:
        pass
    
    if len(split_answer) == 0:
        return 0
    else:
        for word in split_answer:
            if word in split_question:
                match_count += 1
        return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(ans_in_q, axis=1)
print(jeopardy["answer_in_question"].mean())
jeopardy[jeopardy["answer_in_question"] > 0].head()


0.00643064583045
Out[73]:
Show Number Air Date Round Category Value Question Answer clean_question clean_answer clean_value answer_in_question
266 4931 2006-02-06 Double Jeopardy! NOT A CURRENT NATIONAL CAPITAL $400 Ljubljana, Bratislava, Barcelona Barcelona ljubljana bratislava barcelona barcelona 400 1
272 4931 2006-02-06 Double Jeopardy! NOT A CURRENT NATIONAL CAPITAL $800 Istanbul, Ottawa, Amman Istanbul istanbul ottawa amman istanbul 800 1
278 4931 2006-02-06 Double Jeopardy! NOT A CURRENT NATIONAL CAPITAL $1200 Sofia, Sarajevo, Saigon Saigon sofia sarajevo saigon saigon 1200 1
284 4931 2006-02-06 Double Jeopardy! NOT A CURRENT NATIONAL CAPITAL $1600 Bucharest, Bonn, Bern Bonn bucharest bonn bern bonn 1600 1
290 4931 2006-02-06 Double Jeopardy! NOT A CURRENT NATIONAL CAPITAL $2000 Belize City, Guatemala City, Panama City Belize City belize city guatemala city panama city belize city 2000 1

In [85]:
jeopardy[(jeopardy["answer_in_question"] > 0) & (jeopardy["clean_question"].apply(string.split).apply(len) > 6)].head()


Out[85]:
Show Number Air Date Round Category Value Question Answer clean_question clean_answer clean_value answer_in_question
1137 1279 1990-03-08 Jeopardy! PEANUTS $200 Of a 25th, 30th or 40th anniversary, what "Pea... 40th Anniversary of a 25th 30th or 40th anniversary what peanut... 40th anniversary 200 1
1840 3113 1998-02-25 Double Jeopardy! TAKE A GUESS $600 Of a pogo stick injury, a dense winter fog or ... a dense winter fog of a pogo stick injury a dense winter fog or t... a dense winter fog 600 1
2347 4595 2004-07-23 Jeopardy! BIRD HUNTING $800 The third rail in a subway system is the one w... a rail the third rail in a subway system is the one w... a rail 800 1
2572 4220 2002-12-27 Jeopardy! THE PLANET URANUS $400 Of 84, 184 or 284, the length in years of one ... 84 of 84 184 or 284 the length in years of one or... 84 400 1
4163 4213 2002-12-18 Double Jeopardy! MUD $400 Of an artist, a fish, or a wasp, it's what a m... wasp of an artist a fish or a wasp its what a mud d... wasp 400 1

Only 0.6% of the answers appear in the questions itself. Out of this 0.6%, a sample of the questions shows that they are all multiple choice questions, which concludes that it is very unlikely that the answer will be in the question itself.

Recycled questions


In [102]:
jeopardy = jeopardy.sort_values(by="Air Date")

question_overlap = []
terms_used = set()

for index, row in jeopardy.iterrows():
    match_count = 0
    split_question = row["clean_question"].split(" ")
    for word in split_question:
        if len(word) < 6:
            split_question.remove(word)
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= float(len(split_question))
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap
print(jeopardy["question_overlap"].mean())


0.928425630164

In [105]:
jeopardy.tail()


Out[105]:
Show Number Air Date Round Category Value Question Answer clean_question clean_answer clean_value answer_in_question question_overlap
105940 6300 2012-01-27 Jeopardy! THE TRUTH LIES THEREIN $400 Old school GPS on a shopping mall map: "You A... here old school gps on a shopping mall map you are... here 400 0 1.000000
105933 6300 2012-01-27 Jeopardy! LESSER-KNOWN SCIENTISTS $200 In 1779 Dutch scientist Jan Ingenhousz publish... photosynthesis in 1779 dutch scientist jan ingenhousz publish... photosynthesis 200 0 0.888889
105935 6300 2012-01-27 Jeopardy! VISITING THE CITY $400 First the Royal Ontario Museum, then for lunch... Toronto first the royal ontario museum then for lunch ... toronto 400 0 1.000000
105951 6300 2012-01-27 Jeopardy! LESSER-KNOWN SCIENTISTS $800 Joseph Lagrange insisted on 10 as the basic un... the metric system joseph lagrange insisted on 10 as the basic un... the metric system 800 0 0.833333
105930 6300 2012-01-27 Jeopardy! PANTS $200 A synonym for freight, or pants with large bel... cargo pants a synonym for freight or pants with large bell... cargo pants 200 0 0.909091

Low value vs high value questions


In [106]:
def value(row):
    if row["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value

jeopardy["high_value"] = jeopardy.apply(value, axis=1)
jeopardy.head()


Out[106]:
Show Number Air Date Round Category Value Question Answer clean_question clean_answer clean_value answer_in_question question_overlap high_value
84523 1 1984-09-10 Jeopardy! LAKES & RIVERS $100 River mentioned most often in the Bible the Jordan river mentioned most often in the bible the jordan 100 0 0.000000 0
84544 1 1984-09-10 Jeopardy! ANIMALS $500 If this species of hybrid's parents were rever... a mule if this species of hybrids parents were revers... a mule 500 0 0.000000 0
84543 1 1984-09-10 Jeopardy! LAKES & RIVERS $500 World's largest lake, nearly 5 times as big as... the Caspian Sea worlds largest lake nearly 5 times as big as s... the caspian sea 500 0 0.000000 0
84542 1 1984-09-10 Jeopardy! ACTORS & ROLES $400 The blonde preferred in the film "Gentlemen Pr... Marilyn Monroe the blonde preferred in the film gentlemen pre... marilyn monroe 400 0 0.166667 0
84553 1 1984-09-10 Double Jeopardy! NATIONAL LANDMARKS $400 When he was home, George Washington slept here Mount Vernon when he was home george washington slept here mount vernon 400 0 0.000000 0

In [ ]: