In [1]:
# This command allows plots to appear in the jupyter notebook.
%matplotlib inline
import pandas as pd
pd.set_option('max_colwidth', 300) # Show more text in dataframe.
# First import the pandas package and load the cleaned json file into a dataframe called df.
df = pd.read_json('JEOPARDY_QUESTIONS1_cleaned.json')
# Let's convert air_date to date/time, rather than a string.
df['air_date'] = pd.to_datetime(df['air_date'], yearfirst= True)
# Only consider data from well-sampled years.
df = df[(df['air_date']>='2004-01-01') & (df['air_date']<='2011-12-31')]
In [2]:
list_of_states = ['Alabama','Alaska','Arizona','Arkansas','California',
'Colorado','Connecticut', 'Delaware', 'Florida','Georgia',
'Hawaii','Idaho','Illinois','Indiana', 'Iowa', 'Kansas',
'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
'Michigan','Minnesota','Mississippi', 'Missouri','Montana','Nebraska',
'Nevada','New Hampshire', 'New Jersey','New Mexico', 'New York',
'North Carolina', 'North Dakota','Ohio','Oklahoma', 'Oregon',
'Pennsylvania', 'Rhode Island','South Carolina', 'South Dakota',
'Tennessee','Texas','Utah','Vermont', 'Virginia', 'Washington',
'West Virginia', 'Wisconsin', 'Wyoming']
In [3]:
# Create new dataframe with only states as answers.
state_answers = df[df['answer'].isin(list_of_states)]
count_state_answers = state_answers.answer.value_counts()
state_data = pd.DataFrame(count_state_answers)
state_data.columns = ['total_count']
state_data.head()
Out[3]:
I got the idea of figuring out the keywords for states from an exercise I did while learning python using Google's Python Class (which I highly recommend if you are new to python but not new to programming). The first python exercise Basic Python Exercises involved counting the words in a document. I followed the same basic idea here in order to create a list of popular words for each state.
In [4]:
question_words_by_state={}
for state in list_of_states:
# Create Pandas Series of questions for each state
this_state = state_answers[state_answers['answer'] == state]
# Loop over each state's questions and
# create a list of words that is set as the value in the dict.
question_list_of_words = []
for question in this_state['question']:
for word in question.split():
cleaned_word = word.lower()
question_list_of_words.append(cleaned_word)
question_words_by_state[state] = question_list_of_words
In [5]:
# Print out top most common words for the first few states
from collections import Counter
for state in list_of_states[0:5]:
counting = Counter(question_words_by_state[state])
print state
for word in counting.most_common(2):
print ' {} {!s}'.format(word[1], word[0])
Well, this is not very useful. The most common words are pretty boring.
Let's create a list, somewhat arbitrarily, of common words. This list was compiled by running my loop a few times to find the most popular words and then adding the "boring" words to the common_word list.
In [6]:
common_words = ['the', 'in', 'this', 'of', 'state', 'a', 'is', '&', 'to', 'with', 'its',
'saw', "state's", 'for', "u.s.", 'was', 'i', 'all', 'that', 'on', 'as',
'only', 'state"', 'at', 'his', 'from', 'like', 'name', 'goes', 'known',
'named', 'it', 'has', "it's", 'and', 'than', 'found', 'you', 'an', 'are', 'were',
'by', 'my', 'can', "you're", 'if', 'had', 'but', 'have', 'or', 'some', 'where',
'part', 'take', 'but', 'about', 'near', 'what', 'now', 'he', 'after', 'became',
'one', "(of", 'not', 'them', 'group', 'national', 'new', 'miles', 'me', 'more']
question_words_by_state={}
for state in list_of_states:
# Create Series of questions for each state
this_state = state_answers[state_answers['answer'] == state]
# Loop over each state's questions and create a list of words that is the value in the dict.
question_list_of_words = []
for question in this_state['question']:
for word in question.split():
cleaned_word = word.lower().rstrip(",").rstrip('"').lstrip('"').lstrip('(').rstrip(')')
# Ignore common words
if cleaned_word not in common_words:
question_list_of_words.append(cleaned_word)
question_words_by_state[state] = question_list_of_words
In [7]:
# Print out top most common words for the states
from collections import Counter
for state in list_of_states:
counting = Counter(question_words_by_state[state])
print state
for word in counting.most_common(6):
print ' {} {!s}'.format(word[1], word[0])
As an Arizona native, I was a bit surprised that Prescott was so popular. Maybe these questions have to do with the fact that Prescott used to be the capital city of Arizona? Let's take a look at these questions.
In [8]:
state_answers[(state_answers['answer']=='Arizona') &
(state_answers['question'].str.contains('Prescott'))]
Out[8]:
Hmmm... These facts aren't related to the "capital-ness" of Prescott. It seems like a contestant just needs to know that Prescott is in the state of Arizona in order to be able to answer these questions.
What about the questions about Phoenix, the current capital of Arizona? What do these look like?
In [9]:
state_answers[state_answers['question'].str.contains('Phoenix')]
Out[9]:
Phoenix isn't generally used as part of the question, but maybe it's more likely to be the answer.
In [10]:
# Phoenix, the state's capital isn't featuree prominently in questions about the state,
# but perhaps that's because it's more often featured as the answer to a question about Arizona.
df[df['answer']=='Phoenix']
Out[10]:
In [11]:
df[df['answer']=='Phoenix'].count()
Out[11]:
It looks like for Arizona, it's more important to know the name of the capital when given the state, rather than vice versa.
I wonder if this relationship holds true for other states. To do this, I'll need to compile a list of state capitals.
In [12]:
state_capitals = {'Alabama': 'Montgomery', 'Alaska': 'Juneau', 'Arizona': 'Phoenix',
'Arkansas': 'Little Rock', 'California': 'Sacramento',
'Colorado': 'Denver', 'Connecticut': 'Hartford', 'Delaware': 'Dover',
'Florida': 'Tallahassee', 'Georgia': 'Atlanta', 'Hawaii': 'Honolulu',
'Idaho': 'Boise', 'Illinois': 'Springfield', 'Indiana': 'Indianapolis',
'Iowa': 'Des Moines', 'Kansas': 'Topeka', 'Kentucky': 'Frankfort',
'Louisiana': 'Baton Rouge', 'Maine': 'Augusta', 'Maryland': 'Annapolis',
'Massachusetts': 'Boston', 'Michigan': 'Lansing', 'Minnesota': 'St. Paul',
'Mississippi': 'Jackson', 'Missouri': 'Jefferson City', 'Montana': 'Helena',
'Nebraska': 'Lincoln', 'Nevada': 'Carson City', 'New Hampshire': 'Concord',
'New Jersey': 'Trenton', 'New Mexico': 'Santa Fe', 'New York': 'Albany',
'North Carolina': 'Raleigh', 'North Dakota': 'Bismarck', 'Ohio': 'Columbus',
'Oklahoma': 'Oklahoma City', 'Oregon': 'Salem', 'Pennsylvania': 'Harrisburg',
'Rhode Island': 'Providence', 'South Carolina': 'Columbia',
'South Dakota': 'Pierre', 'Tennessee': 'Nashville', 'Texas': 'Austin',
'Utah': 'Salt Lake City', 'Vermont': 'Montpelier', 'Virginia': 'Richmond',
'Washington': 'Olympia', 'West Virginia': 'Charleston', 'Wisconsin': 'Madison',
'Wyoming': 'Cheyenne'}
Now let's compare the number of questions for the following cases
In [13]:
state_capital_count = {}
countAlist = []
countBlist = []
for state in state_capitals:
capital = state_capitals[state]
countA = state_answers[(state_answers['question'].str.contains(capital)) &
(state_answers['answer'] == state)]['answer'].count()
countB = df[(df['answer']==capital) &
(df['question'].str.contains(state))]['answer'].count()
countAlist.append(countA)
countBlist.append(countB)
state_capital_count[state] = [countA, countB]
In [14]:
print state_capital_count
In [15]:
import numpy as np
print np.mean(countAlist)
print np.mean(countBlist)
In [16]:
import matplotlib.pyplot as plt
plt.xlabel('Case A - Question contains capital')
plt.ylabel('Case B - Question contains state')
plt.scatter(countAlist, countBlist, marker = 'o')
plt.xlim(xmin=-1)
plt.ylim(ymin=-1)
plt.plot([-1, 20], [-1,20]);
The blue line shows the one-to-one line. If the points were all along this line then there would be just as many questions in Case A as Case B.
Therefore, it looks like the results for Arizona hold, in general, for the rest of states, with only three outliers. A question is more likely to contain the state's name and expect the capital city as an answer, rather than vice versa.
By the way, those outliers are...
In [17]:
for state in state_capital_count:
capital = state_capitals[state]
if state_capital_count[state][0] > state_capital_count[state][1]:
print capital+", "+ state