Download and install Apache Drill.
Start Apache Drill in the Apache Drill directory: bin/drill-embedded
Tweak the settings as per Querying Large CSV Files With Apache Drill so you can query against column names.
In [ ]:
#Download data file
!wget -P /Users/ajh59/Documents/parlidata/ https://zenodo.org/record/579712/files/senti_post_v2.csv
In [114]:
#Install some dependencies
!pip3 install pydrill
!pip3 install pandas
!pip3 install matplotlib
In [139]:
#Import necessary packages
import pandas as pd
from pydrill.client import PyDrill
#Set the notebooks up for inline plotting
%matplotlib inline
In [6]:
#Get a connection to the Apache Drill server
drill = PyDrill(host='localhost', port=8047)
We can get a speed up on querying the CSV file by converting it to the parquet
format.
In the Apache Drill terminal, run something like the following (change the path to the CSV file as required):
CREATE TABLE dfs.tmp.`/senti_post_v2.parquet` AS SELECT * FROM dfs.`/Users/ajh59/Documents/parlidata/senti_post_v2.csv`;
(Running the command from the notebook suffers a timeout?)
In [10]:
#Test the setup
drill.query(''' SELECT * from dfs.tmp.`/senti_post_v2.parquet` LIMIT 3''').to_dataframe()
Out[10]:
The Hansard data gives the date of each speech but not the session. To search for speeches within a particular session, we need the session dates. We can get these from the Parliament data API.
In [45]:
#Get Parliament session dates from Parliament API
psd=pd.read_csv('http://lda.data.parliament.uk/sessions.csv?_view=Sessions&_pageSize=50')
psd
Out[45]:
In [58]:
def getParliamentDate(session):
start=psd[psd['display name']==session]['start date'].iloc[0]
end=psd[psd['display name']==session]['end date'].iloc[0]
return start, end
getParliamentDate('2015-2016')
Out[58]:
In [140]:
#Check the columns in the Hansard dataset, along with example values
df=drill.query(''' SELECT * from dfs.tmp.`/senti_post_v2.parquet` LIMIT 1''').to_dataframe()
print(df.columns.tolist())
df.iloc[0]
Out[140]:
In [17]:
# Example of count of speeches by person in the dataset as a whole
q='''
SELECT proper_name, COUNT(*) AS number
FROM dfs.tmp.`/senti_post_v2.parquet`
GROUP BY proper_name
'''
df=drill.query(q).to_dataframe()
df.head()
Out[17]:
In [35]:
# Example of count of speeches by gender in the dataset as a whole
q="SELECT gender, count(*) AS `Number of Speeches` FROM dfs.tmp.`/senti_post_v2.parquet` GROUP BY gender"
drill.query(q).to_dataframe()
Out[35]:
In [71]:
#Query within session
session='2015-2016'
start,end=getParliamentDate(session)
q='''
SELECT '{session}' AS session, gender, count(*) AS `Number of Speeches`
FROM dfs.tmp.`/senti_post_v2.parquet`
WHERE speech_date>='{start}' AND speech_date<='{end}'
GROUP BY gender
'''.format(session=session, start=start, end=end)
drill.query(q).to_dataframe()
Out[71]:
In [85]:
#Count number of speeches per person
start,end=getParliamentDate(session)
q='''
SELECT '{session}' AS session, gender, mnis_id, count(*) AS `Number of Speeches`
FROM dfs.tmp.`/senti_post_v2.parquet`
WHERE speech_date>='{start}' AND speech_date<='{end}'
GROUP BY mnis_id, gender
'''.format(session=session, start=start, end=end)
drill.query(q).to_dataframe().head()
Out[85]:
In [143]:
# Example of finding the average number of speeches per person by gender in a particular session
q='''
SELECT AVG(gcount) AS average, gender, session
FROM (SELECT '{session}' AS session, gender, mnis_id, count(*) AS gcount
FROM dfs.tmp.`/senti_post_v2.parquet`
WHERE speech_date>='{start}' AND speech_date<='{end}'
GROUP BY mnis_id, gender)
GROUP BY gender, session
'''.format(session=session, start=start, end=end)
drill.query(q).to_dataframe()
#Note - the average is returned as a string not a numeric
Out[143]:
In [129]:
#We can package that query up in a Python function
def avBySession(session):
start,end=getParliamentDate(session)
q='''SELECT AVG(gcount) AS average, gender, session FROM (SELECT '{session}' AS session, gender, mnis_id, count(*) AS gcount
FROM dfs.tmp.`/senti_post_v2.parquet`
WHERE speech_date>='{start}' AND speech_date<='{end}'
GROUP BY mnis_id, gender) GROUP BY gender, session
'''.format(session=session, start=start, end=end)
dq=drill.query(q).to_dataframe()
#Make the average a numeric type...
dq['average']=dq['average'].astype(float)
return dq
avBySession(session)
Out[129]:
In [146]:
#Loop through sessions and create a dataframe containing gender based averages for each one
overall=pd.DataFrame()
for session in psd['display name']:
overall=pd.concat([overall,avBySession(session)])
#Tidy up the index
overall=overall.reset_index(drop=True)
overall.head()
Out[146]:
The data is currently in a long (tidy) format. To make it easier to plot, we can reshape it (unmelt it) by casting it into a wide format, with one row per session and and the gender averages arranged by column.
In [147]:
#Reshape the dataset
overall_wide = overall.pivot(index='session', columns='gender')
#Flatten the column names
overall_wide.columns = overall_wide.columns.get_level_values(1)
overall_wide
Out[147]:
Now we can plot the data - the session axis should sort in an appropriate way (alphanumerically).
In [137]:
overall_wide.plot(kind='barh');
In [138]:
overall_wide.plot();
We can generalise the approach to look at a count of split by party.
In [150]:
# Example of finding the average number of speeches per person by party in a particular session
# Simply tweak the query we used for gender...
q='''
SELECT AVG(gcount) AS average, party, session
FROM (SELECT '{session}' AS session, party, mnis_id, count(*) AS gcount
FROM dfs.tmp.`/senti_post_v2.parquet`
WHERE speech_date>='{start}' AND speech_date<='{end}'
GROUP BY mnis_id, party)
GROUP BY party, session
'''.format(session=session, start=start, end=end)
drill.query(q).to_dataframe()
Out[150]:
Make a function out of that, as we did before.
In [157]:
def avByType(session,typ):
start,end=getParliamentDate(session)
q='''SELECT AVG(gcount) AS average, {typ}, session
FROM (SELECT '{session}' AS session, {typ}, mnis_id, count(*) AS gcount
FROM dfs.tmp.`/senti_post_v2.parquet`
WHERE speech_date>='{start}' AND speech_date<='{end}'
GROUP BY mnis_id, {typ})
GROUP BY {typ}, session
'''.format(session=session, start=start, end=end, typ=typ)
dq=drill.query(q).to_dataframe()
#Make the average a numeric type...
dq['average']=dq['average'].astype(float)
return dq
def avByParty(session):
return avByType(session,'party')
avByParty(session)
Out[157]:
In [172]:
# Create a function to loop through sessions and create a dataframe containing specified averages for each one
# Note that this just generalises and packages up the code we had previously
def pivotAndFlatten(overall,typ):
#Tidy up the index
overall=overall.reset_index(drop=True)
overall_wide = overall.pivot(index='session', columns=typ)
#Flatten the column names
overall_wide.columns = overall_wide.columns.get_level_values(1)
return overall_wide
def getOverall(typ):
overall=pd.DataFrame()
for session in psd['display name']:
overall=pd.concat([overall,avByType(session,typ)])
return pivotAndFlatten(overall,typ)
overallParty=getOverall('party')
overallParty.head()
Out[172]:
In [236]:
#Note that the function means it's now just as easy to query on another single column
getOverall('party_group')
Out[236]:
In [166]:
overallParty.plot(kind='barh', figsize=(20,20));
In [169]:
parties=['Conservative','Labour']
overallParty[parties].plot();
We can write another query to look by gender and party.
In [175]:
def avByGenderAndParty(session):
start,end=getParliamentDate(session)
q='''SELECT AVG(gcount) AS average, gender, party, session
FROM (SELECT '{session}' AS session, gender, party, mnis_id, count(*) AS gcount
FROM dfs.tmp.`/senti_post_v2.parquet`
WHERE speech_date>='{start}' AND speech_date<='{end}'
GROUP BY mnis_id, gender, party)
GROUP BY gender, party, session
'''.format(session=session, start=start, end=end)
dq=drill.query(q).to_dataframe()
#Make the average a numeric type...
dq['average']=dq['average'].astype(float)
return dq
gp=avByGenderAndParty(session)
gp
Out[175]:
In [206]:
gp_overall=pd.DataFrame()
for session in psd['display name']:
gp_overall=pd.concat([gp_overall,avByGenderAndParty(session)])
#Pivot table it more robust than pivot - missing entries handled with NA
#Also limit what parties we are interested in
gp_wide = gp_overall[gp_overall['party'].isin(parties)].pivot_table(index='session', columns=['party','gender'])
#Flatten column names
gp_wide.columns = gp_wide.columns.droplevel(0)
gp_wide
Out[206]:
In [208]:
gp_wide.plot(figsize=(20,10));
In [209]:
gp_wide.plot(kind='barh', figsize=(20,10));
In [225]:
# Go back to the full dataset, not filtered by party
gp_wide = gp_overall.pivot_table(index='session', columns=['party','gender'])
#Flatten column names
gp_wide.columns = gp_wide.columns.droplevel(0)
gp_wide.head()
Out[225]:
In [232]:
sp_wide = gp_wide.reset_index().melt(id_vars=['session']).pivot_table(index=['session','party'], columns=['gender'])
#Flatten column names
sp_wide.columns = sp_wide.columns.droplevel(0)
sp_wide#.dropna(how='all')
Out[232]:
In [235]:
#Sessions when F spoke more, on average, then M
#Recall, this data has been previously filtered to limit data to Con and Lab
#Tweak the precision of the display
pd.set_option('precision',3)
sp_wide[sp_wide['Female'].fillna(0) > sp_wide['Male'].fillna(0) ]
Out[235]:
In [ ]: