In [32]:
import pandas
import pandasql
def select_first_50(filename):
# Read in our aadhaar_data csv to a pandas dataframe. Afterwards, we rename the columns
# by replacing spaces with underscores and setting all characters to lowercase, so the
# column names more closely resemble columns names one might find in a table.
aadhaar_data = pandas.read_csv(filename)
aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)
# Select out the first 50 values for "registrar" and "enrolment_agency"
# in the aadhaar_data table using SQL syntax.
#
# Note that "enrolment_agency" is spelled with one l. Also, the order
# of the select does matter. Make sure you select registrar then enrolment agency
# in your query.
q = """Select registrar, enrolment_agency from aadhaar_data limit 50"""
#Execute your SQL command against the pandas frame
aadhaar_solution = pandasql.sqldf(q.lower(), locals())
return aadhaar_solution
In [34]:
path = "C:\Vindico\Projects\Data\Course\Python\Udacity\Introduction to Data Science\Lesson 2\Lecture\AADHAAR\UIDAI-ENR-GEOGRAPHY-20141029.csv"
select_first_50(path)
In [44]:
import pandas
import pandasql
def aggregate_query(filename):
# Read in our aadhaar_data csv to a pandas dataframe. Afterwards, we rename the columns
# by replacing spaces with underscores and setting all characters to lowercase, so the
# column names more closely resemble columns names one might find in a table.
aadhaar_data = pandas.read_csv(filename)
aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)
# Write a query that will select from the aadhaar_data table how many men and how
# many women over the age of 50 have had aadhaar generated for them in each district
#
# Note that in this quiz, the SQL query keywords are case sensitive.
# For example, if you want to do a sum make sure you type 'sum' rather than 'SUM'.
#
# The possible columns to select from aadhaar data are:
# 1) Registrar
# 2) Enrolment Agency
# 3) State
# 4) District
# 5) Sub District
# 6) Pin Code
# 7) Gender
# 8) Age
# 9) Aadhaar generated
# 10) Enrolment Rejected
# 11) Residents providing email,
# 12) Residents providing mobile number
#
# You can download a copy of the aadhaar data that we are passing
# into this exercise below:
# https://www.dropbox.com/s/vn8t4uulbsfmalo/aadhaar_data.csv
q = """SELECT Gender, District,
sum(Aadhaar_generated) from aadhaar_data WHERE Age>50 GROUP BY Gender, District"""
# Execute your SQL command against the pandas frame
aadhaar_solution = pandasql.sqldf(q.lower(), locals())
return aadhaar_solution
In [45]:
path = "C:\Vindico\Projects\Data\Course\Python\Udacity\Introduction to Data Science\Lesson 2\Lecture\AADHAAR\UIDAI-ENR-GEOGRAPHY-20141029.csv"
aggregate_query(path)