notebook.community

Edit and run



In [32]:

    
import pandas
import pandasql

def select_first_50(filename):
    # Read in our aadhaar_data csv to a pandas dataframe.  Afterwards, we rename the columns
    # by replacing spaces with underscores and setting all characters to lowercase, so the
    # column names more closely resemble columns names one might find in a table.
    aadhaar_data = pandas.read_csv(filename)
    aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)

    # Select out the first 50 values for "registrar" and "enrolment_agency"
    # in the aadhaar_data table using SQL syntax. 
    #
    # Note that "enrolment_agency" is spelled with one l. Also, the order
    # of the select does matter. Make sure you select registrar then enrolment agency
    # in your query.
    q = """Select registrar, enrolment_agency from aadhaar_data limit 50"""

    #Execute your SQL command against the pandas frame
    aadhaar_solution = pandasql.sqldf(q.lower(), locals())    
    return aadhaar_solution



In [34]:

    
path = "C:\Vindico\Projects\Data\Course\Python\Udacity\Introduction to Data Science\Lesson 2\Lecture\AADHAAR\UIDAI-ENR-GEOGRAPHY-20141029.csv"
select_first_50(path)



In [44]:

    
import pandas
import pandasql

def aggregate_query(filename):
    # Read in our aadhaar_data csv to a pandas dataframe.  Afterwards, we rename the columns
    # by replacing spaces with underscores and setting all characters to lowercase, so the
    # column names more closely resemble columns names one might find in a table.
    
    aadhaar_data = pandas.read_csv(filename)
    aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)

    # Write a query that will select from the aadhaar_data table how many men and how 
    # many women over the age of 50 have had aadhaar generated for them in each district
    #
    # Note that in this quiz, the SQL query keywords are case sensitive. 
    # For example, if you want to do a sum make sure you type 'sum' rather than 'SUM'.
    #

    # The possible columns to select from aadhaar data are:
    #     1) Registrar
    #     2) Enrolment Agency
    #     3) State
    #     4) District
    #     5) Sub District
    #     6) Pin Code
    #     7) Gender
    #     8) Age
    #     9) Aadhaar generated
    #     10) Enrolment Rejected
    #     11) Residents providing email,
    #     12) Residents providing mobile number
    #
    # You can download a copy of the aadhaar data that we are passing 
    # into this exercise below:
    # https://www.dropbox.com/s/vn8t4uulbsfmalo/aadhaar_data.csv
        
    q = """SELECT Gender, District, 
    sum(Aadhaar_generated) from aadhaar_data WHERE Age>50 GROUP BY Gender, District"""

    # Execute your SQL command against the pandas frame
    aadhaar_solution = pandasql.sqldf(q.lower(), locals())
    return aadhaar_solution



In [45]:

    
path = "C:\Vindico\Projects\Data\Course\Python\Udacity\Introduction to Data Science\Lesson 2\Lecture\AADHAAR\UIDAI-ENR-GEOGRAPHY-20141029.csv"
aggregate_query(path)