In [1]:
#import logging
#import sys
import string
#from util import logfile
#logging.basicConfig(filename=logfile, format='%(message)s',
#level=logging.INFO, filemode='w')
def word_count():
# For this exercise, write a program that serially counts the number of occurrences
# of each word in the book Alice in Wonderland.
#
# The text of Alice in Wonderland will be fed into your program line-by-line.
# Your program needs to take each line and do the following:
# 1) Tokenize the line into string tokens by whitespace
# Example: "Hello, World!" should be converted into "Hello," and "World!"
# (This part has been done for you.)
#
# 2) Remove all punctuation
# Example: "Hello," and "World!" should be converted into "Hello" and "World"
#
# 3) Make all letters lowercase
# Example: "Hello" and "World" should be converted to "hello" and "world"
#
# Store the the number of times that a word appears in Alice in Wonderland
# in the word_counts dictionary, and then *print* (don't return) that dictionary
#
# In this exercise, print statements will be considered your final output. Because
# of this, printing a debug statement will cause the grader to break. Instead,
# you can use the logging module which we've configured for you.
#
# For example:
# logging.info("My debugging message")
#
# The logging module can be used to give you more control over your
# debugging or other messages than you can get by printing them. Messages
# logged via the logger we configured will be saved to a
# file. If you click "Test Run", then you will see the contents of that file
# once your program has finished running.
#
# The logging module also has other capabilities; see
# https://docs.python.org/2/library/logging.html
# for more information.
f = open('alice.txt')
lines = f.read().splitlines()
data = []
word_counts = {}
for line in lines:
line = line.lower().translate(None, string.punctuation)
data = line.strip().split(" ")
for d in data:
if d in word_counts:
word_counts[d] = (word_counts[d] + 1)
else:
word_counts[d] = 1
#print word_counts
f.close()
word_count()
In [ ]:
import logging
import sys
import string
from util import logfile
logging.basicConfig(filename=logfile, format='%(message)s',
level=logging.INFO, filemode='w')
def word_count():
# For this exercise, write a program that serially counts the number of occurrences
# of each word in the book Alice in Wonderland.
#
# The text of Alice in Wonderland will be fed into your program line-by-line.
# Your program needs to take each line and do the following:
# 1) Tokenize the line into string tokens by whitespace
# Example: "Hello, World!" should be converted into "Hello," and "World!"
# (This part has been done for you.)
#
# 2) Remove all punctuation
# Example: "Hello," and "World!" should be converted into "Hello" and "World"
#
# 3) Make all letters lowercase
# Example: "Hello" and "World" should be converted to "hello" and "world"
#
# Store the the number of times that a word appears in Alice in Wonderland
# in the word_counts dictionary, and then *print* (don't return) that dictionary
#
# In this exercise, print statements will be considered your final output. Because
# of this, printing a debug statement will cause the grader to break. Instead,
# you can use the logging module which we've configured for you.
#
# For example:
# logging.info("My debugging message")
#
# The logging module can be used to give you more control over your
# debugging or other messages than you can get by printing them. Messages
# logged via the logger we configured will be saved to a
# file. If you click "Test Run", then you will see the contents of that file
# once your program has finished running.
#
# The logging module also has other capabilities; see
# https://docs.python.org/2/library/logging.html
# for more information.
data = []
word_counts = {}
for line in sys.stdin:
line = line.lower().translate(None, string.punctuation)
data = line.strip().split(" ")
for d in data:
if d in word_counts:
word_counts[d] = (word_counts[d] + 1)
else:
word_counts[d] = 1
print word_counts
word_count()
In [3]:
import pandas as pd
data = pd.read_csv('aadhaar_data.csv')
In [8]:
data.head()
Out[8]:
In [25]:
from string import maketrans
intab = "aeiou"
outtab = "12345"
trantab = maketrans(intab, outtab)
#print trantab
str = "this is string example....wow!!!";
print str.translate(trantab);
In [33]:
import string
filename = 'aadhaar_data.csv'
with open(filename, 'r') as f:
f.readline() # skip header line
# mapper
mapped_data = []
for line in f:
data = line.strip().split(",")
cleaned_data = []
# if proper length
if len(data) == 12:
for i in data:
cleaned_data.append(i.translate(string.maketrans("",""), string.punctuation).lower())
mapped_data.append("{0}\t{1}".format(cleaned_data[3], cleaned_data[8]))
#print "{0}\t{1}".format(cleaned_data[3], cleaned_data[8])
# reducer
reduced_data = {}
for line in mapped_data:
data = line.split("\t")
if len(data) == 2:
district, aadhar_gen = data
if district in reduced_data:
reduced_data[district] += int(aadhar_gen)
else:
reduced_data[district] = int(aadhar_gen)
for district in reduced_data:
print "{0}\t{1}".format(district, reduced_data[district])
In [ ]:
import sys
import string
import logging
from util import mapper_logfile
logging.basicConfig(filename=mapper_logfile, format='%(message)s',
level=logging.INFO, filemode='w')
def mapper():
#Also make sure to fill out the reducer code before clicking "Test Run" or "Submit".
#Each line will be a comma-separated list of values. The
#header row WILL be included. Tokenize each row using the
#commas, and emit (i.e. print) a key-value pair containing the
#district (not state) and Aadhaar generated, separated by a tab.
#Skip rows without the correct number of tokens and also skip
#the header row.
#You can see a copy of the the input Aadhaar data
#in the link below:
#https://www.dropbox.com/s/vn8t4uulbsfmalo/aadhaar_data.csv
#Since you are printing the output of your program, printing a debug
#statement will interfere with the operation of the grader. Instead,
#use the logging module, which we've configured to log to a file printed
#when you click "Test Run". For example:
#logging.info("My debugging message")
next(sys.stdin)
for line in sys.stdin:
data = line.strip().split(",")
#cleaned_data = []
# if proper length
if len(data) == 12:
#for i in data:
#cleaned_data.append(i)
print "{0}\t{1}".format(data[3], data[8])
#mapped_data.append("{0}\t{1}".format(cleaned_data[3], cleaned_data[8]))
#print "{0}\t{1}".format(cleaned_data[3], cleaned_data[8])
mapper()
import sys
import logging
from util import reducer_logfile
logging.basicConfig(filename=reducer_logfile, format='%(message)s',
level=logging.INFO, filemode='w')
def reducer():
#Also make sure to fill out the mapper code before clicking "Test Run" or "Submit".
#Each line will be a key-value pair separated by a tab character.
#Print out each key once, along with the total number of Aadhaar
#generated, separated by a tab. Make sure each key-value pair is
#formatted correctly! Here's a sample final key-value pair: 'Gujarat\t5.0'
#Since you are printing the output of your program, printing a debug
#statement will interfere with the operation of the grader. Instead,
#use the logging module, which we've configured to log to a file printed
#when you click "Test Run". For example:
#logging.info("My debugging message")
reduced_data = {}
for line in sys.stdin:
data = line.strip().split("\t")
if len(data) == 2:
district, aadhar_gen = data
if district in reduced_data:
reduced_data[district] += int(aadhar_gen)
else:
reduced_data[district] = int(aadhar_gen)
for district in reduced_data:
print "{0}\t{1}".format(district, reduced_data[district])
reducer()
In [9]:
import string
filename = 'turnstile_data_master_with_weather.csv'
with open(filename, 'r') as f:
f.readline() # skip header line
# mapper
mapped_data = []
for line in f:
data = line.strip().split(",")
cleaned_data = []
# if proper length
if len(data) == 22:
for i in data:
cleaned_data.append(i.translate(string.maketrans("",""), string.punctuation))
mapped_data.append("{0}\t{1}".format(cleaned_data[1], cleaned_data[6]))
#print "{0}\t{1}".format(cleaned_data[1], cleaned_data[6])
# reducer
reduced_data = {}
for line in mapped_data:
data = line.split("\t")
if len(data) == 2:
unit, entries_hourly = data
if unit in reduced_data:
reduced_data[unit] += float(entries_hourly)
else:
reduced_data[unit] = float(entries_hourly)
for unit in reduced_data:
print "{0}\t{1}".format(unit, reduced_data[unit])
In [ ]:
import sys
import string
import logging
from util import mapper_logfile
logging.basicConfig(filename=mapper_logfile, format='%(message)s',
level=logging.INFO, filemode='w')
def mapper():
"""
The input to this mapper will be the final Subway-MTA dataset, the same as
in the previous exercise. You can check out the csv and its structure below:
https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
For each line of input, the mapper output should PRINT (not return) the UNIT as
the key, the number of ENTRIESn_hourly as the value, and separate the key and
the value by a tab. For example: 'R002\t105105.0'
Since you are printing the output of your program, printing a debug
statement will interfere with the operation of the grader. Instead,
use the logging module, which we've configured to log to a file printed
when you click "Test Run". For example:
logging.info("My debugging message")
The logging module can be used to give you more control over your debugging
or other messages than you can get by printing them. In this exercise, print
statements from your mapper will go to your reducer, and print statements
from your reducer will be considered your final output. By contrast, messages
logged via the loggers we configured will be saved to two files, one
for the mapper and one for the reducer. If you click "Test Run", then we
will show the contents of those files once your program has finished running.
The logging module also has other capabilities; see
https://docs.python.org/2/library/logging.html for more information.
"""
next(sys.stdin) # skip header
for line in sys.stdin:
data = line.strip().split(",")
#cleaned_data = []
# if proper length
if len(data) == 22:
#for i in data:
#cleaned_data.append(i.translate(string.maketrans("",""), string.punctuation))
#mapped_data.append("{0}\t{1}".format(cleaned_data[1], cleaned_data[6]))
print "{0}\t{1}".format(data[1], data[6])
mapper()
import sys
import logging
from util import reducer_logfile
logging.basicConfig(filename=reducer_logfile, format='%(message)s',
level=logging.INFO, filemode='w')
def reducer():
'''
Given the output of the mapper for this exercise, the reducer should PRINT
(not return) one line per UNIT along with the total number of ENTRIESn_hourly
over the course of May (which is the duration of our data), separated by a tab.
An example output row from the reducer might look like this: 'R001\t500625.0'
You can assume that the input to the reducer is sorted such that all rows
corresponding to a particular UNIT are grouped together.
Since you are printing the output of your program, printing a debug
statement will interfere with the operation of the grader. Instead,
use the logging module, which we've configured to log to a file printed
when you click "Test Run". For example:
logging.info("My debugging message")
'''
reduced_data = {}
for line in sys.stdin:
data = line.split("\t")
if len(data) == 2:
unit, entries_hourly = data
if unit in reduced_data:
reduced_data[unit] += float(entries_hourly)
else:
reduced_data[unit] = float(entries_hourly)
for unit in reduced_data:
print "{0}\t{1}".format(unit, reduced_data[unit])
reducer()
In [46]:
import pandas as pd
data = pd.read_csv('turnstile_data_master_with_weather.csv', nrows=20000)
data.head()
data.columns.values
Out[46]:
In [47]:
data = data[['ENTRIESn_hourly', 'fog', 'rain']]
data.head()
data.to_csv('avg_entries.csv', index=False)
In [41]:
# fog - rain
# 0 0
# 0 1
# 1 0
# 1 1
filename = 'avg_entries.csv'
with open(filename, 'r') as f:
f.readline() # skip header line
# mapper
mapped_data = []
for line in f:
data = line.strip().split(",")
# if proper length
if len(data) == 3:
fog_rain = "{}fog-{}rain".format('' if float(data[1]) else 'no', '' if float(data[2]) else 'no')
mapped_data.append("{0}\t{1}".format(fog_rain, data[0]))
#print "{0}\t{1}".format(fog_rain, data[0])
In [42]:
# reducer
reduced_data = {}
count = {}
for line in mapped_data:
data = line.split("\t")
if len(data) == 2:
fog_rain, entries_hourly = data
if fog_rain in reduced_data:
reduced_data[fog_rain] += float(entries_hourly)
count[fog_rain] += 1
else:
reduced_data[fog_rain] = float(entries_hourly)
count[fog_rain] = 1
for fog_rain in reduced_data:
print "{0}\t{1}".format(fog_rain, reduced_data[fog_rain]/count[fog_rain])
In [ ]:
import sys
import string
import logging
from util import mapper_logfile
logging.basicConfig(filename=mapper_logfile, format='%(message)s',
level=logging.INFO, filemode='w')
def mapper():
'''
For this exercise, compute the average value of the ENTRIESn_hourly column
for different weather types. Weather type will be defined based on the
combination of the columns fog and rain (which are boolean values).
For example, one output of our reducer would be the average hourly entries
across all hours when it was raining but not foggy.
Each line of input will be a row from our final Subway-MTA dataset in csv format.
You can check out the input csv file and its structure below:
https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
Note that this is a comma-separated file.
This mapper should PRINT (not return) the weather type as the key (use the
given helper function to format the weather type correctly) and the number in
the ENTRIESn_hourly column as the value. They should be separated by a tab.
For example: 'fog-norain\t12345'
Since you are printing the output of your program, printing a debug
statement will interfere with the operation of the grader. Instead,
use the logging module, which we've configured to log to a file printed
when you click "Test Run". For example:
logging.info("My debugging message")
'''
# Takes in variables indicating whether it is foggy and/or rainy and
# returns a formatted key that you should output. The variables passed in
# can be booleans, ints (0 for false and 1 for true) or floats (0.0 for
# false and 1.0 for true), but the strings '0.0' and '1.0' will not work,
# so make sure you convert these values to an appropriate type before
# calling the function.
def format_key(fog, rain):
return '{}fog-{}rain'.format(
'' if fog else 'no',
'' if rain else 'no'
)
next(sys.stdin) # skip header
for line in sys.stdin:
data = line.strip().split(",")
# if proper length
if len(data) == 22:
fog_rain = "{}fog-{}rain".format('' if float(data[14]) else 'no', '' if float(data[15]) else 'no')
print "{0}\t{1}".format(fog_rain, data[6])
mapper()
import sys
import logging
from util import reducer_logfile
logging.basicConfig(filename=reducer_logfile, format='%(message)s',
level=logging.INFO, filemode='w')
def reducer():
'''
Given the output of the mapper for this assignment, the reducer should
print one row per weather type, along with the average value of
ENTRIESn_hourly for that weather type, separated by a tab. You can assume
that the input to the reducer will be sorted by weather type, such that all
entries corresponding to a given weather type will be grouped together.
In order to compute the average value of ENTRIESn_hourly, you'll need to
keep track of both the total riders per weather type and the number of
hours with that weather type. That's why we've initialized the variable
riders and num_hours below. Feel free to use a different data structure in
your solution, though.
An example output row might look like this:
'fog-norain\t1105.32467557'
Since you are printing the output of your program, printing a debug
statement will interfere with the operation of the grader. Instead,
use the logging module, which we've configured to log to a file printed
when you click "Test Run". For example:
logging.info("My debugging message")
'''
riders = 0 # The number of total riders for this key
num_hours = 0 # The number of hours with this key
old_key = None
reduced_data = {}
count = {}
for line in sys.stdin:
data = line.split("\t")
if len(data) == 2:
fog_rain, entries_hourly = data
if fog_rain in reduced_data:
reduced_data[fog_rain] += float(entries_hourly)
count[fog_rain] += 1
else:
reduced_data[fog_rain] = float(entries_hourly)
count[fog_rain] = 1
for fog_rain in reduced_data:
print "{0}\t{1}".format(fog_rain, reduced_data[fog_rain]/count[fog_rain])
reducer()
In [50]:
filename = 'turnstile_data_master_with_weather.csv'
with open(filename, 'r') as f:
f.readline() # skip header line
# mapper
mapped_data = []
for line in f:
data = line.strip().split(",")
# if proper length
if len(data) == 22:
mapped_data.append("{0}\t{1}\t{2}\t{3}".format(data[1], data[6], data[2], data[3]))
#print "{0}\t{1}".format(fog_rain, data[0])
In [61]:
from datetime import datetime
def date_ify(date, time):
return datetime.strptime(' '.join([date,time]), "%Y-%m-%d %H:%M:%S")
# reducer
reduced_data = {}
for line in mapped_data:
data = line.split("\t")
if len(data) == 4:
unit, entries_hourly, date, time = data
datetime = date_ify(date, time)
if unit in reduced_data:
if float(entries_hourly) > float(reduced_data[unit][1]):
reduced_data[unit] = (datetime, entries_hourly)
elif float(entries_hourly) == float(reduced_data[unit][1]):
if datetime > reduced_data[unit][0]:
reduced_data[unit] = (datetime, entries_hourly)
else:
reduced_data[unit] = (datetime, entries_hourly)
for unit in reduced_data:
print "{0}\t{1}\t{2}".format(unit, reduced_data[unit][0], reduced_data[unit][1])
In [ ]:
import sys
import string
import logging
from util import mapper_logfile
logging.basicConfig(filename=mapper_logfile, format='%(message)s',
level=logging.INFO, filemode='w')
def mapper():
"""
In this exercise, for each turnstile unit, you will determine the date and time
(in the span of this data set) at which the most people entered through the unit.
The input to the mapper will be the final Subway-MTA dataset, the same as
in the previous exercise. You can check out the csv and its structure below:
https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
For each line, the mapper should return the UNIT, ENTRIESn_hourly, DATEn, and
TIMEn columns, separated by tabs. For example:
'R001\t100000.0\t2011-05-01\t01:00:00'
Since you are printing the output of your program, printing a debug
statement will interfere with the operation of the grader. Instead,
use the logging module, which we've configured to log to a file printed
when you click "Test Run". For example:
logging.info("My debugging message")
"""
next(sys.stdin) # skip header
for line in sys.stdin:
data = line.strip().split(",")
# if proper length
if len(data) == 22:
print "{0}\t{1}\t{2}\t{3}".format(data[1], data[6], data[2], data[3])
mapper()
import sys
import logging
from util import reducer_logfile
logging.basicConfig(filename=reducer_logfile, format='%(message)s',
level=logging.INFO, filemode='w')
from datetime import datetime
def reducer():
'''
Write a reducer that will compute the busiest date and time (that is, the
date and time with the most entries) for each turnstile unit. Ties should
be broken in favor of datetimes that are later on in the month of May. You
may assume that the contents of the reducer will be sorted so that all entries
corresponding to a given UNIT will be grouped together.
The reducer should print its output with the UNIT name, the datetime (which
is the DATEn followed by the TIMEn column, separated by a single space), and
the number of entries at this datetime, separated by tabs.
For example, the output of the reducer should look like this:
R001 2011-05-11 17:00:00 31213.0
R002 2011-05-12 21:00:00 4295.0
R003 2011-05-05 12:00:00 995.0
R004 2011-05-12 12:00:00 2318.0
R005 2011-05-10 12:00:00 2705.0
R006 2011-05-25 12:00:00 2784.0
R007 2011-05-10 12:00:00 1763.0
R008 2011-05-12 12:00:00 1724.0
R009 2011-05-05 12:00:00 1230.0
R010 2011-05-09 18:00:00 30916.0
...
...
Since you are printing the output of your program, printing a debug
statement will interfere with the operation of the grader. Instead,
use the logging module, which we've configured to log to a file printed
when you click "Test Run". For example:
logging.info("My debugging message")
'''
max_entries = 0
old_key = None
def date_ify(date, time):
return datetime.strptime(' '.join([date,time]).strip(), "%Y-%m-%d %H:%M:%S")
reduced_data = {}
for line in sys.stdin:
data = line.split("\t")
if len(data) == 4:
unit, entries_hourly, date, time = data
date_time = date_ify(date, time)
if unit in reduced_data:
if float(entries_hourly) > float(reduced_data[unit][1]):
reduced_data[unit] = (date_time, entries_hourly)
elif float(entries_hourly) == float(reduced_data[unit][1]):
if date_time > reduced_data[unit][0]:
reduced_data[unit] = (date_time, entries_hourly)
else:
reduced_data[unit] = (date_time, entries_hourly)
for unit in reduced_data:
print "{0}\t{1}\t{2}".format(unit, reduced_data[unit][0], reduced_data[unit][1])
reducer()