In [1]:
#import logging
#import sys
import string

#from util import logfile

#logging.basicConfig(filename=logfile, format='%(message)s',
                   #level=logging.INFO, filemode='w')

def word_count():
    # For this exercise, write a program that serially counts the number of occurrences
    # of each word in the book Alice in Wonderland.
    #
    # The text of Alice in Wonderland will be fed into your program line-by-line.
    # Your program needs to take each line and do the following:
    # 1) Tokenize the line into string tokens by whitespace
    #    Example: "Hello, World!" should be converted into "Hello," and "World!"
    #    (This part has been done for you.)
    #
    # 2) Remove all punctuation
    #    Example: "Hello," and "World!" should be converted into "Hello" and "World"
    #
    # 3) Make all letters lowercase
    #    Example: "Hello" and "World" should be converted to "hello" and "world"
    #
    # Store the the number of times that a word appears in Alice in Wonderland
    # in the word_counts dictionary, and then *print* (don't return) that dictionary
    #
    # In this exercise, print statements will be considered your final output. Because
    # of this, printing a debug statement will cause the grader to break. Instead, 
    # you can use the logging module which we've configured for you.
    #
    # For example:
    # logging.info("My debugging message")
    #
    # The logging module can be used to give you more control over your
    # debugging or other messages than you can get by printing them. Messages 
    # logged via the logger we configured will be saved to a
    # file. If you click "Test Run", then you will see the contents of that file
    # once your program has finished running.
    # 
    # The logging module also has other capabilities; see 
    # https://docs.python.org/2/library/logging.html
    # for more information.

    f = open('alice.txt')
    lines = f.read().splitlines()
    
    data = []
    
    word_counts = {}
    
    for line in lines:
        line = line.lower().translate(None, string.punctuation)
        data = line.strip().split(" ")
        for d in data:
            if d in word_counts:
                word_counts[d] = (word_counts[d] + 1)
            else:
                word_counts[d] = 1
        
    #print word_counts
    
    
    f.close()

word_count()

In [ ]:
import logging
import sys
import string

from util import logfile

logging.basicConfig(filename=logfile, format='%(message)s',
                   level=logging.INFO, filemode='w')


def word_count():
    # For this exercise, write a program that serially counts the number of occurrences
    # of each word in the book Alice in Wonderland.
    #
    # The text of Alice in Wonderland will be fed into your program line-by-line.
    # Your program needs to take each line and do the following:
    # 1) Tokenize the line into string tokens by whitespace
    #    Example: "Hello, World!" should be converted into "Hello," and "World!"
    #    (This part has been done for you.)
    #
    # 2) Remove all punctuation
    #    Example: "Hello," and "World!" should be converted into "Hello" and "World"
    #
    # 3) Make all letters lowercase
    #    Example: "Hello" and "World" should be converted to "hello" and "world"
    #
    # Store the the number of times that a word appears in Alice in Wonderland
    # in the word_counts dictionary, and then *print* (don't return) that dictionary
    #
    # In this exercise, print statements will be considered your final output. Because
    # of this, printing a debug statement will cause the grader to break. Instead, 
    # you can use the logging module which we've configured for you.
    #
    # For example:
    # logging.info("My debugging message")
    #
    # The logging module can be used to give you more control over your
    # debugging or other messages than you can get by printing them. Messages 
    # logged via the logger we configured will be saved to a
    # file. If you click "Test Run", then you will see the contents of that file
    # once your program has finished running.
    # 
    # The logging module also has other capabilities; see 
    # https://docs.python.org/2/library/logging.html
    # for more information.
    
    data = []
    
    word_counts = {}
    
    for line in sys.stdin:
        line = line.lower().translate(None, string.punctuation)
        data = line.strip().split(" ")
        for d in data:
            if d in word_counts:
                word_counts[d] = (word_counts[d] + 1)
            else:
                word_counts[d] = 1
        
    print word_counts


word_count()

In [3]:
import pandas as pd
data = pd.read_csv('aadhaar_data.csv')

In [8]:
data.head()


Out[8]:
Registrar Enrolment Agency State District Sub District Pin Code Gender Age Aadhaar generated Enrolment Rejected Residents providing email Residents providing mobile number
0 Allahabad Bank Tera Software Ltd Jharkhand Ranchi Namkum 834003 M 63 0 1 0 1
1 Allahabad Bank Tera Software Ltd Jharkhand Ranchi Ranchi 834004 F 36 0 1 0 1
2 Allahabad Bank Vakrangee Softwares Limited Gujarat Surat Nizar 394380 M 10 1 0 0 0
3 Allahabad Bank Vakrangee Softwares Limited Himachal Pradesh Kangra Baijnath 176081 M 44 1 0 1 1
4 Allahabad Bank Vakrangee Softwares Limited Madhya Pradesh Chhindwara Pandhurna 480334 M 35 1 0 0 0

In [25]:
from string import maketrans
intab = "aeiou"
outtab = "12345"
trantab = maketrans(intab, outtab)
#print trantab

str = "this is string example....wow!!!";
print str.translate(trantab);


th3s 3s str3ng 2x1mpl2....w4w!!!

In [33]:
import string

filename = 'aadhaar_data.csv'

with open(filename, 'r') as f:
    f.readline() # skip header line
    
    # mapper 
    mapped_data = []
    for line in f:
        data = line.strip().split(",")
        cleaned_data = []
        # if proper length
        if len(data) == 12:
            for i in data:
                cleaned_data.append(i.translate(string.maketrans("",""), string.punctuation).lower())
        
        mapped_data.append("{0}\t{1}".format(cleaned_data[3], cleaned_data[8]))
        #print "{0}\t{1}".format(cleaned_data[3], cleaned_data[8])
    
    # reducer
    reduced_data = {}
    for line in mapped_data:
       
        data = line.split("\t")
        if len(data) == 2:
            district, aadhar_gen = data
            if district in reduced_data:
                reduced_data[district] += int(aadhar_gen)
            else:
                reduced_data[district] = int(aadhar_gen)
    
    for district in reduced_data:
        print "{0}\t{1}".format(district, reduced_data[district])


hamirpur	12
imphal west	0
samastipur	19
sambhal	1
mahesana	2
koriya	6
kheri	9
palamu	221
alirajpur	2965
jaipur	1595
dausa	175
wardha	8
krishnagiri	12
barmer	13
sirohi	5
chennai	9
amroha	7
hisar	3275
almora	8
kottayam	2
mandi	2
sirmaur	2
bandipore	1
bilaspur	2
west singhbhum	4170
rajsamand	165
angul	5
kasaragod	2
siddharthnagar	9
sahebganj	2269
deoria	17
jhunjhunu	0
thoubal	1
bidar	346
murshidabad	11
pithoragarh	4
bangalore rural	141
ferozepur	416
rohtak	789
sawai madhopur	168
rohtas	22
moga	315
simdega	109
yamuna nagar	1586
banswara	1001
muzaffarpur	21
sikar	600
lalitpur	3
bokaro	428
kendujhar	9
chandrapur	8
others	62
dewas	230
katni	609
saharsa	4
seoni	299
hoshangabad	22
saharanpur	8
davangere	32
bhandara	42
barwani	764
jalaun	30
baleswar	13
harda	67
thiruvarur	2
parbhani	65
dhalai	220
ujjain	913
jalna	422
purba medinipur	26
kishanganj	8
yavatmal	469
aligarh	7
firozabad	5
gondiya	6
ranchi	748
saran	26
raigarh	252
hailakandi	1
kaithal	50
mirzapur	15
south delhi	153
khunti	158
deoghar	485
jabalpur	278
tikamgarh	2217
sant ravidas nagar	5
karauli	464
ahmedabad	9
dakshin dinajpur	0
doda	4
viluppuram	85
dhubri	1
dhemaji	1
bharuch	1
jehanabad	4
sivaganga	4
tirap	1
rae bareli	6
bhojpur	28
sonbhadra	65
shaheed bhagat singh nagar	177
satara	945
new delhi	5
sangrur	346
nalbari	1
central delhi	81
washim	282
budaun	7
shrawasti	3
ambala	70
balangir	2
west siang	1
kolkata	8
cuddalore	24
east singhbhum	619
kannauj	6
mau	17
maharajganj	1
faridabad	51
burhanpur	374
jamtara	276
pauri garhwal	9
nawada	20
sheohar	1
banka	46
chittoor	0
alappuzha	1
udham singh nagar	5
fatehabad	307
chickmagalur	160
hassan	350
chhindwara	3223
madurai	2
hanumangarh	101
kollam	5
sirsa	1812
jaunpur	19
puri	2
solapur	700
thane	2575
kaushambi	19
bhilwara	1547
bhadrak	14
raisen	1008
jalpaiguri	5
pilibhit	2
thrissur	3
shahjahanpur	5
bagalkot	33
senapati	1
north tripura	51
jashpur	2
kachchh	6
jhabua	2256
panchkula	3
alwar	303
ratnagiri	26
theni	2
kanniyakumari	7
golaghat	1
arwal	6
jhunjhunun	119
belgaum	697
dharwad	14
chamrajanagar	124
ananthapur	5
damoh	3559
vellore	8
shamli	7
kaimur bhabua	7
pathankot	287
cuddapah	4
udalguri	2
nizamabad	3
mewat	1
cachar	5
patan	1
ahmadnagar	552
kasargod	0
yadgir	69
latur	387
sitapur	4
vidisha	887
kupwara	1
nalgonda	73
dhule	892
bareilly	3
etah	7
bhopal	448
amethi	1
shimla	0
fatehgarh sahib	2
baran	101
sindhudurg	11
pune	1308
unnao	2
rajouri	4
moradabad	11
jhalawar	141
darrang	3
hyderabad	420
latehar	709
dehradun	13
ballia	32
siwan	21
agra	9
north delhi	39
mahoba	4
baloda bazar	1
kasganj	2
jajapur	12
bahraich	2
bageshwar	3
ludhiana	503
west delhi	157
dadra and nagar haveli	1
bhind	1770
mayurbhanj	5
kannur	2
gaya	40
darbhanga	23
raebareli	0
patiala	3
churu	18
osmanabad	209
ghaziabad	108
haridwar	3
baghpat	7
nashik	1532
north 24 parganas	14
tehri garhwal	5
papum pare	2
karimnagar	0
kamrup	2
supaul	5
bastar	1
chamba	4
east godavari	139
mahendragarh	1118
gopalganj	22
jyotiba phule nagar	1
rajnandgaon	2
chamoli	7
north east delhi	66
sangli	4017
tiruvannamalai	3
jorhat	2
neemuch	4
haveri	422
hazaribagh	374
jalgaon	1191
auraiya	10
bijapur	11
malappuram	4
vaishali	16
balrampur	9
sagar	4302
bathinda	570
khagaria	11
chhatarpur	4232
surendra nagar	9
sepahijala	22
etawah	9
thanjavur	1
guna	2372
dhanbad	1054
kargil	1
mumbai	937
chitrakoot	17
mainpuri	9
sidhi	2357
amritsar	269
nadia	31
banaskantha	4
coimbatore	3
karimganj	4
jammu	27
mahabubnagar	2
west tripura	392
east champaran	19
amravati	11
allahabad	137
khandwa	261
unakoti	6
champawat	0
mandya	129
krishna	0
madhubani	30
begusarai	7
gwalior	1083
udhampur	4
garhwa	129
cooch behar	2
kathua	4
darjeeling	5
kushinagar	15
balaghat	4168
giridih	167
pulwama	1
visakhapatnam	6
kangra	47
shimoga	1351
sonipat	274
godda	2459
jhajjar	2304
khowai	50
prakasam	7
ganjam	21
chandauli	10
udaipur	116
dhenkanal	1
pratapgarh	96
meerut	14
east delhi	42
punch	1
adilabad	1
rayagada	1
nainital	4
rudraprayag	2
sri muktsar sahib	358
fatehpur	9
paschim medinipur	11
una	56
tarn taran	1625
hoshiarpur	1297
ambedkar nagar	13
uttar dinajpur	2
puruliya	28
tonk	275
kapurthala	49
lakhimpur	1
beed	342
sibsagar	2
warangal	4
nagaon	2
jamnagar	8
jaisalmer	11
bara banki	0
sas nagar mohali	107
chandigarh	31
palakkad	0
sundergarh	8
the nilgiris	1
ahmed nagar	0
south west delhi	177
tiruchirappalli	2
palwal	1
katihar	11
kota	351
karaikal	4
ajmer	527
lucknow	7
dharmapuri	2
azamgarh	16
faridkot	62
gadchiroli	275
bijnor	7
kolhapur	665
valsad	30
jamui	45
ramanagar	742
sonitpur	1
bhiwani	652
ghazipur	26
kanpur dehat	6
debagarh	1
sheopur	431
varanasi	13
sant kabir nagar	7
raichur	123
junagadh	3
korba	3
basti	9
baksa	1
kodagu	6
howrah	3
uttara kannada	10
shivpuri	698
buxar	8
umaria	1992
namakkal	4
srinagar	1
chittorgarh	98
gumla	375
nandurbar	29
gomati	8
nagpur	155
rewari	193
dindori	612
akola	55
north west delhi	305
kv rangareddy	139
nayagarh	2
kolar	880
bharatpur	1161
mansa	942
vadodara	4
sambalpur	1
sehore	805
munger	11
dholpur	278
raipur	5
dhar	7190
jalandhar	50
dakshina kannada	70
muzaffarnagar	6
kanpur nagar	25
west garo hills	1
rajgarh	41
dibrugarh	2
araria	3
mathura	8
karnal	1428
faizabad	4
banda	6
bellary	1045
dumka	353
kancheepuram	12
datia	360
morena	2191
gulbarga	91
srikakulam	1
south 24 parganas	8
south tripura	4
gautam buddha nagar	26
kozhikode	3
pali	63
lohardaga	1074
ratlam	286
khargone	4020
chatra	2231
lakhisarai	5
buldhana	893
tiruvallur	7
tiruppur	3
nagaur	225
surat	1
gurdaspur	928
bangalore	7085
hazaribag	0
hingoli	64
tumkur	17
barddhaman	21
ganganagar	750
amreli	4
panna	4102
cuttack	2
shajapur	98
shahdol	3700
west champaran	9
nanded	198
ashok nagar	72
dindigul	5
puducherry	14
tirunelveli	3
gorakhpur	15
anuppur	1938
chikkaballapur	11
aurangabad	533
vizianagaram	5
bhagalpur	27
narsinghpur	863
kurnool	1
gadag	6
bulandshahr	8
pakur	3543
jind	1154
farrukhabad	3
hooghly	15
patna	50
bishnupur	1
kendrapara	8
jalor	1
rajkot	3
purnia	3
solan	5
seraikelakharsawan	826
khorda	6
bundi	44
mandla	1352
udupi	14
sheikhpura	2
bongaigaon	1
ramgarh	300
sultanpur	9
hathras	7
panipat	3362
bankura	5
gonda	11
birbhum	4
bikaner	49
nalanda	18
indore	2358
nellore	6
durg	7
bargarh	3
fazilka	154
thiruvananthapuram	3
virudhunagar	5
salem	2
koderma	2012
erode	4
rewa	6550
jhansi	8
malda	4
ernakulam	4
mysore	6
betul	1473
singrauli	2309
dungarpur	752
mandsaur	72
gurgaon	443
jodhpur	72
satna	4376
chitradurga	613
jagatsinghapur	2
rampur	2
kurukshetra	607
malkangiri	1
koppal	139
west godavari	4
rupnagar	3
hardoi	6
thoothukkudi	0
barnala	3
sitamarhi	11
kamrup metro	1

In [ ]:
import sys
import string
import logging

from util import mapper_logfile
logging.basicConfig(filename=mapper_logfile, format='%(message)s',
                    level=logging.INFO, filemode='w')

def mapper():

    #Also make sure to fill out the reducer code before clicking "Test Run" or "Submit".

    #Each line will be a comma-separated list of values. The
    #header row WILL be included. Tokenize each row using the 
    #commas, and emit (i.e. print) a key-value pair containing the 
    #district (not state) and Aadhaar generated, separated by a tab. 
    #Skip rows without the correct number of tokens and also skip 
    #the header row.

    #You can see a copy of the the input Aadhaar data
    #in the link below:
    #https://www.dropbox.com/s/vn8t4uulbsfmalo/aadhaar_data.csv

    #Since you are printing the output of your program, printing a debug 
    #statement will interfere with the operation of the grader. Instead, 
    #use the logging module, which we've configured to log to a file printed 
    #when you click "Test Run". For example:
    #logging.info("My debugging message")

    next(sys.stdin)
    for line in sys.stdin:
        data = line.strip().split(",")
        #cleaned_data = []
        # if proper length
        if len(data) == 12:
            #for i in data:
            #cleaned_data.append(i)
            print "{0}\t{1}".format(data[3], data[8])
        
        #mapped_data.append("{0}\t{1}".format(cleaned_data[3], cleaned_data[8]))
        #print "{0}\t{1}".format(cleaned_data[3], cleaned_data[8])

mapper()


import sys
import logging

from util import reducer_logfile
logging.basicConfig(filename=reducer_logfile, format='%(message)s',
                    level=logging.INFO, filemode='w')

def reducer():
    
    #Also make sure to fill out the mapper code before clicking "Test Run" or "Submit".

    #Each line will be a key-value pair separated by a tab character.
    #Print out each key once, along with the total number of Aadhaar 
    #generated, separated by a tab. Make sure each key-value pair is 
    #formatted correctly! Here's a sample final key-value pair: 'Gujarat\t5.0'

    #Since you are printing the output of your program, printing a debug 
    #statement will interfere with the operation of the grader. Instead, 
    #use the logging module, which we've configured to log to a file printed 
    #when you click "Test Run". For example:
    #logging.info("My debugging message")
        
    
    reduced_data = {}
    for line in sys.stdin:
       
        data = line.strip().split("\t")
        if len(data) == 2:
            district, aadhar_gen = data
            if district in reduced_data:
                reduced_data[district] += int(aadhar_gen)
            else:
                reduced_data[district] = int(aadhar_gen)
    
    for district in reduced_data:
        print "{0}\t{1}".format(district, reduced_data[district])

reducer()

In [9]:
import string

filename = 'turnstile_data_master_with_weather.csv'

with open(filename, 'r') as f:
    f.readline() # skip header line
    
    # mapper 
    mapped_data = []
    for line in f:
        data = line.strip().split(",")
        cleaned_data = []
        # if proper length
        if len(data) == 22:
            for i in data:
                cleaned_data.append(i.translate(string.maketrans("",""), string.punctuation))
        
        mapped_data.append("{0}\t{1}".format(cleaned_data[1], cleaned_data[6]))
        #print "{0}\t{1}".format(cleaned_data[1], cleaned_data[6])
    
    # reducer
    reduced_data = {}
    for line in mapped_data:
        data = line.split("\t")
        if len(data) == 2:
            unit, entries_hourly = data
            if unit in reduced_data:
                reduced_data[unit] += float(entries_hourly)
            else:
                reduced_data[unit] = float(entries_hourly)
    
    for unit in reduced_data:
        print "{0}\t{1}".format(unit, reduced_data[unit])


R550	6681830.0
R551	3890450.0
R552	6839450.0
R396	1087220.0
R397	702930.0
R394	1884980.0
R395	1362020.0
R392	1654020.0
R393	1093060.0
R259	1769190.0
R258	3271400.0
R257	3473220.0
R256	2002310.0
R255	1494650.0
R254	4754600.0
R253	1254530.0
R252	1827850.0
R251	2402910.0
R250	1866820.0
R408	2512070.0
R172	3466340.0
R173	2839010.0
R079	4965090.0
R171	2507820.0
R176	7382110.0
R177	8774920.0
R174	2760510.0
R175	10653360.0
R178	9453400.0
R179	16182610.0
R390	2778300.0
R391	1728380.0
R389	706150.0
R398	1554270.0
R399	1058140.0
R388	1955870.0
R213	2161500.0
R212	3133970.0
R211	4495020.0
R210	1113350.0
R217	2503500.0
R216	1477930.0
R215	2957350.0
R331	681620.0
R219	2369870.0
R218	3647690.0
R358	506050.0
R359	4511440.0
R352	703480.0
R353	1249270.0
R350	742500.0
R356	1963330.0
R357	346660.0
R354	439110.0
R355	674880.0
R338	44640.0
R339	953500.0
R136	3508260.0
R137	4474790.0
R134	1367180.0
R135	2023580.0
R132	7484960.0
R133	2545630.0
R130	992590.0
R131	7420310.0
R138	10382140.0
R139	4717200.0
R183	1619750.0
R182	4471870.0
R181	3186670.0
R180	4161930.0
R187	3319760.0
R186	2768400.0
R185	1862110.0
R184	1886690.0
R189	2912700.0
R188	4790640.0
R002	1765350.0
R003	359380.0
R001	7496820.0
R006	1094730.0
R007	623910.0
R004	931040.0
R005	910310.0
R008	666290.0
R009	559270.0
R439	1321190.0
R438	1350820.0
R433	653020.0
R432	822080.0
R431	1147540.0
R430	786790.0
R437	1333210.0
R436	904330.0
R435	863480.0
R434	742840.0
R330	1788690.0
R170	28879180.0
R318	1081530.0
R319	3868360.0
R316	536900.0
R317	1348520.0
R314	1219080.0
R315	1054210.0
R312	692920.0
R313	349310.0
R310	2953030.0
R311	821500.0
R070	3213430.0
R543	3945760.0
R542	1872580.0
R541	8877650.0
R540	11188470.0
R547	1273080.0
R546	4601560.0
R545	2325670.0
R544	1646700.0
R549	7218230.0
R548	1094260.0
R381	1539260.0
R380	1595440.0
R268	4102430.0
R269	1703170.0
R385	2062880.0
R384	696000.0
R387	2034060.0
R386	1824190.0
R262	982860.0
R263	353860.0
R260	2560850.0
R261	2854660.0
R266	1736260.0
R267	2408590.0
R264	931670.0
R265	1696010.0
R046	16951500.0
R047	9538220.0
R044	8973210.0
R045	5485070.0
R042	1248990.0
R043	5183240.0
R040	2244560.0
R041	5551480.0
R048	2510290.0
R049	5646240.0
R169	4003520.0
R168	9870220.0
R165	218800.0
R164	5819530.0
R167	5071440.0
R166	4493000.0
R161	2730020.0
R160	4567730.0
R163	6026510.0
R442	812970.0
R443	1714750.0
R440	697000.0
R441	787940.0
R446	1673820.0
R447	960870.0
R444	863960.0
R445	1947060.0
R448	242290.0
R449	1186360.0
R294	1771200.0
R345	942770.0
R344	836850.0
R347	1385170.0
R346	2245250.0
R341	961600.0
R340	888520.0
R343	1427230.0
R342	1326510.0
R226	1452410.0
R227	2078610.0
R224	1451570.0
R225	1147800.0
R222	4258400.0
R223	3901510.0
R220	2766200.0
R221	2523720.0
R011	15829140.0
R121	2649690.0
R120	3064490.0
R123	3428290.0
R089	976650.0
R125	1299420.0
R124	1273640.0
R127	8678390.0
R126	3407040.0
R129	2911720.0
R128	1260690.0
R080	7189410.0
R081	6580800.0
R086	4680130.0
R087	2229700.0
R084	18094230.0
R085	4684490.0
R536	1745230.0
R535	975200.0
R228	1947370.0
R229	1154860.0
R349	1155000.0
R348	386770.0
R154	2343930.0
R155	1946350.0
R156	1691610.0
R157	2104640.0
R150	1852000.0
R151	4355040.0
R152	3389170.0
R153	3808140.0
R015	5225480.0
R014	7761000.0
R017	8015120.0
R016	1824200.0
R158	6876530.0
R159	3621660.0
R013	4784630.0
R012	15647520.0
R411	598120.0
R309	1453110.0
R308	1512820.0
R301	1929870.0
R300	4840450.0
R303	2267300.0
R302	3659960.0
R304	2016180.0
R307	771580.0
R306	592370.0
R147	4555250.0
R146	2698510.0
R413	914000.0
R412	585590.0
R415	79210.0
R414	591380.0
R275	1690350.0
R274	1731780.0
R277	982760.0
R276	2628170.0
R271	791800.0
R270	977880.0
R273	2603070.0
R272	3342900.0
R416	419750.0
R279	1510900.0
R278	814900.0
R419	247930.0
R378	2606320.0
R379	1239510.0
R418	65240.0
R374	1611780.0
R375	1157250.0
R376	950050.0
R377	2227860.0
R370	953440.0
R371	1317790.0
R372	1326800.0
R373	1119430.0
R149	2616830.0
R148	1872300.0
R051	9518790.0
R050	7144100.0
R053	6174800.0
R052	2146470.0
R055	16075340.0
R054	2642470.0
R057	8959350.0
R056	2971190.0
R059	2095960.0
R058	1173230.0
R417	152100.0
R118	2834960.0
R119	3359230.0
R110	5786870.0
R111	5880920.0
R112	3088150.0
R113	3638370.0
R114	1663280.0
R115	2350360.0
R116	6175820.0
R117	1619450.0
R288	3512330.0
R289	879400.0
R280	1221480.0
R281	2385300.0
R282	2924820.0
R283	1349350.0
R284	1490640.0
R285	1126690.0
R286	1196520.0
R287	1131920.0
R088	1134170.0
R024	5724630.0
R025	10380870.0
R027	5810310.0
R020	13484140.0
R021	9217540.0
R022	17969320.0
R023	12726790.0
R028	5089070.0
R029	12932600.0
R459	72890.0
R455	237080.0
R454	295570.0
R456	500380.0
R451	1818410.0
R450	927290.0
R453	2915830.0
R452	10680040.0
R082	2717160.0
R083	5628720.0
R239	2066320.0
R238	3865070.0
R332	1393960.0
R333	1598350.0
R334	1850860.0
R335	785760.0
R336	184780.0
R337	283360.0
R231	1843930.0
R230	1089180.0
R233	1967910.0
R232	1954060.0
R235	6869940.0
R234	773160.0
R237	1264320.0
R236	2840950.0
R248	6129590.0
R249	2530420.0
R244	2965130.0
R246	1327940.0
R247	508470.0
R240	6486010.0
R241	822050.0
R242	1156710.0
R243	2473340.0
R383	1098340.0
R382	1674540.0
R068	912760.0
R069	1721210.0
R145	1385250.0
R144	6477990.0
R143	5412520.0
R142	6058260.0
R141	7137260.0
R140	2383670.0
R060	1461860.0
R061	1117960.0
R062	7174130.0
R063	2009100.0
R064	1489840.0
R065	1528650.0
R066	577720.0
R067	1558940.0
R214	1317310.0
R095	4365700.0
R094	3178390.0
R097	5899840.0
R096	4259930.0
R091	1983440.0
R090	779060.0
R093	3821680.0
R092	3610250.0
R099	4291820.0
R098	3991340.0
R464	0.0
R403	964230.0
R460	2959160.0
R461	5864400.0
R462	3825760.0
R463	6090640.0
R468	932950.0
R469	933460.0
R200	2700690.0
R201	6017050.0
R202	4018410.0
R203	3186020.0
R204	2964710.0
R205	3547260.0
R206	3760770.0
R207	3674380.0
R208	5087510.0
R209	1523160.0
R369	929450.0
R368	1311820.0
R367	1385890.0
R366	1146480.0
R365	875240.0
R364	1874490.0
R363	876830.0
R362	1349100.0
R361	2311390.0
R360	318730.0
R406	1885870.0
R407	798180.0
R404	1359250.0
R405	1335350.0
R019	5834450.0
R402	629150.0
R018	13898780.0
R400	835530.0
R401	729160.0
R109	2643090.0
R108	9369920.0
R103	2514060.0
R102	7097050.0
R101	5956300.0
R100	1022900.0
R107	1678060.0
R106	1999630.0
R105	6511580.0
R104	2483700.0
R299	874480.0
R298	1810960.0
R293	12883430.0
R292	976820.0
R291	3325690.0
R290	3377170.0
R297	1599550.0
R296	1052320.0
R295	930930.0
R010	8542430.0
R190	3875850.0
R191	3473110.0
R192	3726960.0
R193	2727570.0
R194	3576330.0
R195	8294130.0
R196	2461350.0
R197	3303040.0
R198	4359060.0
R199	1405460.0
R409	1132190.0
R037	1547290.0
R036	1384610.0
R035	6298350.0
R034	2028240.0
R033	17116630.0
R032	7915130.0
R031	8795360.0
R030	5590910.0
R039	1355280.0
R038	431300.0
R428	848090.0
R429	1846700.0
R420	1179970.0
R421	616230.0
R422	393450.0
R423	934210.0
R424	712390.0
R425	607310.0
R426	461620.0
R427	495060.0
R323	2320460.0
R322	3155750.0
R321	2030650.0
R320	1490500.0
R327	1780620.0
R326	693520.0
R325	729440.0
R324	2923340.0
R329	463980.0
R328	1714090.0
R122	4560580.0

In [ ]:
import sys
import string
import logging

from util import mapper_logfile
logging.basicConfig(filename=mapper_logfile, format='%(message)s',
                    level=logging.INFO, filemode='w')

def mapper():
    """
    The input to this mapper will be the final Subway-MTA dataset, the same as
    in the previous exercise.  You can check out the csv and its structure below:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv

    For each line of input, the mapper output should PRINT (not return) the UNIT as 
    the key, the number of ENTRIESn_hourly as the value, and separate the key and 
    the value by a tab. For example: 'R002\t105105.0'

    Since you are printing the output of your program, printing a debug 
    statement will interfere with the operation of the grader. Instead, 
    use the logging module, which we've configured to log to a file printed 
    when you click "Test Run". For example:
    logging.info("My debugging message")
    
    The logging module can be used to give you more control over your debugging
    or other messages than you can get by printing them. In this exercise, print
    statements from your mapper will go to your reducer, and print statements
    from your reducer will be considered your final output. By contrast, messages
    logged via the loggers we configured will be saved to two files, one
    for the mapper and one for the reducer. If you click "Test Run", then we
    will show the contents of those files once your program has finished running.
    The logging module also has other capabilities; see 
    https://docs.python.org/2/library/logging.html for more information.
    """
    next(sys.stdin) # skip header
    for line in sys.stdin:
        data = line.strip().split(",")
        #cleaned_data = []
        # if proper length
        if len(data) == 22:
        #for i in data:
        #cleaned_data.append(i.translate(string.maketrans("",""), string.punctuation))
        
            #mapped_data.append("{0}\t{1}".format(cleaned_data[1], cleaned_data[6]))
            print "{0}\t{1}".format(data[1], data[6])


mapper()


import sys
import logging

from util import reducer_logfile
logging.basicConfig(filename=reducer_logfile, format='%(message)s',
                    level=logging.INFO, filemode='w')

def reducer():
    '''
    Given the output of the mapper for this exercise, the reducer should PRINT 
    (not return) one line per UNIT along with the total number of ENTRIESn_hourly 
    over the course of May (which is the duration of our data), separated by a tab.
    An example output row from the reducer might look like this: 'R001\t500625.0'

    You can assume that the input to the reducer is sorted such that all rows
    corresponding to a particular UNIT are grouped together.

    Since you are printing the output of your program, printing a debug 
    statement will interfere with the operation of the grader. Instead, 
    use the logging module, which we've configured to log to a file printed 
    when you click "Test Run". For example:
    logging.info("My debugging message")
    '''
    reduced_data = {}
    for line in sys.stdin:
        data = line.split("\t")
        if len(data) == 2:
            unit, entries_hourly = data
            if unit in reduced_data:
                reduced_data[unit] += float(entries_hourly)
            else:
                reduced_data[unit] = float(entries_hourly)
    
    for unit in reduced_data:
        print "{0}\t{1}".format(unit, reduced_data[unit])

        
reducer()

In [46]:
import pandas as pd
data = pd.read_csv('turnstile_data_master_with_weather.csv', nrows=20000)
data.head()
data.columns.values


Out[46]:
array(['Unnamed: 0', 'UNIT', 'DATEn', 'TIMEn', 'Hour', 'DESCn',
       'ENTRIESn_hourly', 'EXITSn_hourly', 'maxpressurei', 'maxdewpti',
       'mindewpti', 'minpressurei', 'meandewpti', 'meanpressurei', 'fog',
       'rain', 'meanwindspdi', 'mintempi', 'meantempi', 'maxtempi',
       'precipi', 'thunder'], dtype=object)

In [47]:
data = data[['ENTRIESn_hourly', 'fog', 'rain']]
data.head()
data.to_csv('avg_entries.csv', index=False)

In [41]:
# fog - rain
#  0     0
#  0     1
#  1     0
#  1     1
filename = 'avg_entries.csv'

with open(filename, 'r') as f:
    f.readline() # skip header line
    
    # mapper 
    mapped_data = []
    for line in f:
        data = line.strip().split(",")
        # if proper length
        if len(data) == 3:
            fog_rain = "{}fog-{}rain".format('' if float(data[1]) else 'no', '' if float(data[2]) else 'no')
            mapped_data.append("{0}\t{1}".format(fog_rain, data[0]))
            #print "{0}\t{1}".format(fog_rain, data[0])

In [42]:
# reducer
    reduced_data = {}
    count = {}
    for line in mapped_data:
        data = line.split("\t")
        if len(data) == 2:
            fog_rain, entries_hourly = data
            if fog_rain in reduced_data:
                reduced_data[fog_rain] += float(entries_hourly)
                count[fog_rain] += 1
            else:
                reduced_data[fog_rain] = float(entries_hourly)
                count[fog_rain] = 1
    
    for fog_rain in reduced_data:
        print "{0}\t{1}".format(fog_rain, reduced_data[fog_rain]/count[fog_rain])


nofog-rain	1315.6958972
nofog-norain	1199.12483937

In [ ]:
import sys
import string
import logging

from util import mapper_logfile
logging.basicConfig(filename=mapper_logfile, format='%(message)s',
                    level=logging.INFO, filemode='w')

def mapper():
    '''
    For this exercise, compute the average value of the ENTRIESn_hourly column 
    for different weather types. Weather type will be defined based on the 
    combination of the columns fog and rain (which are boolean values).
    For example, one output of our reducer would be the average hourly entries 
    across all hours when it was raining but not foggy.

    Each line of input will be a row from our final Subway-MTA dataset in csv format.
    You can check out the input csv file and its structure below:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
    
    Note that this is a comma-separated file.

    This mapper should PRINT (not return) the weather type as the key (use the 
    given helper function to format the weather type correctly) and the number in 
    the ENTRIESn_hourly column as the value. They should be separated by a tab.
    For example: 'fog-norain\t12345'
    
    Since you are printing the output of your program, printing a debug 
    statement will interfere with the operation of the grader. Instead, 
    use the logging module, which we've configured to log to a file printed 
    when you click "Test Run". For example:
    logging.info("My debugging message")
    '''

    # Takes in variables indicating whether it is foggy and/or rainy and
    # returns a formatted key that you should output.  The variables passed in
    # can be booleans, ints (0 for false and 1 for true) or floats (0.0 for
    # false and 1.0 for true), but the strings '0.0' and '1.0' will not work,
    # so make sure you convert these values to an appropriate type before
    # calling the function.
    def format_key(fog, rain):
        return '{}fog-{}rain'.format(
            '' if fog else 'no',
            '' if rain else 'no'
        )
  
    next(sys.stdin) # skip header
    for line in sys.stdin:
        data = line.strip().split(",")
        # if proper length
        if len(data) == 22:
            fog_rain = "{}fog-{}rain".format('' if float(data[14]) else 'no', '' if float(data[15]) else 'no')
            print "{0}\t{1}".format(fog_rain, data[6])
   

mapper()


import sys
import logging

from util import reducer_logfile
logging.basicConfig(filename=reducer_logfile, format='%(message)s',
                    level=logging.INFO, filemode='w')

def reducer():
    '''
    Given the output of the mapper for this assignment, the reducer should
    print one row per weather type, along with the average value of
    ENTRIESn_hourly for that weather type, separated by a tab. You can assume
    that the input to the reducer will be sorted by weather type, such that all
    entries corresponding to a given weather type will be grouped together.

    In order to compute the average value of ENTRIESn_hourly, you'll need to
    keep track of both the total riders per weather type and the number of
    hours with that weather type. That's why we've initialized the variable 
    riders and num_hours below. Feel free to use a different data structure in 
    your solution, though.

    An example output row might look like this:
    'fog-norain\t1105.32467557'

    Since you are printing the output of your program, printing a debug 
    statement will interfere with the operation of the grader. Instead, 
    use the logging module, which we've configured to log to a file printed 
    when you click "Test Run". For example:
    logging.info("My debugging message")
    '''

    riders = 0      # The number of total riders for this key
    num_hours = 0   # The number of hours with this key
    old_key = None

    reduced_data = {}
    count = {}
    for line in sys.stdin:
        data = line.split("\t")
        if len(data) == 2:
            fog_rain, entries_hourly = data
            if fog_rain in reduced_data:
                reduced_data[fog_rain] += float(entries_hourly)
                count[fog_rain] += 1
            else:
                reduced_data[fog_rain] = float(entries_hourly)
                count[fog_rain] = 1
    
    for fog_rain in reduced_data:
        print "{0}\t{1}".format(fog_rain, reduced_data[fog_rain]/count[fog_rain])

reducer()

In [50]:
filename = 'turnstile_data_master_with_weather.csv'

with open(filename, 'r') as f:
    f.readline() # skip header line
    
    # mapper 
    mapped_data = []
    for line in f:
        data = line.strip().split(",")
        # if proper length
        if len(data) == 22:
            mapped_data.append("{0}\t{1}\t{2}\t{3}".format(data[1], data[6], data[2], data[3]))
            #print "{0}\t{1}".format(fog_rain, data[0])

In [61]:
from datetime import datetime

    def date_ify(date, time):
        return datetime.strptime(' '.join([date,time]), "%Y-%m-%d %H:%M:%S")
    
    # reducer
    reduced_data = {}
    for line in mapped_data:
        data = line.split("\t")
        if len(data) == 4:
            unit, entries_hourly, date, time = data
            datetime = date_ify(date, time)
            if unit in reduced_data:
                if  float(entries_hourly) > float(reduced_data[unit][1]):
                    reduced_data[unit] = (datetime, entries_hourly)
                elif float(entries_hourly) == float(reduced_data[unit][1]):
                    if datetime > reduced_data[unit][0]:
                        reduced_data[unit] = (datetime, entries_hourly)   
            else:
                reduced_data[unit] = (datetime, entries_hourly)

    
    for unit in reduced_data:
        print "{0}\t{1}\t{2}".format(unit, reduced_data[unit][0], reduced_data[unit][1])


R550	2011-05-11 13:47:15	1637.0
R551	2011-05-16 12:37:58	1280.0
R552	2011-05-20 21:54:55	2917.0
R396	2011-05-10 09:00:00	1779.0
R397	2011-05-11 21:00:00	1553.0
R394	2011-05-02 17:00:00	4116.0
R395	2011-05-05 09:00:00	2773.0
R392	2011-05-12 09:00:00	2391.0
R393	2011-05-03 09:00:00	1920.0
R259	2011-05-04 12:00:00	3507.0
R258	2011-05-24 12:00:00	5402.0
R257	2011-05-20 12:00:00	5974.0
R256	2011-05-13 12:00:00	4194.0
R255	2011-05-05 12:00:00	3894.0
R254	2011-05-09 20:00:00	11156.0
R253	2011-05-14 00:00:00	12377.0
R252	2011-05-26 00:00:00	2725.0
R251	2011-05-02 20:00:00	4346.0
R250	2011-05-20 12:00:00	3157.0
R408	2011-05-03 17:00:00	4922.0
R172	2011-05-30 20:00:00	7377.0
R173	2011-05-07 21:00:00	5182.0
R079	2011-05-09 18:00:00	9205.0
R171	2011-05-11 21:00:00	3493.0
R176	2011-05-03 21:00:00	15703.0
R177	2011-05-17 21:00:00	14738.0
R174	2011-05-28 01:00:00	10597.0
R175	2011-05-05 21:00:00	17465.0
R178	2011-05-20 17:00:00	11883.0
R179	2011-05-11 12:00:00	23625.0
R390	2011-05-28 01:00:00	4046.0
R391	2011-05-10 09:00:00	3392.0
R389	2011-05-10 09:00:00	1348.0
R398	2011-05-16 09:00:00	3019.0
R399	2011-05-16 09:00:00	2664.0
R388	2011-05-01 13:00:00	3549.0
R213	2011-05-12 12:00:00	5409.0
R212	2011-05-03 20:00:00	8027.0
R211	2011-05-16 12:00:00	10845.0
R210	2011-05-11 12:00:00	2965.0
R217	2011-05-25 20:00:00	6611.0
R216	2011-05-18 12:00:00	3821.0
R215	2011-05-25 12:00:00	5865.0
R331	2011-05-10 09:00:00	1001.0
R219	2011-05-04 12:00:00	5618.0
R218	2011-05-19 12:00:00	8773.0
R358	2011-05-12 00:00:00	1580.0
R359	2011-05-07 01:00:00	14956.0
R352	2011-05-21 01:00:00	10190.0
R353	2011-05-04 01:00:00	1917.0
R350	2011-05-24 09:00:00	1212.0
R356	2011-05-25 12:00:00	5408.0
R357	2011-05-07 21:00:00	660.0
R354	2011-05-05 12:00:00	1354.0
R355	2011-05-11 09:00:00	1828.0
R338	2011-05-20 12:00:00	104.0
R339	2011-05-13 21:00:00	2248.0
R136	2011-05-03 09:00:00	6310.0
R137	2011-05-11 12:00:00	8536.0
R134	2011-05-09 09:00:00	3144.0
R135	2011-05-09 09:00:00	3963.0
R132	2011-05-21 21:00:00	13561.0
R133	2011-05-19 01:00:00	8816.0
R130	2011-05-16 09:00:00	1810.0
R131	2011-05-04 21:00:00	17754.0
R138	2011-05-12 21:00:00	16420.0
R139	2011-05-10 20:00:00	10959.0
R183	2011-05-26 12:00:00	2649.0
R182	2011-05-26 01:00:00	18541.0
R181	2011-05-25 12:00:00	5912.0
R180	2011-05-20 17:00:00	10230.0
R187	2011-05-16 21:00:00	5747.0
R186	2011-05-10 20:00:00	5404.0
R185	2011-05-27 21:00:00	2999.0
R184	2011-05-24 12:00:00	4710.0
R189	2011-05-09 00:00:00	4335.0
R188	2011-05-02 20:00:00	7124.0
R002	2011-05-12 21:00:00	4295.0
R003	2011-05-05 12:00:00	995.0
R001	2011-05-11 17:00:00	31213.0
R006	2011-05-25 12:00:00	2784.0
R007	2011-05-10 12:00:00	1763.0
R004	2011-05-12 12:00:00	2318.0
R005	2011-05-10 12:00:00	2705.0
R008	2011-05-12 12:00:00	1724.0
R009	2011-05-05 12:00:00	1230.0
R439	2011-05-05 09:00:00	2266.0
R438	2011-05-23 09:00:00	4029.0
R433	2011-05-10 17:00:00	1042.0
R432	2011-05-26 01:00:00	1336.0
R431	2011-05-11 09:00:00	2562.0
R430	2011-05-26 21:00:00	2053.0
R437	2011-05-11 09:00:00	2309.0
R436	2011-05-03 09:00:00	1530.0
R435	2011-05-13 13:00:00	2273.0
R434	2011-05-15 05:00:00	1456.0
R330	2011-05-18 00:00:00	5053.0
R170	2011-05-19 21:00:00	51839.0
R318	2011-05-05 00:00:00	2894.0
R319	2011-05-04 00:00:00	7758.0
R316	2011-05-04 09:00:00	754.0
R317	2011-05-13 21:00:00	2548.0
R314	2011-05-11 17:00:00	1740.0
R315	2011-05-28 01:00:00	1692.0
R312	2011-05-30 20:00:00	3455.0
R313	2011-05-25 12:00:00	841.0
R310	2011-05-11 12:00:00	7212.0
R311	2011-05-05 00:00:00	1341.0
R070	2011-05-04 20:00:00	6402.0
R543	2011-05-12 18:44:21	1418.0
R542	2011-05-12 23:11:21	2027.0
R541	2011-05-03 01:11:50	1929.0
R540	2011-05-16 21:34:11	3167.0
R547	2011-05-25 23:23:07	1247.0
R546	2011-05-24 19:11:38	2038.0
R545	2011-05-05 20:49:58	1404.0
R544	2011-05-28 03:33:11	3194.0
R549	2011-05-03 09:29:37	1237.0
R548	2011-05-06 20:19:16	1664.0
R381	2011-05-25 21:00:00	2677.0
R380	2011-05-09 17:00:00	3572.0
R268	2011-05-28 17:00:00	8495.0
R269	2011-05-26 12:00:00	3639.0
R385	2011-05-12 09:00:00	3902.0
R384	2011-05-03 09:00:00	1731.0
R387	2011-05-25 17:00:00	2732.0
R386	2011-05-05 21:00:00	3941.0
R262	2011-05-11 12:00:00	2672.0
R263	2011-05-05 00:00:00	1248.0
R260	2011-05-28 00:00:00	39380.0
R261	2011-05-18 12:00:00	5679.0
R266	2011-05-03 12:00:00	3771.0
R267	2011-05-25 09:00:00	4035.0
R264	2011-05-09 20:00:00	2058.0
R265	2011-05-10 12:00:00	3128.0
R046	2011-05-18 20:00:00	29219.0
R047	2011-05-18 21:00:00	20605.0
R044	2011-05-05 20:00:00	17269.0
R045	2011-05-11 21:00:00	16706.0
R042	2011-05-12 20:00:00	3162.0
R043	2011-05-05 18:00:00	5743.0
R040	2011-05-25 12:00:00	6212.0
R041	2011-05-12 20:00:00	14163.0
R048	2011-05-18 21:00:00	7208.0
R049	2011-05-06 20:00:00	12524.0
R169	2011-05-12 09:00:00	4993.0
R168	2011-05-12 09:00:00	17039.0
R165	2011-05-05 02:00:00	377.0
R164	2011-05-18 17:00:00	9376.0
R167	2011-05-11 21:00:00	7208.0
R166	2011-05-11 21:00:00	5306.0
R161	2011-05-18 17:00:00	4128.0
R160	2011-05-08 01:00:00	19496.0
R163	2011-05-06 04:00:00	12530.0
R442	2011-05-25 09:00:00	1494.0
R443	2011-05-11 09:00:00	3044.0
R440	2011-05-18 21:00:00	1677.0
R441	2011-05-02 13:00:00	1987.0
R446	2011-05-03 21:00:00	2590.0
R447	2011-05-03 17:00:00	1954.0
R444	2011-05-18 09:00:00	1670.0
R445	2011-05-25 09:00:00	2664.0
R448	2011-05-27 21:00:00	375.0
R449	2011-05-11 09:00:00	1758.0
R294	2011-05-11 12:00:00	2924.0
R345	2011-05-26 12:00:00	2082.0
R344	2011-05-26 12:00:00	1906.0
R347	2011-05-25 09:00:00	1969.0
R346	2011-05-10 00:00:00	5790.0
R341	2011-05-18 12:00:00	5052.0
R340	2011-05-07 01:00:00	1625.0
R343	2011-05-03 17:00:00	2441.0
R342	2011-05-12 09:00:00	1736.0
R226	2011-05-11 12:00:00	4158.0
R227	2011-05-12 20:00:00	4390.0
R224	2011-05-04 12:00:00	2935.0
R225	2011-05-02 20:00:00	2053.0
R222	2011-05-18 09:00:00	8335.0
R223	2011-05-17 12:00:00	7611.0
R220	2011-05-11 12:00:00	6265.0
R221	2011-05-10 12:00:00	5452.0
R011	2011-05-04 12:00:00	37193.0
R121	2011-05-24 20:00:00	5021.0
R120	2011-05-20 12:00:00	7227.0
R123	2011-05-11 12:00:00	6989.0
R089	2011-05-04 20:00:00	2358.0
R125	2011-05-16 21:00:00	3397.0
R124	2011-05-03 20:00:00	3010.0
R127	2011-05-12 20:00:00	20968.0
R126	2011-05-12 12:00:00	6681.0
R129	2011-05-04 21:00:00	4377.0
R128	2011-05-03 17:00:00	2570.0
R080	2011-05-18 20:00:00	13284.0
R081	2011-05-25 04:00:00	13970.0
R086	2011-05-21 20:00:00	9071.0
R087	2011-05-04 20:00:00	5834.0
R084	2011-05-10 20:00:00	32119.0
R085	2011-05-10 20:00:00	7138.0
R536	2011-05-16 21:00:00	3599.0
R535	2011-05-27 17:00:00	1436.0
R228	2011-05-14 00:00:00	10034.0
R229	2011-05-06 16:00:00	3196.0
R349	2011-05-24 01:00:00	2645.0
R348	2011-05-04 12:00:00	818.0
R154	2011-05-11 09:00:00	4086.0
R155	2011-05-10 09:00:00	3493.0
R156	2011-05-03 09:00:00	3148.0
R157	2011-05-11 09:00:00	3805.0
R150	2011-05-10 09:00:00	3043.0
R151	2011-05-30 21:00:00	18095.0
R152	2011-05-05 09:00:00	6184.0
R153	2011-05-11 09:00:00	5661.0
R015	2011-05-24 18:00:00	13970.0
R014	2011-05-24 21:00:00	16663.0
R017	2011-05-04 20:00:00	19798.0
R016	2011-05-25 00:00:00	4860.0
R158	2011-05-03 18:30:00	20295.0
R159	2011-05-13 09:00:00	10233.0
R013	2011-05-11 20:00:00	8267.0
R012	2011-05-18 12:00:00	25726.0
R411	2011-05-04 09:00:00	891.0
R309	2011-05-24 20:00:00	3819.0
R308	2011-05-19 12:00:00	3129.0
R301	2011-05-20 21:00:00	5763.0
R300	2011-05-11 00:00:00	10309.0
R303	2011-05-09 16:00:00	5193.0
R302	2011-05-25 21:00:00	9833.0
R304	2011-05-19 20:00:00	4549.0
R307	2011-05-24 00:00:00	1431.0
R306	2011-05-10 09:00:00	1774.0
R147	2011-05-20 17:00:00	11736.0
R146	2011-05-11 17:00:00	6190.0
R413	2011-05-09 13:00:00	2090.0
R412	2011-05-26 21:00:00	1785.0
R415	2011-05-12 09:00:00	219.0
R414	2011-05-20 01:00:00	915.0
R275	2011-05-05 12:00:00	3710.0
R274	2011-05-21 00:00:00	7661.0
R277	2011-05-11 12:00:00	2442.0
R276	2011-05-11 12:00:00	5839.0
R271	2011-05-11 12:00:00	1807.0
R270	2011-05-05 00:00:00	1677.0
R273	2011-05-15 16:00:00	18917.0
R272	2011-05-25 21:00:00	6542.0
R416	2011-05-14 17:00:00	1514.0
R279	2011-05-05 12:00:00	3137.0
R278	2011-05-12 12:00:00	1729.0
R419	2011-05-30 21:00:00	2398.0
R378	2011-05-03 09:00:00	3495.0
R379	2011-05-24 21:00:00	1704.0
R418	2011-05-30 21:00:00	637.0
R374	2011-05-16 09:00:00	3744.0
R375	2011-05-25 09:00:00	1950.0
R376	2011-05-10 09:00:00	1683.0
R377	2011-05-03 17:00:00	3902.0
R370	2011-05-10 12:00:00	3283.0
R371	2011-05-12 12:00:00	4180.0
R372	2011-05-12 12:00:00	4064.0
R373	2011-05-11 12:00:00	3215.0
R149	2011-05-03 09:00:00	4598.0
R148	2011-05-17 09:00:00	3855.0
R051	2011-05-05 20:00:00	19338.0
R050	2011-05-16 20:00:00	15427.0
R053	2011-05-13 12:00:00	11181.0
R052	2011-05-24 12:00:00	4137.0
R055	2011-05-18 12:00:00	26998.0
R054	2011-05-18 20:00:00	6871.0
R057	2011-05-10 20:00:00	14129.0
R056	2011-05-09 20:00:00	4852.0
R059	2011-05-19 04:00:00	5732.0
R058	2011-05-17 12:00:00	3018.0
R417	2011-05-30 21:00:00	1326.0
R118	2011-05-11 21:00:00	7152.0
R119	2011-05-25 12:00:00	5551.0
R110	2011-05-03 09:00:00	10405.0
R111	2011-05-10 20:00:00	9939.0
R112	2011-05-11 12:00:00	5204.0
R113	2011-05-18 21:00:00	10282.0
R114	2011-05-19 12:00:00	3275.0
R115	2011-05-12 12:00:00	4273.0
R116	2011-05-18 20:00:00	13830.0
R117	2011-05-25 12:00:00	4596.0
R288	2011-05-04 21:00:00	6154.0
R289	2011-05-24 09:00:00	2314.0
R280	2011-05-25 20:00:00	2708.0
R281	2011-05-10 20:00:00	4886.0
R282	2011-05-18 20:00:00	7146.0
R283	2011-05-26 05:00:00	1913.0
R284	2011-05-13 12:00:00	4832.0
R285	2011-05-12 12:00:00	2697.0
R286	2011-05-27 21:00:00	2010.0
R287	2011-05-16 12:00:00	2592.0
R088	2011-05-06 21:00:00	2265.0
R024	2011-05-03 12:00:00	11094.0
R025	2011-05-11 12:00:00	23518.0
R027	2011-05-16 20:00:00	17742.0
R020	2011-05-04 20:00:00	38186.0
R021	2011-05-24 20:00:00	23428.0
R022	2011-05-20 20:00:00	34004.0
R023	2011-05-16 20:00:00	20555.0
R028	2011-05-26 19:00:00	12471.0
R029	2011-05-10 20:00:00	27978.0
R459	2011-05-30 20:00:00	3545.0
R455	2011-05-25 20:00:00	523.0
R454	2011-05-21 00:00:00	796.0
R456	2011-05-05 12:00:00	1275.0
R451	2011-05-09 17:00:00	3425.0
R450	2011-05-14 01:00:00	9213.0
R453	2011-05-21 00:00:00	23707.0
R452	2011-05-12 21:00:00	18950.0
R082	2011-05-12 20:00:00	5862.0
R083	2011-05-05 20:00:00	11202.0
R239	2011-05-26 12:00:00	3574.0
R238	2011-05-12 16:00:00	7292.0
R332	2011-05-20 17:00:00	2132.0
R333	2011-05-11 21:00:00	4187.0
R334	2011-05-17 09:00:00	2672.0
R335	2011-05-12 12:00:00	2269.0
R336	2011-05-03 20:00:00	560.0
R337	2011-05-11 12:00:00	786.0
R231	2011-05-05 20:00:00	4455.0
R230	2011-05-11 12:00:00	3135.0
R233	2011-05-05 12:00:00	4252.0
R232	2011-05-11 12:00:00	4387.0
R235	2011-05-06 12:00:00	11600.0
R234	2011-05-11 12:00:00	1503.0
R237	2011-05-05 12:00:00	2663.0
R236	2011-05-05 12:00:00	6361.0
R248	2011-05-19 20:00:00	7881.0
R249	2011-05-25 12:00:00	5861.0
R244	2011-05-25 08:00:00	5711.0
R246	2011-05-04 12:00:00	3322.0
R247	2011-05-09 12:00:00	1102.0
R240	2011-05-27 20:00:00	10206.0
R241	2011-05-26 12:30:00	1728.0
R242	2011-05-05 12:00:00	2508.0
R243	2011-05-05 12:00:00	5235.0
R383	2011-05-11 09:00:00	2452.0
R382	2011-05-11 12:00:00	4424.0
R068	2011-05-03 12:00:00	2458.0
R069	2011-05-03 12:00:00	4159.0
R145	2011-05-03 09:00:00	2106.0
R144	2011-05-06 17:00:00	7653.0
R143	2011-05-04 21:00:00	8968.0
R142	2011-05-13 01:00:00	8830.0
R141	2011-05-03 09:00:00	12659.0
R140	2011-05-24 17:00:00	4838.0
R060	2011-05-01 20:00:00	4932.0
R061	2011-05-04 00:00:00	2361.0
R062	2011-05-19 12:00:00	18022.0
R063	2011-05-24 12:00:00	4985.0
R064	2011-05-25 12:00:00	3887.0
R065	2011-05-05 12:00:00	3609.0
R066	2011-05-05 12:00:00	1322.0
R067	2011-05-11 12:00:00	4001.0
R214	2011-05-04 12:00:00	3661.0
R095	2011-05-05 12:00:00	10670.0
R094	2011-05-05 12:00:00	7468.0
R097	2011-05-27 12:00:00	13803.0
R096	2011-05-11 12:00:00	7355.0
R091	2011-05-27 12:00:00	3980.0
R090	2011-05-03 00:00:00	2117.0
R093	2011-05-10 12:00:00	9095.0
R092	2011-05-27 16:00:00	9573.0
R099	2011-05-05 16:00:00	7757.0
R098	2011-05-04 12:00:00	8597.0
R464	2011-05-30 20:00:00	0.0
R403	2011-05-04 13:00:00	2927.0
R460	2011-05-25 21:00:00	4571.0
R461	2011-05-03 21:00:00	12872.0
R462	2011-05-05 21:00:00	6665.0
R463	2011-05-17 05:00:00	9811.0
R468	2011-05-23 21:00:00	1966.0
R469	2011-05-14 17:00:00	1874.0
R200	2011-05-11 12:00:00	6486.0
R201	2011-05-28 01:00:00	10859.0
R202	2011-05-05 12:00:00	8201.0
R203	2011-05-10 20:00:00	7424.0
R204	2011-05-05 12:00:00	6823.0
R205	2011-05-24 00:00:00	5631.0
R206	2011-05-10 01:00:00	5106.0
R207	2011-05-05 12:00:00	6069.0
R208	2011-05-11 12:00:00	12388.0
R209	2011-05-03 12:00:00	3635.0
R369	2011-05-13 01:00:00	1650.0
R368	2011-05-10 09:00:00	3501.0
R367	2011-05-03 09:00:00	2413.0
R366	2011-05-05 09:00:00	1979.0
R365	2011-05-03 09:00:00	1575.0
R364	2011-05-11 01:00:00	2643.0
R363	2011-05-25 09:00:00	1692.0
R362	2011-05-22 05:00:00	8976.0
R361	2011-05-03 09:00:00	3424.0
R360	2011-05-09 21:00:00	776.0
R406	2011-05-06 13:00:00	3948.0
R407	2011-05-04 17:00:00	2064.0
R404	2011-05-12 09:00:00	3116.0
R405	2011-05-10 09:00:00	2007.0
R019	2011-05-13 12:00:00	14418.0
R402	2011-05-11 09:00:00	1253.0
R018	2011-05-20 12:00:00	25613.0
R400	2011-05-12 17:00:00	2255.0
R401	2011-05-11 09:00:00	1303.0
R109	2011-05-13 01:00:00	5588.0
R108	2011-05-18 20:00:00	16669.0
R103	2011-05-05 12:00:00	4739.0
R102	2011-05-20 00:00:00	11166.0
R101	2011-05-11 12:00:00	9248.0
R100	2011-05-03 12:00:00	2106.0
R107	2011-05-19 04:00:00	2947.0
R106	2011-05-05 12:00:00	4863.0
R105	2011-05-16 20:00:00	9162.0
R104	2011-05-11 12:00:00	5237.0
R299	2011-05-04 17:00:00	1383.0
R298	2011-05-05 09:00:00	3008.0
R293	2011-05-17 10:00:00	24413.0
R292	2011-05-11 09:00:00	1830.0
R291	2011-05-23 20:00:00	8744.0
R290	2011-05-04 21:00:00	8958.0
R297	2011-05-13 01:00:00	2229.0
R296	2011-05-28 01:00:00	1819.0
R295	2011-05-16 08:00:00	8502.0
R010	2011-05-09 18:00:00	30916.0
R190	2011-05-16 21:00:00	6442.0
R191	2011-05-13 01:00:00	7034.0
R192	2011-05-11 01:00:00	6985.0
R193	2011-05-18 21:00:00	4613.0
R194	2011-05-26 00:00:00	7749.0
R195	2011-05-18 16:22:00	22519.0
R196	2011-05-11 12:00:00	5048.0
R197	2011-05-16 09:00:00	10080.0
R198	2011-05-11 12:00:00	8311.0
R199	2011-05-05 12:00:00	2877.0
R409	2011-05-28 01:00:00	2016.0
R037	2011-05-19 12:00:00	3300.0
R036	2011-05-11 12:00:00	3289.0
R035	2011-05-09 20:00:00	10192.0
R034	2011-05-29 20:00:00	3287.0
R033	2011-05-11 20:00:00	27329.0
R032	2011-05-18 20:00:00	19345.0
R031	2011-05-19 12:00:00	14334.0
R030	2011-05-03 20:00:00	12884.0
R039	2011-05-30 04:00:00	3280.0
R038	2011-05-17 12:00:00	1426.0
R428	2011-05-03 09:00:00	1892.0
R429	2011-05-26 12:00:00	4337.0
R420	2011-05-05 21:00:00	2349.0
R421	2011-05-03 21:00:00	1052.0
R422	2011-05-25 21:00:00	889.0
R423	2011-05-16 13:00:00	2090.0
R424	2011-05-05 12:00:00	1886.0
R425	2011-05-12 13:00:00	1207.0
R426	2011-05-17 09:00:00	736.0
R427	2011-05-05 21:00:00	1135.0
R323	2011-05-25 12:00:00	4164.0
R322	2011-05-14 00:00:00	8582.0
R321	2011-05-05 00:00:00	6266.0
R320	2011-05-07 01:00:00	5683.0
R327	2011-05-12 09:00:00	2707.0
R326	2011-05-05 09:00:00	1312.0
R325	2011-05-06 12:00:00	1769.0
R324	2011-05-14 01:00:00	4207.0
R329	2011-05-12 09:00:00	1204.0
R328	2011-05-28 01:00:00	9887.0
R122	2011-05-09 12:00:00	10288.0

In [ ]:
import sys
import string
import logging

from util import mapper_logfile
logging.basicConfig(filename=mapper_logfile, format='%(message)s',
                    level=logging.INFO, filemode='w')

def mapper():
    """
    In this exercise, for each turnstile unit, you will determine the date and time 
    (in the span of this data set) at which the most people entered through the unit.
    
    The input to the mapper will be the final Subway-MTA dataset, the same as
    in the previous exercise. You can check out the csv and its structure below:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv

    For each line, the mapper should return the UNIT, ENTRIESn_hourly, DATEn, and 
    TIMEn columns, separated by tabs. For example:
    'R001\t100000.0\t2011-05-01\t01:00:00'

    Since you are printing the output of your program, printing a debug 
    statement will interfere with the operation of the grader. Instead, 
    use the logging module, which we've configured to log to a file printed 
    when you click "Test Run". For example:
    logging.info("My debugging message")
    """
    
    next(sys.stdin) # skip header
    for line in sys.stdin:
        data = line.strip().split(",")
        # if proper length
        if len(data) == 22:
            print "{0}\t{1}\t{2}\t{3}".format(data[1], data[6], data[2], data[3])

mapper()


import sys
import logging

from util import reducer_logfile
logging.basicConfig(filename=reducer_logfile, format='%(message)s',
                    level=logging.INFO, filemode='w')

from datetime import datetime

def reducer():
    '''
    Write a reducer that will compute the busiest date and time (that is, the 
    date and time with the most entries) for each turnstile unit. Ties should 
    be broken in favor of datetimes that are later on in the month of May. You 
    may assume that the contents of the reducer will be sorted so that all entries 
    corresponding to a given UNIT will be grouped together.
    
    The reducer should print its output with the UNIT name, the datetime (which 
    is the DATEn followed by the TIMEn column, separated by a single space), and 
    the number of entries at this datetime, separated by tabs.

    For example, the output of the reducer should look like this:
    R001    2011-05-11 17:00:00	   31213.0
    R002	2011-05-12 21:00:00	   4295.0
    R003	2011-05-05 12:00:00	   995.0
    R004	2011-05-12 12:00:00	   2318.0
    R005	2011-05-10 12:00:00	   2705.0
    R006	2011-05-25 12:00:00	   2784.0
    R007	2011-05-10 12:00:00	   1763.0
    R008	2011-05-12 12:00:00	   1724.0
    R009	2011-05-05 12:00:00	   1230.0
    R010	2011-05-09 18:00:00	   30916.0
    ...
    ...

    Since you are printing the output of your program, printing a debug 
    statement will interfere with the operation of the grader. Instead, 
    use the logging module, which we've configured to log to a file printed 
    when you click "Test Run". For example:
    logging.info("My debugging message")
    '''

    max_entries = 0
    old_key = None

    def date_ify(date, time):
        return datetime.strptime(' '.join([date,time]).strip(), "%Y-%m-%d %H:%M:%S")

    reduced_data = {}
    for line in sys.stdin:
        data = line.split("\t")
        if len(data) == 4:
            unit, entries_hourly, date, time = data
            date_time = date_ify(date, time)
            if unit in reduced_data:
                if  float(entries_hourly) > float(reduced_data[unit][1]):
                    reduced_data[unit] = (date_time, entries_hourly)
                elif float(entries_hourly) == float(reduced_data[unit][1]):
                    if date_time > reduced_data[unit][0]:
                        reduced_data[unit] = (date_time, entries_hourly)   
            else:
                reduced_data[unit] = (date_time, entries_hourly)

    
    for unit in reduced_data:
        print "{0}\t{1}\t{2}".format(unit, reduced_data[unit][0], reduced_data[unit][1])

reducer()