Stat 133 Assignment 8

Xinyang Geng


In [57]:
library(DataComputing)

library(repr)

options(repr.plot.width=9, repr.plot.height=5)

In [2]:
address <- readRDS("address.rds")

In [3]:
head(address)


Out[3]:
addressfirst_namesex
1900 SETON DRARDALANM
22650 RIDGE AVETHOMASM
34126 N HOLLAND SYLVANIA RDRASHIDM
4456 MAGEE AVEDAVIDM
511100 EUCLID AVEJENNIFERF
612605 E 16TH AVEKEVINM

In [4]:
sampled_address = address %>%
    sample_n(size=50)
sampled_address


Out[4]:
addressfirst_namesex
5902613300 OAK LAWN AVESIMRATF
822697156 INDIANWOOD BLVD
4931342600 GREENWOOD RDCLIFTONM
1931665783 WOOSTER PIKELEONARDM
5314886565 W EMERALD STDONALDM
20245303 SECOND AVE-SUITE-20NITINM
2785394420 LAKE BOONE TRLTINAF
26210330 E HIBISCUS BLVDANETTEF
79005149 LAKE AVEINCORONATAF
830077300 W 27TH STKAILASHM
7345510600 QUIVIRA RDRHONDAF
1989935133 RIDGE RD STE 5SANDRAF
1406051300 CRANE STFM
206848155 STATE HIGHWAY 37HILLARYF
4806993900 UNIVERSITY BLVD SDAVIDM
6744473300 GALLOWS RDDANM
129531121 DEKALB AVEVASANTHAF
202381520 MARY STREETKRISTIF
3816383 W HOMESTEAD AVEROBERTM
212311313 337 W BALD EAGLE ST
325568200 WEDNESBURY LNTOVAF
4632751140 TOWN SQUARE ROAD
7653541040 LONGFIELD CTDAVIDM
996322 ASYLUM STREETTHEODOREM
5065412905 W WARNER RDGEORGEM
2777541513 EAST CLEVELAND AVENUEJANICEF
101381400 PARNASSUS AVE FL 8RICHARDM
177909100 MCGOWAN CTGWENDOLYNF
234590842 E MAIN STBRETTM
84151810 SEVERANCE CIRYIPINGF
6745362007 E US HIGHWAY 136
203832990 OAK RIDGE TPKENAWRASM
3018791630 COMMANCHE AVESHANEM
623691125 W KAGY BLVDJEFFM
38469112130 CORTEZ BLVDARTUROM
4502673401 PGA BLVDGM
4117431 LYONS STMICHAELM
2859411421 3RD AVETHOMASM
752961200 1ST ST SWJOSEPHM
1965567325 MEDICAL CENTER DRHEATHERF
3064511100 N PALM CANYON DR STE 206HETALM
4880612428 W WHITTIER BLVDHAMIDM
176551680 CENTRE STGERALDM
1631211041 HOSPITAL DRIVELEA GRACEF
3817881646 PARK RIDGE DRMARKM
716957391 W CHARLESTON BLVDHUSAMEDDINM
33947017071 FORT ST
6483323400 SPRUCE STREETPETERM
191746122 S PATTERSON AVEJOANNEF
8745731801 E MARCH LN BLDG D # 470EARLM

Testing Regex

1.

Pattern like [number] [name] [street]


In [5]:
# Match
sampled_address %>%
    filter(grepl("[0-9]+ [A-Z]+ ST", address))


Out[5]:
addressfirst_namesex
11300 CRANE STFM
2520 MARY STREETKRISTIF
322 ASYLUM STREETTHEODOREM
41 LYONS STMICHAELM
5680 CENTRE STGERALDM
617071 FORT ST
73400 SPRUCE STREETPETERM

In [6]:
# Not match
sampled_address %>%
    filter(!grepl("[0-9]+ [A-Z]+ ST", address))


Out[6]:
addressfirst_namesex
13300 OAK LAWN AVESIMRATF
2156 INDIANWOOD BLVD
32600 GREENWOOD RDCLIFTONM
45783 WOOSTER PIKELEONARDM
56565 W EMERALD STDONALDM
6303 SECOND AVE-SUITE-20NITINM
74420 LAKE BOONE TRLTINAF
8330 E HIBISCUS BLVDANETTEF
949 LAKE AVEINCORONATAF
10300 W 27TH STKAILASHM
1110600 QUIVIRA RDRHONDAF
125133 RIDGE RD STE 5SANDRAF
13155 STATE HIGHWAY 37HILLARYF
143900 UNIVERSITY BLVD SDAVIDM
153300 GALLOWS RDDANM
16121 DEKALB AVEVASANTHAF
173 W HOMESTEAD AVEROBERTM
18313 337 W BALD EAGLE ST
198200 WEDNESBURY LNTOVAF
201140 TOWN SQUARE ROAD
211040 LONGFIELD CTDAVIDM
222905 W WARNER RDGEORGEM
231513 EAST CLEVELAND AVENUEJANICEF
24400 PARNASSUS AVE FL 8RICHARDM
25100 MCGOWAN CTGWENDOLYNF
26842 E MAIN STBRETTM
2710 SEVERANCE CIRYIPINGF
282007 E US HIGHWAY 136
29990 OAK RIDGE TPKENAWRASM
301630 COMMANCHE AVESHANEM
311125 W KAGY BLVDJEFFM
3212130 CORTEZ BLVDARTUROM
333401 PGA BLVDGM
341421 3RD AVETHOMASM
35200 1ST ST SWJOSEPHM
367325 MEDICAL CENTER DRHEATHERF
371100 N PALM CANYON DR STE 206HETALM
382428 W WHITTIER BLVDHAMIDM
391041 HOSPITAL DRIVELEA GRACEF
401646 PARK RIDGE DRMARKM
417391 W CHARLESTON BLVDHUSAMEDDINM
42122 S PATTERSON AVEJOANNEF
431801 E MARCH LN BLDG D # 470EARLM

2.

Pattern like [number] th street


In [7]:
# Match
sampled_address %>%
    filter(grepl("[0-9]+ [A-Z]+ ST", address))


Out[7]:
addressfirst_namesex
11300 CRANE STFM
2520 MARY STREETKRISTIF
322 ASYLUM STREETTHEODOREM
41 LYONS STMICHAELM
5680 CENTRE STGERALDM
617071 FORT ST
73400 SPRUCE STREETPETERM

In [8]:
# Not match
sampled_address %>%
    filter(!grepl("[0-9]+ [A-Z]+ ST", address))


Out[8]:
addressfirst_namesex
13300 OAK LAWN AVESIMRATF
2156 INDIANWOOD BLVD
32600 GREENWOOD RDCLIFTONM
45783 WOOSTER PIKELEONARDM
56565 W EMERALD STDONALDM
6303 SECOND AVE-SUITE-20NITINM
74420 LAKE BOONE TRLTINAF
8330 E HIBISCUS BLVDANETTEF
949 LAKE AVEINCORONATAF
10300 W 27TH STKAILASHM
1110600 QUIVIRA RDRHONDAF
125133 RIDGE RD STE 5SANDRAF
13155 STATE HIGHWAY 37HILLARYF
143900 UNIVERSITY BLVD SDAVIDM
153300 GALLOWS RDDANM
16121 DEKALB AVEVASANTHAF
173 W HOMESTEAD AVEROBERTM
18313 337 W BALD EAGLE ST
198200 WEDNESBURY LNTOVAF
201140 TOWN SQUARE ROAD
211040 LONGFIELD CTDAVIDM
222905 W WARNER RDGEORGEM
231513 EAST CLEVELAND AVENUEJANICEF
24400 PARNASSUS AVE FL 8RICHARDM
25100 MCGOWAN CTGWENDOLYNF
26842 E MAIN STBRETTM
2710 SEVERANCE CIRYIPINGF
282007 E US HIGHWAY 136
29990 OAK RIDGE TPKENAWRASM
301630 COMMANCHE AVESHANEM
311125 W KAGY BLVDJEFFM
3212130 CORTEZ BLVDARTUROM
333401 PGA BLVDGM
341421 3RD AVETHOMASM
35200 1ST ST SWJOSEPHM
367325 MEDICAL CENTER DRHEATHERF
371100 N PALM CANYON DR STE 206HETALM
382428 W WHITTIER BLVDHAMIDM
391041 HOSPITAL DRIVELEA GRACEF
401646 PARK RIDGE DRMARKM
417391 W CHARLESTON BLVDHUSAMEDDINM
42122 S PATTERSON AVEJOANNEF
431801 E MARCH LN BLDG D # 470EARLM

Back to Street


In [9]:
pattern <- "(ST|RD|ROAD)"
sampled_address %>% 
    filter(!grepl(pattern, address),  # Not contain existing street ends
           !grepl(" APT|UNIT [[:digit:]]+$", address)) 
            # Not contain APT or UNIT followed by a space, numbers in the end of the string. (not an apt or unit number)


Out[9]:
addressfirst_namesex
13300 OAK LAWN AVESIMRATF
2156 INDIANWOOD BLVD
3303 SECOND AVE-SUITE-20NITINM
44420 LAKE BOONE TRLTINAF
5330 E HIBISCUS BLVDANETTEF
649 LAKE AVEINCORONATAF
73900 UNIVERSITY BLVD SDAVIDM
8121 DEKALB AVEVASANTHAF
98200 WEDNESBURY LNTOVAF
101040 LONGFIELD CTDAVIDM
11400 PARNASSUS AVE FL 8RICHARDM
12100 MCGOWAN CTGWENDOLYNF
1310 SEVERANCE CIRYIPINGF
142007 E US HIGHWAY 136
15990 OAK RIDGE TPKENAWRASM
161630 COMMANCHE AVESHANEM
171125 W KAGY BLVDJEFFM
1812130 CORTEZ BLVDARTUROM
193401 PGA BLVDGM
207325 MEDICAL CENTER DRHEATHERF
212428 W WHITTIER BLVDHAMIDM
221041 HOSPITAL DRIVELEA GRACEF
231646 PARK RIDGE DRMARKM
24122 S PATTERSON AVEJOANNEF
251801 E MARCH LN BLDG D # 470EARLM

We not filter all the street ends and plot out the histogram


In [92]:
# All street end appears above 1000 times in the whole dataset
# The code is pretty self-explanatory and hence the explanation is omitted.
st_ends = address %>% 
    filter(!grepl(" APT|UNIT [[:digit:]]+$", address))  %>%
    select(address) %>%
    mutate(address=gsub("\\.+$", "", address)) %>%              # Remove trailing .
    mutate(st_end=strsplit(address, " ")) %>%
    mutate(st_end=factor(sapply(st_end, tail, n=1))) %>%        # Find street ends
    filter(!grepl("[0-9]+$", st_end)) %>%                       # Not end by number
    select(st_end) %>%
    group_by(st_end) %>%
    summarise(count=n()) %>%
    filter(count >= 1000)

In [93]:
st_ends %>% ggplot(aes(st_end, count)) +
    geom_bar(stat = "identity") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))


Now showing all the street ends


In [94]:
st_ends %>% arrange(desc(count))


Out[94]:
st_endcount
1ST197656
2AVE147397
3RD124995
4DR85169
5BLVD61001
6STREET19591
7PKWY14388
8ROAD11732
9HWY10702
10LN10292
11WAY9564
12DRIVE9507
13NE9189
14S9042
15AVENUE8715
16N8574
17SW6882
18NW6488
19PL6402
20SE6017
21CT5759
22E5525
23W5342
24CIR4966
25PLZ3720
26BROADWAY3713
27PIKE3665
28A2088
29TRL1945
30CTR1901
31PARKWAY1825
32LANE1787
33TPKE1646
34B1613
35BOULEVARD1487
36SOUTH1458
37CENTER1254
38HIGHWAY1242
39SQ1148
40REAL1138
41PARK1125
42FWY1011