In [57]:
library(DataComputing)
library(repr)
options(repr.plot.width=9, repr.plot.height=5)
In [2]:
address <- readRDS("address.rds")
In [3]:
head(address)
Out[3]:
In [4]:
sampled_address = address %>%
sample_n(size=50)
sampled_address
Out[4]:
In [5]:
# Match
sampled_address %>%
filter(grepl("[0-9]+ [A-Z]+ ST", address))
Out[5]:
In [6]:
# Not match
sampled_address %>%
filter(!grepl("[0-9]+ [A-Z]+ ST", address))
Out[6]:
In [7]:
# Match
sampled_address %>%
filter(grepl("[0-9]+ [A-Z]+ ST", address))
Out[7]:
In [8]:
# Not match
sampled_address %>%
filter(!grepl("[0-9]+ [A-Z]+ ST", address))
Out[8]:
In [9]:
pattern <- "(ST|RD|ROAD)"
sampled_address %>%
filter(!grepl(pattern, address), # Not contain existing street ends
!grepl(" APT|UNIT [[:digit:]]+$", address))
# Not contain APT or UNIT followed by a space, numbers in the end of the string. (not an apt or unit number)
Out[9]:
We not filter all the street ends and plot out the histogram
In [92]:
# All street end appears above 1000 times in the whole dataset
# The code is pretty self-explanatory and hence the explanation is omitted.
st_ends = address %>%
filter(!grepl(" APT|UNIT [[:digit:]]+$", address)) %>%
select(address) %>%
mutate(address=gsub("\\.+$", "", address)) %>% # Remove trailing .
mutate(st_end=strsplit(address, " ")) %>%
mutate(st_end=factor(sapply(st_end, tail, n=1))) %>% # Find street ends
filter(!grepl("[0-9]+$", st_end)) %>% # Not end by number
select(st_end) %>%
group_by(st_end) %>%
summarise(count=n()) %>%
filter(count >= 1000)
In [93]:
st_ends %>% ggplot(aes(st_end, count)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Now showing all the street ends
In [94]:
st_ends %>% arrange(desc(count))
Out[94]: