Stat 133

Homework 9

Part I


In [9]:
library(DataComputing)
library(rvest)
library(lubridate)
page = "List_of_nuclear_reactors"
xpath = '//*[@id="mw-content-text"]/table' 
table_list <- page %>%
  read_html() %>%
  html_nodes(xpath = xpath) %>%
  html_table(fill = TRUE)

Find the table element


In [10]:
table <- table_list[[23]]
head(table)


Out[10]:
NameReactor No.ReactorNAStatusCapacity in MWNAConstruction Start DateCommercial Operation DateClosure
1TypeModelNetGrossNANANANANANA
2Fukushima Daiichi1BWRBWR-3Inoperable43946025 July 196726 March 197119 May 2011
3Fukushima Daiichi2BWRBWR-4Inoperable7607849 June 196918 July 197419 May 2011
4Fukushima Daiichi3BWRBWR-4Inoperable76078428 December 197027 March 197619 May 2011
5Fukushima Daiichi4BWRBWR-4Shut down/ Inoperable76078412 February 197312 October 197819 May 2011
6Fukushima Daiichi5BWRBWR-4Shut down76078422 May 197218 April 197817 December 2013

In [11]:
names(table)


Out[11]:
  1. 'Name'
  2. 'Reactor No.'
  3. 'Reactor'
  4. 'NA'
  5. 'Status'
  6. 'Capacity in MW'
  7. 'NA'
  8. 'Construction Start Date'
  9. 'Commercial Operation Date'
  10. 'Closure'

Look at it using View()


In [13]:
new_names <- c("name", "reactor_no", "type", "model",
               "status", "net", "gross", "construction_start",
               "operation_start", "closure")
names(table) <- new_names
table <- table %>% filter(row_number() != 1)
head(table)


Out[13]:
namereactor_notypemodelstatusnetgrossconstruction_startoperation_startclosure
1Fukushima Daiichi2BWRBWR-4Inoperable7607849 June 196918 July 197419 May 2011
2Fukushima Daiichi3BWRBWR-4Inoperable76078428 December 197027 March 197619 May 2011
3Fukushima Daiichi4BWRBWR-4Shut down/ Inoperable76078412 February 197312 October 197819 May 2011
4Fukushima Daiichi5BWRBWR-4Shut down76078422 May 197218 April 197817 December 2013
5Fukushima Daiichi6BWRBWR-5Shut down1067110026 October 197324 October 197917 December 2013
6Fukushima Daini1BWRBWR-5Operation suspended1067110016 March 197620 April 1982

A quick visualization


In [15]:
table %>% 
  separate(construction_start, into=c("day", "month", "year"), sep=" ") %>%
  mutate(year=as.numeric(year)) %>%
  ggplot(aes(y=net, x=year, color=type)) + geom_point() + labs(x="construction start")


Warning message:
: Too few values at 3 locations: 20, 21, 22Warning message:
: Removed 3 rows containing missing values (geom_point).

Construction delays


In [16]:
table %>% 
  separate(construction_start, into=c("day1", "month1", "year1"), sep=" ") %>%
  separate(operation_start, into=c("day2", "month2", "year2"), sep=" ") %>%
  mutate(year1=as.numeric(year1), year2=as.numeric(year2)) %>%
  ggplot() + geom_segment(aes(x=year1, xend=year2, y=name, yend=name)) + labs(x="construction start", y="reactor site")


Warning message:
: Too few values at 3 locations: 20, 21, 22Warning message:
: Too few values at 5 locations: 20, 21, 22, 41, 51Warning message:
: Removed 5 rows containing missing values (geom_segment).

Part II

ls
less lifeexpectancy.csv
wc -l lifeexpectancy.csv
head -1 lifeexpectancy.csv #column 152, 177, 202
cut -f 1,152,177,202 -d ',' lifeexpectancy.csv
cut -f 1,152,177,202 -d ',' lifeexpectancy.csv | egrep "[0-9]" > lifeexpectancy.clean.csv
R CMD BATCH makemaps-1.R lifeexpectancy.clean.csv | tee message.txt
ls plots