Vahid Mirjalili, Data Scientist
2. Downloading and Reading Data
Motivation:
Raw Data -> Processing script -> tidy data -> data analysis -> data communication
In [1]:
if (!file.exists("data/")) {
dir.create("data/")
}
download.file() to download a file from internet
In [2]:
fileUrl <- "http://www.brianknaus.com/software/srtoolbox/s_4_1_sequence80.txt"
download.file(fileUrl, destfile="./data/s_4_1_sequence80.txt", method="curl")
read.table()
read.csv()
read.fwf() to read files that have fixed-width-format
If the file contains quotation marks, set qoute=""
In [3]:
if(!file.exists("data/")) {dir.create("data")}
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xlsx?accessType=DOWNLOAD"
download.file(fileUrl, destfile="./data/cameras.xlsx", method="curl")
dateDownloaded <- date
require(xlsx)
cameraData <- read.xlsx("data/cameras.xlsx", sheetIndex=1, header=T)
head(cameraData)
Out[3]:
In [4]:
read.xlsx("data/cameras.xlsx", sheetIndex=1, header=T, colIndex=2:3, rowIndex=1:5)
Out[4]:
In [5]:
library(jsonlite)
jsonData <- fromJSON("https://api.github.com/users/mirjalil/repos")
head(names(jsonData))
names(jsonData$owner)
jsonData$owner$login
Out[5]:
Out[5]:
Out[5]:
In [6]:
myjson <- toJSON(iris[1:2,], pretty=T)
cat(myjson)
In [7]:
require(XML)
fileUrl <- "http://www.w3schools.com/xml/simple.xml"
xmlDoc <- xmlTreeParse(fileUrl, useInternal=T)
rootNode <- xmlRoot(xmlDoc)
names(rootNode)
xmlName(rootNode)
Out[7]:
Out[7]:
In [8]:
rootNode[[1]]
rootNode[[1]][[1]]
Out[8]:
Out[8]:
In [9]:
xmlSApply(rootNode, xmlValue)
Out[9]:
In [10]:
xpathSApply(rootNode, "//name", xmlValue)
Out[10]:
In [11]:
xpathSApply(rootNode, "//price", xmlValue)
Out[11]:
In [10]:
library(data.table)
df <- data.frame(x=rnorm(9), y=rep(c("a", "b", "c"), each=3), z=rnorm(9))
dt <- data.table(x=rnorm(9), y=rep(c("a", "b", "c"), each=3), z=rnorm(9))
## to see all the data.tables in memory
tables()
In [11]:
## subsetting
dt[1:3, list(x,y)]
dt[, table(y)]
In [12]:
## Passing a list of functions to apply:
dt[, list(mean(x), sum(z))]
In [13]:
is.data.table(dt)
dt[,w:=z^2]
# Note: some commands of data.table are still not supported in IPython-Rkernel
Out[13]:
In [14]:
dt[,a:=x>0]
dt[,b:=mean(x+w), by=a]
Note: some commands of data.table are still not supported in IPython-Rkernel
In [15]:
dt[, m:={tmp <- x+z; log2(tmp+5)}]
In [16]:
DT <- data.table(x=rep(c("a", "b", "c"), each=100), y=rnorm(300))
setkey(DT, x)
DT['a']
In [17]:
DT1 <- data.table(x=c('a', 'a', 'b', 'dt1'), y=1:4)
DT2 <- data.table(x=c('a', 'b', 'dt2'), z=5:7)
setkey(DT1, x)
setkey(DT2, x)
merged(DT1, DT2)
In [18]:
big_df <- data.frame(x=rnorm(1E6), y=rnorm(1E6))
tfile=tempfile()
write.table(big_df, file=tfile, row.names=T, sep="\t", quote=F)
## timeing by fread:
system.time(fread(tfile))
## timing by read.table
system.time(read.table(tfile, header=T, sep="\t"))
Out[18]:
Out[18]:
In [19]:
require(RMySQL)
ucscDb <- dbConnect(MySQL(), user="genome",
host="genome-mysql.cse.ucsc.edu")
result_allDBs <- dbGetQuery(ucscDb, "show databases;")
dbDisconnect(ucscDb)
head(result_allDBs)
Out[19]:
Out[19]:
In [20]:
#Specify a database:
hg19 <- dbConnect(MySQL(), user="genome", db="hg19",
host="genome-mysql.cse.ucsc.edu")
allTables <- dbListTables(hg19)
length(allTables)
allTables[1:5]
Out[20]:
Out[20]:
In [21]:
# list all the fields (column name) of a table:
dbListFields(hg19, "affyU133Plus2")
Out[21]:
In [22]:
# find out the number of rows of a specific table:
dbGetQuery(hg19, "SELECT count(*) from affyU133Plus2")
Out[22]:
In [23]:
affyData <- dbReadTable(hg19, "affyU133Plus2")
head(affyData[,1:8], n=3)
Out[23]:
In [24]:
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 30")
affyMis <- fetch(query)
quantile(affyMis$misMatches)
Out[24]:
In [25]:
affyMisSmall <- fetch(query, n=10)
dbClearResult(query)
dim(affyMisSmall)
Out[25]:
Out[25]:
In [26]:
dbDisconnect(hg19)
Out[26]:
HDF == Hierarchical Data Format
groups containing datasets + metadata
datasets are multdimensional array of data
To install, use
source("http://bioconductor.org/biocLite.R")
biocLite("rhdf5")
In [27]:
library(rhdf5)
# create a new hdf5 file
created <- h5createFile("examples/example.h5")
# the output is just a boolean
created
Out[27]:
In [28]:
created <- h5createGroup("examples/example.h5", "foo")
created <- h5createGroup("examples/example.h5", "baa")
created <- h5createGroup("examples/example.h5", "foo/subgroup")
h5ls("examples/example.h5")
Out[28]:
In [29]:
A <- matrix(1:10, nr=5, nc=2)
h5write(A, "examples/example.h5", "foo/A")
B <- array(seq(0.1, 2.0, by=0.1), dim=c(5,2,2))
attr(B, "scale") <- "liter"
h5write(B, "examples/example.h5", "foo/subgroup/B")
h5ls("examples/example.h5")
Out[29]:
In [30]:
## Writea data.frame
df <- data.frame(1L:5L, seq(0,1,length.out=5),
c("ab", "cde", "fhgi", "a", "s"), stringAsFactors=FALSE)
h5write(df, "examples/example.h5", "df")
h5ls("examples/example.h5")
Out[30]:
In [32]:
readA <- h5read("examples/example.h5", "foo/A")
readB <- h5read("examples/example.h5", "foo/subgroup/B")
read.df <- h5read("examples/example.h5", "df")
readA
Out[32]:
In [33]:
h5write(c(12,13,14), "examples/example.h5", "foo/A", index=list(1:3, 1))
h5read("examples/example.h5", "foo/A")
Out[33]:
In [34]:
# create a connection
con <- url("http://scholar.google.com/citations?user=8YTgJ_cAAAAJ&hl=en")
# readlines from the connection
htmlCode <- readLines(con)
# close the connection
close(con)
In [19]:
library(XML)
url.address <- "http://scholar.google.com/citations?user=8YTgJ_cAAAAJ&hl=en"
html <- htmlTreeParse(url.address, useInternalNodes=T)
xpathSApply(html, "//title", xmlValue)
xpathSApply(html, "//td[@id='col-citedby']", xmlValue)
Out[19]:
Out[19]:
In [24]:
library(httr)
html2 <- GET(url.address)
content2 <- content(html2, as="text")
parsedHtml <- htmlParse(content2, asText=TRUE)
xpathSApply(parsedHtml, "//title", xmlValue)
Out[24]:
In [15]:
require(XML)
wcup <- htmlTreeParse("http://www.fifa.com/worldcup/matches/", useInternal=TRUE)
matches <- xpathSApply(wcup, "//span[starts-with(@class, 't-nText ')]", xmlValue)
#head(matches)
m_days <- xpathSApply(wcup, "//div[@class='mu-i-datetime']", xmlValue)
#head(m_days)
wcup.matches <- data.frame(
team_A = matches[seq(1, 130, by=2)],
team_B = matches[seq(2, 130, by=2)],
datetime = m_days[1:65]
)
#head(wcup.matches)
## Print the German games:
wcup.matches[wcup.matches$team_A == "Germany" | wcup.matches$team_B == "Germany", ]
Out[15]:
Using httr package to get data from APIs (twitter, facebook, github, ...)
In [5]:
library(httr)
## read the api keys stored in a file
auth_keys = read.table("/home/vahid/Desktop/tw.api.keys")
myapp = oauth_app("twitter",
key=auth_keys[1,1], secret = auth_keys[2,1])
sig = sign_oauth1.0(myapp,
token=auth_keys[3,1], token_secret=auth_keys[4,1])
## Get the home timeline
homeTL = GET("https://api.twitter.com/1.1/statuses/home_timeline.json", sig)
## Extract its contents
json1 = content(homeTL)
json2 = jsonlite::fromJSON(toJSON(json1))
json2[1:2, 1:4]
Out[5]:
To lear more about other parameters to get statuses/home_timeline:
https://dev.twitter.com/docs/api/1.1/get/statuses/home_timeline
useful parameters:
since_id
Information on GitHub useful for hiring programmers or scientists with particular skill.
To read data from github API:
In [ ]:
## reginster an application at https://github.com/settings/applications
### use http://localhost:1410 fot callback URL
myapp <- oauth_app("github", "74c8e68e2ca64160346d", secret="...")
myapp
> <ouath_app> github
> key: 74c8e68e2ca64160346d
> secret: <hidden>
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)
> Waiting for authentication in browser...
> Press Esc/Ctrl + C to abort
> Authentication complete.
req <- GET("https://api.github.com/users/mirjalil/repos", config(token = github_token))
stop_for_status(req)
content(req)
Use connections to interact directly with other files/sources:
In [11]:
# example reading arff data:
library(foreign)
d.weka <- read.arff("examples/anneal.arff")
str(d.weka[,1:5])
In [8]:
# example: reading image from 101_ObjectCategories
library(jpeg)
d.img <- readJPEG("~/datasets/101_ObjectCategories/pyramid/image_0001.jpg")
str(d.img)