In [1]:
# @hidden_cell
# This function is used to setup the access of Spark to your Object Storage. The definition contains your credentials.
# You might want to remove those credentials before you share your notebook.
setHadoopConfigWithCredentials_d3bd5b94a9334de59a55a7fed2bedeaa <- function(name) {
    # This function sets the Hadoop configuration so it is possible to
    # access data from Bluemix Object Storage using Spark

    prefix = paste("fs.swift.service" , name, sep =".")
    hConf = SparkR:::callJMethod(sc, "hadoopConfiguration")
    SparkR:::callJMethod(hConf, "set", paste(prefix, "auth.url", sep='.'), paste("https://identity.open.softlayer.com","/v3/auth/tokens",sep=""))
    SparkR:::callJMethod(hConf, "set", paste(prefix, "auth.endpoint.prefix", sep='.'), "endpoints")
    SparkR:::callJMethod(hConf, "set", paste(prefix, "tenant", sep='.'), "6aaf54352357483486ee2d4981f8ef15")
    SparkR:::callJMethod(hConf, "set", paste(prefix, "username", sep='.'), "c0eebedc019f4413be3f3d656821b35f")
    SparkR:::callJMethod(hConf, "set", paste(prefix, "password", sep='.'), "ji[T[l.(7D&gld*5")
    SparkR:::callJMethod(hConf, "set", paste(prefix, "region", sep='.'), "dallas")
    invisible(SparkR:::callJMethod(hConf, "setBoolean", paste(prefix, "public", sep='.'), FALSE))
}

name <- "keystone"
setHadoopConfigWithCredentials_d3bd5b94a9334de59a55a7fed2bedeaa(name)

invisible(sparkR.session(appName = "test SparkSession R"))

In [2]:
df.data.1 <- read.json(paste("swift://", "coursera", "." , name,"/", "bearing1_1_acc_transformed_youtube.json", sep=""), 
                       source = "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat", header = "true")
head(df.data.1)


clusterhacctsvacc
121149 -0.018 65509065660-0.077
121149 0.623 65509065700-0.189
121149 0.774 65509065740-0.424
121149 0.441 65509065780 0.749
121149 0.419 65509065820 0.080
121149 0.095 65509065860-0.183

In [3]:
n = nrow(df.data.1)
n


7175680

In [4]:
createOrReplaceTempView(df.data.1,"data")
df_sample = sql("select * from data where rand() <= .1 order by ts asc")

In [5]:
n = nrow(df_sample)
n


716937

In [6]:
df_sample_rdf = collect(df_sample)

In [7]:
colnames(df_sample_rdf)


  1. 'cluster'
  2. 'hacc'
  3. 'ts'
  4. 'vacc'

In [8]:
attach(df_sample_rdf)
plot(ts,hacc, type="o", col="blue")
detach(df_sample_rdf)



In [9]:
attach(df_sample_rdf)
plot(ts,vacc, type="o", col="blue")
detach(df_sample_rdf)



In [10]:
df_grouped = sql("
    select cluster,
    mean(hacc) as mhacc,
    mean(vacc) as mvacc,
    STDDEV_POP(hacc) as sdhacc,
    STDDEV_POP(vacc) as sdvacc 
    from data 
    group by cluster 
    order by cluster asc")

In [11]:
df_grouped_local = collect(df_grouped)

In [12]:
df_grouped_local


clustermhaccmvaccsdhaccsdvacc
10019 0.0022476562-3.925781e-040.3627197 0.4302456
10029 -0.0022738281 1.554687e-030.3682204 0.4326750
10039 0.0008089844-1.763281e-030.3931577 0.4370482
10049 0.0054156250-5.079297e-030.3498997 0.4270153
10059 0.0041875000-1.233203e-030.3714736 0.4273195
1009 -0.0025394531 9.703125e-040.3715793 0.4294702
101019 0.0046421875-1.289063e-030.3608793 0.4670141
101029 0.0023406250-3.859375e-030.3685824 0.4518146
101039 -0.0055210938-2.347266e-030.3887017 0.4499805
101049 0.0051218750-5.710937e-040.3826731 0.4472926
101059 -0.0005714844 1.106641e-030.3822009 0.4605071
10109 0.0003808594-6.777344e-040.4062093 0.4686285
101119 -0.0010058594 1.441406e-040.3936343 0.4639471
101129 -0.0022488281-1.958203e-030.3800240 0.4624880
101139 -0.0074386719 3.247266e-030.3862844 0.4670696
101149 -0.0028761719-5.558594e-040.3726797 0.4463406
101159 0.0017535156 1.824609e-030.3674971 0.4641172
10119 0.0016638672-3.743359e-030.3726542 0.4445911
101219 0.0048628906 1.996484e-030.3774910 0.4682232
101229 0.0017558594-4.573438e-030.3836683 0.4728681
101239 -0.0030753906-2.851563e-030.3850331 0.4651347
101249 -0.0002242187-1.403516e-030.3563611 0.4686641
101259 0.0007289063 3.769531e-030.3681425 0.4516404
10129 0.0031457031 1.756641e-030.3929738 0.4387494
101319 0.0062402344 5.101563e-030.4033246 0.4616818
101329 0.0029507812-4.098047e-030.3800251 0.4707059
101339 0.0017558594 4.511719e-030.3888577 0.4761625
101349 0.0044058594 1.291406e-030.3835008 0.4615697
101359 0.0020257813-1.210937e-050.3696726 0.4483117
10139 0.0009488281-1.429688e-030.3595514 0.4386519
...............
95519 0.0004937500 5.834766e-030.3821765 0.4356149
95529 0.0058195313-1.385547e-030.3877386 0.4298971
95539 0.0028746094-1.676953e-030.3819570 0.4222066
95549 0.0038984375 2.197656e-030.3738199 0.4233148
95559 -0.0019597656 2.327734e-030.3805108 0.4372089
9559 -0.0098207031-3.695313e-040.3819993 0.4405043
95619 0.0044500000-5.664062e-030.3829254 0.4368755
95629 -0.0010800781 5.296094e-030.3802359 0.4359904
95639 -0.0003269531 5.735937e-030.3853124 0.4493293
95649 -0.0001992188 2.846875e-030.3746379 0.4333714
95659 -0.0003617187 2.906250e-030.3881346 0.4358013
9569 0.0032769531-3.337109e-030.3775068 0.4258318
95719 -0.0043085937 3.450000e-030.3888033 0.4372342
95729 -0.0042617188-6.030078e-030.4034682 0.4263787
95739 -0.0046046875-1.734766e-030.3756350 0.4279893
95749 0.0028332031 3.406250e-040.3875505 0.4226225
95759 0.0031335937 1.645703e-030.3568333 0.4305995
9579 0.0009390625-2.144141e-030.4174787 0.4371679
95819 0.0029066406 4.707422e-030.3674526 0.4285304
95829 0.0078531250 1.258984e-030.3706580 0.4230978
95839 -0.0043539063 8.280469e-030.3722328 0.4302419
95849 -0.0029601562-2.869531e-030.3687146 0.4190386
95859 -0.0007355469 4.751172e-030.3860676 0.4291548
9589 0.0006195312 2.434375e-030.3666500 0.4175381
95919 -0.0055164063 3.279297e-030.3771977 0.4456441
95929 -0.0020414063-7.812500e-070.3684894 0.4223342
95939 -0.0017132812-5.035156e-030.3765468 0.4303222
95949 0.0008652344 2.117188e-030.3683631 0.4141480
95959 0.0024847656 6.983594e-030.3700152 0.4296316
9599 -0.0057679687-2.056250e-030.3463770 0.4333110

In [13]:
nrow(df_grouped_local)


2618

In [14]:
attach(df_grouped_local)
plot(cluster,sdhacc)
detach(df_grouped_local)



In [15]:
attach(df_grouped_local)
plot(cluster,sdvacc)
detach(df_grouped_local)



In [16]:
attach(df_grouped_local)
plot(cluster,mhacc)
detach(df_grouped_local)



In [17]:
attach(df_grouped_local)
plot(cluster,mvacc)
detach(df_grouped_local)



In [18]:
install.packages("wavelets")


Installing package into ‘/gpfs/global_fs01/sym_shared/YPProdSpark/user/s4c2-1e12ab68a45670-980ba6aaa6c3/R/libs’
(as ‘lib’ is unspecified)

In [19]:
library(wavelets)

In [20]:
attach(df_sample_rdf)
wt = dwt(vacc, filter="haar", boundary="periodic")
detach(df_sample_rdf)

In [21]:
head(unlist(c(wt@W,wt@V[[wt@level]])))


W11
0.189504617357995
W12
-0.298399061660723
W13
0.00212132034355961
W14
-0.208596500450032
W15
0.0127279220613579
W16
0.318198051533946

In [ ]: