In [1]:
import sys
!{sys.executable} -m pip install --upgrade google-cloud-bigquery


Collecting google-cloud-bigquery
  Downloading https://files.pythonhosted.org/packages/72/e1/1ae3f8024e1d011bc567d54ec81e8c9afd08d107a326bd109e578475415d/google_cloud_bigquery-1.6.0-py2.py3-none-any.whl (83kB)
    100% |████████████████████████████████| 92kB 4.7MB/s ta 0:00:011
Collecting google-api-core<2.0.0dev,>=1.0.0 (from google-cloud-bigquery)
  Downloading https://files.pythonhosted.org/packages/e4/3a/f9a5746a4d1c03e4ae6d4fdea0b4275cd80320dde7b2a44439cf9e913e33/google_api_core-1.5.0-py2.py3-none-any.whl (62kB)
    100% |████████████████████████████████| 71kB 8.9MB/s ta 0:00:011
Collecting google-resumable-media>=0.2.1 (from google-cloud-bigquery)
  Downloading https://files.pythonhosted.org/packages/77/95/2e4020a54366423ddba715f89fb7ca456c8f048b15cada6cd6a54cf10e8c/google_resumable_media-0.3.1-py2.py3-none-any.whl
Collecting google-cloud-core<0.29dev,>=0.28.0 (from google-cloud-bigquery)
  Downloading https://files.pythonhosted.org/packages/0f/41/ae2418b4003a14cf21c1c46d61d1b044bf02cf0f8f91598af572b9216515/google_cloud_core-0.28.1-py2.py3-none-any.whl
Requirement not upgraded as not directly required: googleapis-common-protos<2.0dev,>=1.5.3 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (1.5.3)
Requirement not upgraded as not directly required: six>=1.10.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (1.11.0)
Requirement not upgraded as not directly required: requests<3.0.0dev,>=2.18.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (2.18.4)
Requirement not upgraded as not directly required: pytz in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (2018.5)
Collecting google-auth<2.0.0dev,>=0.4.0 (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery)
  Downloading https://files.pythonhosted.org/packages/58/cb/96dbb4e50e7a9d856e89cc9c8e36ab1055f9774f7d85f37e2156c1d79d9f/google_auth-1.5.1-py2.py3-none-any.whl (65kB)
    100% |████████████████████████████████| 71kB 7.5MB/s ta 0:00:011
Requirement not upgraded as not directly required: setuptools>=34.0.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (39.2.0)
Requirement not upgraded as not directly required: protobuf>=3.4.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (3.6.1)
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (3.0.4)
Requirement not upgraded as not directly required: idna<2.7,>=2.5 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (2.6)
Requirement not upgraded as not directly required: urllib3<1.23,>=1.21.1 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (1.22)
Requirement not upgraded as not directly required: certifi>=2017.4.17 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (2018.10.15)
Collecting cachetools>=2.0.0 (from google-auth<2.0.0dev,>=0.4.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery)
  Downloading https://files.pythonhosted.org/packages/0a/58/cbee863250b31d80f47401d04f34038db6766f95dea1cc909ea099c7e571/cachetools-2.1.0-py2.py3-none-any.whl
Requirement not upgraded as not directly required: rsa>=3.1.4 in /opt/conda/lib/python3.6/site-packages (from google-auth<2.0.0dev,>=0.4.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (4.0)
Requirement not upgraded as not directly required: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.6/site-packages (from google-auth<2.0.0dev,>=0.4.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (0.2.2)
Requirement not upgraded as not directly required: pyasn1>=0.1.3 in /opt/conda/lib/python3.6/site-packages (from rsa>=3.1.4->google-auth<2.0.0dev,>=0.4.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (0.4.4)
mkl-random 1.0.1 requires cython, which is not installed.
Installing collected packages: cachetools, google-auth, google-api-core, google-resumable-media, google-cloud-core, google-cloud-bigquery
Successfully installed cachetools-2.1.0 google-api-core-1.5.0 google-auth-1.5.1 google-cloud-bigquery-1.6.0 google-cloud-core-0.28.1 google-resumable-media-0.3.1
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

In [11]:
import sys
!{sys.executable} -m pip install cython pandas-gbq


Requirement already satisfied: cython in /opt/conda/lib/python3.6/site-packages (0.29)
Requirement already satisfied: pandas-gbq in /opt/conda/lib/python3.6/site-packages (0.7.0)
Requirement already satisfied: pandas in /opt/conda/lib/python3.6/site-packages (from pandas-gbq) (0.23.4)
Requirement already satisfied: google-auth in /opt/conda/lib/python3.6/site-packages (from pandas-gbq) (1.5.1)
Requirement already satisfied: google-cloud-bigquery>=0.32.0 in /opt/conda/lib/python3.6/site-packages (from pandas-gbq) (1.6.0)
Requirement already satisfied: google-auth-oauthlib in /opt/conda/lib/python3.6/site-packages (from pandas-gbq) (0.2.0)
Requirement already satisfied: setuptools in /opt/conda/lib/python3.6/site-packages (from pandas-gbq) (39.2.0)
Requirement already satisfied: python-dateutil>=2.5.0 in /opt/conda/lib/python3.6/site-packages (from pandas->pandas-gbq) (2.7.3)
Requirement already satisfied: pytz>=2011k in /opt/conda/lib/python3.6/site-packages (from pandas->pandas-gbq) (2018.5)
Requirement already satisfied: numpy>=1.9.0 in /opt/conda/lib/python3.6/site-packages (from pandas->pandas-gbq) (1.15.2)
Requirement already satisfied: cachetools>=2.0.0 in /opt/conda/lib/python3.6/site-packages (from google-auth->pandas-gbq) (2.1.0)
Requirement already satisfied: rsa>=3.1.4 in /opt/conda/lib/python3.6/site-packages (from google-auth->pandas-gbq) (4.0)
Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.6/site-packages (from google-auth->pandas-gbq) (0.2.2)
Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.6/site-packages (from google-auth->pandas-gbq) (1.11.0)
Requirement already satisfied: google-api-core<2.0.0dev,>=1.0.0 in /opt/conda/lib/python3.6/site-packages (from google-cloud-bigquery>=0.32.0->pandas-gbq) (1.5.0)
Requirement already satisfied: google-resumable-media>=0.2.1 in /opt/conda/lib/python3.6/site-packages (from google-cloud-bigquery>=0.32.0->pandas-gbq) (0.3.1)
Requirement already satisfied: google-cloud-core<0.29dev,>=0.28.0 in /opt/conda/lib/python3.6/site-packages (from google-cloud-bigquery>=0.32.0->pandas-gbq) (0.28.1)
Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.6/site-packages (from google-auth-oauthlib->pandas-gbq) (1.0.0)
Requirement already satisfied: pyasn1>=0.1.3 in /opt/conda/lib/python3.6/site-packages (from rsa>=3.1.4->google-auth->pandas-gbq) (0.4.4)
Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.5.3 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (1.5.3)
Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (2.18.4)
Requirement already satisfied: protobuf>=3.4.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (3.6.1)
Requirement already satisfied: oauthlib>=0.6.2 in /opt/conda/lib/python3.6/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib->pandas-gbq) (2.1.0)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (3.0.4)
Requirement already satisfied: idna<2.7,>=2.5 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (2.6)
Requirement already satisfied: urllib3<1.23,>=1.21.1 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (1.22)
Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (2018.10.15)
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

In [2]:
from google.cloud import bigquery
client = bigquery.Client()

In [5]:
query_job = client.query("""
	WITH
	  table1 AS (
	  SELECT
	    project_short_name,
	    case_barcode,
	    IF (gender = 'FEMALE',
	      1,
	      0) AS F,
	    IF (gender = 'MALE',
	      1,
	      0) AS M
	  FROM
	    `isb-cgc.TCGA_bioclin_v0.Clinical`
	  GROUP BY
	    project_short_name,
	    case_barcode,
	    gender)
	    -- 
	    --
	SELECT
	  project_short_name,
	  SUM(M) AS M_count,
	  SUM(F) AS F_count
	FROM
	  table1
	GROUP BY
	  project_short_name
	""")

results = query_job.result()

In [6]:
for row in results:
    print("{} : {} : {}".format(row.project_short_name, row.F_count, row.M_count))


TCGA-UCEC : 548 : 0
TCGA-CESC : 307 : 0
TCGA-OV : 587 : 0
TCGA-UCS : 57 : 0
TCGA-BRCA : 1085 : 12
TCGA-CHOL : 25 : 20
TCGA-DLBC : 26 : 22
TCGA-STAD : 158 : 285
TCGA-LGG : 230 : 285
TCGA-ACC : 60 : 32
TCGA-SKCM : 180 : 290
TCGA-UVM : 35 : 45
TCGA-BLCA : 108 : 304
TCGA-KICH : 51 : 62
TCGA-THYM : 60 : 64
TCGA-MESO : 16 : 71
TCGA-PCPG : 101 : 78
TCGA-KIRC : 191 : 346
TCGA-READ : 78 : 92
TCGA-PAAD : 83 : 102
TCGA-LAML : 91 : 109
TCGA-GBM : 230 : 366
TCGA-LUSC : 131 : 373
TCGA-SARC : 142 : 119
TCGA-HNSC : 142 : 386
TCGA-TGCT : 0 : 134
TCGA-THCA : 371 : 136
TCGA-ESCA : 27 : 158
TCGA-KIRP : 77 : 214
TCGA-LUAD : 280 : 242
TCGA-COAD : 216 : 243
TCGA-PRAD : 0 : 500
TCGA-LIHC : 122 : 255

In [52]:
import pandas

In [14]:
projectid = "isb-cgc-02-0001"
query = """
	WITH
	  table1 AS (
	  SELECT
	    project_short_name,
	    case_barcode,
	    IF (gender = 'FEMALE',
	      1,
	      0) AS F,
	    IF (gender = 'MALE',
	      1,
	      0) AS M
	  FROM
	    `isb-cgc.TCGA_bioclin_v0.Clinical`
	  GROUP BY
	    project_short_name,
	    case_barcode,
	    gender)
	    -- 
	    --
	SELECT
	  project_short_name,
	  SUM(M) AS M_count,
	  SUM(F) AS F_count
	FROM
	  table1
	GROUP BY
	  project_short_name
	"""
data_frame = pandas.read_gbq(query, project_id=projectid, dialect='standard')

In [15]:
data_frame.shape


Out[15]:
(33, 3)

In [16]:
data_frame


Out[16]:
project_short_name M_count F_count
0 TCGA-UCEC 0 548
1 TCGA-CESC 0 307
2 TCGA-OV 0 587
3 TCGA-UCS 0 57
4 TCGA-BRCA 12 1085
5 TCGA-CHOL 20 25
6 TCGA-DLBC 22 26
7 TCGA-STAD 285 158
8 TCGA-LGG 285 230
9 TCGA-ACC 32 60
10 TCGA-SKCM 290 180
11 TCGA-UVM 45 35
12 TCGA-BLCA 304 108
13 TCGA-KICH 62 51
14 TCGA-THYM 64 60
15 TCGA-MESO 71 16
16 TCGA-PCPG 78 101
17 TCGA-KIRC 346 191
18 TCGA-READ 92 78
19 TCGA-PAAD 102 83
20 TCGA-LAML 109 91
21 TCGA-GBM 366 230
22 TCGA-LUSC 373 131
23 TCGA-SARC 119 142
24 TCGA-HNSC 386 142
25 TCGA-TGCT 134 0
26 TCGA-THCA 136 371
27 TCGA-ESCA 158 27
28 TCGA-KIRP 214 77
29 TCGA-LUAD 242 280
30 TCGA-COAD 243 216
31 TCGA-PRAD 500 0
32 TCGA-LIHC 255 122

In [19]:
import matplotlib.pyplot as plt
plt.figure();
df2 = pandas.DataFrame(data_frame, columns=['M_count','F_count'])
df2.plot.bar();


<Figure size 432x288 with 0 Axes>

In [1]:
import sys
!{sys.executable} -m pip install pyspark findspark


Collecting pyspark
  Downloading https://files.pythonhosted.org/packages/5e/cb/d8ff49ba885e2c88b8cf2967edd84235ffa9ac301bffef657dfa5605a112/pyspark-2.3.2.tar.gz (211.9MB)
    100% |████████████████████████████████| 211.9MB 174kB/s eta 0:00:01   36% |███████████▋                    | 76.9MB 73.9MB/s eta 0:00:02
Collecting findspark
  Downloading https://files.pythonhosted.org/packages/b1/c8/e6e1f6a303ae5122dc28d131b5a67c5eb87cbf8f7ac5b9f87764ea1b1e1e/findspark-1.3.0-py2.py3-none-any.whl
Collecting py4j==0.10.7 (from pyspark)
  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
    100% |████████████████████████████████| 204kB 19.0MB/s ta 0:00:01
Building wheels for collected packages: pyspark
  Running setup.py bdist_wheel for pyspark ... done
  Stored in directory: /root/.cache/pip/wheels/be/7d/34/cd3cfbc75d8b6b6ae0658e5425348560b86d187fe3e53832cc
Successfully built pyspark
mkl-random 1.0.1 requires cython, which is not installed.
Installing collected packages: py4j, pyspark, findspark
Successfully installed findspark-1.3.0 py4j-0.10.7 pyspark-2.3.2
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

In [2]:
import findspark
findspark.init()

In [4]:
from datetime import datetime
from pyspark.context import SparkContext
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.session import SparkSession

In [34]:
def vector_from_inputs(r):
  return (float(r["label"]), Vectors.dense(float(r["EFGR"]),
                                    float(r["TP53"]),
                                    float(r["NOTCH1"]),
                                    float(r["GATA3"])))

In [6]:
# Use Cloud Dataprocs automatically propagated configurations to get
# the Cloud Storage bucket and Google Cloud Platform project for this
# cluster.
sc = SparkContext()
spark = SparkSession(sc)

In [7]:
bucket = spark._jsc.hadoopConfiguration().get("fs.gs.system.bucket")
project = spark._jsc.hadoopConfiguration().get("fs.gs.project.id")

print(bucket)
print(project)


dataproc-6b064c10-086c-44db-b3b5-f14e410e0c13-us
isb-cgc-02-0001

In [28]:
# Set an input directory for reading data from Bigquery.
todays_date = datetime.strftime(datetime.today(), "%Y-%m-%d-%H-%M-%S")
input_directory = "gs://qotm_oct_2018" + todays_date

In [35]:
# Set the configuration for importing data from BigQuery.
# Specifically, make sure to set the project ID and bucket for Cloud Dataproc,
# and the project ID, dataset, and table names for BigQuery.

conf = {
    # Input Parameters
    "mapred.bq.project.id": project,
    "mapred.bq.gcs.bucket": bucket,
    "mapred.bq.temp.gcs.path": input_directory,
    "mapred.bq.input.project.id": project,
    "mapred.bq.input.dataset.id": "spark_job",
    "mapred.bq.input.table.id": "tcga_spark"
}
print(conf)


{'mapred.bq.project.id': 'isb-cgc-02-0001', 'mapred.bq.gcs.bucket': 'dataproc-6b064c10-086c-44db-b3b5-f14e410e0c13-us', 'mapred.bq.temp.gcs.path': 'gs://qotm_oct_20182018-10-29-23-47-53', 'mapred.bq.input.project.id': 'isb-cgc-02-0001', 'mapred.bq.input.dataset.id': 'spark_job', 'mapred.bq.input.table.id': 'tcga_spark'}

In [36]:
# Read the data from BigQuery into Spark as an RDD.
table_data = spark.sparkContext.newAPIHadoopRDD(
    "com.google.cloud.hadoop.io.bigquery.JsonTextBigQueryInputFormat",
    "org.apache.hadoop.io.LongWritable",
    "com.google.gson.JsonObject",
    conf=conf)


# Extract the JSON strings from the RDD.
table_json = table_data.map(lambda x: x[1])


---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-36-8c6a49b7a66f> in <module>
      4     "org.apache.hadoop.io.LongWritable",
      5     "com.google.gson.JsonObject",
----> 6     conf=conf)
      7 
      8 

/usr/lib/spark/python/pyspark/context.py in newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter, valueConverter, conf, batchSize)
    700         jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass,
    701                                                    valueClass, keyConverter, valueConverter,
--> 702                                                    jconf, batchSize)
    703         return RDD(jrdd, self)
    704 

/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1131         answer = self.gateway_client.send_command(command)
   1132         return_value = get_return_value(
-> 1133             answer, self.gateway_client, self.target_id, self.name)
   1134 
   1135         for temp_arg in temp_args:

/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    317                 raise Py4JJavaError(
    318                     "An error occurred while calling {0}{1}{2}.\n".
--> 319                     format(target_id, ".", name), value)
    320             else:
    321                 raise Py4JError(

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD.
: java.io.IOException: Conflict occurred creating export directory. Path gs://qotm_oct_20182018-10-29-23-47-53 already exists
	at com.google.cloud.hadoop.io.bigquery.AbstractExportToCloudStorage.prepare(AbstractExportToCloudStorage.java:68)
	at com.google.cloud.hadoop.io.bigquery.AbstractBigQueryInputFormat.getSplits(AbstractBigQueryInputFormat.java:136)
	at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:125)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
	at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1333)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1327)
	at org.apache.spark.api.python.SerDeUtil$.pairRDDToPython(SerDeUtil.scala:203)
	at org.apache.spark.api.python.PythonRDD$.newAPIHadoopRDD(PythonRDD.scala:596)
	at org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

In [37]:
# Load the JSON strings as a Spark Dataframe.
tcga_data = spark.read.json(table_json)

# Create a view so that Spark SQL queries can be run against the data.
tcga_data.createOrReplaceTempView("tcga_view")

In [38]:
# As a precaution, run a query in Spark SQL to ensure no NULL values exist.
sql_query = """
SELECT *
from tcga_view
where label is not null
and EFGR is not null
and TP53 is not null
and GATA3 is not null
and NOTCH1 is not null
"""
clean_data = spark.sql(sql_query)

In [39]:
# Create an input DataFrame for Spark ML using the above function.
training_data = clean_data.rdd.map(vector_from_inputs).toDF(["label",
                                                             "features"])
training_data.cache()


Out[39]:
DataFrame[label: double, features: vector]

In [45]:
# Construct a new LinearRegression object and fit the training data.
# https://spark.apache.org/docs/latest/ml-classification-regression.html#binomial-logistic-regression
lr = LogisticRegression(maxIter=5, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(training_data)
# Print the model summary.
print("Coefficients:" + str(model.coefficients))
print("Intercept:" + str(model.intercept))


Coefficients:[50.29267918772197,0.0,0.16224745918590844,-0.31689142394240727]
Intercept:-0.9932429393509908

In [46]:
# getting the model performance metrics 
trainingSummary = lrModel.summary

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)


objectiveHistory:
0.5835274717778136
0.5801510529881112
0.5608210301759466
0.5600787243659968
0.559400795893893
0.559111570022316
+--------------------+-------------------+
|                 FPR|                TPR|
+--------------------+-------------------+
|                 0.0|                0.0|
|0.001646090534979...|0.03111111111111111|
|0.002469135802469...|0.06444444444444444|
|0.003292181069958...|0.09777777777777778|
|0.003292181069958...|0.13333333333333333|
|0.004938271604938...|0.16444444444444445|
|0.005761316872427984|0.19777777777777777|
|0.007407407407407408| 0.2288888888888889|
| 0.00905349794238683|               0.26|
|0.009876543209876543|0.29333333333333333|
|0.009876543209876543| 0.3288888888888889|
|0.009876543209876543|0.36444444444444446|
|0.010699588477366255| 0.3977777777777778|
|0.011522633744855968| 0.4311111111111111|
|0.012345679012345678|0.46444444444444444|
|0.013991769547325103| 0.4955555555555556|
| 0.01646090534979424| 0.5244444444444445|
| 0.01728395061728395| 0.5577777777777778|
|0.018930041152263374| 0.5888888888888889|
| 0.02139917695473251| 0.6177777777777778|
+--------------------+-------------------+
only showing top 20 rows

areaUnderROC: 0.9783191586648377
Out[46]:
LogisticRegression_45c8b09097d92fa6fdcb

In [58]:
import pandas
import matplotlib.pyplot as plt
plt.figure();
trainingSummary.roc.toPandas().plot.scatter('FPR','TPR')


Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f956166c898>
<Figure size 432x288 with 0 Axes>

In [39]:
sc.stop()