For more Queries: https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html
Part of the ISB-CGC: http://www.isb-cgc.org
In [0]:
    
from google.colab import auth
auth.authenticate_user()
print('Authenticated')
    
For instructions on the BigQuery transformation see: https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html
This first query demonstrates the 'magic' %%bigquery command.
In [0]:
    
%%bigquery --project YOUR-PROJECT-ID df
select 
  reference_name,
  start_position,
  end_position
from
  `isb-cgc.QotM.1000genomes`
limit 
  10
    
    Out[0]:
In [0]:
    
df = pd.io.gbq.read_gbq('''
select 
  reference_name as chr,
  start_position,
  end_position
from
  `isb-cgc.QotM.1000genomes`
limit 
  10
''', project_id=project_id, verbose=False, dialect='standard')
df.head()
    
    Out[0]:
In [0]:
    
df = pd.io.gbq.read_gbq('''
#standardsql
SELECT
  start_position,
  reference_name,
  reference_bases AS original,
  alternate_bases[ORDINAL(1)].alt AS alt
FROM
  `isb-cgc.QotM.1000genomes` AS v
WHERE
  ARRAY_LENGTH(alternate_bases) = 1
LIMIT 10
'''
, project_id=project_id, verbose=False, dialect='standard')        
   
df.head()
    
    Out[0]:
In [0]:
    
df = pd.io.gbq.read_gbq('''
SELECT
  start_position,
  reference_name,
  reference_bases AS original,
  alternate_bases[ORDINAL(1)].alt AS changed
FROM
  `isb-cgc.QotM.1000genomes` AS v
WHERE
  ARRAY_LENGTH(alternate_bases) = 1
  AND alternate_bases[ORDINAL(1)].alt IN ('A','C','G','T')
ORDER BY start_position 
LIMIT 10
''', project_id=project_id, verbose=False, dialect='standard')        
   
df.head()
    
    Out[0]:
In [0]:
    
df = pd.io.gbq.read_gbq('''
#standardsql
WITH
table1 AS (
SELECT
  start_position,
  reference_name,
  CONCAT( reference_bases, '->', alternate_bases[ORDINAL(1)].alt) AS mutation
FROM
  `isb-cgc.QotM.1000genomes` AS v
WHERE
  ARRAY_LENGTH(alternate_bases) = 1
  AND alternate_bases[ORDINAL(1)].alt IN ('A','C','G','T')
)
SELECT
  mutation,
  COUNT(mutation) AS num_mutations
FROM
  table1
GROUP BY mutation
ORDER BY num_mutations DESC
LIMIT 5
''', project_id=project_id, verbose=False, dialect='standard')        
   
df.head()
    
    Out[0]:
In [0]:
    
df = pd.io.gbq.read_gbq('''
#standardsql
SELECT
  COUNT(reference_name) AS num_variants
FROM
  `isb-cgc.QotM.1000genomes` AS v
WHERE
  reference_name = '21'
  AND start_position BETWEEN 37365790
  AND 37517450
  ''', project_id=project_id, verbose=False, dialect='standard')
df.head()
    
    Out[0]:
In [0]:
    
df = pd.io.gbq.read_gbq('''
  SELECT
    COUNT(DISTINCT(call.name)) as num_samples
  FROM
    `isb-cgc.QotM.1000genomes`
  JOIN
  UNNEST(call) AS call
  WHERE
    reference_name = '21'
    AND (start_position BETWEEN 37365790 AND 37517450)
   
  ''', project_id=project_id, verbose=False, dialect='standard')
df.head()
    
    Out[0]:
To answer this question, we'll need to start working with records. To do that, we'll build up to answering this question through a series of queries.
Our first query will flatten the call record into three columns, the call.name, and both genotype calls (one for each chromosome), where a zero is the reference call, and an alternate call otherwise. You can see most of the listed entries are actually just homozygous refs.
In [0]:
    
df8 = pd.io.gbq.read_gbq('''
  SELECT
    call.name,
    call.genotype[OFFSET(0)] g1,
    call.genotype[OFFSET(1)] g2
  FROM
    `isb-cgc.QotM.1000genomes`
  JOIN
  UNNEST(call) AS call WITH OFFSET AS ci
  WHERE
    reference_name = '21'
    AND start_position BETWEEN 37365790
    AND 37517450
  
 LIMIT 10''', project_id=project_id, verbose=False, dialect='standard')
df8.head()
    
    Out[0]:
In the above result, we have 11369475 rows, which (as hoped for) is the product of 2535 * 4485!
What we need now, is to match up the sample IDs, and genomic positions.
Here's an example of what we're looking for:
  sample chr      pos   mut  g1 g2  
HG00096  21 37424669  C->A   0  1
See: http://www.internationalgenome.org/data-portal/sample/HG00096
Let's write a query to find it.
In [0]:
    
df9 = pd.io.gbq.read_gbq('''
WITH
t1 as (
SELECT
    reference_name,
    start_position,
    call.name,
    call.genotype[OFFSET(0)] g1,
    call.genotype[OFFSET(1)] g2
  FROM
    `isb-cgc.QotM.1000genomes`
  JOIN
  UNNEST(call) AS call WITH OFFSET AS ci
  WHERE
    reference_name = '21'
    AND start_position BETWEEN 37365790
    AND 37517450
)
select * from t1 where name = 'HG00096' and start_position = 37424669
''', project_id=project_id, verbose=False, dialect='standard')
df9.head()
    
    
    Out[0]:
In [0]:
    
df10 = pd.io.gbq.read_gbq('''
WITH
t1 AS (
SELECT
    reference_name,
    start_position,
    call.name as sample,
    call.genotype[OFFSET(0)] g1,
    call.genotype[OFFSET(1)] g2
  FROM
    `isb-cgc.QotM.1000genomes`
  JOIN
  UNNEST(call) AS call WITH OFFSET AS ci
  WHERE
    reference_name = '21'
    AND (start_position BETWEEN 37365790 AND 37517450)
),
t2 AS (
  SELECT
    sample,
    COUNT(sample) as N
  FROM
    t1
  WHERE
    g1 = 1 OR g2 = 1
  GROUP BY
    sample
  )
select * from t2 GROUP BY N, sample ORDER BY N
''', project_id=project_id, verbose=False, dialect='standard')
df10.head()
    
    Out[0]:
In the above result, we have the expected 2,535 number of rows (for each sample). The number of variants per sample ranges from 42 to 401. Let's make a histogram.
In [0]:
    
df10.hist()
    
    Out[0]:
    
In [0]:
    
df10z = pd.io.gbq.read_gbq('''
WITH
t1 AS (
SELECT
    reference_name,
    start_position,
    call.name,
    call.genotype[OFFSET(0)] g1,
    call.genotype[OFFSET(1)] g2
  FROM
    `isb-cgc.QotM.1000genomes`
  JOIN
  UNNEST(call) AS call WITH OFFSET AS ci
  WHERE
    reference_name = '21'
    AND (start_position BETWEEN 37365790 AND 37517450)
),
t2 AS (
SELECT
  start_position,
  COUNT(start_position) as N
FROM
  t1
WHERE
  (g1 = 1 OR g2 = 1)
GROUP BY
  start_position 
)
select COUNT(*) private_vars from t2 WHERE N = 1
''', project_id=project_id, verbose=False, dialect='standard')
df10z.head()
    
    Out[0]:
In [0]:
    
df11 = pd.io.gbq.read_gbq('''
WITH
t1 AS (
SELECT
    reference_name,
    start_position,
    call.name as sample,
    call.genotype[OFFSET(0)] g1,
    call.genotype[OFFSET(1)] g2
  FROM
    `isb-cgc.QotM.1000genomes`
  JOIN
  UNNEST(call) AS call WITH OFFSET AS ci
  WHERE
    reference_name = '21'
    AND (start_position BETWEEN 37365790 AND 37517450)
),
t2 AS (
  SELECT
    sample,
    COUNT(sample) as N
  FROM
    t1
  WHERE
    g1 = 1 OR g2 = 1
  GROUP BY
    sample
  ),
t3 AS (
SELECT
  AVG(N) avgn, 
  STDDEV(N) stddevn
FROM
  t2
), 
t4 AS (
  SELECT
    sample,
    N,
    avgn,
    stddevn,
    (N - avgn) / stddevn as Z_score
  FROM
    t2 CROSS JOIN t3
)
select * from t4
''', project_id=project_id, verbose=False, dialect='standard')
df11.head()
    
    Out[0]:
In [0]:
    
df11.hist()
    
    Out[0]:
    
This is adapted from the Google tutorial (https://codelabs.developers.google.com/codelabs/genomics-vcfbq/index.html?index=..%2F..index#0 ).
In [0]:
    
df13 = pd.io.gbq.read_gbq('''
#standardsql
WITH ind AS (
  -- count variants for each sample/ref/bin
  SELECT
    call.name AS sample, 
    reference_name AS ref, 
    FLOOR(start_position/1000000) AS bin, 
    COUNT(call.name) AS n
  FROM `isb-cgc.QotM.1000genomes` 
  JOIN UNNEST(call) AS call
  JOIN UNNEST(alternate_bases) AS alt
  WHERE alt.alt != '<*>'
    AND (call.genotype[OFFSET(0)] = 1 OR call.genotype[OFFSET(1)] = 1)
  GROUP BY sample, ref, bin
),
pop AS (
  -- overall all samples in ref/bin
  SELECT 
    ref, 
    bin, 
    AVG(n) AS pop_mu, 
    STDDEV(n) AS pop_sigma
  FROM ind
  GROUP BY ref, bin
),
zscore AS (
  SELECT 
    ind.sample, 
    ind.n AS ind_n,
    (ind.n-pop.pop_mu)/pop.pop_sigma AS z, 
    pop.ref, 
    pop.bin, 
    pop.pop_mu, 
    pop.pop_sigma
  FROM pop, ind
  WHERE ind.ref = pop.ref AND ind.bin = pop.bin
)
SELECT * from zscore
ORDER BY ABS(Z) DESC
''', project_id=project_id, verbose=False, dialect='standard')        
   
df13.head()
    
    Out[0]:
In [0]:
    
import matplotlib
matplotlib.rcParams['figure.figsize'] = [6, 8]
df13.hist()
    
    Out[0]:
    
In [0]:
    
df14 = pd.io.gbq.read_gbq('''
SELECT
reference_name as chr,
start_position,
reference_bases ,
--
--  if the genotype is 0, it's a ref call
--  else use the genotype call to index the alternative base
--
IF( (call.genotype[OFFSET(0)] = 0), 
     reference_bases, 
     alternate_bases[OFFSET(call.genotype[OFFSET(0)] - 1)].alt ) as alt1,
--
--
IF( (call.genotype[OFFSET(1)] = 0), 
     reference_bases, 
     alternate_bases[OFFSET(call.genotype[OFFSET(1)] - 1)].alt ) AS alt2,
--
--  then we're still unnesting a single column.
--
call.name,
call.genotype[OFFSET(0)] AS g1,
call.genotype[OFFSET(1)] AS g2
FROM
  `isb-cgc.QotM.1000genomes`
JOIN
  UNNEST(call) as call
WHERE
  reference_name = '21'
  AND call.name = 'HG00119'
  AND start_position = 34434667
''', project_id=project_id, verbose=False, dialect='standard'
)
df14.head()
    
    Out[0]: