For more Queries: https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html
Part of the ISB-CGC: http://www.isb-cgc.org
In [0]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')
For instructions on the BigQuery transformation see: https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html
This first query demonstrates the 'magic' %%bigquery command.
In [0]:
%%bigquery --project YOUR-PROJECT-ID df
select
reference_name,
start_position,
end_position
from
`isb-cgc.QotM.1000genomes`
limit
10
Out[0]:
In [0]:
df = pd.io.gbq.read_gbq('''
select
reference_name as chr,
start_position,
end_position
from
`isb-cgc.QotM.1000genomes`
limit
10
''', project_id=project_id, verbose=False, dialect='standard')
df.head()
Out[0]:
In [0]:
df = pd.io.gbq.read_gbq('''
#standardsql
SELECT
start_position,
reference_name,
reference_bases AS original,
alternate_bases[ORDINAL(1)].alt AS alt
FROM
`isb-cgc.QotM.1000genomes` AS v
WHERE
ARRAY_LENGTH(alternate_bases) = 1
LIMIT 10
'''
, project_id=project_id, verbose=False, dialect='standard')
df.head()
Out[0]:
In [0]:
df = pd.io.gbq.read_gbq('''
SELECT
start_position,
reference_name,
reference_bases AS original,
alternate_bases[ORDINAL(1)].alt AS changed
FROM
`isb-cgc.QotM.1000genomes` AS v
WHERE
ARRAY_LENGTH(alternate_bases) = 1
AND alternate_bases[ORDINAL(1)].alt IN ('A','C','G','T')
ORDER BY start_position
LIMIT 10
''', project_id=project_id, verbose=False, dialect='standard')
df.head()
Out[0]:
In [0]:
df = pd.io.gbq.read_gbq('''
#standardsql
WITH
table1 AS (
SELECT
start_position,
reference_name,
CONCAT( reference_bases, '->', alternate_bases[ORDINAL(1)].alt) AS mutation
FROM
`isb-cgc.QotM.1000genomes` AS v
WHERE
ARRAY_LENGTH(alternate_bases) = 1
AND alternate_bases[ORDINAL(1)].alt IN ('A','C','G','T')
)
SELECT
mutation,
COUNT(mutation) AS num_mutations
FROM
table1
GROUP BY mutation
ORDER BY num_mutations DESC
LIMIT 5
''', project_id=project_id, verbose=False, dialect='standard')
df.head()
Out[0]:
In [0]:
df = pd.io.gbq.read_gbq('''
#standardsql
SELECT
COUNT(reference_name) AS num_variants
FROM
`isb-cgc.QotM.1000genomes` AS v
WHERE
reference_name = '21'
AND start_position BETWEEN 37365790
AND 37517450
''', project_id=project_id, verbose=False, dialect='standard')
df.head()
Out[0]:
In [0]:
df = pd.io.gbq.read_gbq('''
SELECT
COUNT(DISTINCT(call.name)) as num_samples
FROM
`isb-cgc.QotM.1000genomes`
JOIN
UNNEST(call) AS call
WHERE
reference_name = '21'
AND (start_position BETWEEN 37365790 AND 37517450)
''', project_id=project_id, verbose=False, dialect='standard')
df.head()
Out[0]:
To answer this question, we'll need to start working with records. To do that, we'll build up to answering this question through a series of queries.
Our first query will flatten the call record into three columns, the call.name, and both genotype calls (one for each chromosome), where a zero is the reference call, and an alternate call otherwise. You can see most of the listed entries are actually just homozygous refs.
In [0]:
df8 = pd.io.gbq.read_gbq('''
SELECT
call.name,
call.genotype[OFFSET(0)] g1,
call.genotype[OFFSET(1)] g2
FROM
`isb-cgc.QotM.1000genomes`
JOIN
UNNEST(call) AS call WITH OFFSET AS ci
WHERE
reference_name = '21'
AND start_position BETWEEN 37365790
AND 37517450
LIMIT 10''', project_id=project_id, verbose=False, dialect='standard')
df8.head()
Out[0]:
In the above result, we have 11369475 rows, which (as hoped for) is the product of 2535 * 4485!
What we need now, is to match up the sample IDs, and genomic positions.
Here's an example of what we're looking for:
sample chr pos mut g1 g2
HG00096 21 37424669 C->A 0 1
See: http://www.internationalgenome.org/data-portal/sample/HG00096
Let's write a query to find it.
In [0]:
df9 = pd.io.gbq.read_gbq('''
WITH
t1 as (
SELECT
reference_name,
start_position,
call.name,
call.genotype[OFFSET(0)] g1,
call.genotype[OFFSET(1)] g2
FROM
`isb-cgc.QotM.1000genomes`
JOIN
UNNEST(call) AS call WITH OFFSET AS ci
WHERE
reference_name = '21'
AND start_position BETWEEN 37365790
AND 37517450
)
select * from t1 where name = 'HG00096' and start_position = 37424669
''', project_id=project_id, verbose=False, dialect='standard')
df9.head()
Out[0]:
In [0]:
df10 = pd.io.gbq.read_gbq('''
WITH
t1 AS (
SELECT
reference_name,
start_position,
call.name as sample,
call.genotype[OFFSET(0)] g1,
call.genotype[OFFSET(1)] g2
FROM
`isb-cgc.QotM.1000genomes`
JOIN
UNNEST(call) AS call WITH OFFSET AS ci
WHERE
reference_name = '21'
AND (start_position BETWEEN 37365790 AND 37517450)
),
t2 AS (
SELECT
sample,
COUNT(sample) as N
FROM
t1
WHERE
g1 = 1 OR g2 = 1
GROUP BY
sample
)
select * from t2 GROUP BY N, sample ORDER BY N
''', project_id=project_id, verbose=False, dialect='standard')
df10.head()
Out[0]:
In the above result, we have the expected 2,535 number of rows (for each sample). The number of variants per sample ranges from 42 to 401. Let's make a histogram.
In [0]:
df10.hist()
Out[0]:
In [0]:
df10z = pd.io.gbq.read_gbq('''
WITH
t1 AS (
SELECT
reference_name,
start_position,
call.name,
call.genotype[OFFSET(0)] g1,
call.genotype[OFFSET(1)] g2
FROM
`isb-cgc.QotM.1000genomes`
JOIN
UNNEST(call) AS call WITH OFFSET AS ci
WHERE
reference_name = '21'
AND (start_position BETWEEN 37365790 AND 37517450)
),
t2 AS (
SELECT
start_position,
COUNT(start_position) as N
FROM
t1
WHERE
(g1 = 1 OR g2 = 1)
GROUP BY
start_position
)
select COUNT(*) private_vars from t2 WHERE N = 1
''', project_id=project_id, verbose=False, dialect='standard')
df10z.head()
Out[0]:
In [0]:
df11 = pd.io.gbq.read_gbq('''
WITH
t1 AS (
SELECT
reference_name,
start_position,
call.name as sample,
call.genotype[OFFSET(0)] g1,
call.genotype[OFFSET(1)] g2
FROM
`isb-cgc.QotM.1000genomes`
JOIN
UNNEST(call) AS call WITH OFFSET AS ci
WHERE
reference_name = '21'
AND (start_position BETWEEN 37365790 AND 37517450)
),
t2 AS (
SELECT
sample,
COUNT(sample) as N
FROM
t1
WHERE
g1 = 1 OR g2 = 1
GROUP BY
sample
),
t3 AS (
SELECT
AVG(N) avgn,
STDDEV(N) stddevn
FROM
t2
),
t4 AS (
SELECT
sample,
N,
avgn,
stddevn,
(N - avgn) / stddevn as Z_score
FROM
t2 CROSS JOIN t3
)
select * from t4
''', project_id=project_id, verbose=False, dialect='standard')
df11.head()
Out[0]:
In [0]:
df11.hist()
Out[0]:
This is adapted from the Google tutorial (https://codelabs.developers.google.com/codelabs/genomics-vcfbq/index.html?index=..%2F..index#0 ).
In [0]:
df13 = pd.io.gbq.read_gbq('''
#standardsql
WITH ind AS (
-- count variants for each sample/ref/bin
SELECT
call.name AS sample,
reference_name AS ref,
FLOOR(start_position/1000000) AS bin,
COUNT(call.name) AS n
FROM `isb-cgc.QotM.1000genomes`
JOIN UNNEST(call) AS call
JOIN UNNEST(alternate_bases) AS alt
WHERE alt.alt != '<*>'
AND (call.genotype[OFFSET(0)] = 1 OR call.genotype[OFFSET(1)] = 1)
GROUP BY sample, ref, bin
),
pop AS (
-- overall all samples in ref/bin
SELECT
ref,
bin,
AVG(n) AS pop_mu,
STDDEV(n) AS pop_sigma
FROM ind
GROUP BY ref, bin
),
zscore AS (
SELECT
ind.sample,
ind.n AS ind_n,
(ind.n-pop.pop_mu)/pop.pop_sigma AS z,
pop.ref,
pop.bin,
pop.pop_mu,
pop.pop_sigma
FROM pop, ind
WHERE ind.ref = pop.ref AND ind.bin = pop.bin
)
SELECT * from zscore
ORDER BY ABS(Z) DESC
''', project_id=project_id, verbose=False, dialect='standard')
df13.head()
Out[0]:
In [0]:
import matplotlib
matplotlib.rcParams['figure.figsize'] = [6, 8]
df13.hist()
Out[0]:
In [0]:
df14 = pd.io.gbq.read_gbq('''
SELECT
reference_name as chr,
start_position,
reference_bases ,
--
-- if the genotype is 0, it's a ref call
-- else use the genotype call to index the alternative base
--
IF( (call.genotype[OFFSET(0)] = 0),
reference_bases,
alternate_bases[OFFSET(call.genotype[OFFSET(0)] - 1)].alt ) as alt1,
--
--
IF( (call.genotype[OFFSET(1)] = 0),
reference_bases,
alternate_bases[OFFSET(call.genotype[OFFSET(1)] - 1)].alt ) AS alt2,
--
-- then we're still unnesting a single column.
--
call.name,
call.genotype[OFFSET(0)] AS g1,
call.genotype[OFFSET(1)] AS g2
FROM
`isb-cgc.QotM.1000genomes`
JOIN
UNNEST(call) as call
WHERE
reference_name = '21'
AND call.name = 'HG00119'
AND start_position = 34434667
''', project_id=project_id, verbose=False, dialect='standard'
)
df14.head()
Out[0]: