In [1]:
# required packages:
import numpy as np
import pandas as pd
import sklearn
import skimage
import sqlalchemy as sa
import urllib.request
import requests
import sys
import json
import pickle
import gzip
from pathlib import Path
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
!pip install pymysql
import pymysql
In [ ]:
In [ ]:
In [ ]:
In [2]:
ICGC_API = 'https://dcc.icgc.org/api/v1/download?fn=/release_18/Projects/BRCA-US/'
expression_fname = 'protein_expression.BRCA-US.tsv.gz'
if not Path(expression_fname).is_file():
print("Downloading file", ICGC_API + expression_fname, "saving it as", expression_fname)
urllib.request.urlretrieve(ICGC_API + expression_fname, expression_fname);
else:
print("Local file exists:", expression_fname)
In [3]:
def get_genome_sequence_ensembl(chrom, start, end):
"""
API described here http://rest.ensembl.org/documentation/info/sequence_region
"""
url = 'https://rest.ensembl.org/sequence/region/human/{0}:{1}..{2}:1?content-type=application/json'.format(chrom, start, end)
r = requests.get(url, headers={"Content-Type": "application/json"}, timeout=10.000)
if not r.ok:
print("REST Request FAILED")
decoded = r.json()
print(decoded['error'])
return
else:
print("REST Request OK")
decoded = r.json()
return decoded['seq']
In [4]:
sequence = get_genome_sequence_ensembl(7, 200000,200100)
print(sequence)
In [12]:
# Loading data into pandas
E = pd.read_csv(expression_fname, delimiter='\t')
# E[E['gene_name'] == 'EGFR'].head()
E[E['gene_name'] == 'EGFR']['normalized_expression_level'].hist()
Out[12]:
In [6]:
Out[6]:
In [19]:
engine = sa.create_engine('mysql+pymysql://genome@genome-mysql.cse.ucsc.edu/hg38', poolclass=sa.pool.NullPool)
In [ ]:
In [ ]:
In [ ]:
In [1]:
snp_table = sa.Table('snp147Common',
meta,
sa.PrimaryKeyConstraint('name'),
extend_existing=True)
expr = sa.select([snp_table]).where(snp_table.c.chrom == 'chrY').count()
# pd.read_sql(expr, engine)
# pd_table.group_by('chrom').name.nunique()
In [ ]:
There are a number of ways to handle missing data:
If the machine learning model will be used with new data it is important to consider the possibility of receiving records with values missing that we have not observed previously in the training dataset.
The simplest approach is to remove any records that have missing data. Unfortunately missing values are often not randomly distributed through a dataset and removing them can introduce bias.
An alternative approach is to substitute the missing values. This can be with the mean of the feature across all the records or the value can be predicted based on the values of the other features in the dataset. Placeholder values can also be used with decision trees but do not work as well for most other algorithms.
Finally, missing values can themselves be useful features. Adding an additional feature indicating when a value is missing is often used to include this information.
In [55]:
import sklearn.linear_model
x = np.array([[0, 0], [1, 1], [2, 2]])
y = np.array([0, 1, 2])
print(x,y)
clf = sklearn.linear_model.LinearRegression()
clf.fit(x, y)
print(clf.coef_)
x_missing = np.array([[0, 0], [1, np.nan], [2, 2]])
print(x_missing, y)
print()
try:
clf = sklearn.linear_model.LinearRegression()
clf.fit(x_missing, y)
print(clf.coef_)
except ValueError as e:
print(e)
In [56]:
x = pd.DataFrame([[0,1,2,3,4,5,6],
[2,np.nan,7,4,9,1,3],
[0.1,0.12,0.11,0.15,0.16,0.11,0.14],
[100,120,np.nan,127,130,121,124],
[4,1,7,9,0,2,np.nan]], ).T
x.columns = index=['A', 'B', 'C', 'D', 'E']
y = pd.Series([29.0,
31.2,
63.25,
57.27,
66.3,
26.21,
48.24])
print(x, y)
In [57]:
x1 = x.dropna()
In [58]:
x.fillna(value={'A':1000,'B':2000,'C':3000,'D':4000,'E':5000})
Out[58]:
In [59]:
x.fillna(value=x.mean())
Out[59]:
Many machine learning algorithms expect features to have similar distributions and scales.
A classic example is gradient descent, if features are on different scales some weights will update faster than others because the feature values scale the weight updates.
There are two common approaches to normalization:
Z-score standardization rescales values so that they have a mean of zero and a standard deviation of 1. Specifically we perform the following transformation:
$$z = \frac{x - \mu}{\sigma}$$An alternative is min-max scaling that transforms data into the range of 0 to 1. Specifically:
$$x_{norm} = \frac{x - x_{min}}{x_{max} - x_{min}}$$Min-max scaling is less commonly used but can be useful for image data and in some neural networks.
In [60]:
x_filled = x.fillna(value=x.mean())
print(x_filled)
In [61]:
x_norm = (x_filled - x_filled.min()) / (x_filled.max() - x_filled.min())
print(x_norm)
In [62]:
scaling = sklearn.preprocessing.MinMaxScaler().fit(x_filled)
scaling.transform(x_filled)
Out[62]:
In [ ]:
In [63]:
x = pd.DataFrame([[0,1,2,3,4,5,6],
[2,np.nan,7,4,9,1,3],
[0.1,0.12,0.11,0.15,0.16,0.11,0.14],
[100,120,np.nan,127,130,121,124],
['Green','Red','Blue','Blue','Green','Red','Green']], ).T
x.columns = ['A', 'B', 'C', 'D', 'E']
print(x)
In [64]:
x_cat = x.copy()
for val in x['E'].unique():
x_cat['E_{0}'.format(val)] = x_cat['E'] == val
x_cat
Out[64]:
In [ ]:
x
with the column mean and add an additional column to indicate when missing values have been substituted. The isnull
method on the pandas dataframe may be useful.x
to the z-scaled values. The StandardScaler method in the preprocessing module can be used or the z-scaled values calculated directly.x['C']
into a categorical variable using a threshold of 0.125
In [ ]:
Depending on the type of task being performed there are a variety of steps we may want to take in working with images:
Occasionally the camera used to generate an image will use 10- to 14-bits while a 16-bit file format will be used. In this situation all the pixel intensities will be in the lower values. Rescaling to the full range (or to 0-1) can be useful.
Further processing can be done to alter the histogram of the image.
When looking for particular features in an image a sliding window can be used to check different locations. This can be combined with an image pyramid to detect features at different scales. This is often needed when objects can be at different distances from the camera.
If objects are sparsely distributed in an image a faster approach than using sliding windows is to identify objects with a simple threshold and then test only the bounding boxes containing objects. Before running these through a model centering based on intensity can be a useful approach. Small offsets, rotations and skewing can be used to generate additional training data.
In [67]:
# http://scikit-image.org/docs/stable/auto_examples/color_exposure/plot_equalize.html#example-color-exposure-plot-equalize-py
matplotlib.rcParams['font.size'] = 8
import skimage.data
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
def plot_img_and_hist(img, axes, bins=256):
"""Plot an image along with its histogram and cumulative histogram.
"""
img = skimage.img_as_float(img)
ax_img, ax_hist = axes
ax_cdf = ax_hist.twinx()
# Display image
ax_img.imshow(img, cmap=plt.cm.gray)
ax_img.set_axis_off()
ax_img.set_adjustable('box-forced')
# Display histogram
ax_hist.hist(img.ravel(), bins=bins, histtype='step', color='black')
ax_hist.ticklabel_format(axis='y', style='scientific', scilimits=(0, 0))
ax_hist.set_xlabel('Pixel intensity')
ax_hist.set_xlim(0, 1)
ax_hist.set_yticks([])
# Display cumulative distribution
img_cdf, bins = skimage.exposure.cumulative_distribution(img, bins)
ax_cdf.plot(bins, img_cdf, 'r')
ax_cdf.set_yticks([])
return ax_img, ax_hist, ax_cdf
# Load an example image
img = skimage.data.moon()
# Contrast stretching
p2, p98 = np.percentile(img, (2, 98))
img_rescale = skimage.exposure.rescale_intensity(img, in_range=(p2, p98))
# Equalization
img_eq = skimage.exposure.equalize_hist(img)
# Adaptive Equalization
img_adapteq = skimage.exposure.equalize_adapthist(img, clip_limit=0.03)
# Display results
fig = plt.figure(figsize=(8, 5))
axes = np.zeros((2,4), dtype=np.object)
axes[0,0] = fig.add_subplot(2, 4, 1)
for i in range(1,4):
axes[0,i] = fig.add_subplot(2, 4, 1+i, sharex=axes[0,0], sharey=axes[0,0])
for i in range(0,4):
axes[1,i] = fig.add_subplot(2, 4, 5+i)
ax_img, ax_hist, ax_cdf = plot_img_and_hist(img, axes[:, 0])
ax_img.set_title('Low contrast image')
y_min, y_max = ax_hist.get_ylim()
ax_hist.set_ylabel('Number of pixels')
ax_hist.set_yticks(np.linspace(0, y_max, 5))
ax_img, ax_hist, ax_cdf = plot_img_and_hist(img_rescale, axes[:, 1])
ax_img.set_title('Contrast stretching')
ax_img, ax_hist, ax_cdf = plot_img_and_hist(img_eq, axes[:, 2])
ax_img.set_title('Histogram equalization')
ax_img, ax_hist, ax_cdf = plot_img_and_hist(img_adapteq, axes[:, 3])
ax_img.set_title('Adaptive equalization')
ax_cdf.set_ylabel('Fraction of total intensity')
ax_cdf.set_yticks(np.linspace(0, 1, 5))
# prevent overlap of y-axis labels
fig.tight_layout()
plt.show()
In [68]:
img = skimage.data.page()
fig, ax = plt.subplots(1,1)
ax.imshow(img, cmap=plt.cm.gray)
ax.set_axis_off()
plt.show()
print(img.shape)
In [73]:
import sklearn.feature_extraction
patches = sklearn.feature_extraction.image.extract_patches_2d(img, (20, 20), max_patches=2, random_state=0)
patches.shape
plt.imshow(patches[0], cmap=plt.cm.gray)
plt.show()
In [75]:
import sklearn.datasets
digits = sklearn.datasets.load_digits()
# print(digits.DESCR)
fig, ax = plt.subplots(1,1, figsize=(1,1))
ax.imshow(digits.data[0].reshape((8,8)), cmap=plt.cm.gray, interpolation='nearest')
Out[75]:
In [ ]:
When working with text the simplest approach is known as bag of words. In this approach we simply count the number of instances of each word, and then adjust the values based on how commonly the word is used.
The first task is to break a piece of text up into individual tokens. The number of occurrences of each word is then recorded. More rarely used words are likely to be more interesting and so word counts are scaled by the inverse document frequency.
We can extend this to look at not just individual words but also bigrams and trigrams.
In [76]:
twenty_train = sklearn.datasets.fetch_20newsgroups(subset='train',
categories=['comp.graphics', 'sci.med'], shuffle=True, random_state=0)
print(twenty_train.target_names)
count_vect = sklearn.feature_extraction.text.CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print(X_train_counts.shape)
tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# print(X_train_tfidf.shape, X_train_tfidf[:5,:15].toarray())
In [77]:
print(twenty_train.data[0])
In [78]:
count_vect = sklearn.feature_extraction.text.CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data[0:1])
print(X_train_counts[0].toarray())
print(count_vect.vocabulary_.keys())
In [35]:
import skimage.data
import sklearn.feature_extraction
import skimage.transform
page_img = skimage.data.page()
plt.imshow(img, cmap=plt.cm.gray)
plt.show()
patches_s10 = sklearn.feature_extraction.image.extract_patches_2d(page_img, (10, 10), max_patches=10, random_state=0)
# print(patches_s10)
plt.imshow(patches_s10[3], cmap=plt.cm.gray)
plt.show()
print("OK")
for patch_size in (10, 20, 40):
patches = sklearn.feature_extraction.image.extract_patches_2d(page_img, (patch_size, patch_size), max_patches=3, random_state=0)
for i, patch in enumerate(patches):
scaling_factor = 20.0 / patch_size
rescaled_patch = skimage.transform.rescale(patch, scale=scaling_factor)
plt.imshow(rescaled_patch, cmap=plt.cm.gray)
plt.show()
In [53]:
import sklearn.datasets
twenty_train = sklearn.datasets.fetch_20newsgroups(subset='train',
categories=['comp.graphics', 'sci.med'], shuffle=True, random_state=0)
count_vect = sklearn.feature_extraction.text.CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print(count_vect.get_feature_names()[-10:])
print(X_train_counts.shape)
count_vect = sklearn.feature_extraction.text.CountVectorizer(stop_words=('Hi', 'Best', 'software'))
X_train_counts = count_vect.fit_transform(twenty_train.data)
print(count_vect.get_feature_names()[-10:])
print(X_train_counts.shape)
print()
count_vect = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1, 2))
X_train_counts = count_vect.fit_transform(twenty_train.data)
print(count_vect.get_feature_names()[-10:])
print(X_train_counts.shape)
In [ ]: