In [1]:
# Fetch an example MSA from the Pfam database:
!wget https://pfam.xfam.org/family/PF06201/alignment/full/gzipped -O MSA.gz


--2018-07-23 23:39:35--  https://pfam.xfam.org/family/PF06201/alignment/full/gzipped
Resolving pfam.xfam.org (pfam.xfam.org)... 193.62.193.83
Connecting to pfam.xfam.org (pfam.xfam.org)|193.62.193.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 221647 (216K) [application/x-gzip]
Saving to: ‘MSA.gz’

MSA.gz              100%[===================>] 216,45K   210KB/s    in 1,0s    

2018-07-23 23:39:37 (210 KB/s) - ‘MSA.gz’ saved [221647/221647]


In [2]:
# Parse and filter sequence records into a SequenceData object:
import skmsa
sd = skmsa.read('MSA.gz').filtered() # use defaults (check docstrings for read and filter options)

In [3]:
#Convert sequence data to the most appropriate object, for example:
#a pandas dataframe:
df = sd.as_dataframe()
df.head()


Out[3]:
header 1 2 3 4 5 6 7 8 9 ... 141 142 143 144 145 146 147 148 149 150
1 C5DGB7_LACTC/26-197 F S F V N T P N V ... T R C F Y L E M R G
2 A0A0D8Y9P7_DICVI/124-264 - - L I D K K Q M ... T E I E K L K I F G
5 A0A095CE52_CRYGR/34-192 W S H I D R D N V ... S Q V Y F I G L K G
7 H2ZHR8_CIOSA/125-269 - - - - - - - - - ... T K I Q R I I L Y G
8 K1QGE0_CRAGI/33-175 Y T K I N M E A V ... T K I Y Y I G L K G

5 rows × 128 columns


In [4]:
df.describe()


Out[4]:
header 1 2 3 4 5 6 7 8 9 ... 141 142 143 144 145 146 147 148 149 150
count 854 854 854 854 854 854 854 854 854 854 ... 854 854 854 854 854 854 854 854 854 854
unique 854 18 18 19 9 19 21 20 19 12 ... 11 20 9 20 17 8 18 8 17 2
top A0A0C3NG11_PHLGI/9-157 Y S Q I D F D G V ... T R I D Y I G L K G
freq 1 281 148 166 580 565 148 187 169 321 ... 570 458 491 207 409 434 414 343 234 845

4 rows × 128 columns


In [5]:
# a NumPy ndarray:
X = sd.as_array()
X


Out[5]:
array([['F', 'S', 'F', ..., 'M', 'R', 'G'],
       ['-', '-', 'L', ..., 'I', 'F', 'G'],
       ['W', 'S', 'H', ..., 'L', 'K', 'G'],
       ...,
       ['Y', 'G', 'T', ..., 'F', 'K', 'G'],
       ['T', 'S', 'Q', ..., 'I', 'I', 'G'],
       ['Y', 'T', 'F', ..., 'F', 'R', 'G']], dtype='<U1')

In [6]:
# a numerically encoded array:
X = sd.as_encoded() # default encoding: map residue symbols to their index in the alignment alphabet
print(sd.alphabet)
X


-ACDEFGHIKLMNPQRSTVWY
Out[6]:
array([[ 5, 16,  5, ..., 11, 15,  6],
       [ 0,  0, 10, ...,  8,  5,  6],
       [19, 16,  7, ..., 10,  9,  6],
       ...,
       [20,  6, 17, ...,  5,  9,  6],
       [17, 16, 14, ...,  8,  8,  6],
       [20, 17,  5, ...,  5, 15,  6]])

In [7]:
# a one-hot encoded array (as a SciPy sparse matrix in CSR format):
X = sd.as_one_hot(sparse=True)
X


Out[7]:
<854x2540 sparse matrix of type '<class 'numpy.float64'>'
	with 107584 stored elements in Compressed Sparse Row format>