notebook.community

Edit and run



In [1]:

    
# Fetch an example MSA from the Pfam database:
!wget https://pfam.xfam.org/family/PF06201/alignment/full/gzipped -O MSA.gz









    



--2018-07-23 23:39:35--  https://pfam.xfam.org/family/PF06201/alignment/full/gzipped
Resolving pfam.xfam.org (pfam.xfam.org)... 193.62.193.83
Connecting to pfam.xfam.org (pfam.xfam.org)|193.62.193.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 221647 (216K) [application/x-gzip]
Saving to: ‘MSA.gz’

MSA.gz              100%[===================>] 216,45K   210KB/s    in 1,0s    

2018-07-23 23:39:37 (210 KB/s) - ‘MSA.gz’ saved [221647/221647]



In [2]:

    
# Parse and filter sequence records into a SequenceData object:
import skmsa
sd = skmsa.read('MSA.gz').filtered() # use defaults (check docstrings for read and filter options)



In [3]:

    
#Convert sequence data to the most appropriate object, for example:
#a pandas dataframe:
df = sd.as_dataframe()
df.head()









    Out[3]:







  
    
      
      header
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      141
      142
      143
      144
      145
      146
      147
      148
      149
      150
    
  
  
    
      1
      C5DGB7_LACTC/26-197
      F
      S
      F
      V
      N
      T
      P
      N
      V
      ...
      T
      R
      C
      F
      Y
      L
      E
      M
      R
      G
    
    
      2
      A0A0D8Y9P7_DICVI/124-264
      -
      -
      L
      I
      D
      K
      K
      Q
      M
      ...
      T
      E
      I
      E
      K
      L
      K
      I
      F
      G
    
    
      5
      A0A095CE52_CRYGR/34-192
      W
      S
      H
      I
      D
      R
      D
      N
      V
      ...
      S
      Q
      V
      Y
      F
      I
      G
      L
      K
      G
    
    
      7
      H2ZHR8_CIOSA/125-269
      -
      -
      -
      -
      -
      -
      -
      -
      -
      ...
      T
      K
      I
      Q
      R
      I
      I
      L
      Y
      G
    
    
      8
      K1QGE0_CRAGI/33-175
      Y
      T
      K
      I
      N
      M
      E
      A
      V
      ...
      T
      K
      I
      Y
      Y
      I
      G
      L
      K
      G
    
  

5 rows × 128 columns



In [4]:

    
df.describe()









    Out[4]:







  
    
      
      header
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      141
      142
      143
      144
      145
      146
      147
      148
      149
      150
    
  
  
    
      count
      854
      854
      854
      854
      854
      854
      854
      854
      854
      854
      ...
      854
      854
      854
      854
      854
      854
      854
      854
      854
      854
    
    
      unique
      854
      18
      18
      19
      9
      19
      21
      20
      19
      12
      ...
      11
      20
      9
      20
      17
      8
      18
      8
      17
      2
    
    
      top
      A0A0C3NG11_PHLGI/9-157
      Y
      S
      Q
      I
      D
      F
      D
      G
      V
      ...
      T
      R
      I
      D
      Y
      I
      G
      L
      K
      G
    
    
      freq
      1
      281
      148
      166
      580
      565
      148
      187
      169
      321
      ...
      570
      458
      491
      207
      409
      434
      414
      343
      234
      845
    
  

4 rows × 128 columns



In [5]:

    
# a NumPy ndarray:
X = sd.as_array()
X









    Out[5]:





array([['F', 'S', 'F', ..., 'M', 'R', 'G'],
       ['-', '-', 'L', ..., 'I', 'F', 'G'],
       ['W', 'S', 'H', ..., 'L', 'K', 'G'],
       ...,
       ['Y', 'G', 'T', ..., 'F', 'K', 'G'],
       ['T', 'S', 'Q', ..., 'I', 'I', 'G'],
       ['Y', 'T', 'F', ..., 'F', 'R', 'G']], dtype='<U1')



In [6]:

    
# a numerically encoded array:
X = sd.as_encoded() # default encoding: map residue symbols to their index in the alignment alphabet
print(sd.alphabet)
X









    



-ACDEFGHIKLMNPQRSTVWY






    Out[6]:





array([[ 5, 16,  5, ..., 11, 15,  6],
       [ 0,  0, 10, ...,  8,  5,  6],
       [19, 16,  7, ..., 10,  9,  6],
       ...,
       [20,  6, 17, ...,  5,  9,  6],
       [17, 16, 14, ...,  8,  8,  6],
       [20, 17,  5, ...,  5, 15,  6]])



In [7]:

    
# a one-hot encoded array (as a SciPy sparse matrix in CSR format):
X = sd.as_one_hot(sparse=True)
X









    Out[7]:





<854x2540 sparse matrix of type '<class 'numpy.float64'>'
	with 107584 stored elements in Compressed Sparse Row format>

	header	1	2	3	4	5	6	7	8	9	...	141	142	143	144	145	146	147	148	149	150
1	C5DGB7_LACTC/26-197	F	S	F	V	N	T	P	N	V	...	T	R	C	F	Y	L	E	M	R	G
2	A0A0D8Y9P7_DICVI/124-264	-	-	L	I	D	K	K	Q	M	...	T	E	I	E	K	L	K	I	F	G
5	A0A095CE52_CRYGR/34-192	W	S	H	I	D	R	D	N	V	...	S	Q	V	Y	F	I	G	L	K	G
7	H2ZHR8_CIOSA/125-269	-	-	-	-	-	-	-	-	-	...	T	K	I	Q	R	I	I	L	Y	G
8	K1QGE0_CRAGI/33-175	Y	T	K	I	N	M	E	A	V	...	T	K	I	Y	Y	I	G	L	K	G

	header	1	2	3	4	5	6	7	8	9	...	141	142	143	144	145	146	147	148	149	150
count	854	854	854	854	854	854	854	854	854	854	...	854	854	854	854	854	854	854	854	854	854
unique	854	18	18	19	9	19	21	20	19	12	...	11	20	9	20	17	8	18	8	17	2
top	A0A0C3NG11_PHLGI/9-157	Y	S	Q	I	D	F	D	G	V	...	T	R	I	D	Y	I	G	L	K	G
freq	1	281	148	166	580	565	148	187	169	321	...	570	458	491	207	409	434	414	343	234	845