In [1]:
# Fetch an example MSA from the Pfam database:
!wget https://pfam.xfam.org/family/PF06201/alignment/full/gzipped -O MSA.gz
In [2]:
# Parse and filter sequence records into a SequenceData object:
import skmsa
sd = skmsa.read('MSA.gz').filtered() # use defaults (check docstrings for read and filter options)
In [3]:
#Convert sequence data to the most appropriate object, for example:
#a pandas dataframe:
df = sd.as_dataframe()
df.head()
Out[3]:
In [4]:
df.describe()
Out[4]:
In [5]:
# a NumPy ndarray:
X = sd.as_array()
X
Out[5]:
In [6]:
# a numerically encoded array:
X = sd.as_encoded() # default encoding: map residue symbols to their index in the alignment alphabet
print(sd.alphabet)
X
Out[6]:
In [7]:
# a one-hot encoded array (as a SciPy sparse matrix in CSR format):
X = sd.as_one_hot(sparse=True)
X
Out[7]: