In [2]:
from pathlib import Path
In [3]:
import pandas as pd
import numpy as np
In [4]:
root=Path('../').resolve()
In [5]:
root
Out[5]:
In [6]:
raw_path=root.joinpath('src','raw','Sanger','GSE68950.quantile_normalized.tsv')
probe_path=root.joinpath('src','raw','Affy_U133A_probe_info.csv')
sample_path=root.joinpath('src','raw','Sanger','Sanger.xls')
In [7]:
plat_info_df = pd.read_csv(probe_path.as_posix())
In [8]:
plat_info_df.head()
Out[8]:
In [9]:
sample_info_df = pd.read_excel(sample_path.as_posix())
In [10]:
sample_info_df.head()
Out[10]:
In [11]:
sample_info_df = sample_info_df.loc[
:, ['name', 'cell_line', 'primary_site', 'primary_hist']
]
sample_info_df.columns = [
'name', 'cell_line', 'primary_site', 'primary_histology'
]
In [12]:
sample_raw_data_df = pd.read_table(raw_path.as_posix())
In [13]:
sample_raw_data_df.head()
Out[13]:
In [14]:
sample_raw_data_df.index = sample_raw_data_df['Unnamed: 0']
sample_raw_data_df.index.name = None
In [15]:
sample_raw_data_df = sample_raw_data_df.iloc[:, 1:]
In [16]:
sample_raw_data_df.columns = [col[11:-len('.CEL.gz')] for col in sample_raw_data_df.columns]
sample_raw_data_df = sample_raw_data_df.loc[:, sample_info_df['name']]
sample_raw_data_df = sample_raw_data_df.reindex(pd.unique(plat_info_df.PROBEID))
In [17]:
sample_raw_data_df.head()
Out[17]:
In [18]:
npy_path=root.joinpath('src','sanger.npy')
with (npy_path).open('wb') as f:
np.save(f, sample_raw_data_df.values.astype(np.float32))