In [2]:
from pathlib import Path

In [3]:
import pandas as pd
import numpy as np

In [4]:
root=Path('../').resolve()

In [5]:
root


Out[5]:
WindowsPath('C:/Users/user/Documents/GitHub/Carkinos')

In [6]:
raw_path=root.joinpath('src','raw','Sanger','GSE68950.quantile_normalized.tsv')
probe_path=root.joinpath('src','raw','Affy_U133A_probe_info.csv')
sample_path=root.joinpath('src','raw','Sanger','Sanger.xls')

In [7]:
plat_info_df = pd.read_csv(probe_path.as_posix())

In [8]:
plat_info_df.head()


Out[8]:
PROBEID SYMBOL ENTREZID GENENAME
0 1007_s_at DDR1 780 discoidin domain receptor tyrosine kinase 1
1 1007_s_at MIR4640 100616237 microRNA 4640
2 1053_at RFC2 5982 replication factor C (activator 1) 2, 40kDa
3 117_at HSPA6 3310 heat shock 70kDa protein 6 (HSP70B')
4 121_at PAX8 7849 paired box 8

In [9]:
sample_info_df = pd.read_excel(sample_path.as_posix())

In [10]:
sample_info_df.head()


Out[10]:
name cRNAPlate RNA-ID-CODE Affy-batch cell_line CellLineCode SupplierCode: 1-ATCC/2-RIKEN/3-DTP/4-ECCC/5-DSMZ/6-HSRRB/7-ICLC/8-Unspecified/9-JCRB Tissue:1-AdrenalGland/2-AutoGanglia/3-BiliaryTrack/4-Bone/5-Breast/6-Brain/7-Cervix/8-Endometrium/9-Eye/10-Lymphoma/11-Leukemia/12-Kidney/13-Colorectal/14-Liver/15-Lung/16-Oesophagus/17-Ovary/18-Pancreas/19-Placenta/20-Pleura/21-Prostate/22-Headneck/23-Skin/24-StomachGI/25-Testis/26-Thyroid/27-UrinaryTrack/28-Vulva/29-Muscle/30-SmoothMuscle/31-OtherSarcoma/32-Unknown Histo:1-Carcinoma/2-AdenoCarcinoma/3-Carcinoid/4-ClearCell/5-DuctalCarcinoma/6-FollicularCarcinoma/7-GiantCell/8-HCC/9-LargeCell/10-NSCLC/11-SmallCell/12-Squamous/13-Sarcoma/14-EwingsSarcoma/15-GermCell/16-Glioma/17-Leukemia/18-AML/19-ALL/20-CML/21-CLL/22-MDS/23-Myeloma/24-Lymphoma/25-BurkittsLymphoma/26-HodgekinsLymphoma/27-Rabdoid/28-Medulloblastoma/29-Melanoma/30-Mesothelioma/31-Neuroblastoma/32-Anaplastic/33-Other COSMICid primary_site SiteSubtype1 SiteSubtype2 SiteSubtype3 primary_hist HistSubtype1 HistSubtype2 HistSubtype3 Unnamed: 18
0 5500024035100021608461.G01 8 736 1 380 749 5 11 19 924101 haematopoietic and lymphoid tissue NS NS NS lymphoid neoplasm acute lymphoblastic leukaemia L2 NS NaN
1 5500024034290101707049.A01 6 494 1 697 493 5 11 19 906800 haematopoietic and lymphoid tissue NS NS NS haematopoietic neoplasm acute lymphoblastic leukaemia NS NS NaN
2 5500024052603032009483.A09 11 7 2 5637 505 1 27 1 687452 urinary tract bladder NS NS carcinoma NS NS NS NaN
3 5500024035100021608461.H01 8 746 1 22RV1 760 5 21 2 924100 prostate NS NS NS carcinoma adenocarcinoma NS NS NaN
4 5500024032848101507998.D02 5 439 1 23132-87 437 5 24 2 910924 stomach NS NS NS carcinoma adenocarcinoma NS NS NaN

In [11]:
sample_info_df = sample_info_df.loc[
    :, ['name', 'cell_line', 'primary_site', 'primary_hist']
]
sample_info_df.columns = [
    'name', 'cell_line', 'primary_site', 'primary_histology'
]

In [12]:
sample_raw_data_df = pd.read_table(raw_path.as_posix())

In [13]:
sample_raw_data_df.head()


Out[13]:
Unnamed: 0 GSM1687570_5500024035100021608461.G01.CEL.gz GSM1687571_5500024034290101707049.A01.CEL.gz GSM1687572_5500024052603032009483.A09.CEL.gz GSM1687573_5500024035100021608461.H01.CEL.gz GSM1687574_5500024032848101507998.D02.CEL.gz GSM1687575_5500024030401071707289.D04.CEL.gz GSM1687576_5500024030401071707289.C10.CEL.gz GSM1687577_5500024052861011409506.D05.CEL.gz GSM1687578_5500024032848101507998.E02.CEL.gz ... GSM1688358_5500024032848101507998.G01.CEL.gz GSM1688359_5500024031722092907496.C11.CEL.gz GSM1688360_5500024052861011409506.E11.CEL.gz GSM1688361_5500024035100021608461.E01.CEL.gz GSM1688362_5500024032848101507998.H01.CEL.gz GSM1688363_5500024052861011409506.E09.CEL.gz GSM1688364_5500024031722092907496.B11.CEL.gz GSM1688365_5500024052861011409506.D09.CEL.gz GSM1688366_5500024032848101507000.H03.CEL.gz GSM1688367_5500024035100021608461.F01.CEL.gz
0 1007_s_at 9.525619 8.718047 9.904457 9.798405 10.932997 8.973151 10.857714 10.975397 8.635765 ... 9.429262 9.661919 9.589560 10.112593 8.408232 8.722528 10.710177 9.657667 9.176614 10.897960
1 1053_at 9.047948 8.616065 9.203023 8.572035 8.506107 8.229169 8.695021 8.915824 8.233599 ... 8.776563 9.155915 9.336024 8.824805 8.772813 8.802262 8.678147 8.625144 8.695912 8.629976
2 117_at 7.612412 8.043859 7.603927 7.714699 7.742822 7.551629 7.504556 7.720968 7.730913 ... 7.729079 11.647400 11.342093 7.850625 8.028290 8.037324 7.655276 7.888281 7.691653 7.624577
3 121_at 8.218106 8.235331 8.400500 8.405479 9.297814 8.178474 8.067798 8.176426 11.259210 ... 11.123192 8.063912 8.138332 8.213297 8.035581 8.203285 8.021791 8.373332 8.081797 8.261245
4 1255_g_at 7.392744 7.395735 7.453283 7.493720 7.474639 7.436698 7.388679 7.508520 7.432427 ... 7.445189 9.069289 8.621514 7.446887 7.512241 7.456834 7.453321 7.509484 7.315354 7.513548

5 rows × 799 columns


In [14]:
sample_raw_data_df.index = sample_raw_data_df['Unnamed: 0']
sample_raw_data_df.index.name = None

In [15]:
sample_raw_data_df = sample_raw_data_df.iloc[:, 1:]

In [16]:
sample_raw_data_df.columns = [col[11:-len('.CEL.gz')] for col in sample_raw_data_df.columns]
sample_raw_data_df = sample_raw_data_df.loc[:, sample_info_df['name']]
sample_raw_data_df = sample_raw_data_df.reindex(pd.unique(plat_info_df.PROBEID))

In [17]:
sample_raw_data_df.head()


Out[17]:
5500024035100021608461.G01 5500024034290101707049.A01 5500024052603032009483.A09 5500024035100021608461.H01 5500024032848101507998.D02 5500024030401071707289.D04 5500024030401071707289.C10 5500024052861011409506.D05 5500024032848101507998.E02 5500024052861011409506.E01 ... 5500024032848101507998.G01 5500024031722092907496.C11 5500024052861011409506.E11 5500024035100021608461.E01 5500024032848101507998.H01 5500024052861011409506.E09 5500024031722092907496.B11 5500024052861011409506.D09 5500024032848101507000.H03 5500024035100021608461.F01
1007_s_at 9.525619 8.718047 9.904457 9.798405 10.932997 8.973151 10.857714 10.975397 8.635765 8.742158 ... 9.429262 9.661919 9.589560 10.112593 8.408232 8.722528 10.710177 9.657667 9.176614 10.897960
1053_at 9.047948 8.616065 9.203023 8.572035 8.506107 8.229169 8.695021 8.915824 8.233599 9.093087 ... 8.776563 9.155915 9.336024 8.824805 8.772813 8.802262 8.678147 8.625144 8.695912 8.629976
117_at 7.612412 8.043859 7.603927 7.714699 7.742822 7.551629 7.504556 7.720968 7.730913 7.883360 ... 7.729079 11.647400 11.342093 7.850625 8.028290 8.037324 7.655276 7.888281 7.691653 7.624577
121_at 8.218106 8.235331 8.400500 8.405479 9.297814 8.178474 8.067798 8.176426 11.259210 10.265981 ... 11.123192 8.063912 8.138332 8.213297 8.035581 8.203285 8.021791 8.373332 8.081797 8.261245
1255_g_at 7.392744 7.395735 7.453283 7.493720 7.474639 7.436698 7.388679 7.508520 7.432427 7.465746 ... 7.445189 9.069289 8.621514 7.446887 7.512241 7.456834 7.453321 7.509484 7.315354 7.513548

5 rows × 798 columns


In [18]:
npy_path=root.joinpath('src','sanger.npy')
with (npy_path).open('wb') as f:
    np.save(f, sample_raw_data_df.values.astype(np.float32))