In [1]:
import csv
import os
import sys

import pandas as pd

utils_path = os.path.abspath(os.path.join('../..'))
if utils_path not in sys.path:
    sys.path.append(utils_path)
    
from utils.experiment import parse_barcodes, parse_exp_config
from utils.counts import normalize

In [2]:
bc_dict = parse_barcodes('../../ref/TruSeq_Stranded_RNA-Seq.csv', bc_id="AR")
exp_df = parse_exp_config('../../ref/2017-07-10_NextSeq.csv', bc_dict)
exp_df


Out[2]:
bc_id bc_seq bcm sample temp
0 AR2 CGATGT False utRho201 10
1 AR4 TGACCA False utRho202 10
2 AR5 ACAGTG False utRho203 10
3 AR6 GCCAAT True utRho204 10
4 AR7 CAGATC True utRho205 10
5 AR12 CTTGTA True utRho206 10
6 AR13 AGTCAA False utRho207 25
7 AR14 AGTTCC False utRho208 25
8 AR15 ATGTCA False utRho209 25
9 AR16 CCGTCC True utRho210 25
10 AR18 GTCCGC True utRho211 25
11 AR19 GTGAAA True utRho212 25
12 AR2 CGATGT False utRho213 37
13 AR4 TGACCA False utRho214 37
14 AR5 ACAGTG False utRho215 37
15 AR6 GCCAAT True utRho216 37
16 AR7 CAGATC True utRho217 37
17 AR12 CTTGTA True utRho218 37
18 AR13 AGTCAA False utRho219 44
19 AR14 AGTTCC False utRho220 44
20 AR15 ATGTCA False utRho221 44
21 AR16 CCGTCC True utRho222 44
22 AR18 GTCCGC True utRho223 44
23 AR19 GTGAAA True utRho224 44

In [3]:
ct = pd.read_csv('../../results/utRho201/utRho201_sorted.counts',
                 header=None, names=['gene', 'counts'], sep='\t', skipfooter=5)
ct


/home/ilya/.venv/pydata3/lib/python3.5/site-packages/ipykernel_launcher.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support skipfooter; you can avoid this warning by specifying engine='python'.
  
Out[3]:
gene counts
0 aaaD 11
1 aaaE 38
2 aaeA 150
3 aaeB 231
4 aaeR 606
5 aaeX 57
6 aas 2698
7 aat 491
8 abgA 70
9 abgB 113
10 abgR 372
11 abgT 70
12 abrB 444
13 accA 11000
14 accB 10774
15 accC 23593
16 accD 10645
17 aceA 9785
18 aceB 10978
19 aceE 51411
20 aceF 37069
21 aceK 1903
22 ackA 8393
23 acnA 2534
24 acnB 39387
25 acpH 341
26 acpP 37777
27 acpS 683
28 acpT 443
29 acrA 8050
... ... ...
4421 ytfJ 516
4422 ytfK 1913
4423 ytfL 523
4424 ytfP 2127
4425 ytfQ 478
4426 ytfR 251
4427 ytfT 132
4428 ythA 32
4429 ytjA 112
4430 ytjB 615
4431 ytjC 1970
4432 yzgL 1809
4433 zapA 3389
4434 zapB 3118
4435 zapC 555
4436 zapD 1265
4437 zinT 78
4438 zipA 9186
4439 zitB 263
4440 zntA 1990
4441 zntR 1074
4442 znuA 3431
4443 znuB 437
4444 znuC 366
4445 zraP 325
4446 zraR 1107
4447 zraS 859
4448 zupT 944
4449 zur 607
4450 zwf 8381

4451 rows × 2 columns


In [4]:
def deseq_df(exp_df, cond, cond_names, res_dir='../results'):
    assert len(cond) == len(cond_names), 'cond and cond_names must be same length'
    df = pd.DataFrame()
    for cnd,name in zip(cond, cond_names):
        edf = exp_df[(exp_df.temp==cnd[0]) & (exp_df.bcm==cnd[1])]
        for i,row in enumerate(edf.iterrows()):
            i += 1
            _, sample = row
            filename = '{res_dir}/{sample}/{sample}_sorted.counts'.format(
                        res_dir=res_dir, sample=sample['sample'])
            ct = pd.read_csv(filename, header=None, names=['gene', 'counts'],
                            sep='\t', skipfooter=5, engine='python')
            if df.empty:
                df = ct.copy()
                df.rename(columns={'counts': '%s%i' % (name,i)}, inplace=True)
            else:
                df['%s%i' % (name,i)] = ct['counts']
    df.set_index('gene', inplace=True)
    df.index.names = [None]
    print(df)
    return df

In [5]:
d10bcm = deseq_df(exp_df, ((10, False),(10,True)), ('nobcm', 'bcm'), res_dir="../../results")
d10bcm.to_csv('../../results/2017-07-17/d10bcm.csv', index_label=False)


      nobcm1  nobcm2  nobcm3   bcm1   bcm2   bcm3
aaaD      11      15      10    208    206    182
aaaE      38      54      43   1996   1162    844
aaeA     150     253     144   5160   3001   2541
aaeB     231     421     228   9345   5553   4572
aaeR     606     892     687   6824   3627   4555
aaeX      57      99      55    971    513    567
aas     2698    4602    2875   6355   4054   5006
aat      491     708     591    897    734    946
abgA      70     127      77   2628   1776   2066
abgB     113     189     152   2314   1587   1815
abgR     372     657     372   2723   1731   2497
abgT      70     134     113   1891   1301   1596
abrB     444     723     426   2567   1366   2127
accA   11000   20394   10690  16778  10143  17477
accB   10774   17752   10209  10724   6421  11777
accC   23593   42065   23518  27651  16130  27536
accD   10645   19202   11524  11980   7658  11729
aceA    9785   15354   11406  10818   5031   8760
aceB   10978   18112   11957  13540   7058  11971
aceE   51411   83695   59183  99236  50740  70619
aceF   37069   62424   43192  75219  33983  48125
aceK    1903    2964    2376   7584   3577   4329
ackA    8393   14716   10853  14409   9792  14910
acnA    2534    4601    3146  12184   7176  10081
acnB   39387   71090   47590  41939  24289  37708
acpH     341     688     421   1460    728    941
acpP   37777   69487   39103  65901  41205  63079
acpS     683    1295     689   2646   1685   2439
acpT     443     732     539   1114    859    973
acrA    8050   14005    8406   9340   6023  10772
...      ...     ...     ...    ...    ...    ...
ytfJ     516     883     622   2511   1989   2754
ytfK    1913    3860    2094   2155   1357   2273
ytfL     523     898     701   4206   2902   3988
ytfP    2127    3724    2349   2266   1587   2754
ytfQ     478     842     591   2306   1310   1860
ytfR     251     495     314   3308   1864   2074
ytfT     132     255     156   1771   1008   1130
ythA      32      52      42    588    396    641
ytjA     112     233     149   2413   1404   1928
ytjB     615    1068     706   2571   1218   1841
ytjC    1970    3513    2383   4210   2061   3223
yzgL    1809    2951    2124   4362   3010   3903
zapA    3389    6035    3624   3796   2584   4111
zapB    3118    5649    3598   2251   1584   3142
zapC     555     925     686   2198   1371   2051
zapD    1265    1885    1509   2240   1195   1920
zinT      78     151      67   5869   4709   5440
zipA    9186   16342   10032   7877   4730   8655
zitB     263     451     290    888    613    789
zntA    1990    3157    2024   7695   4002   5713
zntR    1074    1716    1195   1191    799   1246
znuA    3431    5996    3946   7542   4928   7017
znuB     437     804     533   1833   1241   1592
znuC     366     540     375   1497    943   1182
zraP     325     478     287   3151   1876   2754
zraR    1107    1959    1266  11891   6155   6456
zraS     859    1430     889  16118   8997   9968
zupT     944    1602    1060   4830   2676   3489
zur      607    1052     711   3084   1750   2459
zwf     8381   14559    9249   8941   6033   9659

[4451 rows x 6 columns]

In [6]:
d44bcm = deseq_df(exp_df, ((44, False),(44, True)), ('nobcm', 'bcm'), res_dir="../../results")
d44bcm.to_csv('../../results/2017-07-17/d44bcm.csv', index_label=False)


      nobcm1  nobcm2  nobcm3    bcm1   bcm2    bcm3
aaaD      40      16      15     371    461     557
aaaE     194      81     110    7792   5202   10061
aaeA     704     247     450   10046   7776   13707
aaeB     756     305     501   18405  12605   23559
aaeR    1048     315     700    5722   5514    8706
aaeX     258      92     201    1294    976    1748
aas     6232    1897    3661    5249   5574    8122
aat     1125     333     686    1170   1012    1478
abgA     330     120     254    2086   1944    2983
abgB     434     174     260    2357   2227    3627
abgR     982     369     586    1976   1663    2924
abgT     170      58     125    2141   2033    3713
abrB     622     224     402    2698   2750    4253
accA   28522    8765   16365   15561  13236   20359
accB   27402    8196   15117    7379   7662    9527
accC   68814   21134   37357   23268  21818   28304
accD   41793   11821   23927   15892  13307   20153
aceA   25096    8773   15714    7405  10198   13670
aceB   33308   12334   21670   10207  12097   17422
aceE  227683   60380  112504   74025  70359   91640
aceF  153046   39347   71146   49481  44720   55816
aceK    3908    1210    2275    9080  11242   15906
ackA   27218    7243   14559    7666   8534   11777
acnA   16933    6615   11412   14301  11986   19306
acnB  138149   39344   85289   40384  39939   49438
acpH    1040     329     582    1561   1426    2016
acpP  243142   78370  147572  118937  95435  154219
acpS    2891     909    1820    3957   2861    4747
acpT    1174     341     682    1963   2043    2854
acrA   29126    9886   18087   11942  10690   16037
...      ...     ...     ...     ...    ...     ...
ytfJ    1790     590    1079    2360   2505    3869
ytfK   24500    8003   15949    7096   5508   10848
ytfL    1590     581    1074    4465   4597    6669
ytfP   13120    4248    8575    4534   4498    6636
ytfQ    3874    1991    2960    2434   2558    4420
ytfR     871     346     468    5789   5276    8747
ytfT     321     100     164    2438   2744    3953
ythA     197     113     179    1371   1169    2100
ytjA    3221    1508    2383    6931   5800    9591
ytjB    2527     805    1592    2498   2620    4040
ytjC    5936    1658    2995    3450   3385    5036
yzgL     676     126     291    2395   2547    3960
zapA   19650    6591   12584    5196   4284    7479
zapB   30146    9131   19386    7486   7271   12159
zapC    2912    1054    1833    3656   2680    5180
zapD    4522    1383    2537    2190   2167    3362
zinT     469     162     339    6971   6019   11317
zipA   32719   10746   20126    8311   7848   11739
zitB    1163     448     803    1308   1154    1832
zntA   30197   12139   20637    9610  11473   15491
zntR    2032     799    1514    1681   1402    2740
znuA    6830    2124    3718    7499   6856   10874
znuB    1029     302     564    1667   1862    2773
znuC     823     217     454    1802   1736    2648
zraP     173      64     113    2400   2179    3368
zraR    4780    1669    2874   15273  12560   18978
zraS    3213    1138    2079   16705  14185   21179
zupT    4555    1699    3204    4545   4424    7959
zur      979     298     555    3092   3201    5319
zwf    23923    7972   14616   12211  10844   16951

[4451 rows x 6 columns]

In [7]:
d37bcm = deseq_df(exp_df, ((37, False),(37,True)), ('nobcm', 'bcm'), res_dir="../../results")
d37bcm.to_csv('../../results/2017-07-17/d37bcm.csv', index_label=False)


      nobcm1  nobcm2  nobcm3   bcm1   bcm2   bcm3
aaaD      10       7      10    184    101     84
aaaE      11      11      16   4197   1837   1336
aaeA     106      85     110   4416   1939   1652
aaeB      90      85      85   8787   3960   3080
aaeR     147     184     198   4089   2511   1731
aaeX      36      40      55    623    342    219
aas     1279    1180    1577   7673   4696   3480
aat      287     286     428    668    435    382
abgA      39      60      59   1434    732    510
abgB      21      56      46   1239    732    569
abgR     214     326     318   1628   1192    886
abgT      37      27      12   1186    632    467
abrB     126     156     163   2043   1236    778
accA   10711   10702   12670  15363  12131   9683
accB   11911   12341   13740   9633   9461   7221
accC   29036   31568   35221  24325  23625  17133
accD   12828   13904   15934  12998  10444   8381
aceA    4303    5011    6321   6123   6070   3331
aceB    4540    5758    6198   8346   6991   4402
aceE   78159   82499  104573  66131  52882  39678
aceF   43244   50336   63002  41365  29955  21244
aceK     614     633    1019   6107   3995   2400
ackA    9202    9008   12412   7328   7008   5841
acnA    2000    1970    2502  11160   6620   4870
acnB   33786   36644   42701  33526  23892  16425
acpH     231     220     267   1143    721    518
acpP   64841   71760   86873  95502  82405  68198
acpS     703     903     847   2402   1686   1357
acpT     308     333     445   1571   1144    780
acrA    7155    7269    8447   8331   7036   5839
...      ...     ...     ...    ...    ...    ...
ytfJ     332     366     451   2580   1854   1667
ytfK    1774    2001    2298   3692   2637   2363
ytfL     241     220     308   3133   2241   1778
ytfP    1408    1521    1761   1965   1616   1280
ytfQ     263     510     499   1602    988    809
ytfR      78     145     127   3281   1570    965
ytfT      28      33      40   1512    838    492
ythA      15       8       3    675    383    305
ytjA     222     235     238   3428   2678   2124
ytjB     414     437     545   1572    963    647
ytjC    2148    2332    2606   3016   2469   1464
yzgL     211     134     209   3093   1841   1336
zapA    2396    2348    2854   3507   2705   2238
zapB    3969    3995    4594   3314   3195   3019
zapC     659     681     767   2515   1680   1333
zapD    1115    1094    1552   1777   1376    987
zinT      39      51      61   5580   2851   2585
zipA    5197    5995    6341   6048   4643   3646
zitB     135     152     183    767    534    381
zntA    2735    3184    3606   6893   4603   3177
zntR     232     233     254    939    759    642
znuA    1904    2073    2327   7916   5401   4358
znuB     279     309     328   1368    889    638
znuC     180     209     270   1057    574    433
zraP      45      33      40   2398   1411    992
zraR     679     882     930  12598   6074   3734
zraS     518     786     788  14982   6779   4577
zupT     422     506     577   3354   2063   1487
zur      277     222     336   3669   2248   1627
zwf     6903    7349    8329  12128   8203   6798

[4451 rows x 6 columns]

In [8]:
d37d10 = deseq_df(exp_df, ((37, False),(10,False)), ('norm', 'cs'), res_dir="../../results")
d37d10.to_csv('../../results/2017-07-17/d37d10.csv', index_label=False)


      norm1  norm2   norm3    cs1    cs2    cs3
aaaD     10      7      10     11     15     10
aaaE     11     11      16     38     54     43
aaeA    106     85     110    150    253    144
aaeB     90     85      85    231    421    228
aaeR    147    184     198    606    892    687
aaeX     36     40      55     57     99     55
aas    1279   1180    1577   2698   4602   2875
aat     287    286     428    491    708    591
abgA     39     60      59     70    127     77
abgB     21     56      46    113    189    152
abgR    214    326     318    372    657    372
abgT     37     27      12     70    134    113
abrB    126    156     163    444    723    426
accA  10711  10702   12670  11000  20394  10690
accB  11911  12341   13740  10774  17752  10209
accC  29036  31568   35221  23593  42065  23518
accD  12828  13904   15934  10645  19202  11524
aceA   4303   5011    6321   9785  15354  11406
aceB   4540   5758    6198  10978  18112  11957
aceE  78159  82499  104573  51411  83695  59183
aceF  43244  50336   63002  37069  62424  43192
aceK    614    633    1019   1903   2964   2376
ackA   9202   9008   12412   8393  14716  10853
acnA   2000   1970    2502   2534   4601   3146
acnB  33786  36644   42701  39387  71090  47590
acpH    231    220     267    341    688    421
acpP  64841  71760   86873  37777  69487  39103
acpS    703    903     847    683   1295    689
acpT    308    333     445    443    732    539
acrA   7155   7269    8447   8050  14005   8406
...     ...    ...     ...    ...    ...    ...
ytfJ    332    366     451    516    883    622
ytfK   1774   2001    2298   1913   3860   2094
ytfL    241    220     308    523    898    701
ytfP   1408   1521    1761   2127   3724   2349
ytfQ    263    510     499    478    842    591
ytfR     78    145     127    251    495    314
ytfT     28     33      40    132    255    156
ythA     15      8       3     32     52     42
ytjA    222    235     238    112    233    149
ytjB    414    437     545    615   1068    706
ytjC   2148   2332    2606   1970   3513   2383
yzgL    211    134     209   1809   2951   2124
zapA   2396   2348    2854   3389   6035   3624
zapB   3969   3995    4594   3118   5649   3598
zapC    659    681     767    555    925    686
zapD   1115   1094    1552   1265   1885   1509
zinT     39     51      61     78    151     67
zipA   5197   5995    6341   9186  16342  10032
zitB    135    152     183    263    451    290
zntA   2735   3184    3606   1990   3157   2024
zntR    232    233     254   1074   1716   1195
znuA   1904   2073    2327   3431   5996   3946
znuB    279    309     328    437    804    533
znuC    180    209     270    366    540    375
zraP     45     33      40    325    478    287
zraR    679    882     930   1107   1959   1266
zraS    518    786     788    859   1430    889
zupT    422    506     577    944   1602   1060
zur     277    222     336    607   1052    711
zwf    6903   7349    8329   8381  14559   9249

[4451 rows x 6 columns]

In [9]:
d37d44 = deseq_df(exp_df, ((37, False),(44,False)), ('norm', 'hs'), res_dir="../../results")
d37d44.to_csv('../../results/2017-07-17/d37d44.csv', index_label=False)


      norm1  norm2   norm3     hs1    hs2     hs3
aaaD     10      7      10      40     16      15
aaaE     11     11      16     194     81     110
aaeA    106     85     110     704    247     450
aaeB     90     85      85     756    305     501
aaeR    147    184     198    1048    315     700
aaeX     36     40      55     258     92     201
aas    1279   1180    1577    6232   1897    3661
aat     287    286     428    1125    333     686
abgA     39     60      59     330    120     254
abgB     21     56      46     434    174     260
abgR    214    326     318     982    369     586
abgT     37     27      12     170     58     125
abrB    126    156     163     622    224     402
accA  10711  10702   12670   28522   8765   16365
accB  11911  12341   13740   27402   8196   15117
accC  29036  31568   35221   68814  21134   37357
accD  12828  13904   15934   41793  11821   23927
aceA   4303   5011    6321   25096   8773   15714
aceB   4540   5758    6198   33308  12334   21670
aceE  78159  82499  104573  227683  60380  112504
aceF  43244  50336   63002  153046  39347   71146
aceK    614    633    1019    3908   1210    2275
ackA   9202   9008   12412   27218   7243   14559
acnA   2000   1970    2502   16933   6615   11412
acnB  33786  36644   42701  138149  39344   85289
acpH    231    220     267    1040    329     582
acpP  64841  71760   86873  243142  78370  147572
acpS    703    903     847    2891    909    1820
acpT    308    333     445    1174    341     682
acrA   7155   7269    8447   29126   9886   18087
...     ...    ...     ...     ...    ...     ...
ytfJ    332    366     451    1790    590    1079
ytfK   1774   2001    2298   24500   8003   15949
ytfL    241    220     308    1590    581    1074
ytfP   1408   1521    1761   13120   4248    8575
ytfQ    263    510     499    3874   1991    2960
ytfR     78    145     127     871    346     468
ytfT     28     33      40     321    100     164
ythA     15      8       3     197    113     179
ytjA    222    235     238    3221   1508    2383
ytjB    414    437     545    2527    805    1592
ytjC   2148   2332    2606    5936   1658    2995
yzgL    211    134     209     676    126     291
zapA   2396   2348    2854   19650   6591   12584
zapB   3969   3995    4594   30146   9131   19386
zapC    659    681     767    2912   1054    1833
zapD   1115   1094    1552    4522   1383    2537
zinT     39     51      61     469    162     339
zipA   5197   5995    6341   32719  10746   20126
zitB    135    152     183    1163    448     803
zntA   2735   3184    3606   30197  12139   20637
zntR    232    233     254    2032    799    1514
znuA   1904   2073    2327    6830   2124    3718
znuB    279    309     328    1029    302     564
znuC    180    209     270     823    217     454
zraP     45     33      40     173     64     113
zraR    679    882     930    4780   1669    2874
zraS    518    786     788    3213   1138    2079
zupT    422    506     577    4555   1699    3204
zur     277    222     336     979    298     555
zwf    6903   7349    8329   23923   7972   14616

[4451 rows x 6 columns]

In [7]:
def coldata(df, lib_type='single-end'):
    
    def set_cond(rec):
        return rec['cols'].strip('0123456789')
    
    cd = pd.DataFrame()
    cd['cols'] = df.columns.values
    cd['type'] = lib_type
    #conds = set([c.strip('0123456789') for c in df.columns.values])
    cd['condition'] = cd.apply(set_cond, axis=1)
    cd.set_index('cols', inplace=True)
    cd.index.names = [None]
    print(cd)
    return cd

In [10]:
cd = coldata(d10bcm, lib_type="paired-end")
cd.to_csv('../../results/2017-07-17/cd_10bcm.csv', index_label=False)


              type condition
nobcm1  paired-end     nobcm
nobcm2  paired-end     nobcm
nobcm3  paired-end     nobcm
bcm1    paired-end       bcm
bcm2    paired-end       bcm
bcm3    paired-end       bcm

In [11]:
cd = coldata(d44bcm, lib_type="paired-end")
cd.to_csv('../../results/2017-07-17/cd_44bcm.csv', index_label=False)


              type condition
nobcm1  paired-end     nobcm
nobcm2  paired-end     nobcm
nobcm3  paired-end     nobcm
bcm1    paired-end       bcm
bcm2    paired-end       bcm
bcm3    paired-end       bcm

In [11]:
cd = coldata(d37bcm)
cd.to_csv('../../results/2017-07-17/cd_37bcm.csv', index_label=False)


              type condition
nobcm1  single-end     nobcm
nobcm2  single-end     nobcm
nobcm3  single-end     nobcm
bcm1    single-end       bcm
bcm2    single-end       bcm
bcm3    single-end       bcm

In [12]:
cd = coldata(d37d10)
cd.to_csv('../../results/2017-07-17/cd_3710.csv', index_label=False)


             type condition
norm1  single-end      norm
norm2  single-end      norm
norm3  single-end      norm
cs1    single-end        cs
cs2    single-end        cs
cs3    single-end        cs

In [13]:
cd = coldata(d37d44)
cd.to_csv('../../results/2017-07-17/cd_3744.csv', index_label=False)


             type condition
norm1  single-end      norm
norm2  single-end      norm
norm3  single-end      norm
hs1    single-end        hs
hs2    single-end        hs
hs3    single-end        hs