In [2]:
import pandas as pd
import os
import re

In [3]:
# Where is the shortstack directory
os.chdir("/Users/mgalland/SURFdrive/trichome_team/06.results_from_xp/sRNA-Seq/20170117_srnadesc/shortstack/")

In [4]:
# Get all ShortStack directories (sample list)
pattern = re.compile("[A-Z+][0-9]+")

# Store them in a list
files = [os.getcwd() + "/" + f + "/Results.txt" for f in os.listdir("./") if pattern.search(f)]

# print first elements
files[0:2]


Out[4]:
['/Users/mgalland/SURFdrive/trichome_team/06.results_from_xp/sRNA-Seq/20170117_srnadesc/shortstack/C32/Results.txt',
 '/Users/mgalland/SURFdrive/trichome_team/06.results_from_xp/sRNA-Seq/20170117_srnadesc/shortstack/LA0407/Results.txt']

In [5]:
# Store all ShortStack results dataframes in a list
shortstack_res = [pd.read_table(f) for f in files]

# This is how it looks
shortstack_res[0].head()


Out[5]:
#Locus Name Length Reads MeanReads StdevReads UniqueReads FracTop Strand MajorRNA ... MIRNA PhaseSize PhaseScore Short Long 20 21 22 23 24
0 SL2.40ch00:3099527-3099768 Cluster_1 242 77 68.7 1.6 2 0.649 . UUACAUACUCCUGCUAAUAUGU ... N6 20 2.9 0 1 1 5 57 3 10
1 SL2.40ch00:4197032-4197202 Cluster_2 171 86 55.9 4.8 4 0.965 + CCCUUGAAAAUCCGGAGG ... N2 NP2 NaN 40 2 8 15 3 5 13
2 SL2.40ch00:5204257-5204456 Cluster_3 200 20 14.5 2.0 4 0.550 . UGUAAGUGCCCGAACAACAUCA ... N6 21 1.1 1 0 0 1 11 6 1
3 SL2.40ch00:5774256-5774769 Cluster_4 514 87 85.4 1.1 41 0.425 . UGAAAGCUCUCUAAAAAACCUCUC ... N6 20 1.8 2 2 0 1 7 8 67
4 SL2.40ch00:6571367-6571564 Cluster_5 198 48 40.0 1.8 16 0.458 . AUUUCGGGCAUAGAUUGAAGGGGU ... N6 23 1.4 1 0 2 1 0 8 36

5 rows × 23 columns


In [6]:
# Select only the MajorRNA and merge
shortstack_res = [f[["MajorRNA"]] for f in shortstack_res]
shortstack_res[0].head()


Out[6]:
MajorRNA
0 UUACAUACUCCUGCUAAUAUGU
1 CCCUUGAAAAUCCGGAGG
2 UGUAAGUGCCCGAACAACAUCA
3 UGAAAGCUCUCUAAAAAACCUCUC
4 AUUUCGGGCAUAGAUUGAAGGGGU

In [11]:
# initialize the first dataframe and merge
df = pd.merge(shortstack_res[0],shortstack_res[1],on="MajorRNA",how="outer")
print("\n")
print("#############################")
print("First merge")
print(df.head())
rows, columns = df.shape
print("It contains {0} number of lines".format(str(rows)))
print("##############################")



#############################
First merge
                 MajorRNA
0  UUACAUACUCCUGCUAAUAUGU
1      CCCUUGAAAAUCCGGAGG
2      CCCUUGAAAAUCCGGAGG
3      CCCUUGAAAAUCCGGAGG
4  UGUAAGUGCCCGAACAACAUCA
It contains 44450 number of lines
##############################

In [21]:
l = list(set(df["MajorRNA"].tolist()))
l = l[1:10]

In [20]:



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-1d81b7fbec86> in <module>()
----> 1 l = dict(l)
      2 l

ValueError: dictionary update sequence element #0 has length 24; 2 is required

In [ ]: