Visualization and Analysis

This notebook is to go through and experiment ways to visualize and analyze the data.


In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from ggplot import *
import matplotlib.pyplot as plt

In [2]:
###################
## Read in Data
####################

all_df = pd.read_table("../data/outputs/TFBS_map_DF_all_bicoid_test.csv", na_values = 'NA',sep= "\t", index_col = 0)

In [3]:
# remove all rows with NAs
all_df = all_df.dropna()

# Check
print all_df


     position      score  species  raw_position    strand motif_found  \
0          10   5.013668        0            10  positive    ataatttt   
2        -751   8.946094        0           154  negative    tcctcgcc   
1         157  10.457056        0           157  positive    ttcctcgc   
3        -684   5.243600        0           221  negative    tcgttccc   
4        -649   3.285077        0           256  negative    tattgccg   
5        -616   3.594098        0           289  negative    ttggtacc   
6        -598   6.417715        0           307  negative    ctacattt   
7         334   3.702168        0           334  positive    gaacggaa   
8         404   3.491528        0           404  positive    gcaaaagt   
9         451   3.794091        0           451  positive    gtttttgc   
10       -450   9.909568        0           455  negative    tgggatta   
12       -438   5.787577        0           467  negative    agggcttg   
11        481   4.094957        0           481  positive    ttggtacc   
14       -385   4.736640        0           520  negative    gtaccgat   
13        523   5.020957        0           523  positive    tgtaccga   
15       -338   9.909568        0           567  negative    gaagggat   
16        593   3.509995        0           593  positive    caaggcag   
17        603   3.346478        0           603  positive    agcataac   
18       -296   4.374594        0           609  negative    tgctgggt   
19       -291   5.257062        0           614  negative    ggttattt   
20       -281   4.983569        0           624  negative    tgtttttt   
21       -264   4.389700        0           641  negative    ttaaccct   
24       -242   8.959556        0           663  negative    taagccca   
22        668   5.016483        0           668  positive    aagcccag   
23        675  10.016483        0           675  positive    gcacagca   
25        690   6.349059        0           690  positive    ttttggtg   
26       -115   5.920559        0           790  negative    atgattat   
28        -59   6.380240        0           846  negative    tttcaaat   
27        859   3.391992        0           859  positive    aagtccca   
29        873   5.612093        0           873  positive    ggggccgt   
..        ...        ...      ...           ...       ...         ...   
204        89   4.894493        7            89  positive    cgaagcct   
205       169  10.457056        7           169  positive    ttaatccg   
206      -720   8.946094        7           185  negative    cagattat   
207      -656   5.243600        7           249  negative    tagatttt   
208      -621   3.391992        7           284  negative    aagttttg   
209      -589   3.594098        7           316  negative    tggctttc   
210      -571   6.417715        7           334  negative    ggaattaa   
211       342   3.702168        7           342  positive    ttaaacgg   
212      -542   4.431520        7           363  negative    aggagtag   
213       392   3.491528        7           392  positive    aaaaaccg   
214       431   3.794091        7           431  positive    ctaaccca   
215      -449   9.909568        7           456  negative    gggattag   
216       463   4.094957        7           463  positive    attagccg   
217      -437   5.787577        7           468  negative    gggcttga   
218       506   8.959556        7           506  positive    ttaagctg   
219      -382   4.736640        7           523  negative    ccgatttt   
220      -335   9.909568        7           570  negative    gggattag   
221      -307   3.374593        7           598  negative    cagcataa   
222      -268   4.389700        7           637  negative    tgacttaa   
223       645   5.016483        7           645  positive    ttaaccct   
224       651   9.909568        7           651  positive    ctaatccc   
225      -247   8.959556        7           658  negative    cagcttaa   
226       666   8.997031        7           666  positive    ttaagccc   
227      -222   3.285077        7           683  negative    gagttttg   
228      -198   3.391992        7           707  negative    aagttttg   
229      -115   5.920559        7           790  negative    atgattat   
230       -56   6.380240        7           849  negative    caaattaa   
231       857   5.505177        7           857  positive    ttaagtcc   
232       858   3.285077        7           858  positive    taagtccc   
233       899   4.422532        7           899  positive    attatctg   

     align_position  
0                10  
2               220  
1               223  
3               320  
4               355  
5               388  
6               414  
7               442  
8               530  
9               584  
10              591  
12              603  
11              617  
14              684  
13              687  
15              743  
16              771  
17              781  
18              787  
19              792  
20              809  
21              827  
24              850  
22              855  
23              867  
25              892  
26             1015  
28             1075  
27             1088  
29             1102  
..              ...  
204             100  
205             206  
206             222  
207             327  
208             362  
209             401  
210             420  
211             429  
212             450  
213             491  
214             533  
215             575  
216             582  
217             587  
218             625  
219             646  
220             698  
221             726  
222             796  
223             810  
224             817  
225             824  
226             833  
227             856  
228             892  
229             984  
230            1046  
231            1054  
232            1055  
233            1122  

[231 rows x 7 columns]

In [6]:
ggplot(aes(x='species', y = 'strand'), data = all_df) +\
    geom_bar()ga


Out[6]:
<ggplot: (300979097)>

In [ ]: