In [1]:
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
    
In [2]:
    
english = [56, 75, 45, 71, 62, 64, 58, 80, 76, 61]
math = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]
df_marker = pd.DataFrame({'English': english, 'Math': math})
print df_marker.T
    
    
In [3]:
    
df_marker.sort_index(axis=0, ascending=False).T # axis=0 表示 index
    
    Out[3]:
In [4]:
    
df_marker.sort_index(axis=1, ascending=False).T # axis=1 表示 columns ("Math", "English")
    
    Out[4]:
In [5]:
    
df_marker.sort_values(by="Math", ascending=False).T
    
    Out[5]:
In [6]:
    
def tied_rank(x):
    """
    ref: https://github.com/bkgood/auc.git
    
    Computes the tied rank of elements in x.
    This function computes the tied rank of elements in x.
    Parameters
    ----------
    x : list of numbers, numpy array
    Returns
    -------
    score : list of numbers
            The tied rank f each element in x
    """
    sorted_x = sorted(zip(x,range(len(x))))
    r = [0 for k in x]
    cur_val = sorted_x[0][0]
    last_rank = 0
    for i in range(len(sorted_x)):
        if cur_val != sorted_x[i][0]:
            cur_val = sorted_x[i][0]
            for j in range(last_rank, i): 
                r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
            last_rank = i
        if i==len(sorted_x)-1:
            for j in range(last_rank, i+1): 
                r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
    return r
    
In [7]:
    
eng_rank = tied_rank(english)
math_rank = tied_rank(math)
print english
print eng_rank
    
    
In [33]:
    
df = pd.DataFrame({'English':english, 'Math': math,
                   'english_rank': eng_rank, 'math_rank': math_rank
                  })
df.sort_values(by="english_rank", ascending=True)
df
    
    Out[33]:
In [34]:
    
df.ix[df.English==45]
    
    Out[34]:
In [42]:
    
df[df['English'].isin([45, 71])]
    
    Out[42]:
In [26]:
    
x = np.random.randint(0, 100, 10)
y = np.random.randn(10)
ls = [[a, b] for a, b in zip(list(x), list(y))]
test_df = pd.DataFrame(ls, columns=['Int', 'float']) # 每一个 sub_list 表示一行
test_df
test_df.ix[test_df.Int==10]
    
    Out[26]:
In [20]:
    
%time sum([x for x in range(10000)])
    
    
    Out[20]:
In [23]:
    
%run test.py
    
    
In [24]:
    
#%quickref
    
In [3]:
    
import pandas as pd
import os
    
In [6]:
    
pdb_labels_file = '/home/huizhu/git_test/deepchem/datasets/muv.csv.gz'
print os.path.exists(pdb_labels_file)
    
    
In [13]:
    
df = pd.read_csv(pdb_labels_file)
print df.shape
df.head()
    
    
    Out[13]:
In [20]:
    
count_values1 = df['MUV-466'].value_counts()  # count values
count_values2 = pd.isnull(df['MUV-466']).value_counts() # isnull, pd.notnull()
print count_values1; count_values2
    
    
    Out[20]:
In [17]:
    
group = df.groupby(['MUV-466', 'MUV-548'])
group.size()
    
    Out[17]:
In [22]:
    
group.size().unstack().fillna(0)
    
    Out[22]:
In [ ]: