learn pandas


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

两列成绩数据


In [2]:
english = [56, 75, 45, 71, 62, 64, 58, 80, 76, 61]
math = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]
df_marker = pd.DataFrame({'English': english, 'Math': math})
print df_marker.T


          0   1   2   3   4   5   6   7   8   9
English  56  75  45  71  62  64  58  80  76  61
Math     66  70  40  60  65  56  59  77  67  63

对数据进行排序


In [3]:
df_marker.sort_index(axis=0, ascending=False).T # axis=0 表示 index


Out[3]:
9 8 7 6 5 4 3 2 1 0
English 61 76 80 58 64 62 71 45 75 56
Math 63 67 77 59 56 65 60 40 70 66

In [4]:
df_marker.sort_index(axis=1, ascending=False).T # axis=1 表示 columns ("Math", "English")


Out[4]:
0 1 2 3 4 5 6 7 8 9
Math 66 70 40 60 65 56 59 77 67 63
English 56 75 45 71 62 64 58 80 76 61

In [5]:
df_marker.sort_values(by="Math", ascending=False).T


Out[5]:
7 1 8 0 4 9 3 6 5 2
English 80 75 76 56 62 61 71 58 64 45
Math 77 70 67 66 65 63 60 59 56 40

In [6]:
def tied_rank(x):
    """
    ref: https://github.com/bkgood/auc.git
    
    Computes the tied rank of elements in x.

    This function computes the tied rank of elements in x.

    Parameters
    ----------
    x : list of numbers, numpy array

    Returns
    -------
    score : list of numbers
            The tied rank f each element in x

    """
    sorted_x = sorted(zip(x,range(len(x))))
    r = [0 for k in x]
    cur_val = sorted_x[0][0]
    last_rank = 0
    for i in range(len(sorted_x)):
        if cur_val != sorted_x[i][0]:
            cur_val = sorted_x[i][0]
            for j in range(last_rank, i): 
                r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
            last_rank = i
        if i==len(sorted_x)-1:
            for j in range(last_rank, i+1): 
                r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
    return r

In [7]:
eng_rank = tied_rank(english)
math_rank = tied_rank(math)

print english
print eng_rank


[56, 75, 45, 71, 62, 64, 58, 80, 76, 61]
[2.0, 8.0, 1.0, 7.0, 5.0, 6.0, 3.0, 10.0, 9.0, 4.0]

In [33]:
df = pd.DataFrame({'English':english, 'Math': math,
                   'english_rank': eng_rank, 'math_rank': math_rank
                  })
df.sort_values(by="english_rank", ascending=True)
df


Out[33]:
English Math english_rank math_rank
0 56 66 2.0 7.0
1 75 70 8.0 9.0
2 45 40 1.0 1.0
3 71 60 7.0 4.0
4 62 65 5.0 6.0
5 64 56 6.0 2.0
6 58 59 3.0 3.0
7 80 77 10.0 10.0
8 76 67 9.0 8.0
9 61 63 4.0 5.0

In [34]:
df.ix[df.English==45]


Out[34]:
English Math english_rank math_rank
2 45 40 1.0 1.0

In [42]:
df[df['English'].isin([45, 71])]


Out[42]:
English Math english_rank math_rank
2 45 40 1.0 1.0
3 71 60 7.0 4.0

In [26]:
x = np.random.randint(0, 100, 10)
y = np.random.randn(10)
ls = [[a, b] for a, b in zip(list(x), list(y))]
test_df = pd.DataFrame(ls, columns=['Int', 'float']) # 每一个 sub_list 表示一行
test_df
test_df.ix[test_df.Int==10]


Out[26]:
Int float

In [20]:
%time sum([x for x in range(10000)])


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.12 ms
Out[20]:
49995000

In [23]:
%run test.py


[[0 1 2]
 [2 1 0]]
[[ 1. -1.]
 [-1.  1.]]

In [24]:
#%quickref

In [3]:
import pandas as pd
import os

In [6]:
pdb_labels_file = '/home/huizhu/git_test/deepchem/datasets/muv.csv.gz'
print os.path.exists(pdb_labels_file)


True

In [13]:
df = pd.read_csv(pdb_labels_file)
print df.shape
df.head()


(93127, 19)
Out[13]:
MUV-466 MUV-548 MUV-600 MUV-644 MUV-652 MUV-689 MUV-692 MUV-712 MUV-713 MUV-733 MUV-737 MUV-810 MUV-832 MUV-846 MUV-852 MUV-858 MUV-859 mol_id smiles
0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN NaN NaN 0.0 NaN NaN NaN NaN NaN CID2999678 Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C
1 0.0 0.0 NaN NaN 0.0 0.0 0.0 NaN NaN NaN 0.0 NaN 0.0 NaN NaN 0.0 0.0 CID2999679 Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1
2 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 CID2999672 COc1cc2c(cc1NC(=O)CN1C(=O)NC3(CCc4ccccc43)C1=O...
3 NaN 0.0 0.0 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 NaN CID5390002 O=C1/C(=C/NC2CCS(=O)(=O)C2)c2ccccc2C(=O)N1c1cc...
4 0.0 NaN NaN NaN 0.0 NaN 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN CID2999670 NC(=O)NC(Cc1ccccc1)C(=O)O

In [20]:
count_values1 = df['MUV-466'].value_counts()  # count values
count_values2 = pd.isnull(df['MUV-466']).value_counts() # isnull, pd.notnull()
print count_values1; count_values2


0.0    14817
1.0       27
Name: MUV-466, dtype: int64
Out[20]:
True     78283
False    14844
Name: MUV-466, dtype: int64

In [17]:
group = df.groupby(['MUV-466', 'MUV-548'])
group.size()


Out[17]:
MUV-466  MUV-548
0.0      0.0        4224
         1.0           4
1.0      0.0           8
dtype: int64

In [22]:
group.size().unstack().fillna(0)


Out[22]:
MUV-548 0.0 1.0
MUV-466
0.0 4224.0 4.0
1.0 8.0 0.0

In [ ]: