learn pandas



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

两列成绩数据



In [2]:

    
english = [56, 75, 45, 71, 62, 64, 58, 80, 76, 61]
math = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]
df_marker = pd.DataFrame({'English': english, 'Math': math})
print df_marker.T









    



          0   1   2   3   4   5   6   7   8   9
English  56  75  45  71  62  64  58  80  76  61
Math     66  70  40  60  65  56  59  77  67  63

对数据进行排序



In [3]:

    
df_marker.sort_index(axis=0, ascending=False).T # axis=0 表示 index



In [4]:

    
df_marker.sort_index(axis=1, ascending=False).T # axis=1 表示 columns ("Math", "English")



In [5]:

    
df_marker.sort_values(by="Math", ascending=False).T



In [6]:

    
def tied_rank(x):
    """
    ref: https://github.com/bkgood/auc.git
    
    Computes the tied rank of elements in x.

    This function computes the tied rank of elements in x.

    Parameters
    ----------
    x : list of numbers, numpy array

    Returns
    -------
    score : list of numbers
            The tied rank f each element in x

    """
    sorted_x = sorted(zip(x,range(len(x))))
    r = [0 for k in x]
    cur_val = sorted_x[0][0]
    last_rank = 0
    for i in range(len(sorted_x)):
        if cur_val != sorted_x[i][0]:
            cur_val = sorted_x[i][0]
            for j in range(last_rank, i): 
                r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
            last_rank = i
        if i==len(sorted_x)-1:
            for j in range(last_rank, i+1): 
                r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
    return r



In [7]:

    
eng_rank = tied_rank(english)
math_rank = tied_rank(math)

print english
print eng_rank









    



[56, 75, 45, 71, 62, 64, 58, 80, 76, 61]
[2.0, 8.0, 1.0, 7.0, 5.0, 6.0, 3.0, 10.0, 9.0, 4.0]



In [33]:

    
df = pd.DataFrame({'English':english, 'Math': math,
                   'english_rank': eng_rank, 'math_rank': math_rank
                  })
df.sort_values(by="english_rank", ascending=True)
df









    Out[33]:






  
    
      
      English
      Math
      english_rank
      math_rank
    
  
  
    
      0
      56
      66
      2.0
      7.0
    
    
      1
      75
      70
      8.0
      9.0
    
    
      2
      45
      40
      1.0
      1.0
    
    
      3
      71
      60
      7.0
      4.0
    
    
      4
      62
      65
      5.0
      6.0
    
    
      5
      64
      56
      6.0
      2.0
    
    
      6
      58
      59
      3.0
      3.0
    
    
      7
      80
      77
      10.0
      10.0
    
    
      8
      76
      67
      9.0
      8.0
    
    
      9
      61
      63
      4.0
      5.0



In [34]:

    
df.ix[df.English==45]









    Out[34]:






  
    
      
      English
      Math
      english_rank
      math_rank
    
  
  
    
      2
      45
      40
      1.0
      1.0



In [42]:

    
df[df['English'].isin([45, 71])]









    Out[42]:






  
    
      
      English
      Math
      english_rank
      math_rank
    
  
  
    
      2
      45
      40
      1.0
      1.0
    
    
      3
      71
      60
      7.0
      4.0



In [26]:

    
x = np.random.randint(0, 100, 10)
y = np.random.randn(10)
ls = [[a, b] for a, b in zip(list(x), list(y))]
test_df = pd.DataFrame(ls, columns=['Int', 'float']) #　每一个　sub_list 表示一行
test_df
test_df.ix[test_df.Int==10]









    Out[26]:






  
    
      
      Int
      float



In [20]:

    
%time sum([x for x in range(10000)])









    



CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.12 ms






    Out[20]:





49995000



In [23]:

    
%run test.py









    



[[0 1 2]
 [2 1 0]]
[[ 1. -1.]
 [-1.  1.]]



In [24]:

    
#%quickref



In [3]:

    
import pandas as pd
import os



In [6]:

    
pdb_labels_file = '/home/huizhu/git_test/deepchem/datasets/muv.csv.gz'
print os.path.exists(pdb_labels_file)









    



True



In [13]:

    
df = pd.read_csv(pdb_labels_file)
print df.shape
df.head()









    



(93127, 19)






    Out[13]:






  
    
      
      MUV-466
      MUV-548
      MUV-600
      MUV-644
      MUV-652
      MUV-689
      MUV-692
      MUV-712
      MUV-713
      MUV-733
      MUV-737
      MUV-810
      MUV-832
      MUV-846
      MUV-852
      MUV-858
      MUV-859
      mol_id
      smiles
    
  
  
    
      0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0.0
      NaN
      NaN
      NaN
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      CID2999678
      Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C
    
    
      1
      0.0
      0.0
      NaN
      NaN
      0.0
      0.0
      0.0
      NaN
      NaN
      NaN
      0.0
      NaN
      0.0
      NaN
      NaN
      0.0
      0.0
      CID2999679
      Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1
    
    
      2
      NaN
      NaN
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0.0
      CID2999672
      COc1cc2c(cc1NC(=O)CN1C(=O)NC3(CCc4ccccc43)C1=O...
    
    
      3
      NaN
      0.0
      0.0
      NaN
      NaN
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0.0
      NaN
      CID5390002
      O=C1/C(=C/NC2CCS(=O)(=O)C2)c2ccccc2C(=O)N1c1cc...
    
    
      4
      0.0
      NaN
      NaN
      NaN
      0.0
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      CID2999670
      NC(=O)NC(Cc1ccccc1)C(=O)O



In [20]:

    
count_values1 = df['MUV-466'].value_counts()  # count values
count_values2 = pd.isnull(df['MUV-466']).value_counts() # isnull, pd.notnull()
print count_values1; count_values2









    



0.0    14817
1.0       27
Name: MUV-466, dtype: int64






    Out[20]:





True     78283
False    14844
Name: MUV-466, dtype: int64



In [17]:

    
group = df.groupby(['MUV-466', 'MUV-548'])
group.size()









    Out[17]:





MUV-466  MUV-548
0.0      0.0        4224
         1.0           4
1.0      0.0           8
dtype: int64



In [22]:

    
group.size().unstack().fillna(0)



In [ ]:

	English	Math	english_rank	math_rank
0	56	66	2.0	7.0
1	75	70	8.0	9.0
2	45	40	1.0	1.0
3	71	60	7.0	4.0
4	62	65	5.0	6.0
5	64	56	6.0	2.0
6	58	59	3.0	3.0
7	80	77	10.0	10.0
8	76	67	9.0	8.0
9	61	63	4.0	5.0

	MUV-466	MUV-548	MUV-600	MUV-644	MUV-652	MUV-689	MUV-692	MUV-712	MUV-713	MUV-733	MUV-737	MUV-810	MUV-832	MUV-846	MUV-852	MUV-858	MUV-859	mol_id	smiles
0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	0.0	NaN	NaN	NaN	NaN	NaN	CID2999678	Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C
1	0.0	0.0	NaN	NaN	0.0	0.0	0.0	NaN	NaN	NaN	0.0	NaN	0.0	NaN	NaN	0.0	0.0	CID2999679	Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1
2	NaN	NaN	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	CID2999672	COc1cc2c(cc1NC(=O)CN1C(=O)NC3(CCc4ccccc43)C1=O...
3	NaN	0.0	0.0	NaN	NaN	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN	CID5390002	O=C1/C(=C/NC2CCS(=O)(=O)C2)c2ccccc2C(=O)N1c1cc...
4	0.0	NaN	NaN	NaN	0.0	NaN	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	CID2999670	NC(=O)NC(Cc1ccccc1)C(=O)O