In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
english = [56, 75, 45, 71, 62, 64, 58, 80, 76, 61]
math = [66, 70, 40, 60, 65, 56, 59, 77, 67, 63]
df_marker = pd.DataFrame({'English': english, 'Math': math})
print df_marker.T
In [3]:
df_marker.sort_index(axis=0, ascending=False).T # axis=0 表示 index
Out[3]:
In [4]:
df_marker.sort_index(axis=1, ascending=False).T # axis=1 表示 columns ("Math", "English")
Out[4]:
In [5]:
df_marker.sort_values(by="Math", ascending=False).T
Out[5]:
In [6]:
def tied_rank(x):
"""
ref: https://github.com/bkgood/auc.git
Computes the tied rank of elements in x.
This function computes the tied rank of elements in x.
Parameters
----------
x : list of numbers, numpy array
Returns
-------
score : list of numbers
The tied rank f each element in x
"""
sorted_x = sorted(zip(x,range(len(x))))
r = [0 for k in x]
cur_val = sorted_x[0][0]
last_rank = 0
for i in range(len(sorted_x)):
if cur_val != sorted_x[i][0]:
cur_val = sorted_x[i][0]
for j in range(last_rank, i):
r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
last_rank = i
if i==len(sorted_x)-1:
for j in range(last_rank, i+1):
r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
return r
In [7]:
eng_rank = tied_rank(english)
math_rank = tied_rank(math)
print english
print eng_rank
In [33]:
df = pd.DataFrame({'English':english, 'Math': math,
'english_rank': eng_rank, 'math_rank': math_rank
})
df.sort_values(by="english_rank", ascending=True)
df
Out[33]:
In [34]:
df.ix[df.English==45]
Out[34]:
In [42]:
df[df['English'].isin([45, 71])]
Out[42]:
In [26]:
x = np.random.randint(0, 100, 10)
y = np.random.randn(10)
ls = [[a, b] for a, b in zip(list(x), list(y))]
test_df = pd.DataFrame(ls, columns=['Int', 'float']) # 每一个 sub_list 表示一行
test_df
test_df.ix[test_df.Int==10]
Out[26]:
In [20]:
%time sum([x for x in range(10000)])
Out[20]:
In [23]:
%run test.py
In [24]:
#%quickref
In [3]:
import pandas as pd
import os
In [6]:
pdb_labels_file = '/home/huizhu/git_test/deepchem/datasets/muv.csv.gz'
print os.path.exists(pdb_labels_file)
In [13]:
df = pd.read_csv(pdb_labels_file)
print df.shape
df.head()
Out[13]:
In [20]:
count_values1 = df['MUV-466'].value_counts() # count values
count_values2 = pd.isnull(df['MUV-466']).value_counts() # isnull, pd.notnull()
print count_values1; count_values2
Out[20]:
In [17]:
group = df.groupby(['MUV-466', 'MUV-548'])
group.size()
Out[17]:
In [22]:
group.size().unstack().fillna(0)
Out[22]:
In [ ]: