In [2]:
print('Note')
import numpy as np
"""
Pandas tips and tricks
"""
print('')
In [3]:
import pandas as pd
import numpy as np
import IPython
import sys
print(sys.version, '\n')
print('{}: {}'.format('pandas', pd.__version__))
print('{}: {}'.format('numpy', np.__version__))
print('{}: {}'.format('IPython', IPython.__version__))
In [4]:
class display_df(object):
template = """<div style="float: left; padding: 10px;">
<p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
</div>"""
def __init__(self, *args):
self.args = args
def _repr_html_(self):
return '\n'.join(self.template.format(a, eval(a)._repr_html_()) for a in self.args)
def __repr__(self):
return '\n\n'.join(a + '\n' + repr(eval(a)) for a in self.args)
def display_command(command):
print('> {}:\n {}\n'.format(command, eval(command)))
In [5]:
def make_toy_df(cols, ind):
data = {c: [str(c) + str(i) for i in ind] for c in cols}
return pd.DataFrame(data, ind)
# example DataFrame
make_toy_df('ABC', range(3))
Out[5]:
In [ ]:
In [6]:
df = make_toy_df('ABC', range(5))
print('type of {}: {}'.format('df', type(df)))
df
Out[6]:
In [7]:
print('type of {}: {}'.format("df['A']", type(df['A'])))
df['A']
Out[7]:
In [8]:
## dictionary-style & attribute-style to access a column
print(df['A'])
print(df.A)
print(df['A'] is df.A)
In [9]:
## add a new column
df['shape'] = list(range(df.shape[0]))
display_command("df['A'] is df.A")
display_command("df['shape'] is df.shape") # if already have this method
df
Out[9]:
In [10]:
## drop column
df = df.drop(labels='shape', axis=1)
df
Out[10]:
In [ ]:
In [11]:
## we can use column_name_list to select multi columns
display_df('df', "df[['A']]", "df[['A', 'C']]")
Out[11]:
In [12]:
## iloc & loc
display_df('df[2:4]', 'df.iloc[2:4, 1:3]', "df.loc[2:4, 'A':'C']")
Out[12]:
In [13]:
df['Q1'] = np.random.randint(10, size=len(df))
df['Q2'] = np.random.randint(10, size=len(df))
display_df('df', "df[df['Q1']>5]", "df[(df['Q1']>5) & (df['Q2']<5)]")
Out[13]:
In [14]:
display_df("df[(df['Q1']>5) & (df['Q2']<5)]", "df[(df['Q1']>5) | (df['Q2']<5)]", "df[(df['Q1']>5) ^ (df['Q2']<5)]")
Out[14]:
In [15]:
df['name'] = ['Kai', 'Jim', 'Toby', 'Frank', 'YiHsin']
boy_list = ['Kai', 'Jim', 'Toby', 'Frank']
display_df("df", "df[df['name'].isin(boy_list)]", "df[~df['name'].isin(boy_list)]")
Out[15]:
In [16]:
df['NaN_test'] = [100, None, np.nan, '', np.nan]
display_df("df", "df[df['NaN_test'].isna()]", "df[~df['NaN_test'].isna()]")
Out[16]:
In [17]:
display_command('None == None')
display_command('np.nan == np.nan')
display_command('np.nan == None')
In [ ]:
In [20]:
df1 = make_toy_df('AB', range(3))
df2 = make_toy_df('AB', range(2))
display_df('df1', 'df2', 'pd.concat([df1, df2])',
'pd.concat([df1, df2], ignore_index=True)', 'pd.concat([df1, df2], axis=1)')
Out[20]:
In [21]:
df1 = make_toy_df('AB', range(5))
df2 = make_toy_df('AC', range(2, 6))
display_df('df1', 'df2', "pd.merge(df1, df2, on='A')", "pd.merge(df1, df2, how='inner')",
"pd.merge(df1, df2, how='outer')", "pd.merge(df1, df2, how='left')")
Out[21]:
In [ ]:
In [24]:
def my_func(x, y):
return (x**2 + y**2)**0.5
## prepare data
size = 30000
df = pd.DataFrame({'x': np.random.randint(100, size=size),
'y': np.random.randint(100, size=size)})
display_command('df.shape')
df.head()
Out[24]:
In [25]:
%%timeit
distance_list = []
for i in range(0, len(df)):
d = my_func(df.iloc[i]['x'], df.iloc[i]['y'])
distance_list.append(d)
df['distance'] = distance_list
In [26]:
%%timeit
distance_list = []
for index, row in df.iterrows():
distance_list.append(my_func(row['x'], row['y']))
df['distance'] = distance_list
In [27]:
%%timeit
df['distance'] = df.apply(lambda row: my_func(row['x'], row['y']), axis=1)
In [28]:
%%timeit
df['distance'] = my_func(df['x'], df['y'])
In [29]:
%%timeit
df['distance'] = my_func(df['x'].values, df['y'].values)
In [ ]:
In [1]:
import pandas as pd
import numpy as np
data_path = './data_0627/urcos20180626.pd'
df_urcos = pd.read_pickle(data_path)
column_name = ['id', 'userid', 'username', 'prodid',
'prodname', 'score', 'type', 'season',
'skin', 'age', 'content', 'effects', 'reviewid',
'pageview', 'likes', 'pdate', 'crawl_time']
df_urcos.columns = column_name
print(df_urcos.shape)
In [2]:
import jieba
dict_path = './word_dict/try_dict.txt'
jieba.load_userdict(dict_path)
def my_func2(x):
return jieba.lcut(x)
print(my_func2('智慧捷運零元月票'))
print(my_func2('忠孝東路高架化'))
print(my_func2('忠孝東路路下化'))
print(my_func2('忠孝東路直接中斷'))
In [3]:
%%time
df_urcos['content_cut'] = df_urcos['content'].apply(lambda x: my_func2(x))
print(df_urcos['content_cut'].head(), '\n')
In [4]:
%%time
import multiprocessing
def _apply_df(args):
df, func, kwargs = args
return df.apply(func, **kwargs)
def apply_by_multiprocessing(df, func, **kwargs):
workers = kwargs.pop('workers')
pool = multiprocessing.Pool(processes=workers)
result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
pool.close()
return pd.concat(list(result))
## Number of CPU cores on your system
cores = multiprocessing.cpu_count()
df_urcos['content_cut'] = apply_by_multiprocessing(df_urcos['content'], my_func2, workers=cores)
print(df_urcos['content_cut'].head(), '\n')
In [ ]:
In [34]:
def my_func3(x, y):
return x + y > 195
df[df.apply(lambda row: my_func3(row['x'], row['y']), axis=1)]
Out[34]:
In [ ]:
In [ ]:
In [ ]:
In [35]:
groupby_pic() # this function build at the end
In [36]:
df_friends = pd.DataFrame([['Chandler Bing','party','2017-08-04 08:00:00',51],
['Chandler Bing','coffee','2017-08-04 13:00:00',60],
['Chandler Bing','party','2017-08-04 15:00:00',59],
['Chandler Bing','coffee','2017-08-04 16:00:00',40],
['Joey Tribbiani','party','2017-08-04 09:00:00',54],
['Joey Tribbiani','party','2017-08-04 10:00:00',67],
['Joey Tribbiani','football','2017-08-04 08:00:00',84],
['Gunther','work','2017-08-04 14:00:00']],
columns=['name','activity','timestamp','money_spent'])
df_friends['timestamp'] = pd.to_datetime(df_friends['timestamp'], format='%Y-%m-%d %H:%M:%S')
df_friends
Out[36]:
In [37]:
## iter group
for index, group in df_friends.groupby(['name']):
print('index: {}'.format(index))
IPython.display.display(group)
print()
In [38]:
display_df("df_friends.groupby(['name']).count()",
"df_friends.groupby(['name']).count().reset_index()") # count() don't count the NaN
Out[38]:
In [39]:
display_df("df_friends.groupby(['name', 'activity']).mean()",
"df_friends.groupby(['name', 'activity']).mean().reset_index()")
Out[39]:
In [40]:
## add_suffix(), add_prefix()
display_df("df_friends.groupby(['name'])['money_spent'].agg([np.sum, np.mean, np.max])",
"df_friends.groupby(['name'])['money_spent'].agg([np.sum, np.mean, np.max]).add_suffix('_money').reset_index()")
Out[40]:
In [41]:
def filter_func(x):
return x['money_spent'].mean() > 60
display_df("df_friends", "df_friends.groupby(['name']).filter(filter_func)")
Out[41]:
In [42]:
def centralize(x):
x['money_centralize'] = x['money_spent'] - x['money_spent'].mean()
return x
display_df("df_friends", "df_friends.groupby(['name']).apply(lambda x: centralize(x))")
Out[42]:
In [43]:
def coffee_money(df):
df_coffee = df[df['activity']=='coffee']
df_other = df[df['activity']!='coffee']
return pd.Series([df.shape[0],
df_coffee.shape[0], df_other.shape[0],
df_coffee['money_spent'].mean(), df_other['money_spent'].mean(),
df_coffee['money_spent'].sum(), df_other['money_spent'].sum()],
index=['total_count', 'coffee_count', 'other_count',
'coffee_money_mean', 'other_money_mean',
'coffee_money_sum', 'other_money_sum'])
display_df("df_friends", "df_friends.groupby(['name']).apply(lambda x: coffee_money(x)).reset_index()")
Out[43]:
In [ ]:
In [ ]:
In [ ]:
Usually use:
%time: Time the execution of a single statement%timeit: Time repeated execution of a single statement for more accuracy%prun: Run code with the profiler%lprun: Run code with the line-by-line profiler%memit: Measure the memory use of a single statement%mprun: Run code with the line-by-line memory profilerNote: %: line-magic, %%: cell-magic
In [44]:
## list all magic commands
%lsmagic
Out[44]:
In [ ]:
In [ ]:
http://pandas.pydata.org/pandas-docs/stable
https://jakevdp.github.io/PythonDataScienceHandbook
https://towardsdatascience.com/pandas-tips-and-tricks-33bcc8a40bb9
https://engineering.upside.com/a-beginners-guide-to-optimizing-pandas-code-for-speed-c09ef2c6a4d6
https://gist.github.com/yong27/7869662
In [ ]:
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# from:
# https://jakevdp.github.io/PythonDataScienceHandbook/06.00-figure-code.html#Split-Apply-Combine
def draw_dataframe(df, loc=None, width=None, ax=None, linestyle=None,
textstyle=None):
loc = loc or [0, 0]
width = width or 1
x, y = loc
if ax is None:
ax = plt.gca()
ncols = len(df.columns) + 1
nrows = len(df.index) + 1
dx = dy = width / ncols
if linestyle is None:
linestyle = {'color':'black'}
if textstyle is None:
textstyle = {'size': 12}
textstyle.update({'ha':'center', 'va':'center'})
# draw vertical lines
for i in range(ncols + 1):
plt.plot(2 * [x + i * dx], [y, y + dy * nrows], **linestyle)
# draw horizontal lines
for i in range(nrows + 1):
plt.plot([x, x + dx * ncols], 2 * [y + i * dy], **linestyle)
# Create index labels
for i in range(nrows - 1):
plt.text(x + 0.5 * dx, y + (i + 0.5) * dy,
str(df.index[::-1][i]), **textstyle)
# Create column labels
for i in range(ncols - 1):
plt.text(x + (i + 1.5) * dx, y + (nrows - 0.5) * dy,
str(df.columns[i]), style='italic', **textstyle)
# Add index label
if df.index.name:
plt.text(x + 0.5 * dx, y + (nrows - 0.5) * dy,
str(df.index.name), style='italic', **textstyle)
# Insert data
for i in range(nrows - 1):
for j in range(ncols - 1):
plt.text(x + (j + 1.5) * dx,
y + (i + 0.5) * dy,
str(df.values[::-1][i, j]), **textstyle)
#----------------------------------------------------------
# Draw figure
import pandas as pd
def groupby_pic():
df = pd.DataFrame({'data': [1, 2, 3, 4, 5, 6]},
index=['A', 'B', 'C', 'A', 'B', 'C'])
df.index.name = 'key'
fig = plt.figure(figsize=(8, 6), facecolor='white')
ax = plt.axes([0, 0, 1, 1])
ax.axis('off')
draw_dataframe(df, [0, 0])
for y, ind in zip([3, 1, -1], 'ABC'):
split = df[df.index == ind]
draw_dataframe(split, [2, y])
sum = pd.DataFrame(split.sum()).T
sum.index = [ind]
sum.index.name = 'key'
sum.columns = ['data']
draw_dataframe(sum, [4, y + 0.25])
result = df.groupby(df.index).sum()
draw_dataframe(result, [6, 0.75])
style = dict(fontsize=14, ha='center', weight='bold')
plt.text(0.5, 3.6, "Input", **style)
plt.text(2.5, 4.6, "Split", **style)
plt.text(4.5, 4.35, "Apply (sum)", **style)
plt.text(6.5, 2.85, "Combine", **style)
arrowprops = dict(facecolor='black', width=1, headwidth=6)
plt.annotate('', (1.8, 3.6), (1.2, 2.8), arrowprops=arrowprops)
plt.annotate('', (1.8, 1.75), (1.2, 1.75), arrowprops=arrowprops)
plt.annotate('', (1.8, -0.1), (1.2, 0.7), arrowprops=arrowprops)
plt.annotate('', (3.8, 3.8), (3.2, 3.8), arrowprops=arrowprops)
plt.annotate('', (3.8, 1.75), (3.2, 1.75), arrowprops=arrowprops)
plt.annotate('', (3.8, -0.3), (3.2, -0.3), arrowprops=arrowprops)
plt.annotate('', (5.8, 2.8), (5.2, 3.6), arrowprops=arrowprops)
plt.annotate('', (5.8, 1.75), (5.2, 1.75), arrowprops=arrowprops)
plt.annotate('', (5.8, 0.7), (5.2, -0.1), arrowprops=arrowprops)
plt.axis('equal')
plt.ylim(-1.5, 5)
plt.show()
In [ ]:
In [ ]: