In [1]:
import pandas as pd
import numpy as np
In [2]:
url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv'
tips = pd.read_csv(url)
tips.head()
Out[2]:
pandas在DataFrame直接取字段名称即可
In [3]:
tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
Out[3]:
DataFrame取数时,如果不设置字段名称,就会取所有字段 (与 SQL 的 * 等效)
DataFrames过滤方法很多,最常用的是布尔索引(boolean indexing)
In [4]:
tips[tips['time'] == 'Dinner'].head(5)
Out[4]:
这行代码是向DataFrame传递一个True/False对象的Series,返回所有带True的行。
In [5]:
is_dinner = tips['time'] == 'Dinner'
In [6]:
is_dinner.value_counts()
Out[6]:
In [7]:
tips[is_dinner].head(5)
Out[7]:
与SQL的OR/AND类似,DataFrame 用 | (OR) 与 & (AND)
SELECT *
FROM tips
WHERE time = 'Dinner' AND tip > 5.00;
In [8]:
tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
Out[8]:
SELECT *
FROM tips
WHERE size >= 5 OR total_bill > 45;
In [9]:
tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
Out[9]:
用 notna() and isna() 方法检测NULL
In [10]:
frame = pd.DataFrame({
'col1': ['A', 'B', np.NaN, 'C', 'D'],
'col2': ['F', np.NaN, 'G', 'H', 'I']
})
In [11]:
frame
Out[11]:
如果用SQL筛选col2是NULL:
SELECT *
FROM frame
WHERE col2 IS NULL;
pandas用isna方法
In [12]:
frame[frame['col2'].isna()]
Out[12]:
用SQL筛选col1不是NULL:
SELECT *
FROM frame
WHERE col1 IS NOT NULL;
pandas用notna()方法
In [13]:
frame[frame['col1'].notna()]
Out[13]:
pandas方法是:
In [14]:
tips.groupby('sex').size()
Out[14]:
这里用pandas的size()方法,不是count()方法。因为count()方法会自动应用到每一字段,返回所有字段的计数。
In [15]:
tips.groupby('sex').count()
Out[15]:
也可以对一个字段用count()方法,实现同样效果:
In [16]:
tips.groupby('sex')['total_bill'].count()
Out[16]:
也可以一次使用多个聚合函数。假如,要统计每星期的日均小费金额,以及给小费的人数,用SQL如下:
SELECT day, AVG(tip), COUNT(*)
FROM tips
GROUP BY day;
/*
Fri 2.734737 19
Sat 2.993103 87
Sun 3.255132 76
Thur 2.771452 62
*/
pandas用agg()方法实现,参数是一个Python字典(字段名称: 方法名称, ...)
In [17]:
tips.groupby('day').agg({'tip': np.mean, 'day': np.size})
Out[17]:
SELECT smoker, day, COUNT(*), AVG(tip)
FROM tips
GROUP BY smoker, day;
/*
smoker day
No Fri 4 2.812500
Sat 45 3.102889
Sun 57 3.167895
Thur 45 2.673778
Yes Fri 15 2.714000
Sat 42 2.875476
Sun 19 3.516842
Thur 17 3.030000
*/
In [18]:
tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]})
Out[18]:
In [19]:
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
'value': np.random.randn(4)})
In [20]:
df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'],
'value': np.random.randn(4)})
In [21]:
# 默认是INNER JOIN
pd.merge(df1, df2, on='key')
Out[21]:
merge()也可以连接一个表的字段与另一个表的索引
In [22]:
indexed_df2 = df2.set_index('key')
In [23]:
pd.merge(df1, indexed_df2, left_on='key', right_index=True)
Out[23]:
In [24]:
pd.merge(df1, df2, on='key', how='left')
Out[24]:
In [25]:
pd.merge(df1, df2, on='key', how='right')
Out[25]:
In [26]:
pd.merge(df1, df2, on='key', how='outer')
Out[26]:
In [27]:
df1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'],
'rank': range(1, 4)})
df2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'],
'rank': [1, 4, 5]})
SELECT city, rank
FROM df1
UNION ALL
SELECT city, rank
FROM df2;
/*
city rank
Chicago 1
San Francisco 2
New York City 3
Chicago 1
Boston 4
Los Angeles 5
*/
In [28]:
pd.concat([df1, df2])
Out[28]:
SQL的UNION与UNION ALL类似,不过UNION会剔除重复行
SELECT city, rank
FROM df1
UNION
SELECT city, rank
FROM df2;
/*
city rank
Chicago 1
San Francisco 2
New York City 3
Boston 4
Los Angeles 5
*/
pandas需要用concat()+drop_duplicates()方法实现
In [29]:
pd.concat([df1, df2]).drop_duplicates()
Out[29]:
In [30]:
tips.assign(rn=tips.sort_values(['total_bill'], ascending=False).groupby(
['day']).cumcount() + 1).query('rn < 3').sort_values(['day', 'rn'])
Out[30]:
也可以用rank(method=’first’)函数
In [31]:
tips.assign(rnk=tips.groupby(['day'])['total_bill'].rank(
method='first', ascending=False)).query('rnk < 3').sort_values(['day', 'rnk'])
Out[31]:
找出每组消费低于2美金(tips < 2)的前两名(rank < 3),这里用rank(method='min')
SELECT * FROM (
SELECT
t.*,
RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk
FROM tips t
WHERE tip < 2
)
WHERE rnk < 3
ORDER BY sex, rnk;
In [32]:
tips[tips['tip'] < 2].assign(rnk_min=tips.groupby(['sex'])['tip'].rank(
method='min')).query('rnk_min < 3').sort_values(['sex', 'rnk_min'])
Out[32]:
In [33]:
tips.loc[tips['tip'] < 2, 'tip'] *= 2
pandas选择需要的数据,创建新Dataframe,而不是删除数据
In [34]:
tips = tips.loc[tips['tip'] <= 9]