In [2]:
# 新的数据格式,csv
In [2]:
import pandas as pd
import numpy as np
In [50]:
abs_path = '/Users/wangyujie/Desktop/成绩表.csv'
df = pd.read_csv(abs_path)
In [14]:
df.head(5)
Out[14]:
In [8]:
type(df)
Out[8]:
In [11]:
# 列名
print(df.columns)
# 索引
print(df.index)
In [13]:
df.loc[0]
Out[13]:
In [20]:
a = np.array(range(10))
a > 3
Out[20]:
In [23]:
# 筛选数学成绩大于80
df[df.数学 > 80]
Out[23]:
In [25]:
df[df.数学<70]
Out[25]:
In [27]:
# 复杂筛选
df[(df.语文>80) & (df.数学>80) & (df.英语>80)]
Out[27]:
In [33]:
df.sort_values(['数学', '语文', '英语']).head()
Out[33]:
In [51]:
df.head()
Out[51]:
In [39]:
# 按照索引去定位
df.loc[3]
Out[39]:
In [43]:
scores = {
'英语': [90, 78, 89],
'数学': [64, 78, 45],
'姓名': ['wong', 'li', 'sun']
}
df = pd.DataFrame(scores, index=['one', 'two', 'three'])
df
Out[43]:
In [44]:
df.index
Out[44]:
In [47]:
# 因为此时不存在数字索引,所以不能通过数字索引去访问
# df.loc[1]
df.loc['one']
Out[47]:
In [49]:
# 实实在在的所谓的第几行
df.iloc[0]
Out[49]:
In [53]:
# 合并了loc和iloc的功能
df.ix[0]
Out[53]:
In [56]:
df.loc[:2]
Out[56]:
In [58]:
# 当索引为数字索引的时候,ix和loc是等价的
df.ix[:2]
Out[58]:
In [62]:
# 访问某一行,是错误的
# df[0]
#访问多行数据是可以使用切片的
df[:2]
Out[62]:
In [65]:
# dataframe中的数组
df.数学.values
Out[65]:
In [68]:
# 简单的统计
df.数学.value_counts()
Out[68]:
In [70]:
df.head()
Out[70]:
In [78]:
# 提取多列
new = df[['数学', '语文']].head()
new
Out[78]:
In [79]:
new * 2
Out[79]:
In [83]:
def func(score):
if score>=80:
return "优秀"
elif score>=70:
return "良"
elif score>=60:
return "及格"
else:
return "不及格"
df['数学分类'] = df.数学.map(func)
In [84]:
df.head()
Out[84]:
In [89]:
# applymap 对dataframe中所有的数据进行操作的一个函数,非常重要
def func(number):
return number+10
# 等价
func = lambda number: number+10
df.applymap(lambda x: str(x) + ' -').head(2)
Out[89]:
In [90]:
[i+ 100 for i in range(10)]
Out[90]:
In [94]:
def func(x):
return x + 100
In [96]:
list(map(lambda x: x+100, range(10)))
Out[96]:
In [115]:
# 根据多列生成新的一个列的操作,用apply
df['new_score'] = df.apply(lambda x: x.数学 + x.语文, axis=1)
In [116]:
# 前几行
df.head(2)
# 最后几行
df.tail(2)
Out[116]:
In [117]:
df = df.drop(['new_score'], axis=1)
In [119]:
df.head(2)
Out[119]:
In [4]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# 上一行是必不可少的
In [134]:
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.plot(x, y)
plt.plot(x, np.cos(x))
Out[134]:
In [136]:
plt.plot(x, y, '--')
Out[136]:
In [137]:
fig = plt.figure()
plt.plot(x, y, '--')
Out[137]:
In [138]:
fig.savefig('/Users/wangyujie/Desktop/first_figure.png')
In [147]:
# 虚线样式
plt.subplot(2,1,2)
plt.plot(x, np.sin(x), '--')
plt.subplot(2,1,1)
plt.plot(x, np.cos(x))
Out[147]:
In [146]:
# 点状样式
x = np.linspace(0,10,20)
plt.plot(x, np.sin(x), 'o')
Out[146]:
In [150]:
# color控制颜色
x = np.linspace(0,10,20)
plt.plot(x, np.sin(x), 'o', color='red')
Out[150]:
In [162]:
# 加label
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.plot(x, y, '--', label='sin(x)')
plt.plot(x, np.cos(x), 'o', label='cos(x)')
# legen控制label的显示效果, loc是控制label的位置的显示
plt.legend(loc='upper right')
Out[162]:
In [163]:
plt.legend?
# 当遇到一个不熟悉的函数的时候,多使用?号,查看函数的文档
In [185]:
# plot函数,可定制的参数非常多
x = np.linspace(0, 10, 20)
y = np.sin(x)
plt.plot(x, y, '-d', color='orange', markersize=16, linewidth=2, markeredgecolor='gray', markeredgewidth=1)
In [181]:
# 具体参数可查看文档
plt.plot?
In [186]:
# ylim xlim 限定范围
plt.plot(x, y, '-d', color='orange', markersize=16, linewidth=2, markeredgecolor='gray', markeredgewidth=1)
plt.ylim(-0.5, 1.2);
plt.xlim(2,8)
Out[186]:
In [193]:
# 散点图
plt.scatter(x, y, s=100, c='gray')
Out[193]:
In [206]:
plt.style.use('seaborn-whitegrid')
x = np.random.randn(100)
y = np.random.randn(100)
colors = np.random.rand(100)
sizes = 1000 * np.random.rand(100)
plt.scatter(x, y, c=colors, s=sizes, alpha=0.4)
plt.colorbar();
In [211]:
df = pd.DataFrame(np.random.rand(100,4).cumsum(0), columns=['A', 'B', 'C', 'D'])
df.plot()
Out[211]:
In [212]:
df.A.plot()
Out[212]:
In [215]:
df = pd.DataFrame(np.random.randint(10,50,(3,4)),columns=['A','B','C','D'], index=['one','two','three'])
df.plot.bar()
Out[215]:
In [219]:
# df.B.plot.bar()
In [220]:
# 等价于上面的绘制
df.plot(kind='bar')
Out[220]:
In [221]:
df.plot(kind='bar', stacked=True)
Out[221]:
In [8]:
df = pd.DataFrame(np.random.randn(100,4),columns=['A','B','C','D'])
df.hist(column='A',figsize=(5,4))
Out[8]:
In [223]:
df.plot.kde() # df.plot(kind='kde')
Out[223]:
In [9]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import numpy as np
fig = plt.figure()
ax = fig.gca(projection='3d')
# Make data.
X = np.arange(-5, 5, 0.25)
Y = np.arange(-5, 5, 0.25)
X, Y = np.meshgrid(X, Y)
R = np.sqrt(X**2 + Y**2)
Z = np.sin(R)
# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
linewidth=0, antialiased=False)
# Customize the z axis.
ax.set_zlim(-1.01, 1.01)
ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()