人生苦短，我用python

python第四课



In [2]:

    
# 新的数据格式，csv

纯文本，使用某个字符集，比如ASCII、Unicode、EBCDIC或GB2312（简体中文环境）等；
由记录组成（典型的是每行一条记录）；
每条记录被分隔符（英语：Delimiter）分隔为字段（英语：Field (computer science)）（典型分隔符有逗号、分号或制表符；有时分隔符可以包括可选的空格）；
每条记录都有同样的字段序列。



In [2]:

    
import pandas as pd
import numpy as np



In [50]:

    
abs_path = '/Users/wangyujie/Desktop/成绩表.csv'
df = pd.read_csv(abs_path)



In [14]:

    
df.head(5)



In [8]:

    
type(df)









    Out[8]:





pandas.core.frame.DataFrame

DataFrame



In [11]:

    
# 列名
print(df.columns)
# 索引
print(df.index)









    



Index(['序号', '姓名', '性别', '语文', '数学', '英语', '物理', '化学', '生物'], dtype='object')
RangeIndex(start=0, stop=37, step=1)



In [13]:

    
df.loc[0]









    Out[13]:





序号      1
姓名    凌榭辉
性别      男
语文     85
数学     60
英语     80
物理     62
化学     73
生物     80
Name: 0, dtype: object



In [20]:

    
a = np.array(range(10))
a > 3









    Out[20]:





array([False, False, False, False,  True,  True,  True,  True,  True,  True], dtype=bool)



In [23]:

    
# 筛选数学成绩大于80
df[df.数学 > 80]



In [25]:

    
df[df.数学<70]



In [27]:

    
# 复杂筛选
df[(df.语文>80) & (df.数学>80) & (df.英语>80)]

排序



In [33]:

    
df.sort_values(['数学', '语文', '英语']).head()



In [51]:

    
df.head()

访问



In [39]:

    
# 按照索引去定位
df.loc[3]









    Out[39]:





序号      4
姓名    邹新宇
性别      男
语文     90
数学     75
英语     64
物理     72
化学     72
生物     80
Name: 3, dtype: object

索引



In [43]:

    
scores = {
    '英语': [90, 78, 89],
    '数学': [64, 78, 45],
    '姓名': ['wong', 'li', 'sun']
}
df = pd.DataFrame(scores, index=['one', 'two', 'three'])
df



In [44]:

    
df.index









    Out[44]:





Index(['one', 'two', 'three'], dtype='object')



In [47]:

    
# 因为此时不存在数字索引，所以不能通过数字索引去访问
# df.loc[1]
df.loc['one']









    Out[47]:





姓名    wong
数学      64
英语      90
Name: one, dtype: object



In [49]:

    
# 实实在在的所谓的第几行
df.iloc[0]









    Out[49]:





姓名    wong
数学      64
英语      90
Name: one, dtype: object



In [53]:

    
# 合并了loc和iloc的功能
df.ix[0]









    Out[53]:





序号      1
姓名    凌榭辉
性别      男
语文     85
数学     60
英语     80
物理     62
化学     73
生物     80
Name: 0, dtype: object



In [56]:

    
df.loc[:2]



In [58]:

    
# 当索引为数字索引的时候，ix和loc是等价的
df.ix[:2]



In [62]:

    
# 访问某一行,是错误的
# df[0]

#访问多行数据是可以使用切片的
df[:2]



In [65]:

    
# dataframe中的数组
df.数学.values









    Out[65]:





array([60, 80, 80, 75, 76, 60, 78, 90, 80, 80, 80, 80, 80, 78, 80, 80, 80,
       80, 80, 80, 90, 80, 80, 80, 80, 78, 80, 78, 80, 80, 90, 80, 80, 80,
       80, 80, 80])



In [68]:

    
# 简单的统计
df.数学.value_counts()









    Out[68]:





80    26
78     4
90     3
60     2
76     1
75     1
Name: 数学, dtype: int64



In [70]:

    
df.head()



In [78]:

    
# 提取多列
new = df[['数学', '语文']].head()
new



In [79]:

    
new * 2

重点



In [83]:

    
def func(score):
    if score>=80:
        return "优秀"
    elif score>=70:
        return "良"
    elif score>=60:
        return "及格"
    else:
        return "不及格"

df['数学分类'] = df.数学.map(func)



In [84]:

    
df.head()



In [89]:

    
# applymap 对dataframe中所有的数据进行操作的一个函数，非常重要
def func(number):
    return number+10
# 等价
func = lambda number: number+10

df.applymap(lambda x: str(x) + ' -').head(2)

匿名函数



In [90]:

    
[i+ 100 for i in range(10)]









    Out[90]:





[100, 101, 102, 103, 104, 105, 106, 107, 108, 109]



In [94]:

    
def func(x):
    return x + 100



In [96]:

    
list(map(lambda x: x+100, range(10)))









    Out[96]:





[100, 101, 102, 103, 104, 105, 106, 107, 108, 109]



In [115]:

    
# 根据多列生成新的一个列的操作，用apply
df['new_score'] = df.apply(lambda x: x.数学 + x.语文, axis=1)



In [116]:

    
# 前几行
df.head(2)
# 最后几行
df.tail(2)

pandas中的dataframe的操作，很大一部分跟 numpy中的二维数组的操作是近似的

matplotlib绘图



In [117]:

    
df = df.drop(['new_score'], axis=1)



In [119]:

    
df.head(2)

绘图



In [4]:

    
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
# 上一行是必不可少的



In [134]:

    
x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.plot(x, y)
plt.plot(x, np.cos(x))









    Out[134]:





[<matplotlib.lines.Line2D at 0x11cb12a58>]



In [136]:

    
plt.plot(x, y, '--')









    Out[136]:





[<matplotlib.lines.Line2D at 0x11ce24160>]



In [137]:

    
fig = plt.figure()
plt.plot(x, y, '--')









    Out[137]:





[<matplotlib.lines.Line2D at 0x11cf342b0>]



In [138]:

    
fig.savefig('/Users/wangyujie/Desktop/first_figure.png')



In [147]:

    
# 虚线样式
plt.subplot(2,1,2)
plt.plot(x, np.sin(x), '--')

plt.subplot(2,1,1)
plt.plot(x, np.cos(x))









    Out[147]:





[<matplotlib.lines.Line2D at 0x11d9a3ac8>]



In [146]:

    
# 点状样式
x = np.linspace(0,10,20)
plt.plot(x, np.sin(x), 'o')









    Out[146]:





[<matplotlib.lines.Line2D at 0x11d7d5c50>]



In [150]:

    
# color控制颜色
x = np.linspace(0,10,20)
plt.plot(x, np.sin(x), 'o', color='red')









    Out[150]:





[<matplotlib.lines.Line2D at 0x11dca9080>]



In [162]:

    
# 加label
x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.plot(x, y, '--', label='sin(x)')
plt.plot(x, np.cos(x), 'o', label='cos(x)')
# legen控制label的显示效果， loc是控制label的位置的显示
plt.legend(loc='upper right')









    Out[162]:





<matplotlib.legend.Legend at 0x11e0b72b0>



In [163]:

    
plt.legend?
# 当遇到一个不熟悉的函数的时候，多使用？号，查看函数的文档



In [185]:

    
# plot函数，可定制的参数非常多
x = np.linspace(0, 10, 20)
y = np.sin(x)
plt.plot(x, y, '-d', color='orange', markersize=16, linewidth=2, markeredgecolor='gray', markeredgewidth=1)



In [181]:

    
# 具体参数可查看文档
plt.plot?



In [186]:

    
# ylim xlim 限定范围
plt.plot(x, y, '-d', color='orange', markersize=16, linewidth=2, markeredgecolor='gray', markeredgewidth=1)
plt.ylim(-0.5, 1.2);
plt.xlim(2,8)









    Out[186]:





(2, 8)



In [193]:

    
# 散点图
plt.scatter(x, y, s=100, c='gray')









    Out[193]:





<matplotlib.collections.PathCollection at 0x1201ffe10>



In [206]:

    
plt.style.use('seaborn-whitegrid')

x = np.random.randn(100)
y = np.random.randn(100)
colors = np.random.rand(100)
sizes = 1000 * np.random.rand(100)
plt.scatter(x, y, c=colors, s=sizes, alpha=0.4)
plt.colorbar();

pandas本身自带绘图

线性图



In [211]:

    
df = pd.DataFrame(np.random.rand(100,4).cumsum(0), columns=['A', 'B', 'C', 'D'])
df.plot()









    Out[211]:





<matplotlib.axes._subplots.AxesSubplot at 0x120dd4780>



In [212]:

    
df.A.plot()









    Out[212]:





<matplotlib.axes._subplots.AxesSubplot at 0x120d79d68>

柱状图



In [215]:

    
df = pd.DataFrame(np.random.randint(10,50,(3,4)),columns=['A','B','C','D'], index=['one','two','three'])
df.plot.bar()









    Out[215]:





<matplotlib.axes._subplots.AxesSubplot at 0x1205d8978>



In [219]:

    
# df.B.plot.bar()



In [220]:

    
# 等价于上面的绘制
df.plot(kind='bar')









    Out[220]:





<matplotlib.axes._subplots.AxesSubplot at 0x12197a400>



In [221]:

    
df.plot(kind='bar', stacked=True)









    Out[221]:





<matplotlib.axes._subplots.AxesSubplot at 0x121b0df60>

直方图



In [8]:

    
df = pd.DataFrame(np.random.randn(100,4),columns=['A','B','C','D'])
df.hist(column='A',figsize=(5,4))









    Out[8]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x116256eb8>]], dtype=object)

密度图



In [223]:

    
df.plot.kde() # df.plot(kind='kde')









    Out[223]:





<matplotlib.axes._subplots.AxesSubplot at 0x121f3bfd0>



In [9]:

    
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import numpy as np


fig = plt.figure()
ax = fig.gca(projection='3d')

# Make data.
X = np.arange(-5, 5, 0.25)
Y = np.arange(-5, 5, 0.25)
X, Y = np.meshgrid(X, Y)
R = np.sqrt(X**2 + Y**2)
Z = np.sin(R)

# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)

# Customize the z axis.
ax.set_zlim(-1.01, 1.01)
ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))

# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)

plt.show()

	序号	姓名	性别	语文	数学	英语	物理	化学	生物	数学分类
0	1 -	凌榭辉 -	男 -	85 -	60 -	80 -	62 -	73 -	80 -	及格 -
1	2 -	高海 -	男 -	85 -	80 -	67 -	74 -	86 -	80 -	优秀 -

	序号	姓名	性别	语文	数学	英语	物理	化学	生物
0	1	凌榭辉	男	85	60	80	62	73	80
1	2	高海	男	85	80	67	74	86	80
2	3	潘锦乐	男	85	80	66	87	79	72
3	4	邹新宇	男	90	75	64	72	72	80
4	5	吴一中	男	80	76	69	64	72	80

	序号	姓名	性别	语文	数学	英语	物理	化学	生物
7	8	秦佳艺	女	98	90	96	93	96	80
20	21	黄金虎	男	90	90	84	91	98	80
30	31	李佳	男	90	90	78	93	92	80

	序号	姓名	性别	语文	数学	英语	物理	化学	生物
0	1	凌榭辉	男	85	60	80	62	73	80
5	6	杨烨	男	90	60	85	34	78	80
3	4	邹新宇	男	90	75	64	72	72	80
4	5	吴一中	男	80	76	69	64	72	80
13	14	李杜伟	男	85	78	80	60	89	80

	序号	姓名	性别	语文	数学	英语	物理	化学	生物	数学分类
0	1	凌榭辉	男	85	60	80	62	73	80	及格
1	2	高海	男	85	80	67	74	86	80	优秀
2	3	潘锦乐	男	85	80	66	87	79	72	优秀
3	4	邹新宇	男	90	75	64	72	72	80	良
4	5	吴一中	男	80	76	69	64	72	80	良

	序号	姓名	性别	语文	数学	英语	物理	化学	生物	数学分类	new_score
35	36	顾振楠	男	85	80	93	88	93	78	优秀	165
36	37	沈嘉辉	男	90	80	80	67	94	80	优秀	170