File IO

现在开始介绍读写文件,这部分内容很简单,模式都是很固定的模式


In [ ]:
# 打开文件
fh = open('file.txt','w')

# 其中,w表示写入,r表示读取,a表示追加写入
# t是windows平台特有的所谓text mode(文本模式),区别在于会自动识别windows平台的换行符。
# 类Unix平台的换行符是\n,而windows平台用的是\r\n两个ASCII字符来表示换行,python内部采用的是\n来表示换行符。
# rt模式下,python在读取文本时会自动把\r\n转换成\n.
# wt模式下,Python写文件时会用\r\n来表示换行。

In [24]:
dir(fh)


Out[24]:
['_CHUNK_SIZE',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__next__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_checkClosed',
 '_checkReadable',
 '_checkSeekable',
 '_checkWritable',
 '_finalizing',
 'buffer',
 'close',
 'closed',
 'detach',
 'encoding',
 'errors',
 'fileno',
 'flush',
 'isatty',
 'line_buffering',
 'mode',
 'name',
 'newlines',
 'read',
 'readable',
 'readline',
 'readlines',
 'seek',
 'seekable',
 'tell',
 'truncate',
 'writable',
 'write',
 'writelines']

In [80]:
fh.name


Out[80]:
'file.txt'

In [81]:
fh.mode


Out[81]:
'w'

In [82]:
fh.encoding


Out[82]:
'UTF-8'

In [83]:
fh.writable()


Out[83]:
True

In [84]:
fh.write("life is so great,yes we can")


Out[84]:
27

In [85]:
fh.flush()

In [86]:
fh.close()

In [88]:
f = open('file.txt','r')

In [35]:
f.readlines()


Out[35]:
['life is so greatlife is so great,yes we can']

In [39]:
f.close()

In [95]:
ff = open('file.txt','a',encoding = 'latin-1')

In [96]:
ff.write('aaaaa')


Out[96]:
5

In [97]:
ff.close()

注意在编写读写文件的程序时,一定要使用with语句打开文件,因为这样可以不必显式的使用close语句关闭,而且也创建了一个临时的上下文环境。


In [56]:
with open ('file.txt','w') as ff:
    ff.write('aaaaa')

x = 'a'

os模块


In [9]:
import os
cur_dir = os.getcwd()

In [10]:
for file in os.listdir():
    print(file)


.DS_Store
.ipynb_checkpoints
add22.py
bench.sh
data
file.txt
podcasts.xml
potholes.xml
somefile
testdata.csv
testout.csv
Untitled.ipynb
week1st.ipynb
week2nd.ipynb
week3rd_ooDesign.ipynb

In [21]:
os.path.isfile('file.txt')


Out[21]:
True

In [22]:
os.path.exists('file.txt')


Out[22]:
True

In [23]:
os.path.isdir('somefile')


Out[23]:
False

os模块有很多种不同的方法非常的实用,小伙伴可以通过这些方法来简化很多文件相关的操作


In [14]:
for dir_name, sub_dirs, files in os.walk(cur_dir):
    print(dir_name,'\n\t', sub_dirs,'\n\t\t', files)


/Users/hanlei/Documents/python/note/training 
	 ['.ipynb_checkpoints', 'data'] 
		 ['.DS_Store', 'add22.py', 'bench.sh', 'file.txt', 'podcasts.xml', 'potholes.xml', 'somefile', 'testdata.csv', 'testout.csv', 'Untitled.ipynb', 'week1st.ipynb', 'week2nd.ipynb', 'week3rd_ooDesign.ipynb']
/Users/hanlei/Documents/python/note/training/.ipynb_checkpoints 
	 [] 
		 ['Untitled-checkpoint.ipynb', 'week1st-checkpoint.ipynb', 'week2nd-checkpoint.ipynb', 'week3rd-checkpoint.ipynb']
/Users/hanlei/Documents/python/note/training/data 
	 [] 
		 ['person.xml']

扫描目录


In [15]:
for entry in os.scandir(cur_dir):
    if entry.is_dir():
        typ = 'dir'
    elif entry.is_file():
        typ = 'file'
    elif entry.is_symlink():
        typ = 'link'
    else:
        typ = 'unknown'
    print('{name} {typ}'.format(
        name=entry.name,
        typ=typ,
    ))


.DS_Store file
.ipynb_checkpoints dir
add22.py file
bench.sh file
data dir
file.txt file
podcasts.xml file
potholes.xml file
somefile file
testdata.csv file
testout.csv file
Untitled.ipynb file
week1st.ipynb file
week2nd.ipynb file
week3rd_ooDesign.ipynb file

切换目录


In [ ]:
import os

print('Starting:', os.getcwd())

print('Moving up one:', os.pardir)
os.chdir(os.pardir)

print('After move:', os.getcwd())

构建路径


In [ ]:
import os.path

PATHS = [
    ('one', 'two', 'three'),
    ('/', 'one', 'two', 'three'),
    ('/one', '/two', '/three'),
]

for parts in PATHS:
    print('{} : {!r}'.format(parts, os.path.join(*parts)))

分割路径


In [16]:
os.path.split(os.getcwd())


Out[16]:
('/Users/hanlei/Documents/python/note', 'training')

创建和删除目录


In [19]:
import os

dir_name = 'os_directories_example'

print('Creating', dir_name)
os.makedirs(dir_name)

file_name = os.path.join(dir_name, 'example.txt')
print('Creating', file_name)
with open(file_name, 'wt') as f:
    f.write('example file')

print('Cleaning up')
os.unlink(file_name)
os.rmdir(dir_name)


Creating os_directories_example
Creating os_directories_example/example.txt
Cleaning up

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

csv/json/xml


In [44]:
import csv
import json
import lxml

In [50]:
with open('testdata.csv', 'rt') as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)


['Title 1', 'Title 2', 'Title 3', 'Title 4']
['1', 'a', '08/18/07', 'Ã¥']
['2', 'b', '08/19/07', 'â«']
['3', 'c', '08/20/07', 'ç']
[]

In [54]:
unicode_chars = 'å∫ç'

with open('testout.csv', 'wt') as f:
    writer = csv.writer(f)
    writer.writerow(('Title 1', 'Title 2', 'Title 3', 'Title 4'))
    for i in range(3):
        row = (
            i + 1,
            chr(ord('a') + i),
            '08/{:02d}/07'.format(i + 1),
            unicode_chars[i],
        )
        writer.writerow(row)

In [98]:
with open('testdata.csv', 'rt') as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(row)


OrderedDict([('Title 1', '1'), ('Title 2', 'a'), ('Title 3', '08/18/07'), ('Title 4', 'Ã¥')])
OrderedDict([('Title 1', '2'), ('Title 2', 'b'), ('Title 3', '08/19/07'), ('Title 4', 'â«')])
OrderedDict([('Title 1', '3'), ('Title 2', 'c'), ('Title 3', '08/20/07'), ('Title 4', 'ç')])

In [47]:
import json

data = [{'a': 'A', 'b': (2, 4), 'c': 3.0}]
print('DATA:', repr(data))

data_string = json.dumps(data)
print('JSON:', data_string)


DATA: [{'a': 'A', 'b': (2, 4), 'c': 3.0}]
JSON: [{"a": "A", "b": [2, 4], "c": 3.0}]

In [100]:
import json

data = [{'a': 'A', 'b': (2, 4), 'c': 3.0}]
print('DATA   :', data)

data_string = json.dumps(data)
print('ENCODED:', data_string)

decoded = json.loads(data_string)
print('DECODED:', decoded)

print('ORIGINAL:', type(data[0]['b']))
print('DECODED :', type(decoded[0]['b']))


DATA   : [{'a': 'A', 'b': (2, 4), 'c': 3.0}]
ENCODED: [{"a": "A", "b": [2, 4], "c": 3.0}]
DECODED: [{'a': 'A', 'b': [2, 4], 'c': 3.0}]
ORIGINAL: <class 'tuple'>
DECODED : <class 'list'>

csv和json文件非常简单,也都是python的标准库,基本就是几个固定的方法。


In [108]:
import json

data = [{'a': 'A', 'b': (2, 4), 'c': 3.0},{'a': 'A', 'b': (2, 4), 'c': 3.0},{'a': 'A', 'b': (2, 4), 'c': 3.0}]
print('DATA:', data)

# print('NORMAL:', json.dumps(data, sort_keys=True))
# print('INDENT:', json.dumps(data, sort_keys=True, indent=2))

import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(data)


DATA: [{'a': 'A', 'b': (2, 4), 'c': 3.0}, {'a': 'A', 'b': (2, 4), 'c': 3.0}, {'a': 'A', 'b': (2, 4), 'c': 3.0}]
[   {'a': 'A', 'b': (2, 4), 'c': 3.0},
    {'a': 'A', 'b': (2, 4), 'c': 3.0},
    {'a': 'A', 'b': (2, 4), 'c': 3.0}]

In [ ]:

xml

首先,小伙伴需要知道xml的一般格式,这个很容易,网上大把的资料,然后就是通过lxml这个库实现xml文件的提取。一般而言,python中使用xml并不是很常见,所需的配置文件基本用json/ini/YAML就够了


In [123]:
from urllib.request import urlopen
from xml.etree.ElementTree import parse

# Download the RSS feed and parse it
u = urlopen('http://planet.python.org/rss20.xml')
doc = parse(u)

# Extract and output tags of interest
for item in doc.iterfind('channel/item'):
    title = item.findtext('title')
    date = item.findtext('pubDate')
    link = item.findtext('link')

    print(title)
    print(date)
    print(link)
    print()


Patrick Kennedy: Using Docker for Flask Application Development (not just Production!)
Fri, 23 Jun 2017 15:31:24 +0000
http://www.patricksoftwareblog.com/using-docker-for-flask-application-development-not-just-production/

Django Weekly: Django Weekly Issue 44 - Django vs Flask, Translation, Kubernetes, Google Authentication and more
Fri, 23 Jun 2017 12:50:56 +0000
http://djangoweekly.com/blog/post/django-weekly-issue-44-django-vs-flask-translation-kubernetes-google-authentication-and-more

EuroPython: PyData EuroPython 2017
Fri, 23 Jun 2017 10:29:53 +0000
http://blog.europython.eu/post/162158264347

Fabio Zadrozny: mu-repo: Dealing with multiple git repositories
Fri, 23 Jun 2017 09:07:20 +0000
http://feedproxy.google.com/~r/blogspot/pydev/~3/UeW5aEJ_7VE/mu-repo-dealing-with-multiple-git.html

Brad Lucas: Python Virtualenv
Fri, 23 Jun 2017 04:00:00 +0000
http://blog.bradlucas.com/posts/2017-06-23-python-virtualenv/

Hynek Schlawack: Sharing Your Labor of Love: PyPI Quick and Dirty
Fri, 23 Jun 2017 00:00:00 +0000
https://hynek.me/articles/sharing-your-labor-of-love-pypi-quick-and-dirty/

Tarek Ziade: Advanced Molotov example
Thu, 22 Jun 2017 22:00:00 +0000
https://ziade.org/2017/06/23/advanced-molotov-example/

Stephen Ferg: Unicode for dummies &#8212; Encoding
Thu, 22 Jun 2017 21:27:07 +0000
https://pythonconquerstheuniverse.wordpress.com/2012/02/01/unicode-for-dummies-encoding/

Philip Semanchuk: Analyzing the Anglo-Saxonicity of the Baby BNC
Thu, 22 Jun 2017 18:46:13 +0000
http://blog.pyspoken.com/2017/06/22/analyzing-the-anglo-saxonicity-of-the-baby-bnc/

Django Weblog: DjangoCon US Schedule Is Live
Thu, 22 Jun 2017 17:46:57 +0000
https://www.djangoproject.com/weblog/2017/jun/22/djangocon-us-schedule-live/

Mike Driscoll: Book Review: Software Architecture with Python
Thu, 22 Jun 2017 17:15:51 +0000
http://www.blog.pythonlibrary.org/2017/06/22/book-review-software-architecture-with-python/

EuroPython: EuroPython 2017: Call for on-site volunteers
Thu, 22 Jun 2017 15:29:28 +0000
http://blog.europython.eu/post/162126235297

PyCharm: PyCharm Edu 4 EAP: Integration with Stepik for Educators
Thu, 22 Jun 2017 14:52:36 +0000
http://feedproxy.google.com/~r/Pycharm/~3/-pCSrEA9er0/

Python Anywhere: The PythonAnywhere API:  beta now available for all users
Thu, 22 Jun 2017 12:12:36 +0000
http://blog.pythonanywhere.com/154/

A. Jesse Jiryu Davis: New Driver Features for MongoDB 3.6
Thu, 22 Jun 2017 08:12:59 +0000
https://emptysqua.re/blog/driver-features-for-mongodb-3-6/

eGenix.com: Python Meeting Düsseldorf - 2017-06-28
Thu, 22 Jun 2017 08:00:00 +0000
http://www.egenix.com/company/news/Python-Meeting-Duesseldorf-2017-06-28

Continuum Analytics News: It’s Getting Hot, Hot, Hot: Four Industries Turning Up The Data Science Heat
Wed, 21 Jun 2017 17:56:34 +0000
https://www.continuum.io/blog/company-blog/its-getting-hot-hot-hot-four-industries-turning-data-science-heat

PyCharm: PyCharm 2017.2 EAP 4
Wed, 21 Jun 2017 17:00:20 +0000
http://feedproxy.google.com/~r/Pycharm/~3/yOn5wW6X9z4/

Enthought: Enthought Announces Canopy 2.1: A Major Milestone Release for the Python Analysis Environment and Package Distribution
Wed, 21 Jun 2017 16:30:55 +0000
http://blog.enthought.com/enthought-canopy/enthought-announces-canopy-2-1-a-major-milestone-release-for-the-python-analysis-environment-and-package-distribution/

DataCamp: New Course: Deep Learning in Python (first Keras 2.0 online course!)
Wed, 21 Jun 2017 14:10:33 +0000
http://www.datacamp.com/community/blog/new-course-deep-learning-in-python-first-keras-2-0-online-course

EuroPython: EuroPython 2017: Conference App available
Wed, 21 Jun 2017 11:56:34 +0000
http://blog.europython.eu/post/162082203552

Kushal Das: Updates on my Python community work: 16-17
Wed, 21 Jun 2017 10:56:00 +0000
https://kushaldas.in/posts/updates-on-my-python-community-work-16-17.html

Codementor: Building an Hello World Application with Python/Django
Wed, 21 Jun 2017 10:11:00 +0000
https://www.codementor.io/abiodunhassan/building-an-hello-world-application-with-python-django-95sysyr6v

Python Bytes: #31 You should have a change log
Wed, 21 Jun 2017 08:00:00 +0000
https://pythonbytes.fm/episodes/show/31/you-should-have-a-change-log

Talk Python to Me: #117 Functional Python with Coconut
Wed, 21 Jun 2017 08:00:00 +0000
https://talkpython.fm/episodes/show/117/functional-python-with-coconut

xml.etree.ElementTree.parse() 函数解析整个XML文档并将其转换成一个文档对象。 然后,你就能使用 find() 、iterfind() 和 findtext() 等方法来搜索特定的XML元素了。 这些函数的参数就是某个指定的标签名,例如 channel/item 或 title 。

每次指定某个标签时,你需要遍历整个文档结构。每次搜索操作会从一个起始元素开始进行。 同样,每次操作所指定的标签名也是起始元素的相对路径。 例如,执行 doc.iterfind('channel/item') 来搜索所有在 channel 元素下面的 item 元素。 doc 代表文档的最顶层(也就是第一级的 rss 元素)。 然后接下来的调用 item.findtext() 会从已找到的 item 元素位置开始搜索。

ElementTree 模块中的每个元素有一些重要的属性和方法,在解析的时候非常有用。 tag 属性包含了标签的名字,text 属性包含了内部的文本,而 get() 方法能获取属性值。


In [116]:
from xml.etree import ElementTree

with open('podcasts.xml', 'rt') as f:
    tree = ElementTree.parse(f)

print(type(tree))


<class 'xml.etree.ElementTree.ElementTree'>

In [118]:
from xml.etree import ElementTree
import pprint

with open('podcasts.xml', 'rt') as f:
    tree = ElementTree.parse(f)

for node in tree.iter():
    print(node.tag)


opml
head
title
dateCreated
dateModified
body
outline
outline
outline
outline
outline

In [55]:
from xml.etree import ElementTree

with open('podcasts.xml', 'rt') as f:
    tree = ElementTree.parse(f)

for node in tree.iter('outline'):
    name = node.attrib.get('text')
    url = node.attrib.get('xmlUrl')
    if name and url:
        print('  %s' % name)
        print('    %s' % url)
    else:
        print(name)


Non-tech
  99% Invisible
    http://feeds.99percentinvisible.org/99percentinvisible
Python
  Talk Python to Me
    https://talkpython.fm/episodes/rss
  Podcast.__init__
    http://podcastinit.podbean.com/feed/

In [121]:
from xml.etree import ElementTree

with open('podcasts.xml', 'rt') as f:
    tree = ElementTree.parse(f)

for node in tree.findall('.//outline'):
    url = node.attrib.get('xmlUrl')
    if url:
        print(url)


http://feeds.99percentinvisible.org/99percentinvisible
https://talkpython.fm/episodes/rss
http://podcastinit.podbean.com/feed/

In [122]:
from xml.etree import ElementTree

with open('podcasts.xml', 'rt') as f:
    tree = ElementTree.parse(f)

for node in tree.findall('.//outline/outline'):
    url = node.attrib.get('xmlUrl')
    print(url)


http://feeds.99percentinvisible.org/99percentinvisible
https://talkpython.fm/episodes/rss
http://podcastinit.podbean.com/feed/

还有一些没有介绍到的配置文件格式,如YAML和ini。YAML可以认为是json的简化版本,注意YAML的特定格式,尤其是手动修改的之后,例如最后一行必须是空行,缩进注意对齐等,直接用PyYAML这个库就能处理。

对于ini文件,直接用configparser这个标准库处理就好,但是稍微注意一点的就是,一定要遵从这种文件的格式;很多时候会觉得匪夷所思,比如说在ini文件中定义字典,一定要在左大括号后面放置字典的第一个元素,对应的右大括号前一定要有空格。

pickle

对于数据持久化的问题,pickle是其中的一个解决方案。pickle是一个标准库,使用的方法也很简单,和json很类似:loads, dumps


In [2]:
# 编码和解码字符串中的数据

import pickle
import pprint# 这个也是标准库,用于更清楚的显示所要的内容

data = [{'a': 'A', 'b': 2, 'c': 3.0}]
print('DATA:', end=' ')
pprint.pprint(data)

data_string = pickle.dumps(data)
print('PICKLE: {!r}'.format(data_string))


DATA: [{'a': 'A', 'b': 2, 'c': 3.0}]
PICKLE: b'\x80\x03]q\x00}q\x01(X\x01\x00\x00\x00aq\x02X\x01\x00\x00\x00Aq\x03X\x01\x00\x00\x00bq\x04K\x02X\x01\x00\x00\x00cq\x05G@\x08\x00\x00\x00\x00\x00\x00ua.'

In [3]:
import pickle
import pprint

data1 = [{'a': 'A', 'b': 2, 'c': 3.0}]
print('BEFORE: ', end=' ')
pprint.pprint(data1)

data1_string = pickle.dumps(data1)

data2 = pickle.loads(data1_string)
print('AFTER : ', end=' ')
pprint.pprint(data2)

print('SAME? :', (data1 is data2))
print('EQUAL?:', (data1 == data2))


BEFORE:  [{'a': 'A', 'b': 2, 'c': 3.0}]
AFTER :  [{'a': 'A', 'b': 2, 'c': 3.0}]
SAME? : False
EQUAL?: True

In [ ]:


In [ ]:


In [ ]:


In [ ]:

函数

其实我们之前已经看到过很多函数相关的例子,但是Python中函数的参数传递还是有一些内容没有涉及:


In [6]:
def avg(first, *rest):
    print("first is {}".format(first))
    print("rest are {}".format(rest))
    return (first + sum(rest)) / (1 + len(rest))

# Sample use
print(avg(2, 2)) # 1.5
print(avg(4))
avg(5, 2, 3, 4) # 2.5


first is 2
rest are (2,)
2.0
first is 4
rest are ()
4.0
first is 5
rest are (2, 3, 4)
Out[6]:
3.5

In [14]:
import html

def make_element(name, value, **attrs):
    keyvals = [' %s="%s"' % item for item in attrs.items()]
    print(keyvals)
    attr_str = ''.join(keyvals)
    print(attr_str)
    element = '<{name}{attrs}>{value}</{name}>'.format(
                name=name,
                attrs=attr_str,
                value=html.escape(value))
    return element

# Example
# Creates '<item size="large" quantity="6">Albatross</item>'
me = make_element('item', 'Albatross>>&<', size='large', quantity=6)
print(me)
# Creates '<p>&lt;spam&gt;</p>'
make_element('p', '<spam>')


[' size="large"', ' quantity="6"']
 size="large" quantity="6"
<item size="large" quantity="6">Albatross&gt;&gt;&amp;&lt;</item>
[]

Out[14]:
'<p>&lt;spam&gt;</p>'

In [22]:
def mininum(*values, clip=None):
    m = min(values)
    if clip is not None:
        m = clip if clip > m else m
    return m

print(mininum(1, 5, 2, -5, 10)) # Returns -5
print(mininum(1,5, 2,-5,10, clip=20))
print(mininum(1, 5, 2, -5, 10, clip=0)) # Returns 0


-5
20
0

In [ ]:
def mininum(clip, num, value=None, *args, **kwargs):
    pass

In [30]:
def mininum(*values, clip=None, **num):
    m = min(values)
    if clip is not None:
        m = clip if clip > m else m
    return m

print(mininum(1, 5, 2, -5, 10))


-5

In [28]:
def mininum(*values, clip=None, **num):
    m = min(values)
    if clip is not None:
        m = clip if clip > m else m
    m = num
    return m

print(mininum(1, 5, 2, -5, 10, clip = 20, res = 7))


{'res': 7}

In [37]:
def my_func():
    return 1,2,3

a,b,c = my_func()
m = my_func()
m, _, n  = my_func()
print(a,b,c)
print(m)
print(m,n)


1 2 3
1
1 3

In [38]:
def spam(a, b=51):
    print(a, b)

spam(1) # Ok. a=1, b=42
spam(1, 2) # Ok. a=1, b=2


1 51
1 2

In [41]:
add = lambda x, y: x + y
add(3,4)


Out[41]:
7

In [42]:
add('a','b')


Out[42]:
'ab'

In [43]:
def spam(a, b, c, d):
    print(a, b, c, d)

In [69]:
from functools import partial
s1 = partial(spam, 1) # a = 1
s1(2, 3, 4)


1 2 3 4

partial在程序设计语言里面属于一个比较函数式编程中的柯里化,具体内容很简单,确定了一个函数的某些参数为默认参数,而且不会改变,通过这个函数衍生出来的另一个函数的过程。


In [ ]:
def s1(a = 1, b,c,d):
    return a, b, c, d

In [71]:
s2 = partial(spam, 2,3)
s2(4,5)


2 3 4 5

In [46]:
arr = [1,2,3,4,5]
m = map((lambda x: x + 3), arr)

map:这里可以表示为transform(转换)的功能,即遍历列表中的所有元素,将每个元素通过相应的函数进行转换,然后逐个放入一个新的列表之中。


In [48]:
list(map(pow,[1,2,3],[2,3,4]))


Out[48]:
[1, 8, 81]

In [51]:
lt = [i for i in range(-5, 10, 2)]

In [52]:
list(filter((lambda x: x > 0), lt))


Out[52]:
[1, 3, 5, 7, 9]

filter:这里可以表示为过滤的功能,和shell里面的grep有些类似,即遍历列表中的每个元素,然后根据函数的条件进行判断,保留符合条件的元素,最后逐个放入新的列表之中。


In [53]:
from functools import reduce
reduce((lambda x, y: x+y), [1,2,3,4])


Out[53]:
10

reduce:这里可以表示为归并的功能,一共只能对于加法和乘法进行操作。


In [54]:
reduce((lambda x, y: x * y), [2,3,4])


Out[54]:
24

In [124]:
[i for i in range(10) if i % 2]


Out[124]:
[1, 3, 5, 7, 9]

In [125]:
for i in range(10):
    if i % 2:
        print(i)


1
3
5
7
9

迭代器和生成器

迭代是数据处理的基石。扫描内存中放不下的数据集时,我们要找到一种惰性获取数据项的方式,即按需一次获取一个数据项。这就是迭代器模式(Iterator pattern)

在Python 中,所有集合都可以迭代。在Python 语言内部,迭代器用于支持:

  • for 循环
  • 构建和扩展集合类型
  • 逐行遍历文本文件
  • 列表推导、字典推导和集合推导
  • 元组拆包
  • 调用函数时,使用* 拆包实参

如果想要遍历可迭代对象中的所有元素,可以不使用for/while循环:


In [ ]:
arr = [1,2,3,4]
a = iter(arr)
next(a)

In [ ]:
next(a)

In [ ]:
a

In [ ]:
next(a)

In [ ]:
next(a)

In [ ]:
next(a)

In [43]:
def manual_iter():
    with open('testdata.csv') as f:
        try:
            while True:
                line = next(f)
                print(line, end='')
        except StopIteration:
            pass
manual_iter()


"Title 1","Title 2","Title 3","Title 4"
1,"a",08/18/07,"Ã¥"
2,"b",08/19/07,"â«"
3,"c",08/20/07,"ç"


In [48]:
with open('testdata.csv') as f:
    while True:
        line = next(f, None)
        if line is None:
            break
        print(line, end='')


"Title 1","Title 2","Title 3","Title 4"
1,"a",08/18/07,"Ã¥"
2,"b",08/19/07,"â«"
3,"c",08/20/07,"ç"


In [46]:
with open('testdata.csv') as f:
    for line in iter(f):        
        print(line, end = '')


"Title 1","Title 2","Title 3","Title 4"
1,"a",08/18/07,"Ã¥"
2,"b",08/19/07,"â«"
3,"c",08/20/07,"ç"

生成器函数会创建一个生成器对象,包装生成器函数的定义体。把生成器传给next(...)函数时,生成器函数会向前,执行函数定义体中的下一个yield 语句,返回产出的值,并在函数定义体的当前位置暂停。

最终,函数的定义体返回时,外层的生成器对象会抛出StopIteration 异常——这一点与迭代器协议一致。


In [ ]:


In [ ]:


In [ ]:

生成器函数的工作原理

只要Python 函数的定义体中有yield 关键字,该函数就是生成器函数。调用生成器函数时,会返回一个生成器对象。也就是说,生成器函数是生成器工厂。 普通的函数与生成器函数在句法上唯一的区别是,在后者的定义体中有yield 关键字。

使用yield构建生成器,生成器完全实现了迭代器接口:


In [ ]:
def frange(start, stop, increment):
    x = start
    while x < stop:
        yield x
        x += increment

In [ ]:
这里的yield实现了这样的功能在此处暂停执行保存当前的状态然后返回yield右边的值直到触发下一次迭代才会执行后续的语句

In [ ]:
list(frange(0, 1, 0.125))

In [ ]:
def gen_ab():
    print('start')
    yield 'a'
    print('continue')
    yield 'b'
    print('end')

for i in gen_ab():
    print('--->'+i)

一个生成器函数主要特征是它只会回应在迭代中使用到的 next 操作。 一旦生成器函数返回退出,迭代终止。我们在迭代中通常使用的for语句会自动处理这些细节。

将一个多层嵌套的序列展开成一个单层列表:


In [ ]:
from collections import Iterable

def flatten(items, ignore_types=(str, bytes)):
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, ignore_types):
            yield from flatten(x)
        else:
            yield x

items = [1, 2, [3, 4, [5, 6], 7], 8]
list(flatten(items))

语句 yield from 在你想在生成器中调用其他生成器作为子例程的时候非常有用。 如果你不使用它的话,那么就必须写额外的 for 循环了。比如:


In [ ]:
def flatten(items, ignore_types=(str, bytes)):
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, ignore_types):
            for i in flatten(x):
                yield i
        else:
            yield x

items = [1, 2, [3, 4, [5, 6], 7], 8]
list(flatten(items))

itertools模块

itertools模块里面包含了很多相应的迭代器函数,可以节省很多的脑力,直接拿来就用。

In [35]:
s = 'abc'

In [36]:
import itertools
for p in itertools.permutations(s):# 所有可能的情况
    print(p)


('a', 'b', 'c')
('a', 'c', 'b')
('b', 'a', 'c')
('b', 'c', 'a')
('c', 'a', 'b')
('c', 'b', 'a')

In [39]:
items = ['a','b','c','d','e']
for q in itertools.combinations(items,2):
    print(q)


('a', 'b')
('a', 'c')
('a', 'd')
('a', 'e')
('b', 'c')
('b', 'd')
('b', 'e')
('c', 'd')
('c', 'e')
('d', 'e')

In [40]:
for m in itertools.combinations_with_replacement(items, 3):
    print(m)


('a', 'a', 'a')
('a', 'a', 'b')
('a', 'a', 'c')
('a', 'a', 'd')
('a', 'a', 'e')
('a', 'b', 'b')
('a', 'b', 'c')
('a', 'b', 'd')
('a', 'b', 'e')
('a', 'c', 'c')
('a', 'c', 'd')
('a', 'c', 'e')
('a', 'd', 'd')
('a', 'd', 'e')
('a', 'e', 'e')
('b', 'b', 'b')
('b', 'b', 'c')
('b', 'b', 'd')
('b', 'b', 'e')
('b', 'c', 'c')
('b', 'c', 'd')
('b', 'c', 'e')
('b', 'd', 'd')
('b', 'd', 'e')
('b', 'e', 'e')
('c', 'c', 'c')
('c', 'c', 'd')
('c', 'c', 'e')
('c', 'd', 'd')
('c', 'd', 'e')
('c', 'e', 'e')
('d', 'd', 'd')
('d', 'd', 'e')
('d', 'e', 'e')
('e', 'e', 'e')

同时迭代多个序列


In [41]:
a = ['a','b','c','d','e']
b = [1,2,3,4,5]

In [42]:
for k, v in zip(a, b):
    print(k, v)


a 1
b 2
c 3
d 4
e 5

In [ ]:


In [ ]:


In [ ]: