Contents:
PEP character rule
unicode: decode ascii to unicode, encode unicode to utf-8
Counter
chardet
cStringIO to load inline text table to dataframe
introspection: dir(), type(), apihelper.info(object, spacing=10, collapse=True), getattr(object, 'append') to validate
os directory and file names
pandas fill in missing rows, eg years
pandas pivot, unpivot to replace missing with zero
oauth2
rgb to hex
Anaconda environments
dict get, sort
collections counter
dict of key-value pairs to dataframe
logging
glob traverse directories recursive
split text files every n lines
accents/diacritics substituter
consonant/vowel tagger
list of pairwise combinations (fewer than permutations, unordered)
string ljust, rjust
pandas merge map apply
json, pickle io
windows to posix
delete every n files
dict to csv
download list of urls
random numbers cookbook
lambda functions
reverse dict
progress bar
traverse file tree
glob recursive
convert colors rgb hex hls
BeautifulSoup demo
Mechanize demo
In [ ]:
# 72, 79 and 80 character rule (#PEP) do not reach this ->|
#########1#########2#########3#########4#########5#########6#########7#2######9X
In [ ]:
#counter
from collections import Counter
c = Counter()
for term in terms:
for found in re.finditer(term, txt):
c.update([term])
In [ ]:
#chardet #detect #text #encoding
import chardet
print chardet.detect(chapters[0])
In [ ]:
#unicode #encode #decode
# encoding or decoding is always FROM unicode or TOWARDS unicode.
# [decode] [encode]
# ASCII ---> UNICODE ---> UTF-8
# 1 Glyph 1 Glyph
# = 1 Glyph =
# 1 Byte 1-4 Bytes
mystring = 'xxx'
unicode_str = mystring.decode('ascii')
utf8_str = unicode_str.encode('utf-8')
# note that this example is redundant, because ascii is a subset of UTF-8 so it always works
In [ ]:
In [ ]:
#introspection
#ipython:
os? #docstring
os?? #source code
from apihelper import info
info(object, spacing=10, collapse=True) # spacing is num chars in object name, collapse=put all on one line
type(object)
dir(object)
getattr(li, "append") # returns reference to method because it's valud
getattr(li, "append")("Moe") # returns nothing because it's a valid call
getattr(li, "apend") # returns exception
In [1]:
#os and ntpath methods to deal with #directory and file names
#path #os #ntpath #glob #dir #filename
import ntpath
import os
path = r'C:\Program Filez (Portable)\Console2\console.chm'
# Note: I don't call this directory 'Filez' with a z because it's pirated,
# But so it will sort third after Program Files and Program Files (x86)
print 'os:'
dirname = os.path.dirname(path)
basename = os.path.basename(path)
print dirname, basename, os.path.join(dirname, basename)
print 'exists:', os.path.exists(dirname), os.path.exists(basename), os.path.exists(path)
print 'isdir:', os.path.isdir(dirname), os.path.isdir(basename), os.path.isdir(path)
print 'isfile:', os.path.isfile(dirname), os.path.isfile(basename), os.path.isfile(path)
print 'split:', os.path.split(path)
print 'splitext:', os.path.splitext(path), os.path.splitext(dirname), os.path.splitext(basename)
print 'splitunc:', os.path.splitunc(path), os.path.splitunc(dirname), os.path.splitunc(basename)
print '----------'
print 'ntpath:', ntpath.dirname(path), ntpath.basename(path),
# qv http://pydoc.org/2.5.1/ntpath.html
In [8]:
#rename #random files by adding prefix
path = "H:/Music/"
import os
import glob
from random import shuffle
pathlist = glob.glob(path + '*.mp3')
numbers = range(len(pathlist))
shuffle(numbers)
print numbers
print pathlist
for i in range(len(numbers)):
newname = os.path.dirname(pathlist[i]) + '/' + str(numbers[i]) + '_' + os.path.basename(pathlist[i])
os.rename(pathlist[i], newname)
In [ ]:
#pandas #add #fill in #missing #rows, e.g. #years
import pandas as pd
df = pd.DataFrame({'year':[2000, 2001, 2002, 2004, 2005, 2006], 'qty':[10, 20, 15, 25, 20, 30]})
tocomplete = set(df.year)
for i in range(2000, 2007):
if i not in tocomplete:
df = df.append(pd.DataFrame({'year':[i], 'qty':[0]}), ignore_index=True)
In [ ]:
#oauth2
import urlparse
import oauth2 as oauth
consumer_key = 'azkeVAunE4IK2ChXoGifruSQ4YpxLwafX1dLZrLAkzLJrc2IcE'
consumer_secret = 'DhQLUrawaPosoCaMmrSTMyvBfQjaiSUQGV1mJ9vW9yvmu2GURB'
request_token_url = 'http://www.tumblr.com/oauth/request_token'
access_token_url = 'http://www.tumblr.com/oauth/access_token'
authorize_url = 'http://www.tumblr.com/oauth/authorize'
consumer = oauth.Consumer(consumer_key, consumer_secret)
client = oauth.Client(consumer)
# Step 1: Get a request token. This is a temporary token that is used for
# having the user authorize an access token and to sign the request to obtain
# said access token.
resp, content = client.request(request_token_url, "GET")
if resp['status'] != '200':
raise Exception("Invalid response %s." % resp['status'])
request_token = dict(urlparse.parse_qsl(content))
print "Request Token:"
print " - oauth_token = %s" % request_token['oauth_token']
print " - oauth_token_secret = %s" % request_token['oauth_token_secret']
print
# Step 2: Redirect to the provider. Since this is a CLI script we do not
# redirect. In a web application you would redirect the user to the URL
# below.
print "Go to the following link in your browser:"
print "%s?oauth_token=%s" % (authorize_url, request_token['oauth_token'])
print
# After the user has granted access to you, the consumer, the provider will
# redirect you to whatever URL you have told them to redirect to. You can
# usually define this in the oauth_callback argument as well.
accepted = 'n'
while accepted.lower() == 'n':
accepted = raw_input('Have you authorized me? (y/n) ')
oauth_verifier = raw_input('What is the PIN? ')
# Step 3: Once the consumer has redirected the user back to the oauth_callback
# URL you can request the access token the user has approved. You use the
# request token to sign this request. After this is done you throw away the
# request token and use the access token returned. You should store this
# access token somewhere safe, like a database, for future use.
token = oauth.Token(request_token['oauth_token'],
request_token['oauth_token_secret'])
token.set_verifier(oauth_verifier)
client = oauth.Client(consumer, token)
resp, content = client.request(access_token_url, "POST")
access_token = dict(urlparse.parse_qsl(content))
print "Access Token:"
print " - oauth_token = %s" % access_token['oauth_token']
print " - oauth_token_secret = %s" % access_token['oauth_token_secret']
print
print "You may now access protected resources using the access tokens above."
print
In [ ]:
#pandas #pivot #unpivot to replace missing values with zero for graphing
def pivot_unpivot(df, row_name, column_name, value_name):
""" Pivot and unpivot a dataframe to replace missing values with zeroes """
pivoted = pd.DataFrame(pd.pivot_table(df, values=value_name, index = row_name, columns=column_name)).fillna(0.0)
unpivoted = pd.DataFrame()
for column in pivoted.columns:
unpivoted = unpivoted.append(pd.DataFrame({row_name: list(pivoted.index), column_name: [column]*len(pivoted),
value_name: list(pivoted[column])}), ignore_index=True)
return unpivoted
In [1]:
#Convert #rgb to #hex
def rgb(r, g, b):
triplet = (r, g, b)
return '#'+''.join(map(chr, triplet)).encode('hex')
print rgb(127,255,0)
In [ ]:
#Anaconda environments : in shell
ignore = """
see all environments:
conda info -e
conda create -n ENVIRONMENT_NAME python
activate ENVIRONMENTNAME
to deactivate:
deactivate
conda remove -n ENVIRONMENTNAME --all
"""
In [3]:
#dict get, sort
dict[key] = dict.get(key, 0) + increment
###
# sort a dict
def sortdict(dict):
sortedkeys = dict.keys().sort()
newdict = {}
for key in sortedkeys:
newdict[key] = dict[key]
return newdict
# not tested
In [ ]:
#collections #counter
import collections
counter = collections.Counter()
counter.update('a')
In [1]:
# turn dict of key-value pairs into dataframe
import pandas as pd
def dict_pairs_to_df(d, sortby='value', indexby='number'):
""" returns pandas df of key value pairs in dict without recursiom. if sortby != 'value', will sort on key; if indexby != number, will index by key"""
rdf = pd.DataFrame()
for key in d.keys():
rdf = rdf.append(pd.DataFrame({'key':[key], 'value':[d[key]]}), ignore_index=True)
if sortby == 'value':
rdf.sort('value', ascending=True, inplace=True)
else:
rdf.sort('key', ascending=True, inplace=True)
if indexby == 'number':
rdf.reset_index(drop=True, inplace=True)
else:
rdf.set_index('key', drop=False, inplace=True)
return rdf
df = dict_pairs_to_df({'a':2, 'b':5, 'c':'na'})
print df
### BETTER VERSION
def dict_pairs_to_df(d, sortby='value', indexby='number'):
""" returns pandas df of key value pairs in dict without recursion (i.e. dict can only be single key-value pairs, not nested).
Permitted values for sortby argument are 'value', 'value descending', 'key', 'key descending'.
Permitted values for indexby argument are 'number' or 'key'."""
rdf = pd.DataFrame()
for key in d.keys():
rdf = rdf.append(pd.DataFrame({'key':[key], 'value':[d[key]]}), ignore_index=True)
assert sortby in ['value', 'value descending', 'key', 'key descending']
if sortby.find('descending') != -1:
sortasc = False
else:
sortasc = True
assert indexby in ['number', 'key', 'key descending']
if sortby.find('value') != -1:
rdf.sort('value', ascending=sortasc, inplace=True)
else:
rdf.sort('key', ascending=sortasc, inplace=True)
if indexby == 'number':
rdf.reset_index(drop=True, inplace=True)
else:
rdf.set_index('key', drop=False, inplace=True)
return rdf
In [ ]:
# logger, logging
import logging
#-----> USER VARIABLE <-----#
log_name = 'mylog.txt'
# curr_dir = 'C:/mypath/'
log_dir = curr_dir
logger = logging.getLogger(log_name)
log_handler = logging.FileHandler(log_dir + '/' + log_name + '.log')
log_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
log_handler.setFormatter(log_formatter)
if logger.handlers == []:
logger.addHandler(log_handler)
logger.setLevel(logging.INFO)
logger.info(log_name + 'session started in ' + curr_dir)
#-----> examples of logger use:
#logger.error('Saving of {0} did not work'.format(filename))
#logger.info('{0} saved'.format(filename))
In [ ]:
#glob traverse directories recursive
import os
path = 'C:/Users/David/Sync2014/Py2014/111'
filelist = [os.path.join(dirpath, f)
for dirpath, dirnames, files in os.walk(path)
for f in files if f.endswith('.json')]
print filelist[0].split('\\')
# ['C:/Users/David/Sync2014/Py2014/111', 'votes', '2009', 'h1', 'data.json']
for fil in filelist:
ds = fil.split('\\')
newname = os.path.split(fil)[0] + '\\' + ds[0][-3:] + ds[1] + ds[2] + ds[3] + ds[4]
os.rename(fil, newname)
In [ ]:
#os #split #text files every n lines
#every file in a directory!
import os
path = "C:/Users/David/to_split"
os.chdir(path)
import glob
listoffiles = glob.glob('*.csv')
prefix = "all_"
turnover = 1000000
for filename in listoffiles:
with open("filename", "rt") as f:
i = 0
fout = open(prefix + filename + "0.csv", "w")
for line in f.readlines():
fout.write(line)
i+=1
if i%turnover == 0:
fout.close()
fout = open(prefix + filename + "output%d.csv"%(i/turnover),"wb")
fout.close()
In [4]:
#consonant or #vowel #tagger
def cv(name):
"returns consonant-vowel pattern tuplet: every one, then alternations only"
import re
cv = name.lower()
cv = re.sub('[bcdfghjklmnpqrstvxz]', 'c', cv)
cv = re.sub('cy$', 'cv', cv)
cv = re.sub('[aeo][wy][^aeiou]', 'vvc', cv)
cv = re.sub('[aeo][wy]$', 'vv', cv)
cv = re.sub('[wy]', 'c', cv)
cv = re.sub('[aeiou]', 'v', cv)
cv2 = re.sub('[v]{2,}', 'v', cv)
cv2 = re.sub('[c]{2,}', 'c', cv)
return cv, cv2
In [6]:
# make list of pairwise combinations (not permutations)
def list_pairwise(listforfn):
ni = len(listforfn)
if ni == 1:
return None
else:
pairwiselist = []
for i in (range(ni - 1)):
for j in range (i + 1):
pairwiselist.append((listforfn[i], listforfn[j]))
return pairwiselist
print list_pairwise([1,2,3,4])
In [ ]:
# print ljust or rjust to align columns using whitespace
>>> def printit():
... print 'Location: 10-10-10-10'.ljust(40) + 'Revision: 1'
... print 'District: Tower'.ljust(40) + 'Date: May 16, 2012'
... print 'User: LOD'.ljust(40) + 'Time: 10:15'
...
>>> printit()
Location: 10-10-10-10 Revision: 1
District: Tower Date: May 16, 2012
User: LOD Time: 10:15
In [ ]:
# pandas #new#column based on others #merge #map #apply
# import pandas as pd
# def calculate(s):
# a = s['path'] + 2*s['row'] # Simple calc for example
# b = s['path'] * 0.153
# return pd.Series({'col1'=a, 'col2'=b})
# df = df.merge(df.apply(calculate, axis=1), left_index=True, right_index=True)
or
df2 = pd.DataFrame({'col1':[1,2,3],'col2':['a', 'b', 'c']})
def fn(one, two):
return str(str(one**2) + two)
def calculate(df):
return pd.Series({'col3':str(df['col1'])+df['col2']})
df2 = df2.merge(df2.apply(calculate, axis=1), left_index=True, right_index=True)
print df2
or
def add_pct(group):
births = group.births.astype(float)
group['pct'] = (births / births.sum() * 100)
return group
yob = yob.groupby(['year', 'sex']).apply(add_pct)
#add #rank of each name each year each sex
yob['ranked'] = yob.groupby(['year', 'sex'])['births'].rank(ascending=False)
or
df_train['Gender'] = df_train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
#
percent column
#pandas #percent column
#needs to be adjusted for any dataset
# add column 'pct': the number of births of that name and sex in that year
# divided by the total number of births of that sex in that year, multiplied by
# 100 to turn into a percentage and reduce leading zeroes
def add_pct(group):
births = group.births.astype(float)
group['pct'] = (births / births.sum() * 100)
return group
yobgroups = yob.groupby(['year', 'sex']).apply(add_pct)
#add rank of each name each year each sex
yob['ranked'] = yobgroups.groupby(['year', 'sex'])['births'].rank(ascending=False)
### new column from boolean of existing column, leaves old values
myfunc = lambda s: 'newval' if (s['one'] == 'x')
df['newcol'] = df.apply(myfunc, axis=1)
###
apply
# an example of apply
def calc_letter(row):
if row.grade >= 90:
letter_grade = 'A'
elif row['grade'] > 75:
letter_grade = 'B'
elif row['grade'] > 60:
letter_grade = 'C'
else:
letter_grade = 'F'
return letter_grade
df['ltr'] = df.apply(calc_letter, axis=1)
print ' '
print df[df.ltr == 'C']
In [2]:
# #json and #pickle i/o
import json
d = {'a':1, 'b':2}
with open('eraseme.json', 'w+') as f:
f.write(json.dumps(d))
#equivalent
open('eraseme.json', 'w+').write(json.dumps(d))
d = json.loads(open('eraseme.json', 'r').read())
####
import pickle
pickle.dump( forest, open( "forest.pickle", "wb+" ) )
favorite_color = pickle.load( open( "save.p", "rb" ) )
In [2]:
# change windows paths with backslashes to posix paths with forward slashes and allowed characters
# note that this does not validate that the resulting path is posix-compliant; the following function does.
def win2posix(path):
return path.replace("\\", "/")
# validate posix path
def isvalidposix(path):
itisvalid = True
import re
for i in range(len(path)):
if 2>0:
if ((i == 0 and path[i] == '-') or
(i < len(path) and path[i:i+2] == '/-')):
itisvalid = False
print "- is not permitted as the first character of a path or file"
srch = re.search("[^A-Za-z0-9_\.\-// ]", path[i])
if srch:
itisvalid = False
print srch.group(0) + ' is not permitted permitted in POSIX filenames'
if path[i] == ' ':
itisvalid = False
print "Spaces are not permitted in POSIX filenames"
return itisvalid
print win2posix("C:\Users\Popeye\whatacool.jpg")
print isvalidposix("C:/Program Files/-thisfile&thatfile.exe")
print isvalidposix("usr/bin/data.py")
In [3]:
# os delete every n files
import glob
import os
multiple = 2
extension = ".jpg"
origpath = "C:/Users\David\Downloads\Contact opening 4500 frames (8-16-2014 7-39-08 AM)"
thepath = win2posix(origpath) # see function elsewhere in this notebook
os.chdir(thepath)
if thepath[-1] != "/":
thepath += "/"
filelist = glob.glob(thepath + "*" + extension)
print "Original count: %d files." % (len(filelist))
for counter in range(len(filelist)):
if counter % multiple != 0:
os.remove(filelist[counter])
newfilelist = glob.glob(thepath + "*" + extension)
print " New count: %d files." % (len(newfilelist))
In [ ]:
# change a dict into a csv,
# eg {'one':1, 'two':2} becomes:
# one,1
# two,2
def dict2csv(the_dict, csv_path, separator=',', quote_char = '|'):
import csv
with open(csv_path, 'wb+') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=separator,
quotechar=quote_char, quoting=csv.QUOTE_MINIMAL)
for key in the_dict.keys():
csvwriter.writerow([key, the_dict[key]])
In [1]:
#Download from list of #urls
# import urllib
# import urllib2
# import requests
# url = 'http://www.blog.pythonlibrary.org/wp-content/uploads/2012/06/wxDbViewer.zip'
# print "downloading with urllib"
# urllib.urlretrieve(url, "code.zip")
# print "downloading with urllib2"
# f = urllib2.urlopen(url)
# data = f.read()
# with open("code2.zip", "wb") as code:
# code.write(data)
# print "downloading with requests"
# r = requests.get(url)
# with open("code3.zip", "wb") as code:
# code.write(r.content)
###
import urllib
urllist = ['http://www.oshannonland.com/wp-content/uploads/2012/06/20-Spider-Man-20.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/21-Spider-Man-21.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/22-Spider-Man-22.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/23-Spider-Man-231.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/24-Spider-Man-24.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/25-Spider-Man-25.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/26-Spider-Man-26.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-27.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-28.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-29.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-30.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-31.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-32.mp3',
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-33.mp3']
for url in urllist:
urllib.urlretrieve(url, url.rsplit('/', 1)[-1])
print url.rsplit('/', 1)[-1] + ' downloaded successfully'
In [3]:
# some uses for random numbers
import random
# create a n-digit random number string with leading zeroes
def random_digit_string(length):
import random
random.seed()
txt = ''
for i in range(length):
txt += str(random.randint(0, 9))
return txt
print "\nFour times ten random digits:"
for i in range(4):
print random_digit_string(10),
print ""
#randrange demo
print "\nrandrange demo:"
print " 10:", random.randrange(10)
print " 50-1000:", random.randrange(50,1000)
print "10-20, step 2:", random.randrange(10,20,2)
# shuffle and choice
print "\nrandom lists demo"
alist = ["one", "two", "three", "four", "five"]
print " list:", alist
print " choice:", random.choice(alist)
random.shuffle(alist)
print "shuffle:", alist
print " sample:", random.sample(alist, 2)
# random floats
print "\nrandom floats:"
print "random.random (0-1):", random.random()
print " uniform(2-7):", random.uniform(2,7)
print "20 random numbers between 10 and 99, with indicated mode, rounded:"
print "mode 20:",
for i in range(12):
print int(round(random.triangular(10,99,20), 0)),
print "\nmode 90:",
for i in range(12):
print int(round(random.triangular(10,99,90), 0)),
print ""
# gaussian distribution
print "\n20 gaussian numbers (rounded) with mean 50, std dev 10:"
for i in range(20):
print int(round(random.gauss(50, 10),0)),
print "\nWith numpy:"
import numpy as np
print np.random.normal(50, 10, 20)
In [1]:
# examples of #functions and #lambda functions
def thefunc(x):
if 0 <= x <= 1:
return x ** 2
else:
return None
def shortfunc(x):
return x ** 2 if 0 <= x < 1 else None
f = lambda x: x** 2 if 0 <= x < 1 else None
print thefunc(0.5), thefunc(2)
print shortfunc(0.5), shortfunc(2)
print f(0.5), f(2)
In [ ]:
#reverse #dict #reverse dict
inv_map = {v: k for k, v in d.items()}
In [4]:
# remove common #accents / #diacritics
# list is incomplete; notably missing are Hungarian diacritics
accents = [(r'Ą', r'A'), (r'ą', r'a'), (r'Č', r'C'), (r'č', r'c'), (r'ď', r'd'), (r'Ę', r'E'), (r'ę', r'e'),
(r'Ě', r'E'), (r'ě', r'e'), (r'Ĺ', r'L'), (r'ĺ', r'l'), (r'Ň', r'N'), (r'ň', r'n'), (r'Ŕ', r'R'),
(r'ŕ', r'r'), (r'Ř', r'R'), (r'ř', r'r'), (r'ť', r't'), (r'Ů', r'r'), (r'ů', r'r'), (r'Ž', r'Z'),
(r'ž', r'z'), (r'Á', r'A'), (r'á', r'a'), (r'Â', r'A'), (r'â', r'a'), (r'Ø', r'o'), (r'õ', r'o'),
(r'À', r'A'), (r'à', r'a'), (r'Ä', r'A'), (r'ä', r'a'), (r'Ç', r'C'), (r'ç', r'c'), (r'É', r'E'),
(r'é', r'e'), (r'Ê', r'E'), (r'ê', r'e'), (r'È', r'E'), (r'è', r'e'), (r'Ë', r'E'), (r'ë', r'e'),
(r'Í', r'I'), (r'í', r'i'), (r'Î', r'I'), (r'î', r'i'), (r'Ì', r'I'), (r'ì', r'i'), (r'Ï', r'I'),
(r'ï', r'i'), (r'Ñ', r'N'), (r'ñ', r'n'), (r'Ó', r'O'), (r'ó', r'o'), (r'Ô', r'O'), (r'ô', r'o'),
(r'Ò', r'O'), (r'ò', r'o'), (r'Ö', r'O'), (r'ö', r'o'), (r'ø', r'o'), (r'Õ', r'O'), (r'ä', r'a'),
(r'Ú', r'r'), (r'ú', r'r'), (r'Û', r'r'), (r'û', r'r'), (r'Ù', r'r'), (r'ù', r'r'), (r'Ü', r'r'),
(r'ü', r'r'), (r'Ý', r'Y'), (r'ý', r'y'), (r'Š', r'S'), (r'š', r's'), (r'ÿ', r'y'), (r'Ÿ', r'Y'),
(r'Å', r'A'), (r'å', r'a'), (r'Ã', r'A'), (r'ã', r'a'), (r'Ä', r'A'),
(r'Æ', r'Ae'), (r'æ', r'ae'), (r'Œ', r'Oe'), (r'œ', r'oe'), (r'ß', r'ss')]
def remove_accents(txt):
import re
for accentpair in accents:
txt = re.sub(accentpair[0], accentpair[1], txt)
return txt
examples = ['Résumé', 'encyclopædia']
for example in examples:
print remove_accents(example)
# to change a file
accent_change_path_in = "C:/Users/David/Documents/IPython Notebooks/GitHub/Baby_names_US_IPython/lists/bible in prog.txt"
accent_change_path_out = "C:/Users/David/Documents/IPython Notebooks/GitHub/Baby_names_US_IPython/lists/bible in prog_noacc.txt"
in_file = open(accent_change_path_in, 'r').read()
out_file = remove_accents(in_file)
open(accent_change_path_out, 'w+').write(out_file)
In [16]:
#progress bar-type indicator, counts to 100.
import time
class ProgressBar:
def __init__(self, loop_length):
import time
self.start = time.time()
self.increment_size = 100.0/loop_length
self.curr_count = 0
self.curr_pct = 0
self.overflow = False
print '% complete:',
def increment(self):
self.curr_count += self.increment_size
if int(self.curr_count) > self.curr_pct:
self.curr_pct = int(self.curr_count)
if self.curr_pct <= 100:
print self.curr_pct,
elif self.overflow == False:
print "\n*!* Count has gone over 100%; likely either due to:\n*!* - an error in the loop_length specified when " + \
"progress_bar was instantiated\n*!* - an error in the placement of the increment() function"
print '*!* Elapsed time when progress bar full: %0.1f seconds.' % (time.time() - self.start)
self.overflow = True
def finish(self):
if self.curr_pct == 99:
print "100", # this is a cheat, because rounding sometimes makes the maximum count 99. One day I'll fix this bug.
if self.overflow == True:
print '*!* Elapsed time after end of loop: %0.1f seconds.\n' % (time.time() - self.start)
else:
print '\nElapsed time: %0.1f seconds.\n' % (time.time() - self.start)
#examples:
progbar_ex1 = ProgressBar(10)
for i in range(10):
progbar_ex1.increment()
progbar_ex1.finish()
#example2:
progbar_ex2 = ProgressBar(251)
for i in range(251):
progbar_ex2.increment()
progbar_ex2.finish()
#example3:
progbar_ex3 = ProgressBar(10)
for i in range(4962):
progbar_ex3.increment()
progbar_ex3.finish()
In [ ]:
#traverse #file #tree
def w2p(path):
return path.replace("\\", "/")
import os
for root, dirs, files in os.walk("C:\Python_packages\lingpy-2"):
for file in files:
if file[-3:] == '.py':
contents = open(w2p(root) + '/' + file, 'r').read()
contents = "# -*- coding: utf-8 -*-\n" + contents
#open(w2p(root) + '/' + file, 'w').write(contents) # rewrites file
In [18]:
#convert #color #rgb #hex #hls
r = 0
g = 127
b = 255
print "#{0:02x}{1:02x}{2:02x}".format(r, g, b)
############################
import struct
rgbstr='aabbcc'
print struct.unpack('BBB',rgbstr.decode('hex'))
rgb = (50,100,150)
print struct.pack('BBB',*rgb).encode('hex')
######
import colorsys
# note takes values from 0 to 1
r, g, b = (50, 100, 150)
print colorsys.rgb_to_hls(r/255.0, g/255.0, b/255.0)
print colorsys.hls_to_rgb(.583, .392, .5)
# example: lighten colors
colors = ['#BB2114', '#0C5966', '#BA7814', '#4459AB', '#6B3838',
'#B8327B', '#2B947F', '#0D83B5', '#684287', '#8C962C',
'#92289E', '#242D7D']
import struct
new = []
for color in colors:
r, g, b = struct.unpack('BBB',color[1:].decode('hex'))
h, l, s = colorsys.rgb_to_hls(r/255.0, g/255.0, b/255.0)
rgb2 = colorsys.hls_to_rgb(h, (5*l+1)/6, s)
print r, g, b, h, l, s, rgb2,
r, g, b = int(rgb2[0] * 255), int(rgb2[1] * 255), int(rgb2[2] * 255)
print r, g, b
new.append("#{0:02x}{1:02x}{2:02x}".format(r, g, b))
print new
In [ ]:
#BeautifulSoup
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_string) #or soup = BeautifulSoup(open("index.html"))
#to find first such tag:
tag = soup.find('p')
#attributes:
tag.name
tag['class'] # an attr
tag.contents
tag.attrs # returns a dict
#to find all tags (returns list)
tags = soup.findAll('p')
#to find <select name="HSpeaker" multiple size="4">
hspeaker = soup.find('select',{'name':'HSpeaker'})
#to create list all option tags under the above:
options = hspeaker.findAll('option')
print soup.find(id="link3")
#extract all urls in <a href=""> tags
for link in soup.find_all('a'):
print(link.get('href'))
#extract all text
print(soup.get_text())
# more:
# http://omz-software.com/pythonista/docs/ios/beautifulsoup_guide.html
In [ ]:
# mechanize
import mechanize
br = mechanize.Browser()
br.set_handle_robots(False) # ignore robots # no cookies
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
# Follows refresh 0 but not hangs on refresh > 0:
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
user_agents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6',
'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Opera/9.00 (Windows NT 5.1; U; en)']
agent = 2
br.addheaders = [('User-agent', user_agents[agent])]
# Want debugging messages?
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
#alternate to use cookies
# import cookielib
# cj = cookielib.LWPCookieJar()
# br.set_cookiejar(cj)
#########################################
response = br.open('http://www.example.com') # can only be used once
soup = BeautifulSoup(response.read())
br.response().info() # headers
#forms
for form in br.forms():
print form
br.select_form(nr=0) #first form
br.select_form("form1") # works when form has a name
br.form = list(br.forms())[0] # use when form is unnamed
# submit form
# eg if form control is <CheckboxControl(SenateSection=[*1])>
br.form['SenateSection']=1
br.submit()
print br.response().read()
#list all controls
for control in br.form.controls:
print control
print "type=%s, name=%s value=%s" % (control.type, control.name, br[control.name])
#find by name
control = br.form.find_control("controlname")
#allowed values
if control.type == "select": # means it's class ClientForm.SelectControl
for item in control.items:
print " name=%s values=%s" % (item.name, str([label.text for label in item.get_labels()]))
#select-type controls must be set with a list, even if only one item:
print control.value
print control # selected value is starred
control.value = ["ItemName"]
print control
br[control.name] = ["ItemName"] # equivalent and more normal
#text controls can be set as string
if control.type == "text": # means it's class ClientForm.TextControl
control.value = "stuff here"
br["controlname"] = "stuff here" # equivalent
# Looking at some results in link format
for link in br.links(url_regex='stockrt'):
print link
#or
print link.text, link.url
#links
# Testing presence of link (if the link is not found you would have to
# handle a LinkNotFoundError exception)
br.find_link(text='Weekend codes')
# Actually clicking the link
req = br.click_link(text='Weekend codes')
br.open(req)
print br.response().read()
print br.geturl()
# Back
br.back()
print br.response().read()
print br.geturl()
#
br.follow_link(text='Sign out')
#or
resp = br.follow_link(...)
# follow lots of links
all_links = [l for l in br.links(url_regex='\?v=c&th=')] # or text_regex
# Select the first 3
for link in all_links[0:3]:
print link
# Open each message
br.follow_link(msg_link)
#
request = br.click_link(link)
response = br.follow_link(link)
print response.geturl()
print response.get_data()
#
for link in br.links():
print link.text, link.url
# Download
f = br.retrieve('http://www.google.com.br/intl/pt-BR_br/images/logo.gif')[0]
# Proxy and user/password
br.set_proxies({"http": "joe:password@myproxy.example.com:3128"})
# Proxy
br.set_proxies({"http": "myproxy.example.com:3128"})
# Proxy password
br.add_proxy_password("joe", "password")
# more, incl password protected auth
# http://stockrt.github.io/p/emulating-a-browser-in-python-with-mechanize/
#pydoc
# http://joesourcecode.com/Documentation/mechanize0.2.5/