notebook.community

Edit and run

Contents:

PEP character rule
unicode: decode ascii to unicode, encode unicode to utf-8
Counter
chardet
cStringIO to load inline text table to dataframe
introspection: dir(), type(), apihelper.info(object, spacing=10, collapse=True), getattr(object, 'append') to validate
os directory and file names
    pandas fill in missing rows, eg years
pandas pivot, unpivot to replace missing with zero
oauth2
rgb to hex
Anaconda environments
dict get, sort
collections counter
dict of key-value pairs to dataframe
logging
glob traverse directories recursive
split text files every n lines
accents/diacritics substituter
consonant/vowel tagger
list of pairwise combinations (fewer than permutations, unordered)
string ljust, rjust
pandas merge map apply
json, pickle io
windows to posix
delete every n files
dict to csv
download list of urls
random numbers cookbook
lambda functions
reverse dict
progress bar
traverse file tree
glob recursive
convert colors rgb hex hls
BeautifulSoup demo
Mechanize demo



In [ ]:

    
# 72, 79 and 80 character rule (#PEP)                       do not reach this ->|
#########1#########2#########3#########4#########5#########6#########7#2######9X



In [ ]:

    
#counter

from collections import Counter

c = Counter()
for term in terms:
    for found in re.finditer(term, txt):
        c.update([term])



In [ ]:

    
#chardet #detect #text #encoding
import chardet
print chardet.detect(chapters[0])



In [ ]:

    
#unicode #encode #decode

# encoding or decoding is always FROM unicode or TOWARDS unicode.

#      [decode]     [encode]
# ASCII ---> UNICODE ---> UTF-8
# 1 Glyph                 1 Glyph 
#   =        1 Glyph        =
# 1 Byte                  1-4 Bytes

mystring = 'xxx'
unicode_str = mystring.decode('ascii')
utf8_str = unicode_str.encode('utf-8')

# note that this example is redundant, because ascii is a subset of UTF-8 so it always works



In [ ]:



In [ ]:

    
#introspection

#ipython:
os?   #docstring
os??  #source code

from apihelper import info
info(object, spacing=10, collapse=True) # spacing is num chars in object name, collapse=put all on one line

type(object)

dir(object)

getattr(li, "append") # returns reference to method because it's valud
getattr(li, "append")("Moe") # returns nothing because it's a valid call
getattr(li, "apend") # returns exception



In [1]:

    
#os and ntpath methods to deal with #directory and file names
#path #os #ntpath #glob #dir #filename

import ntpath
import os

path = r'C:\Program Filez (Portable)\Console2\console.chm'
# Note: I don't call this directory 'Filez' with a z because it's pirated,
# But so it will sort third after Program Files and Program Files (x86)

print 'os:'
dirname = os.path.dirname(path)
basename = os.path.basename(path)
print dirname, basename, os.path.join(dirname, basename)
print 'exists:', os.path.exists(dirname), os.path.exists(basename), os.path.exists(path)
print 'isdir:', os.path.isdir(dirname), os.path.isdir(basename), os.path.isdir(path)
print 'isfile:', os.path.isfile(dirname), os.path.isfile(basename), os.path.isfile(path)
print 'split:', os.path.split(path)
print 'splitext:', os.path.splitext(path), os.path.splitext(dirname), os.path.splitext(basename)
print 'splitunc:', os.path.splitunc(path), os.path.splitunc(dirname), os.path.splitunc(basename)

print '----------'
print 'ntpath:', ntpath.dirname(path), ntpath.basename(path),
# qv http://pydoc.org/2.5.1/ntpath.html









    



os:
C:\Program Filez (Portable)\Console2 console.chm C:\Program Filez (Portable)\Console2\console.chm
exists: True False True
isdir: True False False
isfile: False False True
split: ('C:\\Program Filez (Portable)\\Console2', 'console.chm')
splitext: ('C:\\Program Filez (Portable)\\Console2\\console', '.chm') ('C:\\Program Filez (Portable)\\Console2', '') ('console', '.chm')
splitunc: ('', 'C:\\Program Filez (Portable)\\Console2\\console.chm') ('', 'C:\\Program Filez (Portable)\\Console2') ('', 'console.chm')
----------
ntpath: C:\Program Filez (Portable)\Console2 console.chm



In [8]:

    
#rename #random files by adding prefix

path = "H:/Music/"

import os
import glob
from random import shuffle

pathlist = glob.glob(path + '*.mp3')

numbers = range(len(pathlist))
shuffle(numbers)

print numbers
print pathlist

for i in range(len(numbers)):
    newname = os.path.dirname(pathlist[i]) + '/' + str(numbers[i]) + '_' + os.path.basename(pathlist[i])
    os.rename(pathlist[i], newname)









    



[22, 18, 9, 25, 8, 7, 15, 6, 0, 14, 1, 11, 16, 17, 2, 4, 13, 5, 21, 12, 3, 24, 23, 20, 19, 10]
['H:/Music\\wits_20130719_128.mp3', 'H:/Music\\geekf-seinfeld.mp3', 'H:/Music\\crossharmar_20131220_128.mp3', 'H:/Music\\GF-Marvel_vs_DC.mp3', 'H:/Music\\cr memory.mp3', 'H:/Music\\cr squander.mp3', 'H:/Music\\crack hivemind.mp3', 'H:/Music\\cracked gen gap.mp3', 'H:/Music\\crk film change life.mp3', 'H:/Music\\geekf-superman.mp3', 'H:/Music\\geekf-trekii.mp3', 'H:/Music\\icbonusginger.mp3', 'H:/Music\\incbonusmilk.mp3', 'H:/Music\\incbonustv.mp3', 'H:/Music\\rl choice.mp3', 'H:/Music\\rl lies.mp3', 'H:/Music\\rl stochastic.mp3', 'H:/Music\\rl words.mp3', 'H:/Music\\rlglomar.mp3', 'H:/Music\\rltimeschange.mp3', 'H:/Music\\soulless-xmasstory.mp3', 'H:/Music\\Star-Trek-4--Speakeasy-Tysto.mp3', 'H:/Music\\soulless-sttmp.mp3', 'H:/Music\\cpg-helloint-14.mp3', 'H:/Music\\cr sexist.mp3', 'H:/Music\\bugle270.mp3']



In [ ]:

    
#pandas #add #fill in #missing #rows, e.g. #years

import pandas as pd
df = pd.DataFrame({'year':[2000, 2001, 2002, 2004, 2005, 2006], 'qty':[10, 20, 15, 25, 20, 30]})

tocomplete = set(df.year)

for i in range(2000, 2007):
    if i not in tocomplete:
        df = df.append(pd.DataFrame({'year':[i], 'qty':[0]}), ignore_index=True)



In [ ]:

    
#oauth2

import urlparse
import oauth2 as oauth

consumer_key = 'azkeVAunE4IK2ChXoGifruSQ4YpxLwafX1dLZrLAkzLJrc2IcE'
consumer_secret = 'DhQLUrawaPosoCaMmrSTMyvBfQjaiSUQGV1mJ9vW9yvmu2GURB'

request_token_url = 'http://www.tumblr.com/oauth/request_token'
access_token_url = 'http://www.tumblr.com/oauth/access_token'
authorize_url = 'http://www.tumblr.com/oauth/authorize'

consumer = oauth.Consumer(consumer_key, consumer_secret)
client = oauth.Client(consumer)

# Step 1: Get a request token. This is a temporary token that is used for 
# having the user authorize an access token and to sign the request to obtain 
# said access token.

resp, content = client.request(request_token_url, "GET")
if resp['status'] != '200':
    raise Exception("Invalid response %s." % resp['status'])

request_token = dict(urlparse.parse_qsl(content))

print "Request Token:"
print "    - oauth_token        = %s" % request_token['oauth_token']
print "    - oauth_token_secret = %s" % request_token['oauth_token_secret']
print 

# Step 2: Redirect to the provider. Since this is a CLI script we do not 
# redirect. In a web application you would redirect the user to the URL
# below.

print "Go to the following link in your browser:"
print "%s?oauth_token=%s" % (authorize_url, request_token['oauth_token'])
print 

# After the user has granted access to you, the consumer, the provider will
# redirect you to whatever URL you have told them to redirect to. You can 
# usually define this in the oauth_callback argument as well.
accepted = 'n'
while accepted.lower() == 'n':
    accepted = raw_input('Have you authorized me? (y/n) ')
oauth_verifier = raw_input('What is the PIN? ')

# Step 3: Once the consumer has redirected the user back to the oauth_callback
# URL you can request the access token the user has approved. You use the 
# request token to sign this request. After this is done you throw away the
# request token and use the access token returned. You should store this 
# access token somewhere safe, like a database, for future use.
token = oauth.Token(request_token['oauth_token'],
    request_token['oauth_token_secret'])
token.set_verifier(oauth_verifier)
client = oauth.Client(consumer, token)

resp, content = client.request(access_token_url, "POST")
access_token = dict(urlparse.parse_qsl(content))

print "Access Token:"
print "    - oauth_token        = %s" % access_token['oauth_token']
print "    - oauth_token_secret = %s" % access_token['oauth_token_secret']
print
print "You may now access protected resources using the access tokens above." 
print



In [ ]:

    
#pandas #pivot #unpivot to replace missing values with zero for graphing
def pivot_unpivot(df, row_name, column_name, value_name):
    """ Pivot and unpivot a dataframe to replace missing values with zeroes """
    pivoted = pd.DataFrame(pd.pivot_table(df, values=value_name, index = row_name, columns=column_name)).fillna(0.0)
    unpivoted = pd.DataFrame()
    for column in pivoted.columns:
        unpivoted = unpivoted.append(pd.DataFrame({row_name: list(pivoted.index), column_name: [column]*len(pivoted),
                                       value_name: list(pivoted[column])}), ignore_index=True)
    return unpivoted



In [1]:

    
#Convert #rgb to #hex

def rgb(r, g, b):
    triplet = (r, g, b)
    return '#'+''.join(map(chr, triplet)).encode('hex')

print rgb(127,255,0)









    



#7fff00



In [ ]:

    
#Anaconda environments : in shell
ignore = """

see all environments:
conda info -e

conda create -n ENVIRONMENT_NAME python

activate ENVIRONMENTNAME

to deactivate:
deactivate

conda remove -n ENVIRONMENTNAME --all
"""



In [3]:

    
#dict get, sort

dict[key] = dict.get(key, 0) + increment

###

# sort a dict

def sortdict(dict):
    sortedkeys = dict.keys().sort()
    newdict = {}
    for key in sortedkeys:
        newdict[key] = dict[key]
    return newdict
# not tested









    



{'x': 1}



In [ ]:

    
#collections #counter

import collections

counter = collections.Counter()
counter.update('a')



In [1]:

    
# turn dict of key-value pairs into dataframe

import pandas as pd
def dict_pairs_to_df(d, sortby='value', indexby='number'):
    """ returns pandas df of key value pairs in dict without recursiom. if sortby != 'value', will sort on key; if indexby != number, will index by key"""
    rdf = pd.DataFrame()
    for key in d.keys():
        rdf = rdf.append(pd.DataFrame({'key':[key], 'value':[d[key]]}), ignore_index=True)
    if sortby == 'value':
        rdf.sort('value', ascending=True, inplace=True)
    else:
        rdf.sort('key', ascending=True, inplace=True)
    if indexby == 'number':
        rdf.reset_index(drop=True, inplace=True)
    else:
        rdf.set_index('key', drop=False, inplace=True)
    return rdf

df = dict_pairs_to_df({'a':2, 'b':5, 'c':'na'})
print df


### BETTER VERSION

def dict_pairs_to_df(d, sortby='value', indexby='number'):
    """ returns pandas df of key value pairs in dict without recursion (i.e. dict can only be single key-value pairs, not nested).
    Permitted values for sortby argument are 'value', 'value descending', 'key', 'key descending'.
    Permitted values for indexby argument are 'number' or 'key'."""
    rdf = pd.DataFrame()
    for key in d.keys():
        rdf = rdf.append(pd.DataFrame({'key':[key], 'value':[d[key]]}), ignore_index=True)
    assert sortby in ['value', 'value descending', 'key', 'key descending']
    if sortby.find('descending') != -1:
        sortasc = False
    else:
        sortasc = True
    assert indexby in ['number', 'key', 'key descending']
    if sortby.find('value') != -1:
        rdf.sort('value', ascending=sortasc, inplace=True)
    else:
        rdf.sort('key', ascending=sortasc, inplace=True)
    if indexby == 'number':
        rdf.reset_index(drop=True, inplace=True)
    else:
        rdf.set_index('key', drop=False, inplace=True)
    return rdf









    



  key value
0   a     2
1   b     5
2   c    na



In [ ]:

    
# logger, logging

import logging

#-----> USER VARIABLE <-----#
log_name = 'mylog.txt'
# curr_dir = 'C:/mypath/'
log_dir = curr_dir

logger = logging.getLogger(log_name)
log_handler = logging.FileHandler(log_dir + '/' + log_name + '.log')
log_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
log_handler.setFormatter(log_formatter)
if logger.handlers == []:
    logger.addHandler(log_handler) 
    logger.setLevel(logging.INFO)
logger.info(log_name + 'session started in ' + curr_dir)

#-----> examples of logger use:
#logger.error('Saving of {0} did not work'.format(filename))
#logger.info('{0} saved'.format(filename))



In [ ]:

    
#glob traverse directories recursive

import os

path = 'C:/Users/David/Sync2014/Py2014/111'

filelist = [os.path.join(dirpath, f)
     for dirpath, dirnames, files in os.walk(path)
     for f in files if f.endswith('.json')]

print filelist[0].split('\\')
# ['C:/Users/David/Sync2014/Py2014/111', 'votes', '2009', 'h1', 'data.json']

for fil in filelist:
    ds = fil.split('\\')
    newname = os.path.split(fil)[0] + '\\' + ds[0][-3:] + ds[1] + ds[2] + ds[3] + ds[4]
    os.rename(fil, newname)



In [ ]:

    
#os #split #text files every n lines
#every file in a directory!

import os
path = "C:/Users/David/to_split"

os.chdir(path)
    
import glob
listoffiles = glob.glob('*.csv')

prefix = "all_"
turnover = 1000000

for filename in listoffiles:
    with open("filename", "rt") as f:
    i = 0
    fout = open(prefix + filename + "0.csv", "w")
    for line in f.readlines():
      fout.write(line)
      i+=1
      if i%turnover == 0:
        fout.close()
        fout = open(prefix + filename + "output%d.csv"%(i/turnover),"wb")
    fout.close()



In [4]:

    
#consonant or #vowel #tagger

def cv(name):
    "returns consonant-vowel pattern tuplet: every one, then alternations only"
    import re
    cv = name.lower()
    cv = re.sub('[bcdfghjklmnpqrstvxz]', 'c', cv)
    cv = re.sub('cy$', 'cv', cv)
    cv = re.sub('[aeo][wy][^aeiou]', 'vvc', cv)
    cv = re.sub('[aeo][wy]$', 'vv', cv) 
    cv = re.sub('[wy]', 'c', cv) 
    cv = re.sub('[aeiou]', 'v', cv)
    cv2 = re.sub('[v]{2,}', 'v', cv)
    cv2 = re.sub('[c]{2,}', 'c', cv)
    return cv, cv2



In [6]:

    
# make list of pairwise combinations (not permutations)

def list_pairwise(listforfn):
    ni = len(listforfn)
    if ni == 1:
        return None
    else:
        pairwiselist = []
        for i in (range(ni - 1)):
            for j in range (i + 1):
                pairwiselist.append((listforfn[i], listforfn[j]))
        return pairwiselist
    
print list_pairwise([1,2,3,4])









    



[(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)]



In [ ]:

    
# print ljust or rjust to align columns using whitespace

>>> def printit():
...     print 'Location: 10-10-10-10'.ljust(40) + 'Revision: 1'
...     print 'District: Tower'.ljust(40) + 'Date: May 16, 2012'
...     print 'User: LOD'.ljust(40) + 'Time: 10:15'
...
>>> printit()
Location: 10-10-10-10                   Revision: 1
District: Tower                         Date: May 16, 2012
User: LOD                               Time: 10:15



In [ ]:

    
# pandas #new#column based on others #merge #map #apply

# import pandas as pd

# def calculate(s):
#     a = s['path'] + 2*s['row'] # Simple calc for example
#     b = s['path'] * 0.153
#     return pd.Series({'col1'=a, 'col2'=b})

# df = df.merge(df.apply(calculate, axis=1), left_index=True, right_index=True)

or

df2 = pd.DataFrame({'col1':[1,2,3],'col2':['a', 'b', 'c']})

def fn(one, two):
   return str(str(one**2) + two)

def calculate(df):
    return pd.Series({'col3':str(df['col1'])+df['col2']})

df2 = df2.merge(df2.apply(calculate, axis=1), left_index=True, right_index=True)

print df2

or

def add_pct(group):
    births = group.births.astype(float)
    group['pct'] = (births / births.sum() * 100)
    return group
yob = yob.groupby(['year', 'sex']).apply(add_pct)
#add #rank of each name each year each sex
yob['ranked'] = yob.groupby(['year', 'sex'])['births'].rank(ascending=False)

or

df_train['Gender'] = df_train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

#

percent column

#pandas #percent column
#needs to be adjusted for any dataset

    # add column 'pct': the number of births of that name and sex in that year
    # divided by the total number of births of that sex in that year, multiplied by
    # 100 to turn into a percentage and reduce leading zeroes
    def add_pct(group):
        births = group.births.astype(float)
        group['pct'] = (births / births.sum() * 100)
        return group
    yobgroups = yob.groupby(['year', 'sex']).apply(add_pct)
    #add rank of each name each year each sex
    yob['ranked'] = yobgroups.groupby(['year', 'sex'])['births'].rank(ascending=False)

    
    ### new column from boolean of existing column, leaves old values
myfunc = lambda s: 'newval' if (s['one'] == 'x')
df['newcol'] = df.apply(myfunc, axis=1)

###

apply

   # an example of apply
def calc_letter(row):
    if row.grade >= 90:
        letter_grade = 'A'
    elif row['grade'] > 75:
        letter_grade = 'B'
    elif row['grade'] > 60:
        letter_grade = 'C'
    else:
        letter_grade = 'F'
    return letter_grade

df['ltr'] = df.apply(calc_letter, axis=1)
print ' '
print df[df.ltr == 'C']



In [2]:

    
# #json and #pickle i/o

import json

d = {'a':1, 'b':2}

with open('eraseme.json', 'w+') as f:
    f.write(json.dumps(d))

#equivalent

open('eraseme.json', 'w+').write(json.dumps(d))
     
d = json.loads(open('eraseme.json', 'r').read())

####

import pickle

pickle.dump( forest, open( "forest.pickle", "wb+" ) )

favorite_color = pickle.load( open( "save.p", "rb" ) )



In [2]:

    
# change windows paths with backslashes to posix paths with forward slashes and allowed characters
# note that this does not validate that the resulting path is posix-compliant; the following function does.

def win2posix(path):
    return path.replace("\\", "/")

# validate posix path

def isvalidposix(path):
    itisvalid = True
    import re
    for i in range(len(path)):
        if 2>0:
            if ((i == 0 and path[i] == '-') or
            (i < len(path) and path[i:i+2] == '/-')):
                itisvalid = False
                print "- is not permitted as the first character of a path or file"
        srch = re.search("[^A-Za-z0-9_\.\-// ]", path[i])
        if srch:
            itisvalid = False
            print srch.group(0) + ' is not permitted permitted in POSIX filenames'
        if path[i] == ' ':
            itisvalid = False
            print "Spaces are not permitted in POSIX filenames"
    return itisvalid

print win2posix("C:\Users\Popeye\whatacool.jpg")
print isvalidposix("C:/Program Files/-thisfile&thatfile.exe")
print isvalidposix("usr/bin/data.py")









    



C:/Users/Popeye/whatacool.jpg
: is not permitted permitted in POSIX filenames
Spaces are not permitted in POSIX filenames
- is not permitted as the first character of a path or file
& is not permitted permitted in POSIX filenames
False
True



In [3]:

    
# os delete every n files

import glob
import os

multiple = 2

extension = ".jpg"

origpath = "C:/Users\David\Downloads\Contact opening 4500 frames (8-16-2014 7-39-08 AM)"
thepath = win2posix(origpath) # see function elsewhere in this notebook
os.chdir(thepath) 

if thepath[-1] != "/":
    thepath += "/"   

filelist = glob.glob(thepath + "*" + extension)

print "Original count: %d files." % (len(filelist))

for counter in range(len(filelist)):
    if counter % multiple != 0:
        os.remove(filelist[counter])
        
newfilelist = glob.glob(thepath + "*" + extension)

print "     New count: %d files." % (len(newfilelist))









    



Original count: 227 files.
     New count: 114 files.



In [ ]:

    
# change a dict into a csv,
# eg {'one':1, 'two':2} becomes:
# one,1
# two,2

def dict2csv(the_dict, csv_path, separator=',', quote_char = '|'):
    import csv
    with open(csv_path, 'wb+') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=separator,
                                quotechar=quote_char, quoting=csv.QUOTE_MINIMAL)
        for key in the_dict.keys():
            csvwriter.writerow([key, the_dict[key]])



In [1]:

    
#Download from list of #urls

# import urllib
# import urllib2
# import requests
 
# url = 'http://www.blog.pythonlibrary.org/wp-content/uploads/2012/06/wxDbViewer.zip'
 
# print "downloading with urllib"
# urllib.urlretrieve(url, "code.zip")
 
# print "downloading with urllib2"
# f = urllib2.urlopen(url)
# data = f.read()
# with open("code2.zip", "wb") as code:
#     code.write(data)
 
# print "downloading with requests"
# r = requests.get(url)
# with open("code3.zip", "wb") as code:
#     code.write(r.content)
    
###

import urllib

urllist = ['http://www.oshannonland.com/wp-content/uploads/2012/06/20-Spider-Man-20.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/21-Spider-Man-21.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/22-Spider-Man-22.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/23-Spider-Man-231.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/24-Spider-Man-24.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/25-Spider-Man-25.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/26-Spider-Man-26.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-27.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-28.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-29.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-30.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-31.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-32.mp3', 
'http://www.oshannonland.com/wp-content/uploads/2012/06/spider-man-33.mp3']

for url in urllist:
    urllib.urlretrieve(url, url.rsplit('/', 1)[-1])
    print url.rsplit('/', 1)[-1] + ' downloaded successfully'









    



20-Spider-Man-20.mp3 downloaded successfully
21-Spider-Man-21.mp3 downloaded successfully
22-Spider-Man-22.mp3 downloaded successfully
23-Spider-Man-231.mp3 downloaded successfully
24-Spider-Man-24.mp3 downloaded successfully
25-Spider-Man-25.mp3 downloaded successfully
26-Spider-Man-26.mp3 downloaded successfully
spider-man-27.mp3 downloaded successfully
spider-man-28.mp3 downloaded successfully
spider-man-29.mp3 downloaded successfully
spider-man-30.mp3 downloaded successfully
spider-man-31.mp3 downloaded successfully
spider-man-32.mp3 downloaded successfully
spider-man-33.mp3 downloaded successfully



In [3]:

    
# some uses for random numbers
import random

# create a n-digit random number string with leading zeroes

def random_digit_string(length):
    import random
    random.seed()
    txt = ''
    for i in range(length):
        txt += str(random.randint(0, 9))
    return txt

print "\nFour times ten random digits:"
    
for i in range(4):
    print random_digit_string(10),
print ""

#randrange demo    
    
print "\nrandrange demo:"
print "           10:", random.randrange(10)
print "      50-1000:", random.randrange(50,1000)
print "10-20, step 2:", random.randrange(10,20,2)

# shuffle and choice

print "\nrandom lists demo"
alist = ["one", "two", "three", "four", "five"]
print "   list:", alist
print " choice:", random.choice(alist)
random.shuffle(alist)
print "shuffle:", alist
print " sample:", random.sample(alist, 2)

# random floats

print "\nrandom floats:"
print "random.random (0-1):", random.random()
print "       uniform(2-7):", random.uniform(2,7)
print "20 random numbers between 10 and 99, with indicated mode, rounded:"
print "mode 20:",
for i in range(12):
    print int(round(random.triangular(10,99,20), 0)),
print "\nmode 90:",
for i in range(12):
    print int(round(random.triangular(10,99,90), 0)),
print ""

# gaussian distribution

print "\n20 gaussian numbers (rounded) with mean 50, std dev 10:"
for i in range(20):
    print int(round(random.gauss(50, 10),0)),
print "\nWith numpy:"
import numpy as np
print np.random.normal(50, 10, 20)









    



Four times ten random digits:
3073740002 4645234230 0540728167 2346467625 

randrange demo:
           10: 5
      50-1000: 524
10-20, step 2: 14

random lists demo
   list: ['one', 'two', 'three', 'four', 'five']
 choice: one
shuffle: ['two', 'one', 'five', 'three', 'four']
 sample: ['five', 'four']

random floats:
random.random (0-1): 0.447620412453
       uniform(2-7): 3.64311878507
20 random numbers between 10 and 99, with indicated mode, rounded:
mode 20: 56 62 20 30 43 86 70 23 68 20 21 59 
mode 90: 68 36 63 46 42 55 70 65 66 61 71 72 

20 gaussian numbers (rounded) with mean 50, std dev 10:
51 36 52 42 58 48 34 36 31 39 25 37 39 60 47 53 53 49 64 71 
With numpy:
[ 53.56384057  33.85086527  64.396047    51.10230152  52.26108643
  34.9087399   46.621346    30.20033591  47.7112718   38.81165908
  53.24774263  53.50361748  52.72360231  57.26983098  49.13291706
  64.73273801  50.46719273  52.22616892  70.49124552  55.25475551]



In [1]:

    
# examples of #functions and #lambda functions

def thefunc(x):
    if 0 <= x <= 1:
        return x ** 2
    else:
        return None
        
def shortfunc(x):
    return x ** 2 if 0 <= x < 1 else None

f = lambda x: x** 2 if 0 <= x < 1 else None

print thefunc(0.5), thefunc(2)
print shortfunc(0.5), shortfunc(2)
print f(0.5), f(2)









    



0.25 None
0.25 None
0.25 None



In [ ]:

    
#reverse #dict #reverse dict
inv_map = {v: k for k, v in d.items()}



In [4]:

    
# remove common #accents / #diacritics

# list is incomplete; notably missing are Hungarian diacritics

accents = [(r'Ą', r'A'), (r'ą', r'a'), (r'Č', r'C'), (r'č', r'c'), (r'ď', r'd'), (r'Ę', r'E'), (r'ę', r'e'), 
           (r'Ě', r'E'), (r'ě', r'e'), (r'Ĺ', r'L'), (r'ĺ', r'l'), (r'Ň', r'N'), (r'ň', r'n'), (r'Ŕ', r'R'), 
           (r'ŕ', r'r'), (r'Ř', r'R'), (r'ř', r'r'), (r'ť', r't'), (r'Ů', r'r'), (r'ů', r'r'), (r'Ž', r'Z'), 
           (r'ž', r'z'), (r'Á', r'A'), (r'á', r'a'), (r'Â', r'A'), (r'â', r'a'), (r'Ø', r'o'), (r'õ', r'o'), 
           (r'À', r'A'), (r'à', r'a'), (r'Ä', r'A'), (r'ä', r'a'), (r'Ç', r'C'), (r'ç', r'c'), (r'É', r'E'), 
           (r'é', r'e'), (r'Ê', r'E'), (r'ê', r'e'), (r'È', r'E'), (r'è', r'e'), (r'Ë', r'E'), (r'ë', r'e'), 
           (r'Í', r'I'), (r'í', r'i'), (r'Î', r'I'), (r'î', r'i'), (r'Ì', r'I'), (r'ì', r'i'), (r'Ï', r'I'), 
           (r'ï', r'i'), (r'Ñ', r'N'), (r'ñ', r'n'), (r'Ó', r'O'), (r'ó', r'o'), (r'Ô', r'O'), (r'ô', r'o'), 
           (r'Ò', r'O'), (r'ò', r'o'), (r'Ö', r'O'), (r'ö', r'o'), (r'ø', r'o'), (r'Õ', r'O'), (r'ä', r'a'),
           (r'Ú', r'r'), (r'ú', r'r'), (r'Û', r'r'), (r'û', r'r'), (r'Ù', r'r'), (r'ù', r'r'), (r'Ü', r'r'), 
           (r'ü', r'r'), (r'Ý', r'Y'), (r'ý', r'y'), (r'Š', r'S'), (r'š', r's'), (r'ÿ', r'y'), (r'Ÿ', r'Y'), 
           (r'Å', r'A'), (r'å', r'a'), (r'Ã', r'A'), (r'ã', r'a'), (r'Ä', r'A'), 
           (r'Æ', r'Ae'), (r'æ', r'ae'), (r'Œ', r'Oe'), (r'œ', r'oe'), (r'ß', r'ss')]

def remove_accents(txt):
    import re
    for accentpair in accents:
        txt = re.sub(accentpair[0], accentpair[1], txt)
    return txt
        
examples = ['Résumé', 'encyclopædia']
for example in examples:
    print remove_accents(example)
    
# to change a file
accent_change_path_in = "C:/Users/David/Documents/IPython Notebooks/GitHub/Baby_names_US_IPython/lists/bible in prog.txt"
accent_change_path_out = "C:/Users/David/Documents/IPython Notebooks/GitHub/Baby_names_US_IPython/lists/bible in prog_noacc.txt"
in_file = open(accent_change_path_in, 'r').read()
out_file = remove_accents(in_file)
open(accent_change_path_out, 'w+').write(out_file)









    



Resume
encyclopaedia



In [16]:

    
#progress bar-type indicator, counts to 100.

import time

class ProgressBar: 
    def __init__(self, loop_length):
        import time
        self.start = time.time()
        self.increment_size = 100.0/loop_length
        self.curr_count = 0
        self.curr_pct = 0
        self.overflow = False
        print '% complete:',
    
    def increment(self):
        self.curr_count += self.increment_size
        if int(self.curr_count) > self.curr_pct:
            self.curr_pct = int(self.curr_count)
            if self.curr_pct <= 100:
                print self.curr_pct, 
            elif self.overflow == False:
                print "\n*!* Count has gone over 100%; likely either due to:\n*!*   - an error in the loop_length specified when " + \
                      "progress_bar was instantiated\n*!*   - an error in the placement of the increment() function"
                print '*!* Elapsed time when progress bar full: %0.1f seconds.' % (time.time() - self.start)
                self.overflow = True

    def finish(self):
        if self.curr_pct == 99:
            print "100", # this is a cheat, because rounding sometimes makes the maximum count 99. One day I'll fix this bug.
        if self.overflow == True:
            print '*!* Elapsed time after end of loop: %0.1f seconds.\n' % (time.time() - self.start)
        else:
            print '\nElapsed time: %0.1f seconds.\n' % (time.time() - self.start)
#examples:

progbar_ex1 = ProgressBar(10)
for i in range(10):
    progbar_ex1.increment()
progbar_ex1.finish()
    
#example2:
progbar_ex2 = ProgressBar(251)
for i in range(251):
    progbar_ex2.increment()
progbar_ex2.finish()
    
#example3:
progbar_ex3 = ProgressBar(10)
for i in range(4962):
    progbar_ex3.increment()
progbar_ex3.finish()









    



% complete: 10 20 30 40 50 60 70 80 90 100 
Elapsed time: 0.0 seconds.

% complete: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 
Elapsed time: 0.0 seconds.

% complete: 10 20 30 40 50 60 70 80 90 100 
*!* Count has gone over 100%; likely either due to:
*!*   - an error in the loop_length specified when progress_bar was instantiated
*!*   - an error in the placement of the increment() function
*!* Elapsed time when progress bar full: 0.0 seconds.
*!* Elapsed time after end of loop: 0.0 seconds.



In [ ]:

    
#traverse #file #tree

def w2p(path):
    return path.replace("\\", "/")
import os
for root, dirs, files in os.walk("C:\Python_packages\lingpy-2"):
    for file in files:
        if file[-3:] == '.py':
            contents = open(w2p(root) + '/' + file, 'r').read()
            contents = "# -*- coding: utf-8 -*-\n" + contents
            #open(w2p(root) + '/' + file, 'w').write(contents) # rewrites file



In [18]:

    
#convert #color #rgb #hex #hls

r = 0
g = 127
b = 255

print "#{0:02x}{1:02x}{2:02x}".format(r, g, b)


############################


import struct
rgbstr='aabbcc'
print struct.unpack('BBB',rgbstr.decode('hex'))

rgb = (50,100,150)
print struct.pack('BBB',*rgb).encode('hex')


######

import colorsys
# note takes values from 0 to 1

r, g, b = (50, 100, 150)

print colorsys.rgb_to_hls(r/255.0, g/255.0, b/255.0)

print colorsys.hls_to_rgb(.583, .392, .5)

# example: lighten colors

colors = ['#BB2114', '#0C5966', '#BA7814', '#4459AB', '#6B3838', 
                  '#B8327B', '#2B947F', '#0D83B5', '#684287', '#8C962C', 
                  '#92289E', '#242D7D']
import struct
new = []
for color in colors:
    r, g, b = struct.unpack('BBB',color[1:].decode('hex'))
    h, l, s = colorsys.rgb_to_hls(r/255.0, g/255.0, b/255.0)
    rgb2 = colorsys.hls_to_rgb(h, (5*l+1)/6, s)
    print r, g, b, h, l, s, rgb2,
    r, g, b = int(rgb2[0] * 255), int(rgb2[1] * 255), int(rgb2[2] * 255)
    print r, g, b
    new.append("#{0:02x}{1:02x}{2:02x}".format(r, g, b))

print new









    



#007fff
(170, 187, 204)
326496
(0.5833333333333334, 0.39215686274509803, 0.5000000000000001)
(0.19599999999999995, 0.392784, 0.5880000000000001)
187 33 20 0.0129740518962 0.405882352941 0.806763285024 (0.9043288813109785, 0.1676612674055128, 0.10547504025764887) 230 42 26
12 89 102 0.524074074074 0.223529411765 0.789473684211 (0.07430340557275539, 0.551083591331269, 0.6315789473684211) 18 140 161
186 120 20 0.100401606426 0.403921568627 0.805825242718 (0.9035471793895551, 0.5852528713750872, 0.10298876832286319) 230 149 26
68 89 171 0.632686084142 0.46862745098 0.430962343096 (0.36635491016490285, 0.4441709738288621, 0.7480241748024175) 93 113 190
107 56 56 0.0 0.319607843137 0.312883435583 (0.5684871085448494, 0.29752596335057535, 0.29752596335057535) 144 75 75
184 50 123 0.9092039801 0.458823529412 0.57264957265 (0.8072733366851014, 0.290765879001173, 0.5721468074409248) 205 74 145
43 148 127 0.466666666667 0.374509803922 0.549738219895 (0.21556650583444548, 0.7419498340348356, 0.6366731683947575) 54 189 162
13 131 181 0.549603174603 0.380392156863 0.865979381443 (0.06482042989016912, 0.6531904858163199, 0.9024998315477395) 16 166 230
104 66 135 0.75845410628 0.394117647059 0.34328358209 (0.5123402594868792, 0.32513901082821195, 0.6650570676031606) 130 82 169
140 150 44 0.182389937107 0.380392156863 0.546391752577 (0.6980661680479754, 0.7479280371942592, 0.21939222424364935) 178 190 55
146 40 158 0.816384180791 0.388235294118 0.59595959596 (0.7229154287977823, 0.19805902158843336, 0.7823331352743118) 184 50 199
36 45 125 0.649812734082 0.31568627451 0.552795031056 (0.1921812203142126, 0.24022652539276557, 0.6672959038687939) 49 61 170
['#e62a1a', '#128ca1', '#e6951a', '#5d71be', '#904b4b', '#cd4a91', '#36bda2', '#10a6e6', '#8252a9', '#b2be37', '#b832c7', '#313daa']



In [ ]:

    
#BeautifulSoup

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_string) #or soup = BeautifulSoup(open("index.html"))

#to find first such tag:
tag = soup.find('p')
#attributes:
tag.name
tag['class'] # an attr
tag.contents
tag.attrs # returns a dict

#to find all tags (returns list)
tags = soup.findAll('p')

#to find <select name="HSpeaker" multiple size="4">
hspeaker = soup.find('select',{'name':'HSpeaker'})

#to create list all option tags under the above:
options = hspeaker.findAll('option')

print soup.find(id="link3")

#extract all urls in <a href=""> tags
for link in soup.find_all('a'):
    print(link.get('href'))
    
#extract all text
print(soup.get_text())

# more:
# http://omz-software.com/pythonista/docs/ios/beautifulsoup_guide.html



In [ ]:

    
# mechanize

import mechanize

br = mechanize.Browser()
br.set_handle_robots(False)   # ignore robots          # no cookies
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
# Follows refresh 0 but not hangs on refresh > 0:
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

user_agents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6',
               'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3',
               'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
               'Opera/9.00 (Windows NT 5.1; U; en)']

agent = 2

br.addheaders = [('User-agent', user_agents[agent])]

# Want debugging messages?
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)

#alternate to use cookies
# import cookielib
# cj = cookielib.LWPCookieJar()
# br.set_cookiejar(cj)

#########################################

response = br.open('http://www.example.com') # can only be used once
soup = BeautifulSoup(response.read())
br.response().info() # headers

#forms
for form in br.forms():
    print form
br.select_form(nr=0) #first form
br.select_form("form1")         # works when form has a name
br.form = list(br.forms())[0]  # use when form is unnamed

# submit form
# eg if form control is <CheckboxControl(SenateSection=[*1])>
br.form['SenateSection']=1
br.submit()
print br.response().read()
#list all controls
for control in br.form.controls:
    print control
    print "type=%s, name=%s value=%s" % (control.type, control.name, br[control.name])
#find by name
control = br.form.find_control("controlname")
#allowed values
if control.type == "select":  # means it's class ClientForm.SelectControl
    for item in control.items:
    print " name=%s values=%s" % (item.name, str([label.text  for label in item.get_labels()]))
#select-type controls must be set with a list, even if only one item:
print control.value
print control  # selected value is starred
control.value = ["ItemName"]
print control
br[control.name] = ["ItemName"]  # equivalent and more normal
#text controls can be set as string
if control.type == "text":  # means it's class ClientForm.TextControl
    control.value = "stuff here"
br["controlname"] = "stuff here"  # equivalent
    
    

# Looking at some results in link format
for link in br.links(url_regex='stockrt'):
    print link
    #or
    print link.text, link.url

#links
# Testing presence of link (if the link is not found you would have to
# handle a LinkNotFoundError exception)
br.find_link(text='Weekend codes')
# Actually clicking the link
req = br.click_link(text='Weekend codes')
br.open(req)
print br.response().read()
print br.geturl()
# Back
br.back()
print br.response().read()
print br.geturl()
#
br.follow_link(text='Sign out')
#or
resp = br.follow_link(...)
# follow lots of links
all_links = [l for l in br.links(url_regex='\?v=c&th=')] # or text_regex
# Select the first 3
for link in all_links[0:3]:
    print link
# Open each message
br.follow_link(msg_link)
#
request = br.click_link(link)
response = br.follow_link(link)
print response.geturl()
print response.get_data()
#
for link in br.links():
    print link.text, link.url

# Download
f = br.retrieve('http://www.google.com.br/intl/pt-BR_br/images/logo.gif')[0]

# Proxy and user/password
br.set_proxies({"http": "joe:password@myproxy.example.com:3128"})
# Proxy
br.set_proxies({"http": "myproxy.example.com:3128"})
# Proxy password
br.add_proxy_password("joe", "password")

# more, incl password protected auth
# http://stockrt.github.io/p/emulating-a-browser-in-python-with-mechanize/

#pydoc
# http://joesourcecode.com/Documentation/mechanize0.2.5/