Apply lambda function to a column


In [ ]:
replace_group = lambda x: x if x in groups_to_keep else aggregate_name # function to replace non-kept column names
dfplot[grouped_column] = dfplot[grouped_column].apply(replace_group) # replace column names

bb['decade'] = bb.apply(lambda row: int(str(row.year)[:3])*10, axis=1)

In [ ]:
#Changing value instead of view in iterrows:
dfmi.loc[:,('one','second')] = x
isntead of
dfmi['one']['second'] = x

Calculate percentage in group


In [ ]:
# add column 'pct': the number of births of that name and sex in that year
# divided by the total number of births of that sex in that year, multiplied by
# 100 to turn into a percentage and reduce leading zeroes
def add_pct(group):
    births = group.births.astype(float)
    group['pct'] = (births / births.sum() * 100)
    return group
yob = yob.groupby(['year', 'sex']).apply(add_pct)
#add rank of each name each year each sex
yob['ranked'] = yob.groupby(['year', 'sex'])['births'].rank(ascending=False)

In [ ]:
#pandas #add #fill in #missing #rows, e.g. #years

import pandas as pd
df = pd.DataFrame({'year':[2000, 2001, 2002, 2004, 2005, 2006], 'qty':[10, 20, 15, 25, 20, 30]})

tocomplete = set(df.year)

for i in range(2000, 2007):
    if i not in tocomplete:
        df = df.append(pd.DataFrame({'year':[i], 'qty':[0]}), ignore_index=True)

Using csv.writer and pandas together


In [ ]:
#calculate comorbidity matrix

import csv
with open('drugs_comorbid.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['drug1', 'drug2', 'comorbnum'])
    for dualflag in dualflags:
        drug1 = full_drugs[dualflag[0]]
        drug2 = full_drugs[dualflag[1]]
        comorbnum = df[(df[dualflag[0]]==1) &
                       (df[dualflag[1]]==1)].ANALWT_C.sum()
        writer.writerow([drug1, drug2, comorbnum])
        if drug1 != drug2:
            writer.writerow([drug2, drug1, comorbnum])

dfcmb = pd.read_csv('drugs_comorbid.csv', index_col=None)
dfcmb

new calculated column with .apply


In [ ]:
def calc_letter(row):
    if row.grade >= 90:
        letter_grade = 'A'
    elif row['grade'] > 75:
        letter_grade = 'B'
    elif row['grade'] > 60:
        letter_grade = 'C'
    else:
        letter_grade = 'F'
    return letter_grade

df['ltr'] = df.apply(calc_letter, axis=1)

# pandas #new#column based on others #merge #map #apply

# import pandas as pd

# def calculate(s):
#     a = s['path'] + 2*s['row'] # Simple calc for example
#     b = s['path'] * 0.153
#     return pd.Series({'col1'=a, 'col2'=b})

# df = df.merge(df.apply(calculate, axis=1), left_index=True, right_index=True)

or

df2 = pd.DataFrame({'col1':[1,2,3],'col2':['a', 'b', 'c']})

def fn(one, two):
   return str(str(one**2) + two)

def calculate(df):
    return pd.Series({'col3':str(df['col1'])+df['col2']})

df2 = df2.merge(df2.apply(calculate, axis=1), left_index=True, right_index=True)

print df2

or

def add_pct(group):
    births = group.births.astype(float)
    group['pct'] = (births / births.sum() * 100)
    return group
yob = yob.groupby(['year', 'sex']).apply(add_pct)
#add #rank of each name each year each sex
yob['ranked'] = yob.groupby(['year', 'sex'])['births'].rank(ascending=False)

or

df_train['Gender'] = df_train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

#

percent column

#pandas #percent column
#needs to be adjusted for any dataset

    # add column 'pct': the number of births of that name and sex in that year
    # divided by the total number of births of that sex in that year, multiplied by
    # 100 to turn into a percentage and reduce leading zeroes
    def add_pct(group):
        births = group.births.astype(float)
        group['pct'] = (births / births.sum() * 100)
        return group
    yobgroups = yob.groupby(['year', 'sex']).apply(add_pct)
    #add rank of each name each year each sex
    yob['ranked'] = yobgroups.groupby(['year', 'sex'])['births'].rank(ascending=False)

    
    ### new column from boolean of existing column, leaves old values
myfunc = lambda s: 'newval' if (s['one'] == 'x')
df['newcol'] = df.apply(myfunc, axis=1)

###

apply

   # an example of apply
def calc_letter(row):
    if row.grade >= 90:
        letter_grade = 'A'
    elif row['grade'] > 75:
        letter_grade = 'B'
    elif row['grade'] > 60:
        letter_grade = 'C'
    else:
        letter_grade = 'F'
    return letter_grade

df['ltr'] = df.apply(calc_letter, axis=1)
print ' '
print df[df.ltr == 'C']

In [ ]:
x = """
  ____ ____  _        _             ___ ___  
 / ___/ ___|| |_ _ __(_)_ __   __ _|_ _/ _ \ 
| |   \___ \| __| '__| | '_ \ / _` || | | | |
| |___ ___) | |_| |  | | | | | (_| || | |_| |
 \____|____/ \__|_|  |_|_| |_|\__, |___\___/ 
                              |___/           """        

#cStringIO #stringio to load #inline #text #table to #dataframe #df

thecsv = """
letter,freq
a,0.081670
b,0.014920
c,0.027820
"""

from six.moves import cStringIO as cStringIO
df = pd.read_csv(cStringIO(thecsv), skiprows=1, skipinitialspace=True)

In [ ]:
#pandas #pivot #unpivot to replace missing values with zero for graphing
def pivot_unpivot(df, row_name, column_name, value_name):
    """ Pivot and unpivot a dataframe to replace missing values with zeroes """
    pivoted = pd.DataFrame(pd.pivot_table(df, values=value_name, index = row_name, columns=column_name)).fillna(0.0)
    unpivoted = pd.DataFrame()
    for column in pivoted.columns:
        unpivoted = unpivoted.append(pd.DataFrame({row_name: list(pivoted.index), column_name: [column]*len(pivoted),
                                       value_name: list(pivoted[column])}), ignore_index=True)
    return unpivoted