In [ ]:
replace_group = lambda x: x if x in groups_to_keep else aggregate_name # function to replace non-kept column names
dfplot[grouped_column] = dfplot[grouped_column].apply(replace_group) # replace column names
bb['decade'] = bb.apply(lambda row: int(str(row.year)[:3])*10, axis=1)
In [ ]:
#Changing value instead of view in iterrows:
dfmi.loc[:,('one','second')] = x
isntead of
dfmi['one']['second'] = x
In [ ]:
# add column 'pct': the number of births of that name and sex in that year
# divided by the total number of births of that sex in that year, multiplied by
# 100 to turn into a percentage and reduce leading zeroes
def add_pct(group):
births = group.births.astype(float)
group['pct'] = (births / births.sum() * 100)
return group
yob = yob.groupby(['year', 'sex']).apply(add_pct)
#add rank of each name each year each sex
yob['ranked'] = yob.groupby(['year', 'sex'])['births'].rank(ascending=False)
In [ ]:
#pandas #add #fill in #missing #rows, e.g. #years
import pandas as pd
df = pd.DataFrame({'year':[2000, 2001, 2002, 2004, 2005, 2006], 'qty':[10, 20, 15, 25, 20, 30]})
tocomplete = set(df.year)
for i in range(2000, 2007):
if i not in tocomplete:
df = df.append(pd.DataFrame({'year':[i], 'qty':[0]}), ignore_index=True)
In [ ]:
#calculate comorbidity matrix
import csv
with open('drugs_comorbid.csv', 'wb') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['drug1', 'drug2', 'comorbnum'])
for dualflag in dualflags:
drug1 = full_drugs[dualflag[0]]
drug2 = full_drugs[dualflag[1]]
comorbnum = df[(df[dualflag[0]]==1) &
(df[dualflag[1]]==1)].ANALWT_C.sum()
writer.writerow([drug1, drug2, comorbnum])
if drug1 != drug2:
writer.writerow([drug2, drug1, comorbnum])
dfcmb = pd.read_csv('drugs_comorbid.csv', index_col=None)
dfcmb
In [ ]:
def calc_letter(row):
if row.grade >= 90:
letter_grade = 'A'
elif row['grade'] > 75:
letter_grade = 'B'
elif row['grade'] > 60:
letter_grade = 'C'
else:
letter_grade = 'F'
return letter_grade
df['ltr'] = df.apply(calc_letter, axis=1)
# pandas #new#column based on others #merge #map #apply
# import pandas as pd
# def calculate(s):
# a = s['path'] + 2*s['row'] # Simple calc for example
# b = s['path'] * 0.153
# return pd.Series({'col1'=a, 'col2'=b})
# df = df.merge(df.apply(calculate, axis=1), left_index=True, right_index=True)
or
df2 = pd.DataFrame({'col1':[1,2,3],'col2':['a', 'b', 'c']})
def fn(one, two):
return str(str(one**2) + two)
def calculate(df):
return pd.Series({'col3':str(df['col1'])+df['col2']})
df2 = df2.merge(df2.apply(calculate, axis=1), left_index=True, right_index=True)
print df2
or
def add_pct(group):
births = group.births.astype(float)
group['pct'] = (births / births.sum() * 100)
return group
yob = yob.groupby(['year', 'sex']).apply(add_pct)
#add #rank of each name each year each sex
yob['ranked'] = yob.groupby(['year', 'sex'])['births'].rank(ascending=False)
or
df_train['Gender'] = df_train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
#
percent column
#pandas #percent column
#needs to be adjusted for any dataset
# add column 'pct': the number of births of that name and sex in that year
# divided by the total number of births of that sex in that year, multiplied by
# 100 to turn into a percentage and reduce leading zeroes
def add_pct(group):
births = group.births.astype(float)
group['pct'] = (births / births.sum() * 100)
return group
yobgroups = yob.groupby(['year', 'sex']).apply(add_pct)
#add rank of each name each year each sex
yob['ranked'] = yobgroups.groupby(['year', 'sex'])['births'].rank(ascending=False)
### new column from boolean of existing column, leaves old values
myfunc = lambda s: 'newval' if (s['one'] == 'x')
df['newcol'] = df.apply(myfunc, axis=1)
###
apply
# an example of apply
def calc_letter(row):
if row.grade >= 90:
letter_grade = 'A'
elif row['grade'] > 75:
letter_grade = 'B'
elif row['grade'] > 60:
letter_grade = 'C'
else:
letter_grade = 'F'
return letter_grade
df['ltr'] = df.apply(calc_letter, axis=1)
print ' '
print df[df.ltr == 'C']
In [ ]:
x = """
____ ____ _ _ ___ ___
/ ___/ ___|| |_ _ __(_)_ __ __ _|_ _/ _ \
| | \___ \| __| '__| | '_ \ / _` || | | | |
| |___ ___) | |_| | | | | | | (_| || | |_| |
\____|____/ \__|_| |_|_| |_|\__, |___\___/
|___/ """
#cStringIO #stringio to load #inline #text #table to #dataframe #df
thecsv = """
letter,freq
a,0.081670
b,0.014920
c,0.027820
"""
from six.moves import cStringIO as cStringIO
df = pd.read_csv(cStringIO(thecsv), skiprows=1, skipinitialspace=True)
In [ ]:
#pandas #pivot #unpivot to replace missing values with zero for graphing
def pivot_unpivot(df, row_name, column_name, value_name):
""" Pivot and unpivot a dataframe to replace missing values with zeroes """
pivoted = pd.DataFrame(pd.pivot_table(df, values=value_name, index = row_name, columns=column_name)).fillna(0.0)
unpivoted = pd.DataFrame()
for column in pivoted.columns:
unpivoted = unpivoted.append(pd.DataFrame({row_name: list(pivoted.index), column_name: [column]*len(pivoted),
value_name: list(pivoted[column])}), ignore_index=True)
return unpivoted