In [18]:
import pandas as pd
df = pd.read_excel("sheet_1_without_simple_logic.xls")
print(df)
# note the NaN (not-a-number) cells when we have no value
In [19]:
df.head(10) # this creates a Table view (non-interactive but prettier)
Out[19]:
In [20]:
print("Column names:", df.columns)
In [21]:
print("Information about each row including data types:")
print("(note - type 'object' is catch-all that includes strings)")
df.info()
In [22]:
print("\nWe can extract a column of data as a Series object:")
print(df['Feature1'])
In [23]:
row = df.ix[0]
print("\nWe can extract a row as a Python dictionary:")
print(row)
In [24]:
print("\nRow items, e.g. Feature1={feature1}".format(feature1=row['Feature1']))
In [25]:
def decision_f1_f2(row):
feature_1 = row['Feature1']
feature_2 = row['Feature2']
if feature_1 > 0.5:
if feature_2 > 0.5:
return True
return False
# we'll use apply on the entire DataFrame, axis=1 means row-wise (not column-wise)
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html
df.apply(decision_f1_f2, axis=1)
# note this doesn't change the DataFrame, it generates a new separate Series
# and here we just print it and then discard it
Out[25]:
In [27]:
df.head(10) # confirm we haven't modified the DataFrame
Out[27]:
In [29]:
# we can assign the result back to the DataFrame as a new column
new_result = df.apply(decision_f1_f2, axis=1)
df['DecisionF1F2'] = new_result
df.head(10)
Out[29]:
In [31]:
def decision_tvshow(cell):
cleaned_cell = cell
cleaned_cell = cleaned_cell.lower() # lowercase
is_hollyoaks = cleaned_cell == "hollyoaks"
return is_hollyoaks
# we can assign the result back to the DataFrame as a new column
df['Decision2'] = df['TVShow'].apply(decision_tvshow)
df.head(10)
Out[31]:
In [32]:
def decision_tvshow_is_hollyoaks(cell):
cleaned_cell = cell
cleaned_cell = cleaned_cell.lower() # lowercase
cleaned_cell = cleaned_cell.strip() # remove superflous whitespace
is_hollyoaks = cleaned_cell == "hollyoaks"
return is_hollyoaks
# we can assign the result back to the DataFrame as a new column
df['Decision2'] = df['TVShow'].apply(decision_tvshow_is_hollyoaks)
df.head(10)
Out[32]:
In [34]:
# use a different way to access the columns (using .colname - this only works for
# easy-to-read ASCII names, it won't work with funny characters)
# and use the logical and (&) to do pairwise logic, assigning the result to our new column
df['Decision3'] = df.DecisionF1F2 & df.Decision2
df.head(10)
Out[34]:
In [35]:
writer = pd.ExcelWriter('sheet_1_with_added_logic_generated_via_pandas.xlsx', engine='xlsxwriter')
df.to_excel(writer, index=False, sheet_name='Our New Sheet')
workbook = writer.book
worksheet = writer.sheets['Our New Sheet']
writer.save()
# note we could add lots of conditional formatting for Excel via:
# http://pbpython.com/improve-pandas-excel-output.html
In [47]:
from fuzzywuzzy import fuzz
# http://pbpython.com/excel-pandas-comp.html side reading on fuzzywuzzy
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ other fuzzywuzzy metrics for
# other ways of measuring similarity
In [37]:
# quick demo:
item1 = "hollyoaks"
target = "hollyoaks"
print(target, fuzz.ratio(item1, target))
target = 'holly-oaks'
print(target, fuzz.ratio(item1, target))
target = 'holly oak'
print(target, fuzz.ratio(item1, target))
target = "tv's best moments"
print(target, fuzz.ratio(item1, target))
In [41]:
def decision_tvshow_is_hollyoaks_with_smarts(cell):
cleaned_cell = cell
cleaned_cell = cleaned_cell.lower() # lowercase
cleaned_cell = cleaned_cell.strip() # remove superflous whitespace
is_hollyoaks = fuzz.ratio("hollyoaks", cleaned_cell) > 85
return is_hollyoaks
# we can assign the result back to the DataFrame as a new column
df['TVShow'].apply(decision_tvshow_is_hollyoaks_with_smarts)
Out[41]:
In [42]:
df.head(10) # show that we *haven't* yet manipulated the DataFrame
Out[42]:
In [45]:
df['Decision2'] = df['TVShow'].apply(decision_tvshow_is_hollyoaks_with_smarts)
df.head(10)
# note that we *haven't re-run Decision3*!
Out[45]:
In [46]:
df['Decision3'] = df.DecisionF1F2 & df.Decision2
df.head(10)
Out[46]:
In [44]:
# We could copy the Excel code down here and write out a new spreadsheet to disk...
In [ ]: