since program readability is our top priority, as python aim to make the programmer life easier, we should only optimize our code when needed.
so in other words All optimization are premature.
Inย [60]:
! pip install numba numexpr
Inย [6]:
import math
import time
import warnings
from dateutil.parser import parse
import janitor
import numpy as np
import pandas as pd
from numba import jit
from sklearn import datasets
from pandas.api.types import is_datetime64_any_dtype as is_datetime
Inย [7]:
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
pd.options.display.max_columns = 999
Inย [8]:
path = 'https://raw.githubusercontent.com/FBosler/you-datascientist/master/invoices.csv'
def load_dataset(naivly=False):
df = (pd.concat([pd.read_csv(path)
.clean_names()
.remove_columns(["meal_id", "company_id"])
for i in range(20)])
.assign(meal_tip=lambda x: x.meal_price.map(lambda x: x * 0.2))
.astype({"meal_price": int})
.rename(columns={"meal_price": "meal_price_with_tip"}))
if naivly:
for col in df.columns:
df[col] = df[col].astype(object)
return df
Inย [9]:
df = load_dataset()
df.head()
Out[9]:
Inย [1]:
def iterrows_original_meal_price(df):
for i, row in df.iterrows():
df.loc[i]["original_meal_price"] = row["meal_price_with_tip"] - row["meal_tip"]
return df
Inย [5]:
%%timeit -r 1 -n 1
iterrows_original_meal_price(df)
Inย [4]:
def apply_original_meal_price(df):
df["original_meal_price"] = df.apply(lambda x: x['meal_price_with_tip'] - x['meal_tip'], axis=1)
return df
Inย [5]:
%%timeit
apply_original_meal_price(df)
Inย [65]:
def vectorized_original_meal_price(df):
df["original_meal_price"] = df["meal_price_with_tip"] - df["meal_tip"]
return df
Inย [66]:
%%timeit
vectorized_original_meal_price(df)
Inย [67]:
ones = np.ones(shape=5000)
ones
Out[67]:
Inย [84]:
types = ['object', 'complex128', 'float64', 'int64', 'int32', 'int16', 'int8', 'bool']
df = pd.DataFrame(dict([(t, ones.astype(t)) for t in types]))
df.memory_usage(index=False, deep=True)
Out[84]:
Inย [69]:
df = load_dataset(naivly=True)
Inย [70]:
df.memory_usage(deep=True).sum()
Out[70]:
Inย [71]:
df.memory_usage(deep=True)
Out[71]:
Inย [72]:
df.dtypes
Out[72]:
so its pretty obvious that we should aim for the type that has lowest memory footprint with the same functionality
Inย [11]:
df = df.astype({'order_id': 'category',
'date': 'category',
'date_of_meal': 'category',
'participants': 'category',
'meal_price': 'int16',
'type_of_meal': 'category',
'heroes_adjustment': 'bool',
'meal_tip': 'float32'})
Inย [75]:
df.memory_usage(deep=True).sum()
Out[75]:
Inย [76]:
df.memory_usage(deep=True)
Out[76]:
Inย [77]:
df.dtypes
Out[77]:
There are open sourced Types like cyberpandas for ip like objects and geopadnas for spatial like objects
Inย [52]:
def proccess():
pass
Inย [53]:
def proccess_file(huge_file_path, chunksize = 10 ** 6):
for chunk in pd.read_csv(path, chunksize=chunksize):
process(chunk)
Inย [32]:
%%timeit
df["meal_price_with_tip"].astype(object).mean()
Inย [33]:
%%timeit
df["meal_price_with_tip"].astype(float).mean()
The main benefit of eval/query is the saved memory
NumPy allocate memory to every itermediate step.
and this Gives you the ability to compute this type of compound expression element by element, without the need to allocate full intermediate arrays.
Inย [28]:
%%timeit
df[df.type_of_meal=="Breakfast"]
Inย [29]:
%%timeit
df.query("type_of_meal=='Breakfast'")
Inย [17]:
def foo(N):
accumulator = 0
for i in range(N):
accumulator = accumulator + i
return accumulator
Inย [18]:
%%timeit
df.meal_price_with_tip.map(foo)
Inย [19]:
%load_ext Cython
Inย [20]:
%%cython
def cython_foo(long N):
cdef long accumulator
accumulator = 0
cdef long i
for i in range(N):
accumulator += i
return accumulator
Inย [21]:
%%timeit
df.meal_price_with_tip.map(cython_foo)
using numba is really easy its simply adding a decorator to a method
Inย [22]:
@jit(nopython=True)
def numba_foo(N):
accumulator = 0
for i in range(N):
accumulator = accumulator + i
return accumulator
Inย [23]:
%%timeit
df.meal_price_with_tip.map(numba_foo)
Inย [ย ]:
@jit(nopython=True, parallel=True)
def numba_foo(N):
accumulator = 0
for i in range(N):
accumulator = accumulator + i
return accumulator
Inย [ย ]:
%%timeit
df.meal_price_with_tip.map(numba_foo)
Inย [32]:
def another_foo(data):
return data * 2
def foo(data):
return data + 10
Inย [28]:
%reload_ext memory_profiler
Inย [37]:
def load_data():
return np.ones((2 ** 30), dtype=np.uint8)
Inย [50]:
%%memit
def proccess():
data = load_data()
return another_foo(foo(data))
proccess()
Inย [51]:
%%memit
def proccess():
data = load_data()
data = foo(data)
data = another_foo(data)
return data
proccess()
fillna cudf sux , group by cudf best, modin can be bad cudf hard