In [ ]:
import platform
print('Python: ' + platform.python_version())
import numpy as np
print('numpy: ' + np.__version__)
import pandas as pd
print('pandas: ' + pd.__version__)
import scipy
print('scipy: ' + scipy.__version__)
import sklearn
print('scikit-learn: ' + sklearn.__version__)
import matplotlib as plt
print('matplotlib: ' + plt.__version__)
import flask
print('flask: ' + flask.__version__)
In [ ]:
# In case of no Internet, use:
# pd.read_json('data/cached_Python.json')
(
pd.read_json('https://raw.githubusercontent.com/Nozdi/first-steps-with-pandas-workshop/master/data/cached_python.json')
.resample('1W')
.mean()
['daily_views']
)
In [ ]:
some_data = [ list(range(1,100)) for x in range(1,1000) ]
some_df = pd.DataFrame(some_data)
def standard_way(data):
return [[col*2 for col in row] for row in data]
def pandas_way(df):
return df * 2
In [ ]:
%timeit standard_way(some_data)
In [ ]:
%timeit pandas_way(some_df)
In [ ]:
strengths = pd.Series([400, 200, 300, 400, 500])
strengths
In [ ]:
names = pd.Series(["Batman", "Robin", "Spiderman", "Robocop", "Terminator"])
names
In [ ]:
heroes = pd.DataFrame({
'hero': names,
'strength': strengths
})
heroes
In [ ]:
other_heroes = pd.DataFrame([
dict(hero="Hercules", strength=800),
dict(hero="Conan")
])
other_heroes
In [ ]:
another_heroes = pd.DataFrame([
pd.Series(["Wonder Woman", 10, 3], index=["hero", "strength", "cookies"]),
pd.Series(["Xena", 20, 0], index=["hero", "strength", "cookies"])
])
another_heroes
In [ ]:
another_heroes.columns
In [ ]:
another_heroes.shape
In [ ]:
another_heroes.info()
In [ ]:
another_heroes['cookies']
In [ ]:
another_heroes.cookies
In [ ]:
another_heroes[ ['hero', 'cookies'] ]
In [ ]:
another_heroes[['hero', 'cookies']][['cookies']]
In [ ]:
another_heroes[['hero', 'cookies']][['cookies']]['cookies']
Create DataFrame presented below in 3 different ways
movie_title imdb_score
0 Avatar 7.9
1 Pirates of the Caribbean: At World's End 7.1
2 Spectre 6.8
Help: http://pandas.pydata.org/pandas-docs/stable/dsintro.html#from-dict-of-series-or-dicts
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
# Uncomment and press tab..
# pd.read_
# SQL, csv, hdf
In [ ]:
# pd.read_csv?
In [ ]:
# executing bash in jupyter notebook
!head -c 500 data/cached_python.json
In [ ]:
pd.read_json('data/cached_python.json')
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
heroes
In [ ]:
heroes['strength'] == 400
In [ ]:
heroes[heroes['strength'] == 400]
In [ ]:
heroes[heroes['strength'] > 400]
In [ ]:
try:
heroes[200 < heroes['strength'] < 400]
except ValueError:
print("This cool Python syntax ain't work :(")
In [ ]:
heroes[
(heroes['strength'] > 200) &
(heroes['strength'] < 400)
]
In [ ]:
heroes[
(heroes['strength'] <= 200) |
(heroes['strength'] >= 400)
]
In [ ]:
~(heroes['strength'] == 400)
In [ ]:
heroes['strength'] != 400
In [ ]:
heroes[~(
(heroes['strength'] <= 200) |
(heroes['strength'] >= 400)
)]
In [ ]:
heroes[
heroes['hero'].isin(['Batman', 'Robin'])
]
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
heroes.values
In [ ]:
heroes.to_dict()
In [ ]:
heroes.to_dict('records')
In [ ]:
heroes.to_json()
In [ ]:
heroes.to_json(orient='records')
In [ ]:
heroes.to_csv()
In [ ]:
heroes.to_csv(index=False)
In [ ]:
heroes.to_csv('data/heroes.csv', index=False)
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
heroes
In [ ]:
heroes['health'] = np.NaN
heroes.head()
In [ ]:
heroes['health'] = 100
heroes.head()
In [ ]:
heroes['height'] = [180, 170, 175, 190, 185]
heroes
In [ ]:
heroes['is_hungry'] = pd.Series([True, False, False, True, True])
heroes
In [ ]:
heroes['strength'] * 2
In [ ]:
heroes['strength'] / heroes['height']
In [ ]:
heroes['strength_per_cm'] = heroes['strength'] / heroes['height']
heroes
In [ ]:
pd.Series([1, 2, 3]).map(lambda x: x**3)
In [ ]:
pd.Series(['Batman', 'Robin']).map(lambda x: x[:2])
In [ ]:
# however, more idiomatic approach for strings is to do..
pd.Series(['Batman', 'Robin']).str[:2]
In [ ]:
pd.Series(['Batman', 'Robin']).str.lower()
In [ ]:
pd.Series([
['Batman', 'Robin'],
['Robocop']
]).map(len)
In [ ]:
heroes['code'] = heroes['hero'].map(lambda name: name[:2])
heroes
In [ ]:
heroes['effective_strength'] = heroes.apply(
lambda row: (not row['is_hungry']) * row['strength'],
axis=1
)
heroes.head()
In [ ]:
heroes[['health', 'strength']] = heroes[['health', 'strength']].applymap(
lambda x: x + 100
)
heroes
In [ ]:
heroes['strength'].value_counts()
In [ ]:
heroes.sort_values('strength')
In [ ]:
heroes.sort_values(
['is_hungry', 'code'],
ascending=[False, True]
)
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
heroes
In [ ]:
heroes.describe()
In [ ]:
%matplotlib inline
In [ ]:
pd.Series([1, 2, 3]).plot()
In [ ]:
pd.Series([1, 2, 3], index=['Batman', 'Robin', 'Rambo']).plot()
In [ ]:
pd.Series([1, 2, 3], index=['Batman', 'Robin', 'Rambo']).plot(kind='bar')
In [ ]:
pd.Series([1, 2, 3], index=['Batman', 'Robin', 'Rambo']).plot(
kind='bar',
figsize=(15, 6)
)
In [ ]:
pd.Series([1, 2, 3], index=['Batman', 'Robin', 'Rambo']).plot(kind='pie')
In [ ]:
heroes.plot()
In [ ]:
indexed_heroes = heroes.set_index('hero')
indexed_heroes
In [ ]:
indexed_heroes.plot()
In [ ]:
indexed_heroes.plot(kind='barh')
In [ ]:
indexed_heroes.plot(kind='bar', subplots=True, figsize=(15, 15))
In [ ]:
indexed_heroes[['height', 'strength']].plot(kind='bar')
In [ ]:
heroes.plot(x='hero', y=['height', 'strength'], kind='bar')
In [ ]:
# alternative to subplots
heroes.plot(
x='hero',
y=['height', 'strength'],
kind='bar',
secondary_y='strength',
figsize=(10,8)
)
In [ ]:
heroes.plot(
x='hero',
y=['height', 'strength'],
kind='bar',
secondary_y='strength',
title='Super plot of super heroes',
figsize=(10,8)
)
In [ ]:
heroes.hist(figsize=(10, 10))
In [ ]:
heroes.hist(
figsize=(10, 10),
bins=2
)
In [ ]:
heroes.describe()['strength'].plot(kind='bar')
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
movie_heroes = pd.DataFrame({
'hero': ['Batman', 'Robin', 'Spiderman', 'Robocop', 'Lex Luthor', 'Dr Octopus'],
'movie': ['Batman', 'Batman', 'Spiderman', 'Robocop', 'Spiderman', 'Spiderman'],
'strength': [400, 100, 400, 560, 89, 300],
'speed': [100, 10, 200, 1, 20, None],
})
movie_heroes = movie_heroes.set_index('hero')
movie_heroes
In [ ]:
movie_heroes.groupby('movie')
In [ ]:
list(movie_heroes.groupby('movie'))
In [ ]:
movie_heroes.groupby('movie').size()
In [ ]:
movie_heroes.groupby('movie').count()
In [ ]:
movie_heroes.groupby('movie')['speed'].sum()
In [ ]:
movie_heroes.groupby('movie').mean()
In [ ]:
movie_heroes.groupby('movie').apply(
lambda group: group['strength'] / group['strength'].max()
)
In [ ]:
movie_heroes.groupby('movie').agg({
'speed': 'mean',
'strength': 'max',
})
In [ ]:
movie_heroes = movie_heroes.reset_index()
movie_heroes
In [ ]:
movie_heroes.groupby(['movie', 'hero']).mean()
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
movie_heroes
In [ ]:
apetite = pd.DataFrame([
dict(hero='Spiderman', is_hungry=True),
dict(hero='Robocop', is_hungry=False)
])
apetite
In [ ]:
movie_heroes['is_hungry'] = apetite['is_hungry']
movie_heroes
In [ ]:
apetite.index = [2, 3]
In [ ]:
movie_heroes['is_hungry'] = apetite['is_hungry']
movie_heroes
In [ ]:
indexed_movie_heroes = movie_heroes.set_index('hero')
indexed_movie_heroes
In [ ]:
indexed_apetite = apetite.set_index('hero')
indexed_apetite
In [ ]:
# and alignment works well automagically..
indexed_movie_heroes['is_hungry'] = indexed_apetite['is_hungry']
In [ ]:
indexed_movie_heroes
In [ ]:
movie_heroes
In [ ]:
apetite
In [ ]:
# couple of other arguments available here
pd.merge(
movie_heroes[['hero', 'speed']],
apetite,
on=['hero'],
how='outer'
)
In [ ]:
spiderman_meals = pd.DataFrame([
dict(time='2016-10-15 10:00', calories=300),
dict(time='2016-10-15 13:00', calories=900),
dict(time='2016-10-15 15:00', calories=1200),
dict(time='2016-10-15 21:00', calories=700),
dict(time='2016-10-16 07:00', calories=1600),
dict(time='2016-10-16 13:00', calories=600),
dict(time='2016-10-16 16:00', calories=900),
dict(time='2016-10-16 20:00', calories=500),
dict(time='2016-10-16 21:00', calories=300),
dict(time='2016-10-17 08:00', calories=900),
])
spiderman_meals
In [ ]:
spiderman_meals.dtypes
In [ ]:
spiderman_meals['time'] = pd.to_datetime(spiderman_meals['time'])
spiderman_meals.dtypes
In [ ]:
spiderman_meals
In [ ]:
spiderman_meals = spiderman_meals.set_index('time')
spiderman_meals
In [ ]:
spiderman_meals.index
In [ ]:
spiderman_meals["2016-10-15"]
In [ ]:
spiderman_meals["2016-10-16 10:00":]
In [ ]:
spiderman_meals["2016-10-16 10:00":"2016-10-16 20:00"]
In [ ]:
spiderman_meals["2016-10"]
In [ ]:
spiderman_meals.resample('1D').sum()
In [ ]:
spiderman_meals.resample('1H').mean()
In [ ]:
spiderman_meals.resample('1H').ffill()
In [ ]:
spiderman_meals.resample('1D').first()
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
heroes_with_missing = pd.DataFrame([
('Batman', None, None),
('Robin', None, 100),
('Spiderman', 400, 90),
('Robocop', 500, 95),
('Terminator', 600, None)
], columns=['hero', 'strength', 'health'])
heroes_with_missing
In [ ]:
heroes_with_missing.dropna()
In [ ]:
heroes_with_missing.fillna(0)
In [ ]:
heroes_with_missing.fillna({'strength': 10, 'health': 20})
In [ ]:
heroes_with_missing.fillna(heroes_with_missing.min())
In [ ]:
heroes_with_missing.fillna(heroes_with_missing.median())
In [ ]:
pd.DataFrame({'x': [1, 2], 'y': [10, 20]}).plot(x='x', y='y', kind='scatter')
In [ ]:
from sklearn.linear_model import LinearRegression
X=[ [1], [2] ]
y=[ 10, 20 ]
clf = LinearRegression()
clf.fit(X, y)
In [ ]:
clf.predict([ [0.5], [2], [4] ])
In [ ]:
X = np.array([ [1], [2] ])
y = np.array([ 10, 20 ])
X
In [ ]:
clf = LinearRegression()
clf.fit(X, y)
In [ ]:
clf.predict( np.array([ [0.5], [2], [4] ]) )
In [ ]:
train_df = pd.DataFrame([
(1, 10),
(2, 20),
], columns=['x', 'y'])
train_df
In [ ]:
clf = LinearRegression()
clf.fit(train_df[['x']], train_df['y'])
In [ ]:
clf.predict([[0.5]])
In [ ]:
test_df = pd.DataFrame({'x': [0.5, 1.5, 4]})
test_df
In [ ]:
clf.predict(test_df[['x']])
In [ ]:
test_df['y'] = clf.predict(test_df[['x']])
test_df
In [ ]:
train_df['color'] = 'blue'
test_df['color'] = 'red'
all_df = train_df.append(test_df)
all_df.plot(x='x', y='y', kind='scatter', figsize=(10, 8), color=all_df['color'])
More models to try: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning
In [ ]:
# Solution here
In [ ]:
# Solution here
In [ ]:
# Solution in flask_exercise.py
You can reach us:
http://66.media.tumblr.com/b0e67112232adb68180fe7b988304abd/tumblr_inline_nw063rleWO1tp3b6e_1280.jpg