In [1]:
import platform
print('Python: ' + platform.python_version())
import numpy as np
print('numpy: ' + np.__version__)
import pandas as pd
print('pandas: ' + pd.__version__)
import scipy
print('scipy: ' + scipy.__version__)
import sklearn
print('scikit-learn: ' + sklearn.__version__)
import matplotlib as plt
print('matplotlib: ' + plt.__version__)
import flask
print('flask: ' + flask.__version__)
In [2]:
# In case of no Internet, use:
# pd.read_json('data/cached_Python.json')
(
pd.read_json('https://raw.githubusercontent.com/Nozdi/first-steps-with-pandas-workshop/master/data/cached_python.json')
.resample('1W')
.mean()
['daily_views']
)
Out[2]:
In [3]:
some_data = [ list(range(1,100)) for x in range(1,1000) ]
some_df = pd.DataFrame(some_data)
def standard_way(data):
return [[col*2 for col in row] for row in data]
def pandas_way(df):
return df * 2
In [4]:
%timeit standard_way(some_data)
In [5]:
%timeit pandas_way(some_df)
In [6]:
strengths = pd.Series([400, 200, 300, 400, 500])
strengths
Out[6]:
In [7]:
names = pd.Series(["Batman", "Robin", "Spiderman", "Robocop", "Terminator"])
names
Out[7]:
In [8]:
heroes = pd.DataFrame({
'hero': names,
'strength': strengths
})
heroes
Out[8]:
In [9]:
other_heroes = pd.DataFrame([
dict(hero="Hercules", strength=800),
dict(hero="Conan")
])
other_heroes
Out[9]:
In [10]:
another_heroes = pd.DataFrame([
pd.Series(["Wonder Woman", 10, 3], index=["hero", "strength", "cookies"]),
pd.Series(["Xena", 20, 0], index=["hero", "strength", "cookies"])
])
another_heroes
Out[10]:
In [11]:
another_heroes.columns
Out[11]:
In [12]:
another_heroes.shape
Out[12]:
In [13]:
another_heroes.info()
In [14]:
another_heroes['cookies']
Out[14]:
In [15]:
another_heroes.cookies
Out[15]:
In [16]:
another_heroes[ ['hero', 'cookies'] ]
Out[16]:
In [17]:
another_heroes[['hero', 'cookies']][['cookies']]
Out[17]:
In [18]:
another_heroes[['hero', 'cookies']][['cookies']]['cookies']
Out[18]:
Create DataFrame presented below in 3 different ways
movie_title imdb_score
0 Avatar 7.9
1 Pirates of the Caribbean: At World's End 7.1
2 Spectre 6.8
Help: http://pandas.pydata.org/pandas-docs/stable/dsintro.html#from-dict-of-series-or-dicts
In [19]:
# Solution here
titles = pd.Series(["Avatar", "Pirates of the Caribbean: At World's End", "Spectre"])
imdb_scores = pd.Series([7.9, 7.1, 6.8])
pd.DataFrame({'movie_title': titles, 'imdb_score': imdb_scores})
Out[19]:
In [20]:
# Solution here
pd.DataFrame([
dict(movie_title="Avatar", imdb_score=7.9),
dict(movie_title="Pirates of the Caribbean: At World's End", imdb_score=7.1),
dict(movie_title="Spectre", imdb_score=6.8),
])
Out[20]:
In [21]:
# Solution here
pd.DataFrame([
pd.Series(["Avatar", 7.9], index=['movie_title', 'imdb_score']),
pd.Series(["Pirates of the Caribbean: At World's End", 7.1], index=['movie_title', 'imdb_score']),
pd.Series(["Spectre", 6.8], index=['movie_title', 'imdb_score'])
])
Out[21]:
In [22]:
# Uncomment and press tab..
# pd.read_
# SQL, csv, hdf
In [23]:
# pd.read_csv?
In [24]:
# executing bash in jupyter notebook
!head -c 500 data/cached_python.json
In [25]:
pd.read_json('data/cached_python.json')
Out[25]:
In [26]:
# Solution here
movies = pd.read_csv('data/movies.csv')
movies.head()
Out[26]:
In [27]:
# Solution here
print(movies.shape)
print(movies.columns)
In [28]:
heroes
Out[28]:
In [29]:
heroes['strength'] == 400
Out[29]:
In [30]:
heroes[heroes['strength'] == 400]
Out[30]:
In [31]:
heroes[heroes['strength'] > 400]
Out[31]:
In [32]:
try:
heroes[200 < heroes['strength'] < 400]
except ValueError:
print("This cool Python syntax ain't work :(")
In [33]:
heroes[
(heroes['strength'] > 200) &
(heroes['strength'] < 400)
]
Out[33]:
In [34]:
heroes[
(heroes['strength'] <= 200) |
(heroes['strength'] >= 400)
]
Out[34]:
In [35]:
~(heroes['strength'] == 400)
Out[35]:
In [36]:
heroes['strength'] != 400
Out[36]:
In [37]:
heroes[~(
(heroes['strength'] <= 200) |
(heroes['strength'] >= 400)
)]
Out[37]:
In [38]:
heroes[
heroes['hero'].isin(['Batman', 'Robin'])
]
Out[38]:
In [39]:
# Solution here
movies[movies['director_name'] == "Clint Eastwood"]
Out[39]:
In [40]:
# Solution here
movies[movies['gross'] > 500e6]['movie_title']
Out[40]:
In [41]:
# Solution here
movies[movies['language'] == 'Polish']['movie_title']
Out[41]:
In [42]:
# Solution here
movies[
(movies['movie_facebook_likes'] > 100000) &
(movies['imdb_score'] > 8.5)
]['movie_title']
Out[42]:
In [43]:
# Solution here
brutals = ["Jason Statham", "Sylvester Stallone"]
god = "Morgan Freeman"
movies[
(movies['actor_1_name'].isin(brutals)) |
(movies['actor_1_name'] == god)
]['movie_title'].head()
Out[43]:
In [44]:
heroes.values
Out[44]:
In [45]:
heroes.to_dict()
Out[45]:
In [46]:
heroes.to_dict('records')
Out[46]:
In [47]:
heroes.to_json()
Out[47]:
In [48]:
heroes.to_json(orient='records')
Out[48]:
In [49]:
heroes.to_csv()
Out[49]:
In [50]:
heroes.to_csv(index=False)
Out[50]:
In [51]:
heroes.to_csv('data/heroes.csv', index=False)
In [52]:
# Solution here
cols = [
'movie_title',
'actor_1_name',
'actor_2_name',
'actor_3_name',
'budget'
]
movies[movies['budget'] > 200e6][cols].to_csv("data/expensive-cast.csv", index=False)
In [53]:
# Solution here
cols = [
'movie_title',
'movie_facebook_likes'
]
movies[movies['director_name'] == 'Christopher Nolan'][cols].to_dict('r')
Out[53]:
In [54]:
heroes
Out[54]:
In [55]:
heroes['health'] = np.NaN
heroes.head()
Out[55]:
In [56]:
heroes['health'] = 100
heroes.head()
Out[56]:
In [57]:
heroes['height'] = [180, 170, 175, 190, 185]
heroes
Out[57]:
In [58]:
heroes['is_hungry'] = pd.Series([True, False, False, True, True])
heroes
Out[58]:
In [59]:
heroes['strength'] * 2
Out[59]:
In [60]:
heroes['strength'] / heroes['height']
Out[60]:
In [61]:
heroes['strength_per_cm'] = heroes['strength'] / heroes['height']
heroes
Out[61]:
In [62]:
pd.Series([1, 2, 3]).map(lambda x: x**3)
Out[62]:
In [63]:
pd.Series(['Batman', 'Robin']).map(lambda x: x[:2])
Out[63]:
In [64]:
# however, more idiomatic approach for strings is to do..
pd.Series(['Batman', 'Robin']).str[:2]
Out[64]:
In [65]:
pd.Series(['Batman', 'Robin']).str.lower()
Out[65]:
In [66]:
pd.Series([
['Batman', 'Robin'],
['Robocop']
]).map(len)
Out[66]:
In [67]:
heroes['code'] = heroes['hero'].map(lambda name: name[:2])
heroes
Out[67]:
In [68]:
heroes['effective_strength'] = heroes.apply(
lambda row: (not row['is_hungry']) * row['strength'],
axis=1
)
heroes.head()
Out[68]:
In [69]:
heroes[['health', 'strength']] = heroes[['health', 'strength']].applymap(
lambda x: x + 100
)
heroes
Out[69]:
In [70]:
heroes['strength'].value_counts()
Out[70]:
In [71]:
heroes.sort_values('strength')
Out[71]:
In [72]:
heroes.sort_values(
['is_hungry', 'code'],
ascending=[False, True]
)
Out[72]:
In [73]:
# Solution here
movies['profitability'] = movies['gross'] / movies['budget']
movies.sort_values('profitability', ascending=False).head(10)
Out[73]:
In [74]:
# Solution here
movies['first_genre'] = movies['genres'].str.split('|').str[0]
movies.head()
Out[74]:
In [75]:
heroes
Out[75]:
In [76]:
heroes.describe()
Out[76]:
In [77]:
%matplotlib inline
In [78]:
pd.Series([1, 2, 3]).plot()
Out[78]:
In [79]:
pd.Series([1, 2, 3], index=['Batman', 'Robin', 'Rambo']).plot()
Out[79]:
In [80]:
pd.Series([1, 2, 3], index=['Batman', 'Robin', 'Rambo']).plot(kind='bar')
Out[80]:
In [81]:
pd.Series([1, 2, 3], index=['Batman', 'Robin', 'Rambo']).plot(
kind='bar',
figsize=(15, 6)
)
Out[81]:
In [82]:
pd.Series([1, 2, 3], index=['Batman', 'Robin', 'Rambo']).plot(kind='pie')
Out[82]:
In [83]:
heroes.plot()
Out[83]:
In [84]:
indexed_heroes = heroes.set_index('hero')
indexed_heroes
Out[84]:
In [85]:
indexed_heroes.plot()
Out[85]:
In [86]:
indexed_heroes.plot(kind='barh')
Out[86]:
In [87]:
indexed_heroes.plot(kind='bar', subplots=True, figsize=(15, 15))
Out[87]:
In [88]:
indexed_heroes[['height', 'strength']].plot(kind='bar')
Out[88]:
In [89]:
heroes.plot(x='hero', y=['height', 'strength'], kind='bar')
Out[89]:
In [90]:
# alternative to subplots
heroes.plot(
x='hero',
y=['height', 'strength'],
kind='bar',
secondary_y='strength',
figsize=(10,8)
)
Out[90]:
In [91]:
heroes.plot(
x='hero',
y=['height', 'strength'],
kind='bar',
secondary_y='strength',
title='Super plot of super heroes',
figsize=(10,8)
)
Out[91]:
In [92]:
heroes.hist(figsize=(10, 10))
Out[92]:
In [93]:
heroes.hist(
figsize=(10, 10),
bins=2
)
Out[93]:
In [94]:
heroes.describe()['strength'].plot(kind='bar')
Out[94]:
In [95]:
# Solution here
nolan_movies = movies[movies['director_name'] == 'Christopher Nolan']
nolan_movies = nolan_movies.set_index('movie_title')
nolan_movies['gross'].plot(kind='bar')
Out[95]:
In [96]:
# Solution here
movies['duration'].hist(bins=25)
Out[96]:
In [97]:
# Solution here
movies['first_genre'].value_counts().plot(
kind='pie',
figsize=(15,15)
)
Out[97]:
In [98]:
movie_heroes = pd.DataFrame({
'hero': ['Batman', 'Robin', 'Spiderman', 'Robocop', 'Lex Luthor', 'Dr Octopus'],
'movie': ['Batman', 'Batman', 'Spiderman', 'Robocop', 'Spiderman', 'Spiderman'],
'strength': [400, 100, 400, 560, 89, 300],
'speed': [100, 10, 200, 1, 20, None],
})
movie_heroes = movie_heroes.set_index('hero')
movie_heroes
Out[98]:
In [99]:
movie_heroes.groupby('movie')
Out[99]:
In [100]:
list(movie_heroes.groupby('movie'))
Out[100]:
In [101]:
movie_heroes.groupby('movie').size()
Out[101]:
In [102]:
movie_heroes.groupby('movie').count()
Out[102]:
In [103]:
movie_heroes.groupby('movie')['speed'].sum()
Out[103]:
In [104]:
movie_heroes.groupby('movie').mean()
Out[104]:
In [105]:
movie_heroes.groupby('movie').apply(
lambda group: group['strength'] / group['strength'].max()
)
Out[105]:
In [106]:
movie_heroes.groupby('movie').agg({
'speed': 'mean',
'strength': 'max',
})
Out[106]:
In [107]:
movie_heroes = movie_heroes.reset_index()
movie_heroes
Out[107]:
In [108]:
movie_heroes.groupby(['movie', 'hero']).mean()
Out[108]:
In [109]:
# Solution here
movies.groupby('title_year')['gross'].max().tail(10).plot(kind='bar')
Out[109]:
In [110]:
# Solution here
(
movies
.groupby('director_name')
['gross']
.mean()
.sort_values(ascending=False)
.head(3)
)
Out[110]:
In [111]:
movie_heroes
Out[111]:
In [112]:
apetite = pd.DataFrame([
dict(hero='Spiderman', is_hungry=True),
dict(hero='Robocop', is_hungry=False)
])
apetite
Out[112]:
In [113]:
movie_heroes['is_hungry'] = apetite['is_hungry']
movie_heroes
Out[113]:
In [114]:
apetite.index = [2, 3]
In [115]:
movie_heroes['is_hungry'] = apetite['is_hungry']
movie_heroes
Out[115]:
In [116]:
indexed_movie_heroes = movie_heroes.set_index('hero')
indexed_movie_heroes
Out[116]:
In [117]:
indexed_apetite = apetite.set_index('hero')
indexed_apetite
Out[117]:
In [118]:
# and alignment works well automagically..
indexed_movie_heroes['is_hungry'] = indexed_apetite['is_hungry']
In [119]:
indexed_movie_heroes
Out[119]:
In [120]:
movie_heroes
Out[120]:
In [121]:
apetite
Out[121]:
In [122]:
# couple of other arguments available here
pd.merge(
movie_heroes[['hero', 'speed']],
apetite,
on=['hero'],
how='outer'
)
Out[122]:
In [123]:
spiderman_meals = pd.DataFrame([
dict(time='2016-10-15 10:00', calories=300),
dict(time='2016-10-15 13:00', calories=900),
dict(time='2016-10-15 15:00', calories=1200),
dict(time='2016-10-15 21:00', calories=700),
dict(time='2016-10-16 07:00', calories=1600),
dict(time='2016-10-16 13:00', calories=600),
dict(time='2016-10-16 16:00', calories=900),
dict(time='2016-10-16 20:00', calories=500),
dict(time='2016-10-16 21:00', calories=300),
dict(time='2016-10-17 08:00', calories=900),
])
spiderman_meals
Out[123]:
In [124]:
spiderman_meals.dtypes
Out[124]:
In [125]:
spiderman_meals['time'] = pd.to_datetime(spiderman_meals['time'])
spiderman_meals.dtypes
Out[125]:
In [126]:
spiderman_meals
Out[126]:
In [127]:
spiderman_meals = spiderman_meals.set_index('time')
spiderman_meals
Out[127]:
In [128]:
spiderman_meals.index
Out[128]:
In [129]:
spiderman_meals["2016-10-15"]
Out[129]:
In [130]:
spiderman_meals["2016-10-16 10:00":]
Out[130]:
In [131]:
spiderman_meals["2016-10-16 10:00":"2016-10-16 20:00"]
Out[131]:
In [132]:
spiderman_meals["2016-10"]
Out[132]:
In [133]:
spiderman_meals.resample('1D').sum()
Out[133]:
In [134]:
spiderman_meals.resample('1H').mean()
Out[134]:
In [135]:
spiderman_meals.resample('1H').ffill()
Out[135]:
In [136]:
spiderman_meals.resample('1D').first()
Out[136]:
In [137]:
# Solution here
force_awakens_tweets = pd.read_csv(
'data/theforceawakens_tweets.csv',
parse_dates=['created_at'],
index_col='created_at'
)
force_awakens_tweets.head()
Out[137]:
In [138]:
# Solution here
force_awakens_tweets.resample('1D').count()
Out[138]:
In [139]:
# Solution here
(
force_awakens_tweets
.resample('4H')
.count()
.plot(figsize=(15, 5))
)
(
force_awakens_tweets["2016-09-29":]
.resample('1H')
.count()
.plot(figsize=(15, 5))
)
Out[139]:
In [140]:
heroes_with_missing = pd.DataFrame([
('Batman', None, None),
('Robin', None, 100),
('Spiderman', 400, 90),
('Robocop', 500, 95),
('Terminator', 600, None)
], columns=['hero', 'strength', 'health'])
heroes_with_missing
Out[140]:
In [141]:
heroes_with_missing.dropna()
Out[141]:
In [142]:
heroes_with_missing.fillna(0)
Out[142]:
In [143]:
heroes_with_missing.fillna({'strength': 10, 'health': 20})
Out[143]:
In [144]:
heroes_with_missing.fillna(heroes_with_missing.min())
Out[144]:
In [145]:
heroes_with_missing.fillna(heroes_with_missing.median())
Out[145]:
In [146]:
pd.DataFrame({'x': [1, 2], 'y': [10, 20]}).plot(x='x', y='y', kind='scatter')
Out[146]:
In [147]:
from sklearn.linear_model import LinearRegression
X=[ [1], [2] ]
y=[ 10, 20 ]
clf = LinearRegression()
clf.fit(X, y)
Out[147]:
In [148]:
clf.predict([ [0.5], [2], [4] ])
Out[148]:
In [149]:
X = np.array([ [1], [2] ])
y = np.array([ 10, 20 ])
X
Out[149]:
In [150]:
clf = LinearRegression()
clf.fit(X, y)
Out[150]:
In [151]:
clf.predict( np.array([ [0.5], [2], [4] ]) )
Out[151]:
In [152]:
train_df = pd.DataFrame([
(1, 10),
(2, 20),
], columns=['x', 'y'])
train_df
Out[152]:
In [153]:
clf = LinearRegression()
clf.fit(train_df[['x']], train_df['y'])
Out[153]:
In [154]:
clf.predict([[0.5]])
Out[154]:
In [155]:
test_df = pd.DataFrame({'x': [0.5, 1.5, 4]})
test_df
Out[155]:
In [156]:
clf.predict(test_df[['x']])
Out[156]:
In [157]:
test_df['y'] = clf.predict(test_df[['x']])
test_df
Out[157]:
In [158]:
train_df['color'] = 'blue'
test_df['color'] = 'red'
all_df = train_df.append(test_df)
all_df.plot(x='x', y='y', kind='scatter', figsize=(10, 8), color=all_df['color'])
Out[158]:
More models to try: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning
In [159]:
# Solution here
from sklearn.linear_model import LinearRegression
FEATURES = ['num_voted_users', 'imdb_score']
TARGET = 'gross'
movies_with_data = movies[FEATURES + [TARGET]].dropna()
X = movies_with_data[FEATURES].values
y = movies_with_data[TARGET].values
clf = LinearRegression()
clf.fit(X, y)
clf.predict([
[800000, 8.0],
[400000, 8.0],
[400000, 4.0],
[ 40000, 8.0],
])
Out[159]:
In [160]:
# Solution here
def discover_similar_plot(target_keywords, threshold=0.5):
movies_with_plot = movies.dropna(
subset=['plot_keywords']
).copy()
movies_with_plot['plot_keywords_set'] = movies_with_plot[
'plot_keywords'
].str.split('|').map(set)
movies_with_plot['match_count'] = movies_with_plot[
'plot_keywords_set'
].map(
lambda keywords: len(keywords.intersection(target_keywords))
)
return movies_with_plot[
(movies_with_plot['match_count'] >= threshold*len(target_keywords))
]
discover_similar_plot(['magic', 'harry', 'wizard'])['movie_title']
Out[160]:
In [161]:
# Solution in flask_exercise.py
You can reach us:
http://66.media.tumblr.com/b0e67112232adb68180fe7b988304abd/tumblr_inline_nw063rleWO1tp3b6e_1280.jpg