In [1]:
from IPython.display import Image
Image('ML_Workflow.PNG')
Out[1]:
In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
In [3]:
pole = [1,2,3]
pole * 3
Out[3]:
In [4]:
np_pole = np.array([1,2,3])
np_pole * 3
Out[4]:
In [12]:
x = np.arange(20).reshape(4, 5) # skusit viacere dimenzie
x
Out[12]:
In [13]:
x.shape
Out[13]:
In [14]:
x.ndim
Out[14]:
In [17]:
x.sum(axis=0)
Out[17]:
In [18]:
x.dtype
Out[18]:
In [19]:
a = np.array([.1,.2])
print(a)
a.dtype
Out[19]:
In [20]:
c = np.array( [ [1,2], [3,4] ], dtype=complex )
print(c)
c.dtype
Out[20]:
In [21]:
np.zeros((3,4))
Out[21]:
In [22]:
np.ones((2,5))
Out[22]:
In [23]:
np.repeat(3, 10).reshape([2,5])
Out[23]:
In [24]:
np.linspace(0, 2, 9)
Out[24]:
In [25]:
x = np.linspace( 0, 2*np.pi, 100 )
f = np.sin(x)
In [26]:
plt.plot(f)
Out[26]:
In [27]:
A = np.array( [[1,1], [0,1]] )
B = np.array( [[2,0], [3,4]] )
In [28]:
A
Out[28]:
In [29]:
B
Out[29]:
In [30]:
np.transpose(B)
Out[30]:
In [31]:
A*B
Out[31]:
In [32]:
A.dot(B) # np.dot(A, B)
Out[32]:
In [33]:
a = np.arange(10)**3
a
Out[33]:
In [34]:
a[2]
Out[34]:
In [35]:
a[2:5]
Out[35]:
In [36]:
a[2:6:2]
Out[36]:
In [37]:
a[:6:2] = -1000
a
Out[37]:
In [38]:
a[ : :-1]
Out[38]:
In [39]:
b = np.arange(20).reshape(4,5)
b
Out[39]:
In [40]:
b[2,3]
Out[40]:
In [41]:
b[2,]
Out[41]:
In [42]:
b[1:3,2:4]
Out[42]:
In [43]:
b[:,2:4]
Out[43]:
Dalsie operacie si pozrite
Pandas pouziva Numpy pole a nad nim si postavili typ Series a DataFrame
In [44]:
s = pd.Series([0,1,2,3,4])
s
Out[44]:
In [45]:
# k numpy polu je pridany explicitny index
s.index
Out[45]:
In [46]:
s.values
Out[46]:
In [47]:
s[0]
Out[47]:
In [48]:
# na rozdiel od numpy vsak index moze byt aj nieco ine ako cislo
s2 = pd.Series(np.arange(4), index=['a', 'b', 'c', 'd'])
s2
Out[48]:
In [49]:
s2['c']
Out[49]:
In [50]:
s2[2]
Out[50]:
In [51]:
s2.c
Out[51]:
In [52]:
# na vytvorenie Series objektu sa da pouzit aj asociatyvne pole
population = pd.Series({'Germany': 81.3, 'Belgium': 11.3, 'France': 64.3, 'United Kingdom': 64.9, 'Netherlands': 16.9})
population
Out[52]:
In [53]:
population['France']
Out[53]:
In [54]:
# kedze je to postavene na Numpy, tak vieme robit vsetky zaujimave operacie
population * 1000
Out[54]:
In [55]:
# index ma implicitne dane poradie, takze sa da robit rozsah
population['Belgium':'Netherlands']
Out[55]:
In [56]:
population.mean()
Out[56]:
Da sa pristupovat k prvkom tak, ako sme na to zvyknuti z R
In [57]:
population[['France', 'Netherlands']]
Out[57]:
In [ ]:
population[population > 20]
No a DataFrame je vlastne multidimenzionalny Series
In [67]:
data = {'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'],
'population': [11.3, 64.3, 81.3, 16.9, 64.9],
'area': [30510, 671308, 357050, 41526, 244820],
'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']}
countries = pd.DataFrame(data)
countries
Out[67]:
In [68]:
countries.index
Out[68]:
In [69]:
countries.columns
Out[69]:
In [70]:
countries.values
Out[70]:
In [71]:
countries.dtypes
Out[71]:
In [72]:
countries.info()
In [73]:
countries.describe()
Out[73]:
In [74]:
countries = countries.set_index('country')
countries
Out[74]:
a vieme teraz velmi jednoducho pristupovat k jednotlivym stlpcom
In [75]:
countries.area # countries['area']
Out[75]:
In [76]:
countries['population']*1000000 / countries['area'] # hustota zaludnenia
Out[76]:
In [77]:
# vieme si jednoducho vyrobit novy stlpec
countries['density'] = countries['population']*1000000 / countries['area']
countries
Out[77]:
In [78]:
# a na zaklade neho napriklad vyberat riadky
countries[countries['density'] > 300]
Out[78]:
In [79]:
# vieme potom napriklad usporiadavat
countries.sort_values(by='density', ascending=False)
Out[79]:
In [82]:
# velmi silna vlastnost je priamociare vykreslovanie
# countries.density.plot()
# countries.density.plot(kind='bar')
countries.plot()
Out[82]:
In [83]:
countries.plot(kind='scatter', x='population', y='area')
Out[83]:
Kedze nam v DataFrame pribudla moznost vyberat stlpce podla nazvu, tak sa nam trochu skomplikovalo vyberanie prvkov oproti Numpy. Musime rozoznavat
In [84]:
countries['area']
Out[84]:
In [85]:
countries[['area', 'density']]
Out[85]:
In [86]:
# ked ale chceme rozsah, tak nam to pristupuje k riadkom
countries['France':'Netherlands']
Out[86]:
Na pokrocilejsie vyberanie z tabulky pouzivame:
loc ailoc
In [87]:
# pristup ku konkretnej bunke pomocou riadka a stlpca
countries.loc['Germany', 'area']
Out[87]:
In [88]:
# tu sa daju pouzit aj rozsahy na oboch rozmeroch
countries.loc['France':'Germany', :]
Out[88]:
In [89]:
# ale aj vymenovanie
countries.loc[countries['density']>300, ['capital', 'population']]
Out[89]:
In [ ]:
# iloc vybera podla poradia. Toto je podobne pristupovaniu k prvkom ako v Numpy
countries.iloc[0:2,1:3]
In [90]:
# samozrejem, ze sa stale daju priradovat hodnoty
countries.loc['Belgium':'Germany', 'population'] = 10
countries
Out[90]:
In [91]:
df = pd.DataFrame({'A':['one', 'one', 'two', 'two'], 'B':['a', 'b', 'a', 'b'], 'C':range(4)})
# df = pd.DataFrame({'A':['one', 'one', 'two', 'two'], 'B':['a', 'b', 'a', 'b'], 'C':range(4), 'D':range(4)})
df
Out[91]:
unstack presuva hodnoty v nejakom stlpci a vytvori z nich nazvy stlpcov
casto sa nam to hodi ak mame data, ktore su v trochu unej forme ako by sme potrebovali
In [92]:
Image("img/stack.png")
Out[92]:
In [93]:
df = df.set_index(['A', 'B']) # najskor si vyberieme stlpec, ktory pouzijeme ako index.
# Ten druhy bude dodavat hodnoty do nazvov novych stlpcov
df
Out[93]:
In [94]:
# teraz si povieme v ktorom stlpci su hodnoty a nechame to preskupit
result = df['C'].unstack()
result
Out[94]:
In [95]:
# opacna transformacia je stack. zoberie nazvy stlpcov a spravi z nich hodnoty
df = result.stack().reset_index(name='C')
df
Out[95]:
In [96]:
# pivot je velmi podobny ako unstack, ale necha nastavit mena stlpcov a moze ich byt viac
df = pd.DataFrame({'A':['one', 'one', 'two', 'two'], 'B':['a', 'b', 'a', 'b'], 'C':range(4)})
df
Out[96]:
In [97]:
df.pivot(index='A', columns='B', values='C')
Out[97]:
In [98]:
# pivot_table je podobne ako pivot, ale dokaze pracovat s duplicitnymi stlpcami a necha vas definovat agregacnu funkciu
df = pd.DataFrame({'A':['one', 'one', 'two', 'two', 'one', 'two'], 'B':['a', 'b', 'a', 'b', 'a', 'b'], 'C':range(6)})
df
Out[98]:
In [99]:
df.pivot_table(index='A', columns='B', values='C', aggfunc=np.sum) #aggfunct je defaultne np.mean
Out[99]:
In [ ]:
In [100]:
data = pd.read_csv("data/BETR8010000800100hour.1-1-1990.31-12-2012", sep='\t')
data.head()
# Data su tvorene meraniami nejakej veliciny v jednotlivych hodinach dna.
# Co den, to riadok. Kazda hodina ma zvlast stlpec + je tu stlpec pre nejaky flag, ktory nas nezaujima
# su tam nejak divne hodnoty, ktore by tam asi nemali byt -999 a -9999
# datum bude asi index
# v subore nieje hlavicka
Out[100]:
In [101]:
filename = "data/BETR8010000800100hour.1-1-1990.31-12-2012"
data = pd.read_csv(filename, sep='\t', header=None,
na_values=[-999, -9999], index_col=0)
# vela upratovania dat vieme spravit uz pri nacitani
data.head()
Out[101]:
In [102]:
# skusime povyhadzovat tie flagy, ktore nas nezaujimaju. Zhodou okolnosti je to kazdy druhy stlpec
data = data.drop(data.columns[1::2], axis=1)
data.head()
Out[102]:
In [103]:
["{:02d}".format(i) for i in range(len(data.columns))]
Out[103]:
In [104]:
# mam nejako rozsypane nazvy stlpcov
data.columns = ["{:02d}".format(i) for i in range(len(data.columns))]
data.head()
Out[104]:
In [105]:
data = data.stack()
data.head()
Out[105]:
In [106]:
type(data) # vysledok preusporiadania je viacdimenzionaly Series objekt a nie DataFrame. Ja chcem mat pekny data frame, tak s tim nieco spravime
Out[106]:
In [107]:
# mohli by sme nejak normalne poemnovat stlpec
import os
_, fname = os.path.split(filename)
station = fname[:7]
print(filename)
print(station)
In [108]:
data = data.reset_index(name=station) #reset index mi z toho sprav data frame
# data = data.reset_index() #reset index mi z toho sprav data frame
print(type(data))
data.head()
Out[108]:
In [109]:
data = data.rename(columns = {0:'date', 'level_1':'hour'})
data.head()
Out[109]:
In [110]:
# teraz tomu vyrobime novy index z datumu a hodiny
data.index = pd.to_datetime(data['date'] + ' ' + data['hour'])
data.head()
Out[110]:
In [111]:
# a zmazeme nepotrebne stlpce
data = data.drop(['date', 'hour'], axis=1)
data.head()
# Teraz uz mame data, s ktorymi sa uz da nieco robit
Out[111]:
Ja mam tych suborov viac. Kazdy obsahuje data z inej meracej stanice. Aby som zjednodusil prezentaciu, tak predchadzajuci kod som dal do cyklu a vlozil do skriptu
In [112]:
import airbase
no2 = airbase.load_data()
In [113]:
no2.head(3)
Out[113]:
In [114]:
no2.tail()
Out[114]:
In [115]:
no2.info()
In [116]:
no2.describe()
Out[116]:
In [117]:
no2.plot(kind='box')
Out[117]:
In [118]:
no2['BETN029'].plot(kind='hist', bins=50)
Out[118]:
In [119]:
import seaborn
In [120]:
seaborn.violinplot(no2)
Out[120]:
In [121]:
no2.plot(figsize=(12,6))
# mozem si vyplotovat surove data, ale je otazne, co mi to povie
Out[121]:
In [122]:
# mozem si povedat, ze chcem len nejaku mensiu cast
no2[-500:].plot(figsize=(12,6))
Out[122]:
alebo pouzijem zaujimavejsie operacie s casovymi radmi
In [123]:
no2.index # kedze index su casy, tak viem robit s nimi zaujimave veci
Out[123]:
In [124]:
no2["2010-01-01 09:00": "2010-01-01 12:00"] # napriklad definovat rozsahy pomocou stringu s datumom
Out[124]:
In [125]:
no2['2012'] # alebo takto vybrat vsetky data z jedneho konkretneho roku
# no2['2012'].head()
# no2['2012/03'] # alebo len data z marca
Out[125]:
In [126]:
# komponenty datumu su pristupne z indexu
# no2.index.hour
no2.index.year
Out[126]:
In [127]:
# a co je zaujimavejsie viem zmenit vzorkovaciu frekvenciu
no2.resample('D').mean().head()
Out[127]:
In [128]:
no2.resample('M').mean().plot()
# toto sa zda, ze povie uz trochu viac. Napriklad, ze je tu asi nejaka sezonnost
Out[128]:
In [1]:
no2.resample('A').mean().plot()
# a mozno aj nejaky dlhodoby trend
In [ ]:
no2['2012-3':'2012-4'].resample('D').mean().plot()
# mozno je tam aj nejaka tyzdenna sezonnost
In [ ]:
# mozem pouzit aj viacero agregacnych funkcii a porovnat si ich
no2.loc['2009':, 'FR04037'].resample('M').agg(['mean', 'median']).plot()
# no2.loc['2009':, 'FR04037'].resample('M').agg(['mean', 'std']).plot()
In [ ]:
df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]})
df
In [ ]:
df.groupby('key').aggregate('sum') # df.groupby('key').sum()
In [ ]:
no2['month'] = no2.index.month
no2.head()
In [ ]:
no2.groupby('month').mean()
In [ ]:
no2.groupby('month').mean().plot()
In [ ]:
iris_data = pd.read_csv('data/iris-data.csv')
iris_data.head()
# toto je trochu spotvoreny dataset kvetiniek
In [ ]:
iris_data.info()
In [ ]:
iris_data.describe()
In [ ]:
seaborn.pairplot(iris_data.dropna(), hue='class')
In [ ]:
iris_data.loc[iris_data['class'] == 'versicolor', 'class'] = 'Iris-versicolor'
iris_data.loc[iris_data['class'] == 'Iris-setossa', 'class'] = 'Iris-setosa'
iris_data['class'].unique()
In [ ]:
seaborn.pairplot(iris_data.dropna(), hue='class')
In [ ]:
iris_data.loc[iris_data['class'] == 'Iris-versicolor', 'sepal_length_cm'].hist()
In [ ]:
plt.rc("lines", markeredgewidth=0.5)
iris_data.loc[iris_data['class'] == 'Iris-versicolor', 'sepal_length_cm'].plot(kind='box')
In [ ]:
iris_data.loc[(iris_data['class'] == 'Iris-versicolor') & (iris_data['sepal_length_cm'] < 1 ), 'sepal_length_cm']
In [ ]:
iris_data.loc[(iris_data['class'] == 'Iris-versicolor') & (iris_data['sepal_length_cm'] > 1 ), 'sepal_length_cm']
In [ ]:
mask = (iris_data['class'] == 'Iris-versicolor') & (iris_data['sepal_length_cm'] < 1 )
iris_data.loc[mask, 'sepal_length_cm'] = iris_data.loc[mask, 'sepal_length_cm'] * 100
In [ ]:
iris_data.loc[mask, 'sepal_length_cm']
In [ ]:
seaborn.pairplot(iris_data.dropna(), hue='class')
In [ ]:
iris_data.loc[(iris_data['sepal_length_cm'].isnull()) |
(iris_data['sepal_width_cm'].isnull()) |
(iris_data['petal_length_cm'].isnull()) |
(iris_data['petal_width_cm'].isnull())]
In [ ]:
iris_data.loc[iris_data['class'] == 'Iris-setosa', 'petal_width_cm'].hist()
In [ ]:
average_petal_width = iris_data.loc[iris_data['class'] == 'Iris-setosa', 'petal_width_cm'].mean()
iris_data.loc[(iris_data['class'] == 'Iris-setosa') &
(iris_data['petal_width_cm'].isnull()),
'petal_width_cm'] = average_petal_width
In [ ]:
seaborn.pairplot(iris_data, hue='class')
In [ ]:
from pandasql import sqldf
In [ ]:
from pandasql import load_meat, load_births
meat = load_meat()
births = load_births()
In [ ]:
type(meat)
In [ ]:
meat.head()
In [ ]:
births.head()
In [ ]:
data = {'meat': meat}
In [ ]:
sqldf('select * from meat limit 10', data)
In [ ]:
data2 = {'meat2': meat}
In [ ]:
sqldf('select * from meat2 limit 10', data2)
In [ ]:
sqldf('select * from meat limit 10', locals())
In [ ]:
sqldf('select * from births limit 10', locals())
In [ ]:
q = """
SELECT
m.date
, b.births
, m.beef
FROM
meat m
INNER JOIN
births b
on m.date = b.date
ORDER BY
m.date
LIMIT 100;
"""
joined = sqldf(q, locals())
print(joined.head())
Pandasql bezi na SQLite3, takze vsetky klasicke opercaie v SQL viete robit aj tu. Funguju podmienky, vnorene dopyty, joiny, union, funkcie, ...
In [ ]:
df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python_reference/master/Data/some_soccer_data.csv')
df.head()
In [ ]:
# premenovanie vybranych stlpcov
df = df.rename(columns={'P': 'points',
'GP': 'games',
'SOT': 'shots_on_target',
'G': 'goals',
'PPG': 'points_per_game',
'A': 'assists',})
df.head()
In [ ]:
df['SALARY'] = df['SALARY'].apply(lambda x: x.strip('$m'))
df.head()
In [ ]:
df['team'] = pd.Series('', index=df.index)
df['position'] = pd.Series('', index=df.index)
df.head()
In [ ]:
def process_player_col(text):
name, rest = text.split('\n')
position, team = [x.strip() for x in rest.split(' — ')]
return pd.Series([name, team, position])
df[['PLAYER', 'team', 'position']] = df.PLAYER.apply(process_player_col)
df.head()
In [ ]:
df.shape[0] - df.dropna().shape[0]
In [ ]:
df[df['assists'].isnull()]
In [ ]:
df[df['assists'].notnull()]
# df[~df['assists'].isnull()]
In [ ]:
# predtym sme to robili manulane.
# iris_data.loc[(iris_data['class'] == 'Iris-setosa') & (iris_data['petal_width_cm'].isnull()), 'petal_width_cm'] = average_petal_width
# Da sa na to pouzit takato pekna funkcia
df.fillna(value=0, inplace=True)
df
In [ ]:
df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python_reference/master/Data/some_soccer_data.csv')
df = df.rename(columns={'P': 'points',
'GP': 'games',
'SOT': 'shots_on_target',
'G': 'goals',
'PPG': 'points_per_game',
'A': 'assists',})
df['SALARY'] = df['SALARY'].apply(lambda x: x.strip('$m'))
df[['PLAYER', 'team', 'position']] = df.PLAYER.apply(process_player_col)
df.head()
In [ ]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
df[['games', 'assists']] = imp.fit_transform(df[['games', 'assists']].values)
df.head()
Pozor, toto doplnanie neberie do uvahy triedu
In [ ]:
df.games.mean()
In [ ]:
df[df.position == 'Forward'].games.mean()
In [ ]:
df[ (df['team'] == 'Arsenal') | (df['team'] == 'Chelsea') ]
In [ ]:
df[ (df['team'] == 'Arsenal') & (df['position'] == 'Forward') ]