Chapter_05_Part_02



In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

In [10]:
import pandas_datareader as pdr

In [13]:
all_data = {}
for ticker in ['IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = pdr.get_data_yahoo(ticker)
    
all_data


---------------------------------------------------------------------------
RemoteDataError                           Traceback (most recent call last)
<ipython-input-13-a9c6d32b2574> in <module>()
      1 all_data = {}
      2 for ticker in ['IBM', 'MSFT', 'GOOG']:
----> 3     all_data[ticker] = pdr.get_data_yahoo(ticker)
      4 
      5 all_data

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas_datareader/data.py in get_data_yahoo(*args, **kwargs)
     38 
     39 def get_data_yahoo(*args, **kwargs):
---> 40     return YahooDailyReader(*args, **kwargs).read()
     41 
     42 

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas_datareader/yahoo/daily.py in read(self)
     75     def read(self):
     76         """ read one data from specified URL """
---> 77         df = super(YahooDailyReader, self).read()
     78         if self.ret_index:
     79             df['Ret_Index'] = _calc_return_index(df['Adj Close'])

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas_datareader/base.py in read(self)
    155         if isinstance(self.symbols, (compat.string_types, int)):
    156             df = self._read_one_data(self.url,
--> 157                                      params=self._get_params(self.symbols))
    158         # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
    159         elif isinstance(self.symbols, DataFrame):

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas_datareader/base.py in _read_one_data(self, url, params)
     72         """ read one data from specified URL """
     73         if self._format == 'string':
---> 74             out = self._read_url_as_StringIO(url, params=params)
     75         elif self._format == 'json':
     76             out = self._get_response(url, params=params).json()

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas_datareader/base.py in _read_url_as_StringIO(self, url, params)
     83         Open url (and retry)
     84         """
---> 85         response = self._get_response(url, params=params)
     86         text = self._sanitize_response(response)
     87         out = StringIO()

/Users/alexkirnas/anaconda/lib/python3.6/site-packages/pandas_datareader/base.py in _get_response(self, url, params)
    118         if params is not None and len(params) > 0:
    119             url = url + "?" + urlencode(params)
--> 120         raise RemoteDataError('Unable to read URL: {0}'.format(url))
    121 
    122     def _read_lines(self, out):

RemoteDataError: Unable to read URL: http://ichart.finance.yahoo.com/table.csv?s=IBM&a=0&b=1&c=2010&d=5&e=10&f=2017&g=d&ignore=.csv

In [14]:
obj = Series(list('cadaabbcc'))

In [15]:
obj.unique()


Out[15]:
array(['c', 'a', 'd', 'b'], dtype=object)

In [16]:
obj.value_counts()


Out[16]:
a    3
c    3
b    2
d    1
dtype: int64

In [17]:
mask = obj.isin(['b', 'c'])
mask


Out[17]:
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [18]:
obj[mask]


Out[18]:
0    c
5    b
6    b
7    c
8    c
dtype: object

In [19]:
df = DataFrame({'Qu1' : [1, 3, 4, 3, 4],
                'Qu2' : [2, 3, 1, 2, 3],
                'Qu3' : [1, 5, 2, 4, 4]})
df


Out[19]:
Qu1 Qu2 Qu3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4

In [20]:
result = df.apply(pd.value_counts).fillna(0)
result


Out[20]:
Qu1 Qu2 Qu3
1 1.0 1.0 1.0
2 0.0 2.0 1.0
3 2.0 2.0 0.0
4 2.0 0.0 2.0
5 0.0 0.0 1.0

In [21]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data


Out[21]:
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [22]:
string_data.isnull()


Out[22]:
0    False
1    False
2     True
3    False
dtype: bool

In [23]:
data = Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()


Out[23]:
0    1.0
2    3.5
4    7.0
dtype: float64

In [24]:
data[data.notnull()]


Out[24]:
0    1.0
2    3.5
4    7.0
dtype: float64

In [27]:
data = DataFrame([[1., 6.5, 3.],
                  [1, np.nan, np.nan],
                  [np.nan, np.nan, np.nan],
                  [np.nan, 6.5, 3.]])
cleaned = data.dropna()
cleaned


Out[27]:
0 1 2
0 1.0 6.5 3.0

In [28]:
data


Out[28]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0

In [29]:
data.dropna(how = 'all')


Out[29]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0

In [30]:
data[4] = np.nan
data


Out[30]:
0 1 2 4
0 1.0 6.5 3.0 NaN
1 1.0 NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN 6.5 3.0 NaN

In [33]:
data.dropna(axis = 1, how = 'all')


Out[33]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0

In [35]:
df = DataFrame(np.random.randn(7, 3))
df


Out[35]:
0 1 2
0 -0.926146 -1.271959 0.777246
1 0.669697 0.726587 1.277278
2 -0.656911 0.292782 0.154870
3 -0.987786 -1.095611 1.803881
4 -1.218587 -0.315563 0.012396
5 0.511702 -0.416387 -0.238253
6 1.055907 0.010851 -0.737679

In [37]:
df.ix[:4, 1] = np.nan
df.ix[:2, 2] = np.nan
df


Out[37]:
0 1 2
0 -0.926146 NaN NaN
1 0.669697 NaN NaN
2 -0.656911 NaN NaN
3 -0.987786 NaN 1.803881
4 -1.218587 NaN 0.012396
5 0.511702 -0.416387 -0.238253
6 1.055907 0.010851 -0.737679

In [41]:
df.dropna(thresh = 3)


Out[41]:
0 1 2
5 0.511702 -0.416387 -0.238253
6 1.055907 0.010851 -0.737679

In [43]:
df.fillna(0)


Out[43]:
0 1 2
0 -0.926146 0.000000 0.000000
1 0.669697 0.000000 0.000000
2 -0.656911 0.000000 0.000000
3 -0.987786 0.000000 1.803881
4 -1.218587 0.000000 0.012396
5 0.511702 -0.416387 -0.238253
6 1.055907 0.010851 -0.737679

In [44]:
df.fillna({1 : 0.5, 3 : -1})


Out[44]:
0 1 2
0 -0.926146 0.500000 NaN
1 0.669697 0.500000 NaN
2 -0.656911 0.500000 NaN
3 -0.987786 0.500000 1.803881
4 -1.218587 0.500000 0.012396
5 0.511702 -0.416387 -0.238253
6 1.055907 0.010851 -0.737679

In [46]:
df.fillna(0, inplace = True)
df


Out[46]:
0 1 2
0 -0.926146 0.000000 0.000000
1 0.669697 0.000000 0.000000
2 -0.656911 0.000000 0.000000
3 -0.987786 0.000000 1.803881
4 -1.218587 0.000000 0.012396
5 0.511702 -0.416387 -0.238253
6 1.055907 0.010851 -0.737679

In [47]:
df = DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = np.nan
df.ix[4:, 2] = np.nan
df


Out[47]:
0 1 2
0 -1.909524 -1.652112 -0.076914
1 -1.036873 -0.586205 -1.598724
2 0.149483 NaN -1.038821
3 -1.458301 NaN -0.416097
4 -0.117156 NaN NaN
5 -0.734223 NaN NaN

In [48]:
df.fillna(method = 'ffill')


Out[48]:
0 1 2
0 -1.909524 -1.652112 -0.076914
1 -1.036873 -0.586205 -1.598724
2 0.149483 -0.586205 -1.038821
3 -1.458301 -0.586205 -0.416097
4 -0.117156 -0.586205 -0.416097
5 -0.734223 -0.586205 -0.416097

In [50]:
df.fillna(method = 'ffill', limit = 2)


Out[50]:
0 1 2
0 -1.909524 -1.652112 -0.076914
1 -1.036873 -0.586205 -1.598724
2 0.149483 -0.586205 -1.038821
3 -1.458301 -0.586205 -0.416097
4 -0.117156 NaN -0.416097
5 -0.734223 NaN -0.416097

In [51]:
df.fillna(df.mean())


Out[51]:
0 1 2
0 -1.909524 -1.652112 -0.076914
1 -1.036873 -0.586205 -1.598724
2 0.149483 -1.119158 -1.038821
3 -1.458301 -1.119158 -0.416097
4 -0.117156 -1.119158 -0.782639
5 -0.734223 -1.119158 -0.782639

In [52]:
data = Series(np.random.randn(10),
              index = [list('aaabbbccdd'), 
                       [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data


Out[52]:
a  1    1.372091
   2   -0.251959
   3   -2.214907
b  1   -0.351220
   2   -0.585478
   3    2.007993
c  1   -0.218729
   2   -0.629041
d  2    1.648315
   3    1.736258
dtype: float64

In [53]:
data.index


Out[53]:
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [55]:
data.ix['b']


Out[55]:
1   -0.351220
2   -0.585478
3    2.007993
dtype: float64

In [57]:
data['b':'c']


Out[57]:
b  1   -0.351220
   2   -0.585478
   3    2.007993
c  1   -0.218729
   2   -0.629041
dtype: float64

In [58]:
data[['b', 'd']]


Out[58]:
b  1   -0.351220
   2   -0.585478
   3    2.007993
d  2    1.648315
   3    1.736258
dtype: float64

In [59]:
data[:, 2]


Out[59]:
a   -0.251959
b   -0.585478
c   -0.629041
d    1.648315
dtype: float64

In [60]:
data.unstack()


Out[60]:
1 2 3
a 1.372091 -0.251959 -2.214907
b -0.351220 -0.585478 2.007993
c -0.218729 -0.629041 NaN
d NaN 1.648315 1.736258

In [61]:
data.unstack().stack()


Out[61]:
a  1    1.372091
   2   -0.251959
   3   -2.214907
b  1   -0.351220
   2   -0.585478
   3    2.007993
c  1   -0.218729
   2   -0.629041
d  2    1.648315
   3    1.736258
dtype: float64

In [62]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index = [list('aabb'), [1, 2, 1, 2]],
                  columns = [['Ohio', 'Ohio', 'Colorado'],
                             ['Green', 'Red', 'Green']])
frame


Out[62]:
Ohio Colorado
Green Red Green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11

In [63]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame


Out[63]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11

In [65]:
frame['Ohio']


Out[65]:
color Green Red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10

In [66]:
frame.swaplevel('key1', 'key2')


Out[66]:
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11

In [67]:
frame.sortlevel(1)


Out[67]:
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11

In [68]:
frame.swaplevel('key1', 'key2').sortlevel(0)


Out[68]:
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11

In [69]:
frame.sum(level = 'key2')


Out[69]:
state Ohio Colorado
color Green Red Green
key2
1 6 8 10
2 12 14 16

In [70]:
frame.sum(level='color', axis = 1)


Out[70]:
color Green Red
key1 key2
a 1 2 1
2 8 4
b 1 14 7
2 20 10

In [75]:
frame = DataFrame({'a' : np.arange(7), 
                   'b' : np.arange(7, 0, -1),
                   'c' : ['one'] * 3 + ['two'] * 4,
                   'd' : [0, 1, 2, 0, 1, 2, 3]})
frame


Out[75]:
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3

In [76]:
frame.set_index(['c', 'd'])


Out[76]:
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1

In [77]:
frame.set_index(['c', 'd'], drop=False)


Out[77]:
a b c d
c d
one 0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
two 0 3 4 two 0
1 4 3 two 1
2 5 2 two 2
3 6 1 two 3

In [78]:
frame


Out[78]:
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3

In [ ]: