In [2]:
#importamos numpy y pandas
import numpy as np
import pandas as pd
#cargamos para manejar fechas
import datetime
from datetime import datetime, date

#configuramos el pandas para que nos muestre por pantalla como queremos
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 60)

#usaremos matplotlib para los gráficos
import matplotlib.pyplot as plt
%matplotlib inline

#cargamos los datos y solamente leemos las columnas de la posicion 0 2 3 y 7

sp500 = pd.read_csv("tratamiento_datos/data/sp500.csv", index_col='Symbol', usecols=[0,2,3,7])

In [3]:
#creamos un dataframe aleatorio para practicar
np.random.seed(123456)
df=pd.DataFrame({'foo':np.random.random(1000000), 'key':range(100,1000100)})

In [4]:
#obtenemos un dato solamente
df[df.key==10099]


Out[4]:
           foo    key
9999  0.272283  10099

In [5]:
#Cuanto tiempo tardamos al hacer la selección por llave
%timeit df[df.key==10099]


1.3 ms ± 57.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [6]:
#pasamos las keys a que sean indices
df_with_index=df.set_index(['key'])
df_with_index[:5]


Out[6]:
          foo
key          
100  0.126970
101  0.966718
102  0.260476
103  0.897237
104  0.376750

In [7]:
df_with_index.loc[10099]


Out[7]:
foo    0.272283
Name: 10099, dtype: float64

In [8]:
#ahora con loc vemos a ver lo rapido que es
%timeit df_with_index.loc[10099]


89.7 µs ± 1.03 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [9]:
#indices básicos
temps = pd.DataFrame({'city':["Missoula", "Philadelphia"],"Temperature": [70, 80]})
print(temps)
print(temps.columns)


   Temperature          city
0           70      Missoula
1           80  Philadelphia
Index(['Temperature', 'city'], dtype='object')

In [10]:
#vamos a trabajar con indices
df_i64 = pd.DataFrame(np.arange(10,20), index=np.arange(0,10))
print(df_i64[:5])
print(df_i64.index)


    0
0  10
1  11
2  12
3  13
4  14
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [12]:
df_f64 = pd.DataFrame(np.arange(0,1000,5), np.arange(0.0, 100.0, 0.5))
df_f64.iloc[:5]


Out[12]:
      0
0.0   0
0.5   5
1.0  10
1.5  15
2.0  20

In [13]:
df_f64.index


Out[13]:
Float64Index([ 0.0,  0.5,  1.0,  1.5,  2.0,  2.5,  3.0,
               3.5,  4.0,  4.5,
              ...
              95.0, 95.5, 96.0, 96.5, 97.0, 97.5, 98.0,
              98.5, 99.0, 99.5],
             dtype='float64', length=200)

In [15]:
#extraemos un intervalo
df_interval = pd.DataFrame({"A":[1,2,3,4]}, index= pd.IntervalIndex.from_breaks([0, 0.5, 1.0, 1.5, 2.0]))
df_interval


Out[15]:
            A
(0.0, 0.5]  1
(0.5, 1.0]  2
(1.0, 1.5]  3
(1.5, 2.0]  4

In [16]:
df_interval.index


Out[16]:
IntervalIndex([(0.0, 0.5], (0.5, 1.0], (1.0, 1.5], (1.5, 2.0]]
              closed='right',
              dtype='interval[float64]')

In [18]:
df_categorical = pd.DataFrame({'A': np.arange(6), 'B':list('aabbca')})
print(df_categorical)


   A  B
0  0  a
1  1  a
2  2  b
3  3  b
4  4  c
5  5  a

In [19]:
sp500[:5]


Out[19]:
                        Sector   Price  Book Value
Symbol                                            
MMM                Industrials  141.14      26.668
ABT                Health Care   39.60      15.573
ABBV               Health Care   53.95       2.954
ACN     Information Technology   79.79       8.326
ACE                 Financials  102.91      86.897

In [20]:
index_mov_to_column = sp500.reset_index()
index_mov_to_column[:5]


Out[20]:
  Symbol                  Sector   Price  Book Value
0    MMM             Industrials  141.14      26.668
1    ABT             Health Care   39.60      15.573
2   ABBV             Health Care   53.95       2.954
3    ACN  Information Technology   79.79       8.326
4    ACE              Financials  102.91      86.897

In [21]:
index_mov_to_column.set_index('Sector')[:5]


Out[21]:
                       Symbol   Price  Book Value
Sector                                           
Industrials               MMM  141.14      26.668
Health Care               ABT   39.60      15.573
Health Care              ABBV   53.95       2.954
Information Technology    ACN   79.79       8.326
Financials                ACE  102.91      86.897

In [23]:
reindexed = sp500.reindex(index=['MMM','ABBV','FOO'])
reindexed


Out[23]:
             Sector   Price  Book Value
Symbol                                 
MMM     Industrials  141.14      26.668
ABBV    Health Care   53.95       2.954
FOO             NaN     NaN         NaN

In [24]:
sp500.reindex(columns=['Price','Book Value','NewCol'])[:5]


Out[24]:
         Price  Book Value  NewCol
Symbol                            
MMM     141.14      26.668     NaN
ABT      39.60      15.573     NaN
ABBV     53.95       2.954     NaN
ACN      79.79       8.326     NaN
ACE     102.91      86.897     NaN

In [ ]: