In [6]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
In [2]:
obj = Series([1,2,3])
obj.values
Out[2]:
In [3]:
obj.index
Out[3]:
In [4]:
obj = Series([4,5,6], index=list("abc"))
obj
Out[4]:
In [5]:
obj[['a','b']]
Out[5]:
In [7]:
np.exp(obj)
Out[7]:
In [9]:
st=list("abe")
x2 = Series(obj, index=st)
x2
Out[9]:
In [10]:
x2.isnull()
Out[10]:
In [11]:
x2[x2.notnull()]
Out[11]:
In [12]:
obj.name = 'test'
obj
Out[12]:
In [13]:
obj.index = ['g','h','i']
In [15]:
# Data Frames
In [16]:
data = {'st':['ct','ny','ca'],
'year':[2008,2009,2010], 'pop':[1.5, 1.2, 1.8]}
data
Out[16]:
In [17]:
df = DataFrame(data)
df
Out[17]:
In [18]:
# Re-arrange columns
DataFrame(data, columns=['year','st','pop'])
Out[18]:
In [24]:
# Re-arrange columns + Re-index
df = DataFrame(data, columns=['year','st','pop'],
index = ['a','b','c'])
In [25]:
df.year
Out[25]:
In [26]:
df['year']
Out[26]:
In [27]:
# Accessing Rows with ix
df.ix[0]
Out[27]:
In [28]:
df.ix['a']
Out[28]:
In [29]:
df['extra'] = 100
df
Out[29]:
In [31]:
df['extra'] = np.arange(3)
df
Out[31]:
In [33]:
df['isct'] = df['st'] == "ct"
df
Out[33]:
In [37]:
DataFrame(df, index=['a','b','e'])
Out[37]:
In [38]:
df.index.name = "alpha"
df.columns.name = "info"
df
Out[38]:
In [39]:
df.values
Out[39]:
In [40]:
df.index
Out[40]:
In [41]:
df.index[0]
Out[41]:
In [42]:
df.index[0] = "new" # Index objects are immutable
In [43]:
df.columns
Out[43]:
In [44]:
'pop' in df.columns
Out[44]:
In [45]:
df['pop']
Out[45]:
In [47]:
df[df.columns['pop' in df.columns]]
Out[47]:
In [48]:
df.index
Out[48]:
In [49]:
# Re-arrange rows
df.reindex(['c','b','a'])
Out[49]:
In [50]:
df.reindex(['c','b','a','e'], fill_value=0)
Out[50]:
In [51]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method="ffill") # Forward Fill Missing
Out[51]:
In [61]:
df.drop('c')
Out[61]:
In [62]:
df.drop('isct', axis=1)
Out[62]:
In [64]:
# Slicing with labels
# The endpoint is included
df['a':'b']
Out[64]:
In [65]:
df[['year','st']]
Out[65]:
In [66]:
# Indexing by Rows + Columns
df.ix['a':'b',['year','st','pop']]
Out[66]:
In [68]:
# In operations, values that do not overlap are marked with NaN
a1 = pd.DataFrame([1,2,3], index=['a','b','c'])
a2 = pd.DataFrame([10,20,30], index=['a','b','d'])
a1 + a2
Out[68]:
In [69]:
# Applying a function on 1D Arrays to each column
obj = DataFrame(np.random.randn(4, 3), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
f = lambda x: x.max() - x.min()
obj
Out[69]:
In [70]:
# Similar to R apply
obj.apply(f)
Out[70]:
In [71]:
format = lambda x: '%.2f' %x
obj.applymap(format)
Out[71]:
In [72]:
obj.sort_index()
Out[72]:
In [74]:
obj.sort_index(axis=1, ascending=False)
Out[74]:
In [76]:
# To sort by values, use order
obj['e'].order()
Out[76]:
In [78]:
# To sort a DataFrame by values, use order
obj.sort_index(by=['e','d'])
Out[78]:
In [81]:
obj['e'].rank(method="first") # Rank with tie break
Out[81]:
In [84]:
ser = Series([1,1,2,10,3,5], index=list('aabcde'))
ser.index.is_unique
Out[84]:
In [85]:
obj
Out[85]:
In [86]:
obj.sum()
Out[86]:
In [88]:
obj.sum(axis=1, skipna=True)
Out[88]:
In [89]:
obj.describe()
Out[89]:
In [91]:
obj.unstack()
Out[91]:
In [94]:
print ser
print '----'
ser.value_counts()
Out[94]:
In [95]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data
Out[95]:
In [96]:
data.dropna()
Out[96]:
In [98]:
# But drop NA drops any row that has NA
# With DataFrames, the requirement might be differenet
data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data
Out[98]:
In [100]:
# Drop only if all values in the row is an NA
data.dropna(axis=0, how='all')
Out[100]:
In [101]:
data.fillna(0)
Out[101]:
In [103]:
# However, fillna returns a new object
# You can modify in-place using _
_ = data.fillna(0, inplace=True)
data
Out[103]:
In [104]:
# Hierarchical Indexing
data = Series(np.random.randn(10),
index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data
Out[104]:
In [105]:
# Shows that data has a multi index
data.index
Out[105]:
In [107]:
# Shows all values of a the outer index
data['a',]
Out[107]:
In [109]:
# Shows the values as 1 for each of a, b and c outer index
data[:,1]
Out[109]:
In [110]:
data.unstack()
Out[110]:
In [111]:
# Hierarchical Indexing in Data Frames
df = DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
df
Out[111]:
In [112]:
# The hierarchical levels can have names (as strings or
# any Python objects). If so, these will show up in the console output (don’t confuse the index names with the axis labels!):
df.index.names = ['key1','key2']
df.columns.names = ['state', 'colour']
df
Out[112]:
In [113]:
# Changing the order of the levels
df.sortlevel(1)
Out[113]:
In [114]:
df.swaplevel('key1','key2')
Out[114]:
In [115]:
df.sum(level='key2')
Out[115]:
In [116]:
df.sum(level='colour', axis = 1)
Out[116]:
In [118]:
obj
Out[118]:
In [119]:
# Setting an index from a column name
obj.set_index('b')
Out[119]:
In [120]:
obj.set_index('b', drop=False)
Out[120]:
In [121]:
obj
Out[121]:
In [122]:
_ = obj.set_index('b', inplace=True)
In [123]:
obj
Out[123]:
In [124]:
In [ ]: