In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Two primary data structures of pandas
One-dimensional array-like object containing:
In [6]:
from pandas import Series, DataFrame
s = Series([3, -1, 0, 5])
s
Out[6]:
Get index object of the Series via its index attributes, or create our own index:
In [10]:
s.index
Out[10]:
In [16]:
s2 = Series([13, -3, 5, 9],
index = ["a", "b","c", "d"])
s2
Out[16]:
In [19]:
s + 3
Out[19]:
In [20]:
s2 * 3
Out[20]:
In [22]:
s[s > 0]
Out[22]:
In [23]:
s2[["b","c"]]
Out[23]:
In [26]:
names1 = ["Ann", "Bob", "Carl", "Doris"]
balance1 = [200, 100, 300, 400]
account1 = Series(balance1, index=names1)
account1
Out[26]:
In [29]:
names2 = ["Carl", "Doris", "Ann", "Bob"]
balance2 = [20, 10, 30, 40]
account2 = Series(balance2, index=names2)
account2
Out[29]:
In [30]:
# Automatic alignment by index
account1 + account2
Out[30]:
DataFrame, has both a row and column index
Easiest way to construct DataFrame is from a dictionary of equal-length lists or NumPy arrays Example: DataFrame
In [54]:
# values are equal length lists; keys
data = {"Name": ["Ann", "Bob", "Carl", "Doris"],
"HW1": [ 90, 85, 70, 100],
"HW2": [ 80, 70, 90, 90]}
# Create a data frame
grades = DataFrame(data)
grades
Out[54]:
In [56]:
grades = DataFrame(data, columns = ["Name", "HW1", "HW2"])
grades
Out[56]:
In [57]:
grades["Name"]
Out[57]:
In [58]:
grades.iloc[2]
Out[58]:
In [59]:
grades["HW3"] = 0
grades
Out[59]:
In [60]:
HW3 = Series([70, 90], index = [1, 3])
grades["HW3"] = HW3
grades
Out[60]:
In [61]:
grades.T
Out[61]:
In [62]:
grades.values
Out[62]:
In [65]:
grades = Series([ 60, 90, 80, 75],
index= ["a", "b", "c", "d"])
grades
Out[65]:
In [64]:
grades = Series([ 60, 90, 80, 75],
index = ["Bob", "Tom", "Ann", "Jane"])
grades
Out[64]:
In [71]:
a = Series(['A', 'B', 'C'], index = [0, 3, 5])
a
Out[71]:
In [74]:
a.reindex(range(6), method="ffill")
Out[74]:
In [85]:
s = Series(np.arange(5), index = ["a", "b", "c", "d", "e"])
s
Out[85]:
In [86]:
s["c"]
Out[86]:
In [87]:
s[3]
Out[87]:
In [88]:
s[1:3]
Out[88]:
In [89]:
s[s >= 2]
Out[89]:
In [90]:
s[["b", "e", "a"]]
Out[90]:
In [92]:
s["b" : "d"] = 33
s
Out[92]:
In [94]:
grades = DataFrame(np.arange(16).reshape((4, 4)),
index = ["Andy", "Brad", "Carla", "Donna"],
columns = ["HW1", "HW2", "HW3", "HW4"])
grades
Out[94]:
In [95]:
grades["HW1"]
Out[95]:
In [96]:
grades[["HW2", "HW3"]]
Out[96]:
In [99]:
g = DataFrame(np.arange(16).reshape((4, 4)),
index = ["Ann", "Bob", "Carl", "Donna"],
columns = ["HW1", "HW2", "HW3", "HW4"])
g[g["HW3"] > 6]
Out[99]:
In [101]:
# Select rows up to, but not including 2
g[:2]
Out[101]:
In [106]:
a = Series([5, 4, 0, 7],
index = ["a", "c", "d", "e"])
b = Series([-1, 3, 4, -2, 1],
index = ["a", "c", "e", "f", "g"])
a
Out[106]:
In [107]:
b
Out[107]:
In [103]:
a + b
Out[103]:
In [108]:
a.add(b, fill_value=0)
Out[108]:
In [116]:
st0 = Series([0, 1, 2, 3],
index = ["HW1", "HW2", "HW3", "HW4"])
st0
Out[116]:
In [117]:
grades + st0
Out[117]:
In [120]:
xls_file = pd.ExcelFile('Excel_table.xlsx')
t = xls_file.parse("Sheet1")
t
Out[120]:
In [121]:
d = DataFrame(np.random.randn(3, 3),
columns=list("xyz"),
index = ["A", "B", "C"])
d
Out[121]:
In [122]:
np.abs(d)
Out[122]:
In [123]:
def minmax(t):
return Series([t.min(), t.max()],
index = ["min", "max"])
d.apply(minmax)
Out[123]:
In [125]:
a = Series(range(5),
index = ["Bob", "john", "Jane", "Ann", "Cathy"])
a.sort_index()
Out[125]:
In [126]:
df = DataFrame(np.arange(16).reshape((4, 4)),
index = ["B", "Q", "M", "A"],
columns = list("pseb"))
df
Out[126]:
In [127]:
# Sort index alphabetically by ROW
df.sort_index(axis=0)
Out[127]:
In [129]:
# Sort index alpha by COLUMN
df.sort_index(axis=1)
Out[129]:
In [130]:
s= Series([7, -2, 0, 8, -1])
s.sort_values()
Out[130]:
In [133]:
s2 = Series([7, np.nan, -2, 0, np.nan, 8, -1])
s2
s2.sort_values()
Out[133]:
In [134]:
df = DataFrame(np.arange(9).reshape(3, 3),
columns=list("xyz"),
index = ["A", "B", "C"])
df
Out[134]:
In [135]:
df.sum()
Out[135]:
In [136]:
df.sum(axis=1)
Out[136]:
In [137]:
df.idxmax()
Out[137]:
In [138]:
df.idxmin()
Out[138]:
In [ ]: