Key Features:
In [ ]:
from IPython.core.display import HTML
HTML("<iframe src=http://pandas.pydata.org width=800 height=350></iframe>")
In [ ]:
%matplotlib inline
import pandas as pd
import numpy as np
In [ ]:
counts = pd.Series([223, 43, 53, 24, 43])
counts
In [ ]:
type(counts)
In [ ]:
counts[0]
In [ ]:
counts[1:4]
In [ ]:
counts.values
In [ ]:
counts.index
In [ ]:
fruit = pd.Series([223, 43, 53, 24, 43],
index=['apple', 'orange', 'banana', 'pears', 'lemon'])
fruit
In [ ]:
fruit.index
In [ ]:
fruit['apple']
In [ ]:
fruit[['apple', 'lemon']]
In [ ]:
fruit.name = 'counts'
fruit.index.name = 'fruit'
fruit
In [ ]:
fruit > 50
In [ ]:
fruit[fruit > 50]
In [ ]:
fruit2 = pd.Series([11, 12, 13, 14, 15],
index=fruit.index)
fruit2
In [ ]:
fruit2 = fruit2.drop('apple')
fruit2
In [ ]:
fruit2['grape'] = 18
fruit2
In [ ]:
fruit3 = fruit + fruit2
fruit3
In [ ]:
fruit3.dropna()
In [ ]:
fruit3
In [ ]:
fruit3.isnull()
In [ ]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year':[2000, 2001, 2002, 2001, 2003],
'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data)
df
In [ ]:
len(df) # Get the number of rows in the dataframe
In [ ]:
df.shape # Get the (rows, cols) of the dataframe
In [ ]:
df.T
In [ ]:
df.columns # get the index of columns
In [ ]:
df.index # get the index of the row
In [ ]:
df.dtypes
In [ ]:
df.describe()
In [ ]:
df
In [ ]:
df['state'] # indexing by label
In [ ]:
df[['state', 'year']] # indexing by a list of label
In [ ]:
df[:2] # numpy-style indexing
In [ ]:
df.iloc[0, 0]
In [ ]:
df.iloc[0, :]
In [ ]:
df.iloc[:, 1]
In [ ]:
df.iloc[:2, 1:3]
In [ ]:
df.loc[:, 'state']
In [ ]:
df.loc[:, ['state', 'year']]
In [ ]:
df['debt'] = np.random.randn(len(df))
df['rain'] = np.abs(np.random.randn(len(df)))
df
In [ ]:
df = df.drop('debt', axis=1)
df
In [ ]:
row1 = pd.Series([4.5, 'Nevada', 2005, 2.56], index=df.columns)
df.append(row1,ignore_index=True)
In [ ]:
df.drop([0, 1])
In [ ]:
df['pop'] < 2
In [ ]:
df
In [ ]:
df.loc[df['pop'] < 2, 'pop'] = 2
df
In [ ]:
df['year'] == 2001
In [ ]:
(df['pop'] > 3) | (df['year'] == 2001)
In [ ]:
df.loc[(df['pop'] > 3) | (df['year'] == 2001), 'pop'] = 3
df
In [ ]:
df.sort_index(ascending=False)
In [ ]:
df.sort_index(axis=1, ascending=False)
In [ ]:
df
In [ ]:
df.loc[:, ['pop', 'rain']].sum()
In [ ]:
df.loc[:,['pop', 'rain']].mean()
In [ ]:
df.loc[:, ['pop', 'rain']].var()
In [ ]:
df.loc[:, ['pop', 'rain']].cumsum()
In [ ]:
df
In [ ]:
df.loc[:, ['pop', 'rain']].apply(lambda x: x.max() - x.min()) # apply new functions to each row
In [ ]:
df
In [ ]:
df.groupby(df['state']).mean()
In [ ]:
df.groupby(df['state'])[['pop', 'rain']].apply(lambda x: x.max() - x.min())
In [ ]:
grouped = df.groupby(df['state'])
group_list = []
for name, group in grouped:
print(name)
print(group)
print('\n')
In [ ]:
df
In [ ]:
df_h = df.set_index(['state', 'year'])
df_h
In [ ]:
df_h.index.is_unique
In [ ]:
df_h.loc['Ohio', :].max() - df_h.loc['Ohio', :].min()
In [ ]:
df
In [ ]:
df.to_csv('test_csv_file.csv',index=False)
In [ ]:
%more test_csv_file.csv
In [ ]:
df_csv = pd.read_csv('test_csv_file.csv')
df_csv
In [ ]:
writer = pd.ExcelWriter('test_excel_file.xlsx')
df.to_excel(writer, 'sheet1', index=False)
writer.save()
In [ ]:
df_excel = pd.read_excel('test_excel_file.xlsx', sheetname='sheet1')
df_excel
In [ ]:
pd.read_table??
In [ ]:
df = pd.DataFrame([[1, 6.5, 3.], [1., np.nan, np.nan],
[np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
df
In [ ]:
cleaned = df.dropna() # delete rows with Nan value
cleaned
In [ ]:
df.dropna(how='all') # delete rows with all Nan value
In [ ]:
df.dropna(thresh=2) # keep the rows with at least thresh non-Nan value
In [ ]:
df.fillna(0) # fill Nan with a constant
In [ ]:
variables = pd.DataFrame({'normal': np.random.normal(size=100),
'gamma': np.random.gamma(1, size=100),
'poisson': np.random.poisson(size=100)})
variables.head()
In [ ]:
variables.shape
In [ ]:
variables.cumsum().plot()
In [ ]:
variables.cumsum().plot(subplots=True)
Write a Python program to create and display a DataFrame from a specified dictionary data which has the index labels. Sample Python dictionary data and list labels:
exam_data = {'name': ['Anastasia', 'Dima', 'Katherine', 'James', 'Emily', 'Michael', 'Matthew', 'Laura', 'Kevin', 'Jonas'], 'score': [12.5, 9, 16.5, np.nan, 9, 20, 14.5, np.nan, 8, 19], 'attempts': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1], 'qualify': ['yes', 'no', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes']} labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
In [ ]:
In [ ]:
In [ ]:
In [ ]: