In [27]:
import pandas as pd
import numpy as np
In [12]:
# pandas Series
goods = pd.Series(data=[1,2,4,'ba'], index=['hat', 'gloves', 'cup', 'mouse'])
print(goods)
print('goods dimentions: ', goods.ndim)
print('goods shape: ', goods.shape)
print('goods index: ', goods.index)
print('goods values: ', goods.values)
print('check index banana in goods: ', 'banana' in goods.index)
In [19]:
# access and deleting elements
# loc: locaiton
# iloc: interger location
# Series.drop(label, inplace=True)
print(goods['hat'])
print()
print(goods.loc[['hat', 'cup']])
print()
print('access by index:\n ', goods.iloc[[0,2]])
print()
print(goods[[0,2]])
In [20]:
# Pandas Series arithmetic operation
# make suer the operation can apply to all data type in the Series
vegies = pd.Series(data=[2,78,45,42], index=['kale', 'cabbage', 'carroat', 'lettice'])
In [21]:
vegies * 2
Out[21]:
In [22]:
vegies / 2
Out[22]:
In [25]:
vegies ** 2
Out[25]:
In [28]:
np.sqrt(vegies)
Out[28]:
In [29]:
np.exp(vegies)
Out[29]:
In [30]:
vegies['kale'] * 2
Out[30]:
In [44]:
vegies[vegies < 2]
Out[44]:
In [46]:
# pandas DataFrame
# creat from a dictionary
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}
df = pd.DataFrame(items)
df
df.shape
df.ndim
type(df)
df.size
Out[46]:
In [47]:
df.index
Out[47]:
In [48]:
df.columns
Out[48]:
In [51]:
# create a df with part of data
df2 = pd.DataFrame(items, columns=['Alice'])
df2
Out[51]:
In [52]:
# create a df with part of the row
df3 = pd.DataFrame(items, index=['bike', 'watch'])
df3
Out[52]:
In [53]:
# create df with list
df4 = pd.DataFrame([x for x in np.arange(3)])
df4
Out[53]:
In [74]:
# create df with list with columns and index
df5 = pd.DataFrame(data=[{'price:': 2}, {'weight': 2}], index=['colary','s'])
df5
Out[74]:
In [34]:
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5},
{'watches2': 10, 'glasses2': 50, 'bikes2': 15, 'pants2':5}]
# We create a DataFrame
store_items = pd.DataFrame(items2)a
store_items
Out[34]:
In [80]:
# access element in dataframe
store_items[['watches2']]
Out[80]:
In [91]:
# access index with loc or iloc
store_items.iloc[[1]]
Out[91]:
In [92]:
# acess indidvidual elements in df, column comes first
store_items['bikes'][0]
Out[92]:
In [93]:
# add a column
store_items['cup'] = [12,45,69]
In [94]:
store_items
Out[94]:
In [99]:
store_items + 9
Out[99]:
In [101]:
new_items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4}]
new_store = pd.DataFrame(new_items, index=['store3'])
new_store
Out[101]:
In [102]:
# add a row
store_items.append(new_store)
Out[102]:
In [103]:
# insert a column with df.insert(loc, lable, data)
store_items.insert(4, 'plants', [4,5,9])
In [104]:
store_items
Out[104]:
In [111]:
# delete a column and row
#store_items.pop('cup') # will remove the last column
store_items
Out[111]:
In [113]:
# drop will remove a column by specify axis=1
store_items.drop('glasses2', axis=1)
Out[113]:
In [114]:
# remove a row with axis=0
store_items.drop(1, axis=0)
Out[114]:
In [115]:
# rename a row or column
store_items.rename({'bikes': 'renameed_bikes' }, axis=1)
Out[115]:
In [117]:
store_items.rename({2: 'last_store'}, axis=0)
Out[117]:
In [120]:
# Dealing with NAN
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]
df6 = pd.DataFrame(items2, index=['store1', 'store2', 'store3'])
df6
Out[120]:
In [125]:
df6.isnull()
Out[125]:
In [126]:
df6.isnull().sum()
Out[126]:
In [127]:
df.isnull().sum().sum()
Out[127]:
In [128]:
# count the not NAN values
df6.count()
Out[128]:
In [129]:
# drop nan on columns or rows out of place, if needed specify inplace=True
df6.dropna(axis=0) # drop nan on rows out of place
Out[129]:
In [130]:
# drop nana on columns out of place
df6.dropna(axis=1)
Out[130]:
In [132]:
# fill na with fillna(method='ffill', axis) out of place, specify inplace=True if needed
df6
df6.fillna(method='ffill', axis=0) # ues the previes value on the column to fill nan
Out[132]:
In [133]:
df6.fillna(method='backfill', axis=0)
Out[133]:
In [134]:
df6
Out[134]:
In [135]:
# linear fill with interpolate(method, axis)
df6.interpolate(method='linear', axis=0)
Out[135]:
In [136]:
df6.interpolate(method='linear', axis=1)
Out[136]:
In [137]:
# fillna with mean on the column
df6.fillna(df6.mean(), axis=0)
Out[137]:
In [138]:
# best_rated = book_ratings[(book_ratings == 5).any(axis = 1)]['Book Title'].values
# The code above returns a NumPy ndarray that only contains the names of the books that had a rating of 5.
In [139]:
# loading data into a DataFrame
df7 = pd.read_csv('goog-1.csv')
df7.head(3)
Out[139]:
In [140]:
df7.tail(3)
Out[140]:
In [142]:
df7.isnull().any()
Out[142]:
In [144]:
# Get detailed info with describe()
df7.describe()
Out[144]:
In [146]:
df7['Open'].describe()
Out[146]:
In [147]:
# Get correlation between columns, 1 shows strong corr, 0 shows no corr
df7.corr()
Out[147]:
In [152]:
# Groupby
df7.groupby(['Adj Close']).sum().head(2)
Out[152]:
In [157]:
df7['Open'].rolling(150).mean()
Out[157]:
In [158]:
import matplotlib.pyplot as plt
%matplotlib inline
In [167]:
plt.plot(df7['Open'].rolling(150).mean())
plt.plot(df7['Open'])
plt.legend(['Open', 'Opend rolling 150'])
plt.show()
In [ ]: